Skip to content

Commit

Permalink
chore: Azure AI search - clarify that dataframe is not supported (#1407)
Browse files Browse the repository at this point in the history
* Azure AI search - clarify dataframe is not supported

* add Azure AI Search to README

* fix

* README fix
  • Loading branch information
anakin87 authored Feb 17, 2025
1 parent 5d8a4d5 commit 95b3542
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 35 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta
| [amazon-sagemaker-haystack](integrations/amazon_sagemaker/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/amazon-sagemaker-haystack.svg)](https://pypi.org/project/amazon-sagemaker-haystack) | [![Test / amazon_sagemaker](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/amazon_sagemaker.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/amazon_sagemaker.yml) |
| [anthropic-haystack](integrations/anthropic/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/anthropic-haystack.svg)](https://pypi.org/project/anthropic-haystack) | [![Test / anthropic](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/anthropic.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/anthropic.yml) |
| [astra-haystack](integrations/astra/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/astra-haystack.svg)](https://pypi.org/project/astra-haystack) | [![Test / astra](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/astra.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/astra.yml) |
| [azure-ai-search-haystack](integrations/azure_ai_search/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/azure-ai-search-haystack.svg)](https://pypi.org/project/azure-ai-search-haystack) | [![Test / azure-ai-search](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/azure_ai_search.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/azure_ai_search.yml) |
| [chroma-haystack](integrations/chroma/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/chroma-haystack.svg)](https://pypi.org/project/chroma-haystack) | [![Test / chroma](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml) |
| [cohere-haystack](integrations/cohere/) | Embedder, Generator, Ranker | [![PyPI - Version](https://img.shields.io/pypi/v/cohere-haystack.svg)](https://pypi.org/project/cohere-haystack) | [![Test / cohere](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cohere.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cohere.yml) |
| [deepeval-haystack](integrations/deepeval/) | Evaluator | [![PyPI - Version](https://img.shields.io/pypi/v/deepeval-haystack.svg)](https://pypi.org/project/deepeval-haystack) | [![Test / deepeval](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/deepeval.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/deepeval.yml) |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# SPDX-License-Identifier: Apache-2.0
import logging
import os
from dataclasses import asdict
from datetime import datetime
from typing import Any, Dict, List, Optional

Expand Down Expand Up @@ -246,15 +245,6 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D
:return: the number of documents added to index.
"""

def _convert_input_document(documents: Document):
document_dict = asdict(documents)
if not isinstance(document_dict["id"], str):
msg = f"Document id {document_dict['id']} is not a string, "
raise TypeError(msg)
index_document = self._convert_haystack_documents_to_azure(document_dict)

return index_document

if len(documents) > 0:
if not isinstance(documents[0], Document):
msg = "param 'documents' must contain a list of objects of type Document"
Expand All @@ -266,7 +256,7 @@ def _convert_input_document(documents: Document):
f"but got {policy}. Overwriting duplicates is enabled by default."
)
client = self.client
documents_to_write = [(_convert_input_document(doc)) for doc in documents]
documents_to_write = [self._convert_haystack_document_to_azure(doc) for doc in documents]

if documents_to_write != []:
client.upload_documents(documents_to_write)
Expand Down Expand Up @@ -370,11 +360,23 @@ def _get_raw_documents_by_id(self, document_ids: List[str]):
logger.warning(f"Document with ID {doc_id} not found.")
return azure_documents

def _convert_haystack_documents_to_azure(self, document: Dict[str, Any]) -> Dict[str, Any]:
"""Map the document keys to fields of search index"""
def _convert_haystack_document_to_azure(self, document: Document) -> Dict[str, Any]:
"""Convert a Haystack Document to an Azure Search document"""

doc_dict = document.to_dict(flatten=False)

if "dataframe" in doc_dict:
dataframe = doc_dict.pop("dataframe")
if dataframe:
logger.warning(
"Document %s has the `dataframe` field set. "
"AzureAISearchDocumentStore does not support dataframes and this field will be ignored. "
"The `dataframe` field will soon be removed from Haystack Document.",
doc_dict["id"],
)

# Because Azure Search does not allow dynamic fields, we only include fields that are part of the schema
index_document = {k: v for k, v in {**document, **document.get("meta", {})}.items() if k in self._index_fields}
index_document = {k: v for k, v in {**doc_dict, **doc_dict.get("meta", {})}.items() if k in self._index_fields}
if index_document["embedding"] is None:
index_document["embedding"] = self._dummy_vector

Expand Down
33 changes: 12 additions & 21 deletions integrations/azure_ai_search/tests/test_document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@
CountDocumentsTest,
DeleteDocumentsTest,
FilterDocumentsTest,
FilterDocumentsTestWithDataframe,
WriteDocumentsTest,
)
from haystack.utils.auth import EnvVarSecret, Secret
from pandas import DataFrame

from haystack_integrations.document_stores.azure_ai_search import DEFAULT_VECTOR_SEARCH, AzureAISearchDocumentStore

Expand Down Expand Up @@ -130,6 +130,16 @@ def test_write_documents_with_meta(self, document_store: AzureAISearchDocumentSt
doc = document_store.get_documents_by_id(["1"])
assert doc[0] == docs[0]

def test_write_documents_skips_dataframe(self, document_store: AzureAISearchDocumentStore):
doc = Document(id="1", content="test")
doc.dataframe = DataFrame({"a": [1, 2, 3]})

assert document_store.write_documents([doc]) == 1
retrieved_docs = document_store.get_documents_by_id(["1"])
assert retrieved_docs[0].id == "1"
assert retrieved_docs[0].content == "test"
assert not hasattr(retrieved_docs[0], "dataframe") or retrieved_docs[0].dataframe is None

@pytest.mark.skip(reason="Azure AI search index overwrites duplicate documents by default")
def test_write_documents_duplicate_fail(self, document_store: AzureAISearchDocumentStore): ...

Expand All @@ -156,10 +166,9 @@ def _random_embeddings(n):
],
indirect=True,
)
class TestFilters(FilterDocumentsTest, FilterDocumentsTestWithDataframe):
class TestFilters(FilterDocumentsTest):

# Overriding to change "date" to compatible ISO 8601 format
# and remove incompatible fields (dataframes) for Azure search index
@pytest.fixture
def filterable_docs(self) -> List[Document]:
"""Fixture that returns a list of Documents that can be used to test filtering."""
Expand Down Expand Up @@ -227,24 +236,6 @@ def assert_documents_are_equal(self, received: List[Document], expected: List[Do
sorted_expected = sorted(expected, key=lambda doc: doc.id)
assert sorted_recieved == sorted_expected

@pytest.mark.skip(reason="Azure AI search index does not support dataframes")
def test_comparison_equal_with_dataframe(self, document_store, filterable_docs): ...

@pytest.mark.skip(reason="Azure AI search index does not support dataframes")
def test_comparison_not_equal_with_dataframe(self, document_store, filterable_docs): ...

@pytest.mark.skip(reason="Azure AI search index does not support dataframes")
def test_comparison_greater_than_with_dataframe(self, document_store, filterable_docs): ...

@pytest.mark.skip(reason="Azure AI search index does not support dataframes")
def test_comparison_less_than_with_dataframe(self, document_store, filterable_docs): ...

@pytest.mark.skip(reason="Azure AI search index does not support dataframes")
def test_comparison_greater_than_equal_with_dataframe(self, document_store, filterable_docs): ...

@pytest.mark.skip(reason="Azure AI search index does not support dataframes")
def test_comparison_less_than_equal_with_dataframe(self, document_store, filterable_docs): ...

# Azure search index supports UTC datetime in ISO 8601 format
def test_comparison_greater_than_with_iso_date(self, document_store, filterable_docs):
"""Test filter_documents() with > comparator and datetime"""
Expand Down

0 comments on commit 95b3542

Please sign in to comment.