From 3afc3b9a1c20b688155649cf3e437d887b8bfbe2 Mon Sep 17 00:00:00 2001
From: Amna Mubashar <amnahkhan.ak@gmail.com>
Date: Thu, 31 Oct 2024 17:53:07 +0100
Subject: [PATCH] Updated code based on PR comments

---
 integrations/azure_ai_search/.gitignore       | 163 ------------------
 integrations/azure_ai_search/pyproject.toml   |   3 +-
 .../azure_ai_search/embedding_retriever.py    |  11 +-
 .../azure_ai_search/document_store.py         |  31 ++--
 .../tests/test_document_store.py              |  48 +++---
 .../tests/test_embedding_retriever.py         |   2 -
 6 files changed, 39 insertions(+), 219 deletions(-)
 delete mode 100644 integrations/azure_ai_search/.gitignore

diff --git a/integrations/azure_ai_search/.gitignore b/integrations/azure_ai_search/.gitignore
deleted file mode 100644
index d1c340c1f..000000000
--- a/integrations/azure_ai_search/.gitignore
+++ /dev/null
@@ -1,163 +0,0 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-cover/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-.pybuilder/
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-
-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-#   in version control.
-#   https://pdm.fming.dev/#use-with-ide
-.pdm.toml
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
-
-# pytype static type analyzer
-.pytype/
-
-# Cython debug symbols
-cython_debug/
-
-# PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
-
-# VS Code
-.vscode
diff --git a/integrations/azure_ai_search/pyproject.toml b/integrations/azure_ai_search/pyproject.toml
index c90ebfc5d..49ca623e7 100644
--- a/integrations/azure_ai_search/pyproject.toml
+++ b/integrations/azure_ai_search/pyproject.toml
@@ -15,14 +15,13 @@ classifiers = [
   "License :: OSI Approved :: Apache Software License",
   "Development Status :: 4 - Beta",
   "Programming Language :: Python",
-  "Programming Language :: Python :: 3.8",
   "Programming Language :: Python :: 3.9",
   "Programming Language :: Python :: 3.10",
   "Programming Language :: Python :: 3.11",
   "Programming Language :: Python :: Implementation :: CPython",
   "Programming Language :: Python :: Implementation :: PyPy",
 ]
-dependencies = ["haystack-ai", "azure-search-documents>=11.5", "azure-identity", "torch>=1.11.0"]
+dependencies = ["haystack-ai", "azure-search-documents>=11.5", "azure-identity"]
 
 [project.urls]
 Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/azure_ai_search#readme"
diff --git a/integrations/azure_ai_search/src/haystack_integrations/components/retrievers/azure_ai_search/embedding_retriever.py b/integrations/azure_ai_search/src/haystack_integrations/components/retrievers/azure_ai_search/embedding_retriever.py
index fe23718c8..ab649f874 100644
--- a/integrations/azure_ai_search/src/haystack_integrations/components/retrievers/azure_ai_search/embedding_retriever.py
+++ b/integrations/azure_ai_search/src/haystack_integrations/components/retrievers/azure_ai_search/embedding_retriever.py
@@ -25,7 +25,6 @@ def __init__(
         filters: Optional[Dict[str, Any]] = None,
         top_k: int = 10,
         filter_policy: Union[str, FilterPolicy] = FilterPolicy.REPLACE,
-        raise_on_failure: bool = True,
     ):
         """
         Create the AzureAISearchEmbeddingRetriever component.
@@ -44,7 +43,6 @@ def __init__(
         self._filter_policy = (
             filter_policy if isinstance(filter_policy, FilterPolicy) else FilterPolicy.from_str(filter_policy)
         )
-        self._raise_on_failure = raise_on_failure
 
         if not isinstance(document_store, AzureAISearchDocumentStore):
             message = "document_store must be an instance of AzureAISearchDocumentStore"
@@ -113,13 +111,6 @@ def run(self, query_embedding: List[float], filters: Optional[Dict[str, Any]] =
                 top_k=top_k,
             )
         except Exception as e:
-            if self._raise_on_failure:
-                raise e
-            logger.warning(
-                "An error occurred during embedding retrieval and will be ignored, returning empty results: %s",
-                str(e),
-                exc_info=True,
-            )
-            docs = []
+            raise e
 
         return {"documents": docs}
diff --git a/integrations/azure_ai_search/src/haystack_integrations/document_stores/azure_ai_search/document_store.py b/integrations/azure_ai_search/src/haystack_integrations/document_stores/azure_ai_search/document_store.py
index 777efe20d..7b07c81a8 100644
--- a/integrations/azure_ai_search/src/haystack_integrations/document_stores/azure_ai_search/document_store.py
+++ b/integrations/azure_ai_search/src/haystack_integrations/document_stores/azure_ai_search/document_store.py
@@ -73,7 +73,6 @@ def __init__(
         embedding_dimension: int = 768,
         metadata_fields: Optional[Dict[str, type]] = None,
         vector_search_configuration: VectorSearch = None,
-        create_index: bool = True,
         **kwargs,
     ):
         """
@@ -117,7 +116,6 @@ def __init__(
         self._dummy_vector = [-10.0] * self._embedding_dimension
         self._metadata_fields = metadata_fields
         self._vector_search_configuration = vector_search_configuration or DEFAULT_VECTOR_SEARCH
-        self._create_index = create_index
         self._kwargs = kwargs
 
     @property
@@ -133,13 +131,13 @@ def client(self) -> SearchClient:
         try:
             if not self._index_client:
                 self._index_client = SearchIndexClient(resolved_endpoint, credential, **self._kwargs)
-            if not self.index_exists(self._index_name):
+            if not self._index_exists(self._index_name):
                 # Create a new index if it does not exist
                 logger.debug(
                     "The index '%s' does not exist. A new index will be created.",
                     self._index_name,
                 )
-                self.create_index(self._index_name)
+                self._create_index(self._index_name)
         except (HttpResponseError, ClientAuthenticationError) as error:
             msg = f"Failed to authenticate with Azure Search: {error}"
             raise AzureAISearchDocumentStoreConfigError(msg) from error
@@ -155,7 +153,7 @@ def client(self) -> SearchClient:
 
         return self._client
 
-    def create_index(self, index_name: str, **kwargs) -> None:
+    def _create_index(self, index_name: str, **kwargs) -> None:
         """
         Creates a new search index.
         :param index_name: Name of the index to create. If None, the index name from the constructor is used.
@@ -201,7 +199,6 @@ def to_dict(self) -> Dict[str, Any]:
             azure_endpoint=self._azure_endpoint.to_dict() if self._azure_endpoint is not None else None,
             api_key=self._api_key.to_dict() if self._api_key is not None else None,
             index_name=self._index_name,
-            create_index=self._create_index,
             embedding_dimension=self._embedding_dimension,
             metadata_fields=self._metadata_fields,
             vector_search_configuration=self._vector_search_configuration.as_dict(),
@@ -225,14 +222,13 @@ def from_dict(cls, data: Dict[str, Any]) -> "AzureAISearchDocumentStore":
             data["init_parameters"]["vector_search_configuration"] = VectorSearch.from_dict(vector_search_configuration)
         return default_from_dict(cls, data)
 
-    def count_documents(self, **kwargs: Any) -> int:
+    def count_documents(self) -> int:
         """
         Returns how many documents are present in the search index.
 
-        :param kwargs: additional keyword parameters.
         :returns: list of retrieved documents.
         """
-        return self.client.get_document_count(**kwargs)
+        return self.client.get_document_count()
 
     def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.FAIL) -> int:
         """
@@ -292,7 +288,7 @@ def delete_documents(self, document_ids: List[str]) -> None:
     def get_documents_by_id(self, document_ids: List[str]) -> List[Document]:
         return self._convert_search_result_to_documents(self._get_raw_documents_by_id(document_ids))
 
-    def search_documents(self, search_text: Optional[str] = "*", top_k: Optional[int] = 10) -> List[Document]:
+    def search_documents(self, search_text: str = "*", top_k: int = 10) -> List[Document]:
         """
         Returns all documents that match the provided search_text.
         If search_text is None, returns all documents.
@@ -345,7 +341,7 @@ def _convert_search_result_to_documents(self, azure_docs: List[Dict[str, Any]])
             documents.append(doc)
         return documents
 
-    def index_exists(self, index_name: Optional[str]) -> bool:
+    def _index_exists(self, index_name: Optional[str]) -> bool:
         """
         Check if the index exists in the Azure AI Search service.
 
@@ -403,14 +399,19 @@ def _map_metadata_field_types(self, metadata: Dict[str, type]) -> Dict[str, str]
 
         for key, value_type in metadata.items():
 
-            # Azure Search index only allows field names starting with letters
-            field_name = next((key[i:] for i, char in enumerate(key) if char.isalpha()), key)
+            if not key[0].isalpha():
+                msg = (
+                    f"Azure Search index only allows field names starting with letters. "
+                    f"Invalid key: {key} will be dropped."
+                )
+                logger.warning(msg)
+                continue
 
             field_type = type_mapping.get(value_type)
             if not field_type:
-                error_message = f"Unsupported field type for key '{field_name}': {value_type}"
+                error_message = f"Unsupported field type for key '{key}': {value_type}"
                 raise ValueError(error_message)
-            metadata_field_mapping[field_name] = field_type
+            metadata_field_mapping[key] = field_type
 
         return metadata_field_mapping
 
diff --git a/integrations/azure_ai_search/tests/test_document_store.py b/integrations/azure_ai_search/tests/test_document_store.py
index 50733a8a4..8e1aaec80 100644
--- a/integrations/azure_ai_search/tests/test_document_store.py
+++ b/integrations/azure_ai_search/tests/test_document_store.py
@@ -35,7 +35,6 @@ def test_to_dict(monkeypatch):
             "index_name": "default",
             "embedding_dimension": 768,
             "metadata_fields": None,
-            "create_index": True,
             "vector_search_configuration": {
                 "profiles": [
                     {"name": "default-vector-config", "algorithm_configuration_name": "cosine-algorithm-config"}
@@ -65,7 +64,6 @@ def test_from_dict(monkeypatch):
             "embedding_dimension": 768,
             "index_name": "default",
             "metadata_fields": None,
-            "create_index": False,
             "vector_search_configuration": DEFAULT_VECTOR_SEARCH,
         },
     }
@@ -75,7 +73,6 @@ def test_from_dict(monkeypatch):
     assert document_store._index_name == "default"
     assert document_store._embedding_dimension == 768
     assert document_store._metadata_fields is None
-    assert document_store._create_index is False
     assert document_store._vector_search_configuration == DEFAULT_VECTOR_SEARCH
 
 
@@ -92,13 +89,11 @@ def test_init(_mock_azure_search_client):
         api_key=Secret.from_token("fake-api-key"),
         azure_endpoint=Secret.from_token("fake_endpoint"),
         index_name="my_index",
-        create_index=False,
         embedding_dimension=15,
         metadata_fields={"Title": str, "Pages": int},
     )
 
     assert document_store._index_name == "my_index"
-    assert document_store._create_index is False
     assert document_store._embedding_dimension == 15
     assert document_store._metadata_fields == {"Title": str, "Pages": int}
     assert document_store._vector_search_configuration == DEFAULT_VECTOR_SEARCH
@@ -156,7 +151,8 @@ def _random_embeddings(n):
 )
 class TestFilters(FilterDocumentsTest):
 
-    # Overriding to change "date" to compatible ISO format and remove incompatible fields (dataframes) for search index
+    # Overriding to change "date" to compatible ISO 8601 format
+    # and remove incompatible fields (dataframes) for Azure search index
     @pytest.fixture
     def filterable_docs(self) -> List[Document]:
         """Fixture that returns a list of Documents that can be used to test filtering."""
@@ -224,26 +220,25 @@ def assert_documents_are_equal(self, received: List[Document], expected: List[Do
         sorted_expected = sorted(expected, key=lambda doc: doc.id)
         assert sorted_recieved == sorted_expected
 
-    # Dataframes are not supported in serach index
-    def test_comparison_equal_with_dataframe(self, document_store, filterable_docs):
-        pass
+    @pytest.mark.skip(reason="Azure AI search index does not support dataframes")
+    def test_comparison_equal_with_dataframe(self, document_store, filterable_docs): ...
 
-    def test_comparison_not_equal_with_dataframe(self, document_store, filterable_docs):
-        pass
+    @pytest.mark.skip(reason="Azure AI search index does not support dataframes")
+    def test_comparison_not_equal_with_dataframe(self, document_store, filterable_docs): ...
 
-    def test_comparison_greater_than_with_dataframe(self, document_store, filterable_docs):
-        pass
+    @pytest.mark.skip(reason="Azure AI search index does not support dataframes")
+    def test_comparison_greater_than_with_dataframe(self, document_store, filterable_docs): ...
 
-    def test_comparison_less_than_with_dataframe(self, document_store, filterable_docs):
-        pass
+    @pytest.mark.skip(reason="Azure AI search index does not support dataframes")
+    def test_comparison_less_than_with_dataframe(self, document_store, filterable_docs): ...
 
-    def test_comparison_greater_than_equal_with_dataframe(self, document_store, filterable_docs):
-        pass
+    @pytest.mark.skip(reason="Azure AI search index does not support dataframes")
+    def test_comparison_greater_than_equal_with_dataframe(self, document_store, filterable_docs): ...
 
-    def test_comparison_less_than_equal_with_dataframe(self, document_store, filterable_docs):
-        pass
+    @pytest.mark.skip(reason="Azure AI search index does not support dataframes")
+    def test_comparison_less_than_equal_with_dataframe(self, document_store, filterable_docs): ...
 
-    # Azure search index supports UTC datetime in ISO format
+    # Azure search index supports UTC datetime in ISO 8601 format
     def test_comparison_greater_than_with_iso_date(self, document_store, filterable_docs):
         """Test filter_documents() with > comparator and datetime"""
         document_store.write_documents(filterable_docs)
@@ -346,15 +341,14 @@ def test_comparison_in(self, document_store, filterable_docs):
         expected = [d for d in filterable_docs if d.meta.get("page") is not None and d.meta["page"] in ["100", "123"]]
         self.assert_documents_are_equal(result, expected)
 
-    # not supported
-    def test_comparison_not_in(self, document_store, filterable_docs):
-        pass
+    @pytest.mark.skip(reason="Azure AI search index does not support not in operator")
+    def test_comparison_not_in(self, document_store, filterable_docs): ...
 
-    def test_comparison_not_in_with_with_non_list(self, document_store, filterable_docs):
-        pass
+    @pytest.mark.skip(reason="Azure AI search index does not support not in operator")
+    def test_comparison_not_in_with_with_non_list(self, document_store, filterable_docs): ...
 
-    def test_comparison_not_in_with_with_non_list_iterable(self, document_store, filterable_docs):
-        pass
+    @pytest.mark.skip(reason="Azure AI search index does not support not in operator")
+    def test_comparison_not_in_with_with_non_list_iterable(self, document_store, filterable_docs): ...
 
     def test_missing_condition_operator_key(self, document_store, filterable_docs):
         """Test filter_documents() with missing operator key"""
diff --git a/integrations/azure_ai_search/tests/test_embedding_retriever.py b/integrations/azure_ai_search/tests/test_embedding_retriever.py
index af4b21478..83db9c058 100644
--- a/integrations/azure_ai_search/tests/test_embedding_retriever.py
+++ b/integrations/azure_ai_search/tests/test_embedding_retriever.py
@@ -49,7 +49,6 @@ def test_to_dict():
                     },
                     "api_key": {"type": "env_var", "env_vars": ["AZURE_SEARCH_API_KEY"], "strict": False},
                     "index_name": "default",
-                    "create_index": True,
                     "embedding_dimension": 768,
                     "metadata_fields": None,
                     "vector_search_configuration": {
@@ -88,7 +87,6 @@ def test_from_dict():
                     },
                     "api_key": {"type": "env_var", "env_vars": ["AZURE_SEARCH_API_KEY"], "strict": False},
                     "index_name": "default",
-                    "create_index": True,
                     "embedding_dimension": 768,
                     "metadata_fields": None,
                     "vector_search_configuration": DEFAULT_VECTOR_SEARCH,