From 3afc3b9a1c20b688155649cf3e437d887b8bfbe2 Mon Sep 17 00:00:00 2001 From: Amna Mubashar Date: Thu, 31 Oct 2024 17:53:07 +0100 Subject: [PATCH] Updated code based on PR comments --- integrations/azure_ai_search/.gitignore | 163 ------------------ integrations/azure_ai_search/pyproject.toml | 3 +- .../azure_ai_search/embedding_retriever.py | 11 +- .../azure_ai_search/document_store.py | 31 ++-- .../tests/test_document_store.py | 48 +++--- .../tests/test_embedding_retriever.py | 2 - 6 files changed, 39 insertions(+), 219 deletions(-) delete mode 100644 integrations/azure_ai_search/.gitignore diff --git a/integrations/azure_ai_search/.gitignore b/integrations/azure_ai_search/.gitignore deleted file mode 100644 index d1c340c1f..000000000 --- a/integrations/azure_ai_search/.gitignore +++ /dev/null @@ -1,163 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/#use-with-ide -.pdm.toml - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ - -# VS Code -.vscode diff --git a/integrations/azure_ai_search/pyproject.toml b/integrations/azure_ai_search/pyproject.toml index c90ebfc5d..49ca623e7 100644 --- a/integrations/azure_ai_search/pyproject.toml +++ b/integrations/azure_ai_search/pyproject.toml @@ -15,14 +15,13 @@ classifiers = [ "License :: OSI Approved :: Apache Software License", "Development Status :: 4 - Beta", "Programming Language :: Python", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] -dependencies = ["haystack-ai", "azure-search-documents>=11.5", "azure-identity", "torch>=1.11.0"] +dependencies = ["haystack-ai", "azure-search-documents>=11.5", "azure-identity"] [project.urls] Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/azure_ai_search#readme" diff --git a/integrations/azure_ai_search/src/haystack_integrations/components/retrievers/azure_ai_search/embedding_retriever.py b/integrations/azure_ai_search/src/haystack_integrations/components/retrievers/azure_ai_search/embedding_retriever.py index fe23718c8..ab649f874 100644 --- a/integrations/azure_ai_search/src/haystack_integrations/components/retrievers/azure_ai_search/embedding_retriever.py +++ b/integrations/azure_ai_search/src/haystack_integrations/components/retrievers/azure_ai_search/embedding_retriever.py @@ -25,7 +25,6 @@ def __init__( filters: Optional[Dict[str, Any]] = None, top_k: int = 10, filter_policy: Union[str, FilterPolicy] = FilterPolicy.REPLACE, - raise_on_failure: bool = True, ): """ Create the AzureAISearchEmbeddingRetriever component. @@ -44,7 +43,6 @@ def __init__( self._filter_policy = ( filter_policy if isinstance(filter_policy, FilterPolicy) else FilterPolicy.from_str(filter_policy) ) - self._raise_on_failure = raise_on_failure if not isinstance(document_store, AzureAISearchDocumentStore): message = "document_store must be an instance of AzureAISearchDocumentStore" @@ -113,13 +111,6 @@ def run(self, query_embedding: List[float], filters: Optional[Dict[str, Any]] = top_k=top_k, ) except Exception as e: - if self._raise_on_failure: - raise e - logger.warning( - "An error occurred during embedding retrieval and will be ignored, returning empty results: %s", - str(e), - exc_info=True, - ) - docs = [] + raise e return {"documents": docs} diff --git a/integrations/azure_ai_search/src/haystack_integrations/document_stores/azure_ai_search/document_store.py b/integrations/azure_ai_search/src/haystack_integrations/document_stores/azure_ai_search/document_store.py index 777efe20d..7b07c81a8 100644 --- a/integrations/azure_ai_search/src/haystack_integrations/document_stores/azure_ai_search/document_store.py +++ b/integrations/azure_ai_search/src/haystack_integrations/document_stores/azure_ai_search/document_store.py @@ -73,7 +73,6 @@ def __init__( embedding_dimension: int = 768, metadata_fields: Optional[Dict[str, type]] = None, vector_search_configuration: VectorSearch = None, - create_index: bool = True, **kwargs, ): """ @@ -117,7 +116,6 @@ def __init__( self._dummy_vector = [-10.0] * self._embedding_dimension self._metadata_fields = metadata_fields self._vector_search_configuration = vector_search_configuration or DEFAULT_VECTOR_SEARCH - self._create_index = create_index self._kwargs = kwargs @property @@ -133,13 +131,13 @@ def client(self) -> SearchClient: try: if not self._index_client: self._index_client = SearchIndexClient(resolved_endpoint, credential, **self._kwargs) - if not self.index_exists(self._index_name): + if not self._index_exists(self._index_name): # Create a new index if it does not exist logger.debug( "The index '%s' does not exist. A new index will be created.", self._index_name, ) - self.create_index(self._index_name) + self._create_index(self._index_name) except (HttpResponseError, ClientAuthenticationError) as error: msg = f"Failed to authenticate with Azure Search: {error}" raise AzureAISearchDocumentStoreConfigError(msg) from error @@ -155,7 +153,7 @@ def client(self) -> SearchClient: return self._client - def create_index(self, index_name: str, **kwargs) -> None: + def _create_index(self, index_name: str, **kwargs) -> None: """ Creates a new search index. :param index_name: Name of the index to create. If None, the index name from the constructor is used. @@ -201,7 +199,6 @@ def to_dict(self) -> Dict[str, Any]: azure_endpoint=self._azure_endpoint.to_dict() if self._azure_endpoint is not None else None, api_key=self._api_key.to_dict() if self._api_key is not None else None, index_name=self._index_name, - create_index=self._create_index, embedding_dimension=self._embedding_dimension, metadata_fields=self._metadata_fields, vector_search_configuration=self._vector_search_configuration.as_dict(), @@ -225,14 +222,13 @@ def from_dict(cls, data: Dict[str, Any]) -> "AzureAISearchDocumentStore": data["init_parameters"]["vector_search_configuration"] = VectorSearch.from_dict(vector_search_configuration) return default_from_dict(cls, data) - def count_documents(self, **kwargs: Any) -> int: + def count_documents(self) -> int: """ Returns how many documents are present in the search index. - :param kwargs: additional keyword parameters. :returns: list of retrieved documents. """ - return self.client.get_document_count(**kwargs) + return self.client.get_document_count() def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.FAIL) -> int: """ @@ -292,7 +288,7 @@ def delete_documents(self, document_ids: List[str]) -> None: def get_documents_by_id(self, document_ids: List[str]) -> List[Document]: return self._convert_search_result_to_documents(self._get_raw_documents_by_id(document_ids)) - def search_documents(self, search_text: Optional[str] = "*", top_k: Optional[int] = 10) -> List[Document]: + def search_documents(self, search_text: str = "*", top_k: int = 10) -> List[Document]: """ Returns all documents that match the provided search_text. If search_text is None, returns all documents. @@ -345,7 +341,7 @@ def _convert_search_result_to_documents(self, azure_docs: List[Dict[str, Any]]) documents.append(doc) return documents - def index_exists(self, index_name: Optional[str]) -> bool: + def _index_exists(self, index_name: Optional[str]) -> bool: """ Check if the index exists in the Azure AI Search service. @@ -403,14 +399,19 @@ def _map_metadata_field_types(self, metadata: Dict[str, type]) -> Dict[str, str] for key, value_type in metadata.items(): - # Azure Search index only allows field names starting with letters - field_name = next((key[i:] for i, char in enumerate(key) if char.isalpha()), key) + if not key[0].isalpha(): + msg = ( + f"Azure Search index only allows field names starting with letters. " + f"Invalid key: {key} will be dropped." + ) + logger.warning(msg) + continue field_type = type_mapping.get(value_type) if not field_type: - error_message = f"Unsupported field type for key '{field_name}': {value_type}" + error_message = f"Unsupported field type for key '{key}': {value_type}" raise ValueError(error_message) - metadata_field_mapping[field_name] = field_type + metadata_field_mapping[key] = field_type return metadata_field_mapping diff --git a/integrations/azure_ai_search/tests/test_document_store.py b/integrations/azure_ai_search/tests/test_document_store.py index 50733a8a4..8e1aaec80 100644 --- a/integrations/azure_ai_search/tests/test_document_store.py +++ b/integrations/azure_ai_search/tests/test_document_store.py @@ -35,7 +35,6 @@ def test_to_dict(monkeypatch): "index_name": "default", "embedding_dimension": 768, "metadata_fields": None, - "create_index": True, "vector_search_configuration": { "profiles": [ {"name": "default-vector-config", "algorithm_configuration_name": "cosine-algorithm-config"} @@ -65,7 +64,6 @@ def test_from_dict(monkeypatch): "embedding_dimension": 768, "index_name": "default", "metadata_fields": None, - "create_index": False, "vector_search_configuration": DEFAULT_VECTOR_SEARCH, }, } @@ -75,7 +73,6 @@ def test_from_dict(monkeypatch): assert document_store._index_name == "default" assert document_store._embedding_dimension == 768 assert document_store._metadata_fields is None - assert document_store._create_index is False assert document_store._vector_search_configuration == DEFAULT_VECTOR_SEARCH @@ -92,13 +89,11 @@ def test_init(_mock_azure_search_client): api_key=Secret.from_token("fake-api-key"), azure_endpoint=Secret.from_token("fake_endpoint"), index_name="my_index", - create_index=False, embedding_dimension=15, metadata_fields={"Title": str, "Pages": int}, ) assert document_store._index_name == "my_index" - assert document_store._create_index is False assert document_store._embedding_dimension == 15 assert document_store._metadata_fields == {"Title": str, "Pages": int} assert document_store._vector_search_configuration == DEFAULT_VECTOR_SEARCH @@ -156,7 +151,8 @@ def _random_embeddings(n): ) class TestFilters(FilterDocumentsTest): - # Overriding to change "date" to compatible ISO format and remove incompatible fields (dataframes) for search index + # Overriding to change "date" to compatible ISO 8601 format + # and remove incompatible fields (dataframes) for Azure search index @pytest.fixture def filterable_docs(self) -> List[Document]: """Fixture that returns a list of Documents that can be used to test filtering.""" @@ -224,26 +220,25 @@ def assert_documents_are_equal(self, received: List[Document], expected: List[Do sorted_expected = sorted(expected, key=lambda doc: doc.id) assert sorted_recieved == sorted_expected - # Dataframes are not supported in serach index - def test_comparison_equal_with_dataframe(self, document_store, filterable_docs): - pass + @pytest.mark.skip(reason="Azure AI search index does not support dataframes") + def test_comparison_equal_with_dataframe(self, document_store, filterable_docs): ... - def test_comparison_not_equal_with_dataframe(self, document_store, filterable_docs): - pass + @pytest.mark.skip(reason="Azure AI search index does not support dataframes") + def test_comparison_not_equal_with_dataframe(self, document_store, filterable_docs): ... - def test_comparison_greater_than_with_dataframe(self, document_store, filterable_docs): - pass + @pytest.mark.skip(reason="Azure AI search index does not support dataframes") + def test_comparison_greater_than_with_dataframe(self, document_store, filterable_docs): ... - def test_comparison_less_than_with_dataframe(self, document_store, filterable_docs): - pass + @pytest.mark.skip(reason="Azure AI search index does not support dataframes") + def test_comparison_less_than_with_dataframe(self, document_store, filterable_docs): ... - def test_comparison_greater_than_equal_with_dataframe(self, document_store, filterable_docs): - pass + @pytest.mark.skip(reason="Azure AI search index does not support dataframes") + def test_comparison_greater_than_equal_with_dataframe(self, document_store, filterable_docs): ... - def test_comparison_less_than_equal_with_dataframe(self, document_store, filterable_docs): - pass + @pytest.mark.skip(reason="Azure AI search index does not support dataframes") + def test_comparison_less_than_equal_with_dataframe(self, document_store, filterable_docs): ... - # Azure search index supports UTC datetime in ISO format + # Azure search index supports UTC datetime in ISO 8601 format def test_comparison_greater_than_with_iso_date(self, document_store, filterable_docs): """Test filter_documents() with > comparator and datetime""" document_store.write_documents(filterable_docs) @@ -346,15 +341,14 @@ def test_comparison_in(self, document_store, filterable_docs): expected = [d for d in filterable_docs if d.meta.get("page") is not None and d.meta["page"] in ["100", "123"]] self.assert_documents_are_equal(result, expected) - # not supported - def test_comparison_not_in(self, document_store, filterable_docs): - pass + @pytest.mark.skip(reason="Azure AI search index does not support not in operator") + def test_comparison_not_in(self, document_store, filterable_docs): ... - def test_comparison_not_in_with_with_non_list(self, document_store, filterable_docs): - pass + @pytest.mark.skip(reason="Azure AI search index does not support not in operator") + def test_comparison_not_in_with_with_non_list(self, document_store, filterable_docs): ... - def test_comparison_not_in_with_with_non_list_iterable(self, document_store, filterable_docs): - pass + @pytest.mark.skip(reason="Azure AI search index does not support not in operator") + def test_comparison_not_in_with_with_non_list_iterable(self, document_store, filterable_docs): ... def test_missing_condition_operator_key(self, document_store, filterable_docs): """Test filter_documents() with missing operator key""" diff --git a/integrations/azure_ai_search/tests/test_embedding_retriever.py b/integrations/azure_ai_search/tests/test_embedding_retriever.py index af4b21478..83db9c058 100644 --- a/integrations/azure_ai_search/tests/test_embedding_retriever.py +++ b/integrations/azure_ai_search/tests/test_embedding_retriever.py @@ -49,7 +49,6 @@ def test_to_dict(): }, "api_key": {"type": "env_var", "env_vars": ["AZURE_SEARCH_API_KEY"], "strict": False}, "index_name": "default", - "create_index": True, "embedding_dimension": 768, "metadata_fields": None, "vector_search_configuration": { @@ -88,7 +87,6 @@ def test_from_dict(): }, "api_key": {"type": "env_var", "env_vars": ["AZURE_SEARCH_API_KEY"], "strict": False}, "index_name": "default", - "create_index": True, "embedding_dimension": 768, "metadata_fields": None, "vector_search_configuration": DEFAULT_VECTOR_SEARCH,