From 63f20c03e3637ee248083631e978a89d2be48dbe Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Thu, 12 Dec 2024 09:35:30 +0100 Subject: [PATCH] chore: Update docstring and type of fuzziness (#1243) * Update docstring and type of fuzziness * Add test --- .../retrievers/opensearch/bm25_retriever.py | 14 ++++++--- .../opensearch/document_store.py | 12 ++++++-- .../opensearch/tests/test_bm25_retriever.py | 29 +++++++++++++++++++ 3 files changed, 48 insertions(+), 7 deletions(-) diff --git a/integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/bm25_retriever.py b/integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/bm25_retriever.py index 4a8478e2c..69288a5cf 100644 --- a/integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/bm25_retriever.py +++ b/integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/bm25_retriever.py @@ -27,7 +27,7 @@ def __init__( *, document_store: OpenSearchDocumentStore, filters: Optional[Dict[str, Any]] = None, - fuzziness: str = "AUTO", + fuzziness: Union[int, str] = "AUTO", top_k: int = 10, scale_score: bool = False, all_terms_must_match: bool = False, @@ -40,8 +40,14 @@ def __init__( :param document_store: An instance of OpenSearchDocumentStore to use with the Retriever. :param filters: Filters to narrow down the search for documents in the Document Store. - :param fuzziness: Fuzziness parameter for full-text queries to apply approximate string matching. - For more information, see [OpenSearch fuzzy query](https://opensearch.org/docs/latest/query-dsl/term/fuzzy/). + :param fuzziness: Determines how approximate string matching is applied in full-text queries. + This parameter sets the number of character edits (insertions, deletions, or substitutions) + required to transform one word into another. For example, the "fuzziness" between the words + "wined" and "wind" is 1 because only one edit is needed to match them. + + Use "AUTO" (the default) for automatic adjustment based on term length, which is optimal for + most scenarios. For detailed guidance, refer to the + [OpenSearch fuzzy query documentation](https://opensearch.org/docs/latest/query-dsl/term/fuzzy/). :param top_k: Maximum number of documents to return. :param scale_score: If `True`, scales the score of retrieved documents to a range between 0 and 1. This is useful when comparing documents across different indexes. @@ -153,7 +159,7 @@ def run( filters: Optional[Dict[str, Any]] = None, all_terms_must_match: Optional[bool] = None, top_k: Optional[int] = None, - fuzziness: Optional[str] = None, + fuzziness: Optional[Union[int, str]] = None, scale_score: Optional[bool] = None, custom_query: Optional[Dict[str, Any]] = None, ): diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 4ec2420b3..6cb5295f0 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -340,7 +340,7 @@ def _bm25_retrieval( query: str, *, filters: Optional[Dict[str, Any]] = None, - fuzziness: str = "AUTO", + fuzziness: Union[int, str] = "AUTO", top_k: int = 10, scale_score: bool = False, all_terms_must_match: bool = False, @@ -357,8 +357,14 @@ def _bm25_retrieval( :param query: String to search in saved Documents' text. :param filters: Optional filters to narrow down the search space. - :param fuzziness: Fuzziness parameter passed to OpenSearch, defaults to "AUTO". see the official documentation - for valid [fuzziness values](https://www.elastic.co/guide/en/OpenSearch/reference/current/common-options.html#fuzziness) + :param fuzziness: Determines how approximate string matching is applied in full-text queries. + This parameter sets the number of character edits (insertions, deletions, or substitutions) + required to transform one word into another. For example, the "fuzziness" between the words + "wined" and "wind" is 1 because only one edit is needed to match them. + + Use "AUTO" (the default) for automatic adjustment based on term length, which is optimal for + most scenarios. For detailed guidance, refer to the + [OpenSearch fuzzy query documentation](https://opensearch.org/docs/latest/query-dsl/term/fuzzy/). :param top_k: Maximum number of Documents to return, defaults to 10 :param scale_score: If `True` scales the Document`s scores between 0 and 1, defaults to False :param all_terms_must_match: If `True` all terms in `query` must be present in the Document, defaults to False diff --git a/integrations/opensearch/tests/test_bm25_retriever.py b/integrations/opensearch/tests/test_bm25_retriever.py index ef3275608..48fc31419 100644 --- a/integrations/opensearch/tests/test_bm25_retriever.py +++ b/integrations/opensearch/tests/test_bm25_retriever.py @@ -121,6 +121,35 @@ def test_from_dict(_mock_opensearch_client): assert retriever._filter_policy == FilterPolicy.REPLACE +@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch") +def test_from_dict_not_defaults(_mock_opensearch_client): + data = { + "type": "haystack_integrations.components.retrievers.opensearch.bm25_retriever.OpenSearchBM25Retriever", + "init_parameters": { + "document_store": { + "init_parameters": {"hosts": "some fake host", "index": "default"}, + "type": "haystack_integrations.document_stores.opensearch.document_store.OpenSearchDocumentStore", + }, + "filters": {}, + "fuzziness": 0, + "top_k": 15, + "scale_score": True, + "filter_policy": "replace", + "custom_query": {"some": "custom query"}, + "raise_on_failure": True, + }, + } + retriever = OpenSearchBM25Retriever.from_dict(data) + assert retriever._document_store + assert retriever._filters == {} + assert retriever._fuzziness == 0 + assert retriever._top_k == 15 + assert retriever._scale_score + assert retriever._filter_policy == FilterPolicy.REPLACE + assert retriever._custom_query == {"some": "custom query"} + assert retriever._raise_on_failure is True + + def test_run(): mock_store = Mock(spec=OpenSearchDocumentStore) mock_store._bm25_retrieval.return_value = [Document(content="Test doc")]