diff --git a/document_stores/elasticsearch/src/elasticsearch_haystack/bm25_retriever.py b/document_stores/elasticsearch/src/elasticsearch_haystack/bm25_retriever.py index feb39f42a..f48efcf3f 100644 --- a/document_stores/elasticsearch/src/elasticsearch_haystack/bm25_retriever.py +++ b/document_stores/elasticsearch/src/elasticsearch_haystack/bm25_retriever.py @@ -16,6 +16,7 @@ def __init__( *, document_store: ElasticsearchDocumentStore, filters: Optional[Dict[str, Any]] = None, + fuzziness: str = "AUTO", top_k: int = 10, scale_score: bool = True, ): @@ -25,6 +26,7 @@ def __init__( self._document_store = document_store self._filters = filters or {} + self._fuzziness = fuzziness self._top_k = top_k self._scale_score = scale_score @@ -32,6 +34,7 @@ def to_dict(self) -> Dict[str, Any]: return default_to_dict( self, filters=self._filters, + fuzziness=self._fuzziness, top_k=self._top_k, scale_score=self._scale_score, document_store=self._document_store.to_dict(), @@ -47,6 +50,10 @@ def from_dict(cls, data: Dict[str, Any]) -> "ElasticsearchBM25Retriever": @component.output_types(documents=List[Document]) def run(self, query: str): docs = self._document_store._bm25_retrieval( - query=query, filters=self._filters, top_k=self._top_k, scale_score=self._scale_score + query=query, + filters=self._filters, + fuzziness=self._fuzziness, + top_k=self._top_k, + scale_score=self._scale_score, ) return {"documents": docs} diff --git a/document_stores/elasticsearch/src/elasticsearch_haystack/document_store.py b/document_stores/elasticsearch/src/elasticsearch_haystack/document_store.py index de059f685..b428b19e8 100644 --- a/document_stores/elasticsearch/src/elasticsearch_haystack/document_store.py +++ b/document_stores/elasticsearch/src/elasticsearch_haystack/document_store.py @@ -246,6 +246,7 @@ def _bm25_retrieval( query: str, *, filters: Optional[Dict[str, Any]] = None, + fuzziness: str = "AUTO", top_k: int = 10, scale_score: bool = True, ) -> List[Document]: @@ -263,6 +264,9 @@ def _bm25_retrieval( :param query: String to search in saved Documents' text. :param filters: Filters applied to the retrieved Documents, for more info see `ElasticsearchDocumentStore.filter_documents`, defaults to None + :param fuzziness: Fuzziness parameter passed to Elasticsearch, defaults to "AUTO". + see the official documentation for valid values: + https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness :param top_k: Maximum number of Documents to return, defaults to 10 :param scale_score: If `True` scales the Document`s scores between 0 and 1, defaults to True :raises ValueError: If `query` is an empty string @@ -281,6 +285,7 @@ def _bm25_retrieval( { "multi_match": { "query": query, + "fuzziness": fuzziness, "type": "most_fields", "operator": "AND", } diff --git a/document_stores/elasticsearch/tests/test_bm25_retriever.py b/document_stores/elasticsearch/tests/test_bm25_retriever.py index 51dfe5140..b9914e69c 100644 --- a/document_stores/elasticsearch/tests/test_bm25_retriever.py +++ b/document_stores/elasticsearch/tests/test_bm25_retriever.py @@ -31,6 +31,7 @@ def test_to_dict(_mock_elasticsearch_client): "type": "ElasticsearchDocumentStore", }, "filters": {}, + "fuzziness": "AUTO", "top_k": 10, "scale_score": True, }, @@ -47,6 +48,7 @@ def test_from_dict(_mock_elasticsearch_client): "type": "ElasticsearchDocumentStore", }, "filters": {}, + "fuzziness": "AUTO", "top_k": 10, "scale_score": True, }, @@ -54,6 +56,7 @@ def test_from_dict(_mock_elasticsearch_client): retriever = ElasticsearchBM25Retriever.from_dict(data) assert retriever._document_store assert retriever._filters == {} + assert retriever._fuzziness == "AUTO" assert retriever._top_k == 10 assert retriever._scale_score @@ -66,6 +69,7 @@ def test_run(): mock_store._bm25_retrieval.assert_called_once_with( query="some query", filters={}, + fuzziness="AUTO", top_k=10, scale_score=True, ) diff --git a/document_stores/elasticsearch/tests/test_document_store.py b/document_stores/elasticsearch/tests/test_document_store.py index 215df160e..64c154687 100644 --- a/document_stores/elasticsearch/tests/test_document_store.py +++ b/document_stores/elasticsearch/tests/test_document_store.py @@ -82,6 +82,36 @@ def test_bm25_retrieval(self, docstore: ElasticsearchDocumentStore): assert "functional" in res[1].content assert "functional" in res[2].content + def test_bm25_retrieval_with_fuzziness(self, docstore: ElasticsearchDocumentStore): + docstore.write_documents( + [ + Document(content="Haskell is a functional programming language"), + Document(content="Lisp is a functional programming language"), + Document(content="Exilir is a functional programming language"), + Document(content="F# is a functional programming language"), + Document(content="C# is a functional programming language"), + Document(content="C++ is an object oriented programming language"), + Document(content="Dart is an object oriented programming language"), + Document(content="Go is an object oriented programming language"), + Document(content="Python is a object oriented programming language"), + Document(content="Ruby is a object oriented programming language"), + Document(content="PHP is a object oriented programming language"), + ] + ) + + query_with_typo = "functinal" + # Query without fuzziness to search for the exact match + res = docstore._bm25_retrieval(query_with_typo, top_k=3, fuzziness="0") + # Nothing is found as the query contains a typo + assert res == [] + + # Query with fuzziness with the same query + res = docstore._bm25_retrieval(query_with_typo, top_k=3, fuzziness="1") + assert len(res) == 3 + assert "functional" in res[0].content + assert "functional" in res[1].content + assert "functional" in res[2].content + def test_write_duplicate_fail(self, docstore: ElasticsearchDocumentStore): """ Verify `DuplicateDocumentError` is raised when trying to write duplicate files.