Skip to content

Commit

Permalink
Add fuzziness support in Elasticsearch bm25 retrieval
Browse files Browse the repository at this point in the history
  • Loading branch information
silvanocerza committed Nov 10, 2023
1 parent 0f57ce4 commit b038255
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def __init__(
*,
document_store: ElasticsearchDocumentStore,
filters: Optional[Dict[str, Any]] = None,
fuzziness: str = "AUTO",
top_k: int = 10,
scale_score: bool = True,
):
Expand All @@ -25,13 +26,15 @@ def __init__(

self._document_store = document_store
self._filters = filters or {}
self._fuzziness = fuzziness
self._top_k = top_k
self._scale_score = scale_score

def to_dict(self) -> Dict[str, Any]:
return default_to_dict(
self,
filters=self._filters,
fuzziness=self._fuzziness,
top_k=self._top_k,
scale_score=self._scale_score,
document_store=self._document_store.to_dict(),
Expand All @@ -47,6 +50,10 @@ def from_dict(cls, data: Dict[str, Any]) -> "ElasticsearchBM25Retriever":
@component.output_types(documents=List[Document])
def run(self, query: str):
docs = self._document_store._bm25_retrieval(
query=query, filters=self._filters, top_k=self._top_k, scale_score=self._scale_score
query=query,
filters=self._filters,
fuzziness=self._fuzziness,
top_k=self._top_k,
scale_score=self._scale_score,
)
return {"documents": docs}
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,7 @@ def _bm25_retrieval(
query: str,
*,
filters: Optional[Dict[str, Any]] = None,
fuzziness: str = "AUTO",
top_k: int = 10,
scale_score: bool = True,
) -> List[Document]:
Expand All @@ -263,6 +264,9 @@ def _bm25_retrieval(
:param query: String to search in saved Documents' text.
:param filters: Filters applied to the retrieved Documents, for more info
see `ElasticsearchDocumentStore.filter_documents`, defaults to None
:param fuzziness: Fuzziness parameter passed to Elasticsearch, defaults to "AUTO".
see the official documentation for valid values:
https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
:param top_k: Maximum number of Documents to return, defaults to 10
:param scale_score: If `True` scales the Document`s scores between 0 and 1, defaults to True
:raises ValueError: If `query` is an empty string
Expand All @@ -281,6 +285,7 @@ def _bm25_retrieval(
{
"multi_match": {
"query": query,
"fuzziness": fuzziness,
"type": "most_fields",
"operator": "AND",
}
Expand Down
4 changes: 4 additions & 0 deletions document_stores/elasticsearch/tests/test_bm25_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def test_to_dict(_mock_elasticsearch_client):
"type": "ElasticsearchDocumentStore",
},
"filters": {},
"fuzziness": "AUTO",
"top_k": 10,
"scale_score": True,
},
Expand All @@ -47,13 +48,15 @@ def test_from_dict(_mock_elasticsearch_client):
"type": "ElasticsearchDocumentStore",
},
"filters": {},
"fuzziness": "AUTO",
"top_k": 10,
"scale_score": True,
},
}
retriever = ElasticsearchBM25Retriever.from_dict(data)
assert retriever._document_store
assert retriever._filters == {}
assert retriever._fuzziness == "AUTO"
assert retriever._top_k == 10
assert retriever._scale_score

Expand All @@ -66,6 +69,7 @@ def test_run():
mock_store._bm25_retrieval.assert_called_once_with(
query="some query",
filters={},
fuzziness="AUTO",
top_k=10,
scale_score=True,
)
Expand Down
30 changes: 30 additions & 0 deletions document_stores/elasticsearch/tests/test_document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,36 @@ def test_bm25_retrieval(self, docstore: ElasticsearchDocumentStore):
assert "functional" in res[1].content
assert "functional" in res[2].content

def test_bm25_retrieval_with_fuzziness(self, docstore: ElasticsearchDocumentStore):
docstore.write_documents(
[
Document(content="Haskell is a functional programming language"),
Document(content="Lisp is a functional programming language"),
Document(content="Exilir is a functional programming language"),
Document(content="F# is a functional programming language"),
Document(content="C# is a functional programming language"),
Document(content="C++ is an object oriented programming language"),
Document(content="Dart is an object oriented programming language"),
Document(content="Go is an object oriented programming language"),
Document(content="Python is a object oriented programming language"),
Document(content="Ruby is a object oriented programming language"),
Document(content="PHP is a object oriented programming language"),
]
)

query_with_typo = "functinal"
# Query without fuzziness to search for the exact match
res = docstore._bm25_retrieval(query_with_typo, top_k=3, fuzziness="0")
# Nothing is found as the query contains a typo
assert res == []

# Query with fuzziness with the same query
res = docstore._bm25_retrieval(query_with_typo, top_k=3, fuzziness="1")
assert len(res) == 3
assert "functional" in res[0].content
assert "functional" in res[1].content
assert "functional" in res[2].content

def test_write_duplicate_fail(self, docstore: ElasticsearchDocumentStore):
"""
Verify `DuplicateDocumentError` is raised when trying to write duplicate files.
Expand Down

0 comments on commit b038255

Please sign in to comment.