Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add fuzziness support in Elasticsearch bm25 retrieval #49

Merged
merged 1 commit into from
Nov 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def __init__(
*,
document_store: ElasticsearchDocumentStore,
filters: Optional[Dict[str, Any]] = None,
fuzziness: str = "AUTO",
top_k: int = 10,
scale_score: bool = True,
):
Expand All @@ -25,13 +26,15 @@ def __init__(

self._document_store = document_store
self._filters = filters or {}
self._fuzziness = fuzziness
self._top_k = top_k
self._scale_score = scale_score

def to_dict(self) -> Dict[str, Any]:
return default_to_dict(
self,
filters=self._filters,
fuzziness=self._fuzziness,
top_k=self._top_k,
scale_score=self._scale_score,
document_store=self._document_store.to_dict(),
Expand All @@ -47,6 +50,10 @@ def from_dict(cls, data: Dict[str, Any]) -> "ElasticsearchBM25Retriever":
@component.output_types(documents=List[Document])
def run(self, query: str):
docs = self._document_store._bm25_retrieval(
query=query, filters=self._filters, top_k=self._top_k, scale_score=self._scale_score
query=query,
filters=self._filters,
fuzziness=self._fuzziness,
top_k=self._top_k,
scale_score=self._scale_score,
)
return {"documents": docs}
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,7 @@ def _bm25_retrieval(
query: str,
*,
filters: Optional[Dict[str, Any]] = None,
fuzziness: str = "AUTO",
top_k: int = 10,
scale_score: bool = True,
) -> List[Document]:
Expand All @@ -263,6 +264,9 @@ def _bm25_retrieval(
:param query: String to search in saved Documents' text.
:param filters: Filters applied to the retrieved Documents, for more info
see `ElasticsearchDocumentStore.filter_documents`, defaults to None
:param fuzziness: Fuzziness parameter passed to Elasticsearch, defaults to "AUTO".
see the official documentation for valid values:
https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
:param top_k: Maximum number of Documents to return, defaults to 10
:param scale_score: If `True` scales the Document`s scores between 0 and 1, defaults to True
:raises ValueError: If `query` is an empty string
Expand All @@ -281,6 +285,7 @@ def _bm25_retrieval(
{
"multi_match": {
"query": query,
"fuzziness": fuzziness,
"type": "most_fields",
"operator": "AND",
}
Expand Down
4 changes: 4 additions & 0 deletions document_stores/elasticsearch/tests/test_bm25_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def test_to_dict(_mock_elasticsearch_client):
"type": "ElasticsearchDocumentStore",
},
"filters": {},
"fuzziness": "AUTO",
"top_k": 10,
"scale_score": True,
},
Expand All @@ -47,13 +48,15 @@ def test_from_dict(_mock_elasticsearch_client):
"type": "ElasticsearchDocumentStore",
},
"filters": {},
"fuzziness": "AUTO",
"top_k": 10,
"scale_score": True,
},
}
retriever = ElasticsearchBM25Retriever.from_dict(data)
assert retriever._document_store
assert retriever._filters == {}
assert retriever._fuzziness == "AUTO"
assert retriever._top_k == 10
assert retriever._scale_score

Expand All @@ -66,6 +69,7 @@ def test_run():
mock_store._bm25_retrieval.assert_called_once_with(
query="some query",
filters={},
fuzziness="AUTO",
top_k=10,
scale_score=True,
)
Expand Down
30 changes: 30 additions & 0 deletions document_stores/elasticsearch/tests/test_document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,36 @@ def test_bm25_retrieval(self, docstore: ElasticsearchDocumentStore):
assert "functional" in res[1].content
assert "functional" in res[2].content

def test_bm25_retrieval_with_fuzziness(self, docstore: ElasticsearchDocumentStore):
docstore.write_documents(
[
Document(content="Haskell is a functional programming language"),
Document(content="Lisp is a functional programming language"),
Document(content="Exilir is a functional programming language"),
Document(content="F# is a functional programming language"),
Document(content="C# is a functional programming language"),
Document(content="C++ is an object oriented programming language"),
Document(content="Dart is an object oriented programming language"),
Document(content="Go is an object oriented programming language"),
Document(content="Python is a object oriented programming language"),
Document(content="Ruby is a object oriented programming language"),
Document(content="PHP is a object oriented programming language"),
]
)

query_with_typo = "functinal"
# Query without fuzziness to search for the exact match
res = docstore._bm25_retrieval(query_with_typo, top_k=3, fuzziness="0")
# Nothing is found as the query contains a typo
assert res == []

# Query with fuzziness with the same query
res = docstore._bm25_retrieval(query_with_typo, top_k=3, fuzziness="1")
assert len(res) == 3
assert "functional" in res[0].content
assert "functional" in res[1].content
assert "functional" in res[2].content

def test_write_duplicate_fail(self, docstore: ElasticsearchDocumentStore):
"""
Verify `DuplicateDocumentError` is raised when trying to write duplicate files.
Expand Down