-
Notifications
You must be signed in to change notification settings - Fork 127
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Elasticsearch - refactor
_search_documents
(#57)
* set scale_score default to False * unrelated: replace text w content * first implementation * test * fix some tests * make tests more robust; skip unsupported ones * rm unsupported test * ignore import-not-found * first chunk addressing PR feedback * improve tests * use _search_documents also in bm25 retrieval * improve logic and tests * fix format * better format * Update document_stores/elasticsearch/src/elasticsearch_haystack/document_store.py Co-authored-by: Silvano Cerza <[email protected]> * Update document_stores/elasticsearch/src/elasticsearch_haystack/document_store.py Co-authored-by: Silvano Cerza <[email protected]> * remove wrong increment * move ruff ignore error --------- Co-authored-by: Silvano Cerza <[email protected]>
- Loading branch information
1 parent
48c0d5f
commit 1c6410e
Showing
2 changed files
with
58 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,8 @@ | ||
# SPDX-FileCopyrightText: 2023-present Silvano Cerza <[email protected]> | ||
# | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
import random | ||
from typing import List | ||
from unittest.mock import patch | ||
|
||
|
@@ -92,6 +94,34 @@ def test_bm25_retrieval(self, docstore: ElasticsearchDocumentStore): | |
assert "functional" in res[1].content | ||
assert "functional" in res[2].content | ||
|
||
def test_bm25_retrieval_pagination(self, docstore: ElasticsearchDocumentStore): | ||
""" | ||
Test that handling of pagination works as expected, when the matching documents are > 10. | ||
""" | ||
docstore.write_documents( | ||
[ | ||
Document(content="Haskell is a functional programming language"), | ||
Document(content="Lisp is a functional programming language"), | ||
Document(content="Exilir is a functional programming language"), | ||
Document(content="F# is a functional programming language"), | ||
Document(content="C# is a functional programming language"), | ||
Document(content="C++ is an object oriented programming language"), | ||
Document(content="Dart is an object oriented programming language"), | ||
Document(content="Go is an object oriented programming language"), | ||
Document(content="Python is a object oriented programming language"), | ||
Document(content="Ruby is a object oriented programming language"), | ||
Document(content="PHP is a object oriented programming language"), | ||
Document(content="Java is an object oriented programming language"), | ||
Document(content="Javascript is a programming language"), | ||
Document(content="Typescript is a programming language"), | ||
Document(content="C is a programming language"), | ||
] | ||
) | ||
|
||
res = docstore._bm25_retrieval("programming", top_k=11) | ||
assert len(res) == 11 | ||
assert all("programming" in doc.content for doc in res) | ||
|
||
def test_bm25_retrieval_with_fuzziness(self, docstore: ElasticsearchDocumentStore): | ||
docstore.write_documents( | ||
[ | ||
|
@@ -282,6 +312,20 @@ def test_embedding_retrieval_w_filters(self, docstore: ElasticsearchDocumentStor | |
assert len(results) == 1 | ||
assert results[0].content == "Not very similar document with meta field" | ||
|
||
def test_embedding_retrieval_pagination(self, docstore: ElasticsearchDocumentStore): | ||
""" | ||
Test that handling of pagination works as expected, when the matching documents are > 10. | ||
""" | ||
|
||
docs = [ | ||
Document(content=f"Document {i}", embedding=[random.random() for _ in range(4)]) # noqa: S311 | ||
for i in range(20) | ||
] | ||
|
||
docstore.write_documents(docs) | ||
results = docstore._embedding_retrieval(query_embedding=[0.1, 0.1, 0.1, 0.1], top_k=11, filters={}) | ||
assert len(results) == 11 | ||
|
||
def test_embedding_retrieval_query_documents_different_embedding_sizes(self, docstore: ElasticsearchDocumentStore): | ||
""" | ||
Test that the retrieval fails if the query embedding and the documents have different embedding sizes. | ||
|