diff --git a/haystack/nodes/other/join_docs.py b/haystack/nodes/other/join_docs.py index 274e90a38d..b94d7d14e5 100644 --- a/haystack/nodes/other/join_docs.py +++ b/haystack/nodes/other/join_docs.py @@ -155,8 +155,16 @@ def _calculate_rrf(self, results): K = 61 scores_map = defaultdict(int) - for result in results: + weights = self.weights if self.weights else [1 / len(results)] * len(results) + + # Calculate weighted reciprocal rank fusion score + for result, weight in zip(results, weights): for rank, doc in enumerate(result): - scores_map[doc.id] += 1 / (K + rank) + scores_map[doc.id] += (weight * len(results)) / (K + rank) + + # Normalize scores. Note: len(results) / K is the maximum possible score, + # achieved by being ranked first in all results with non-zero weight. + for id in scores_map: + scores_map[id] = scores_map[id] / (len(results) / K) return scores_map diff --git a/releasenotes/notes/join-docs-weighting-rrf-c52ba00a25004fd4.yaml b/releasenotes/notes/join-docs-weighting-rrf-c52ba00a25004fd4.yaml new file mode 100644 index 0000000000..23a31d911e --- /dev/null +++ b/releasenotes/notes/join-docs-weighting-rrf-c52ba00a25004fd4.yaml @@ -0,0 +1,6 @@ +--- +enhancements: + - | + Make `JoinDocuments` sensitive to `weights` parameter when + `join_mode` is reciprocal rank fusion. Add score normalization + for `JoinDocuments` when `join_mode` is reciprocal rank fusion. diff --git a/test/nodes/test_join_documents.py b/test/nodes/test_join_documents.py index 463aeaa577..ae809f4994 100644 --- a/test/nodes/test_join_documents.py +++ b/test/nodes/test_join_documents.py @@ -3,6 +3,7 @@ from haystack import Document from haystack.nodes.other.join_docs import JoinDocuments +from copy import deepcopy @pytest.mark.unit @@ -113,3 +114,38 @@ def test_joindocuments_concatenate_duplicate_docs_null_score(): result, _ = join_docs.run(inputs) assert len(result["documents"]) == 3 assert result["documents"] == expected_outputs["documents"] + + +@pytest.mark.unit +def test_joindocuments_rrf_weights(): + """ + Test that the reciprocal rank fusion method correctly handles weights. + """ + inputs_none = [ + { + "documents": [ + Document(content="text document 1", content_type="text", score=0.2), + Document(content="text document 2", content_type="text", score=0.3), + ] + }, + { + "documents": [ + Document(content="text document 3", content_type="text", score=0.7), + Document(content="text document 4", content_type="text", score=None), + ] + }, + ] + + inputs_even = deepcopy(inputs_none) + inputs_uneven = deepcopy(inputs_none) + + join_docs_none = JoinDocuments(join_mode="reciprocal_rank_fusion") + result_none, _ = join_docs_none.run(inputs_none) + join_docs_even = JoinDocuments(join_mode="reciprocal_rank_fusion", weights=[0.5, 0.5]) + result_even, _ = join_docs_even.run(inputs_even) + join_docs_uneven = JoinDocuments(join_mode="reciprocal_rank_fusion", weights=[0.7, 0.3]) + result_uneven, _ = join_docs_uneven.run(inputs_uneven) + + assert result_none["documents"] == result_even["documents"] + assert result_uneven["documents"] != result_none["documents"] + assert result_uneven["documents"][0].score > result_none["documents"][0].score