From 7ff13192b1f460bfe1f1ec7a4a78b080b2fd8d03 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Mon, 11 Nov 2024 05:49:07 +0100 Subject: [PATCH] CrateDB: Vector Store -- make _euclidean_relevance_score_fn identity f. We don't need anything on top of it, ie we don't need this function and instead should use value from CrateDB as is. Similarity is already in the (0,1] interval and dividing by math.sqrt(2) won't normalize it but return wrong result, for example 1 will become 0.714. --- .../langchain_community/vectorstores/cratedb/base.py | 5 ++--- .../tests/integration_tests/vectorstores/test_cratedb.py | 6 +++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/cratedb/base.py b/libs/community/langchain_community/vectorstores/cratedb/base.py index e9109764ecfc7e..eb05571cbf945d 100644 --- a/libs/community/langchain_community/vectorstores/cratedb/base.py +++ b/libs/community/langchain_community/vectorstores/cratedb/base.py @@ -1,7 +1,6 @@ from __future__ import annotations import enum -import math from typing import ( Any, Callable, @@ -466,10 +465,10 @@ def _euclidean_relevance_score_fn(similarity: float) -> float: # others are not!) # - embedding dimensionality # - etc. - # This function converts the euclidean norm of normalized embeddings + # This function converts the Euclidean norm of normalized embeddings # (0 is most similar, sqrt(2) most dissimilar) # to a similarity function (0 to 1) # Original: # return 1.0 - distance / math.sqrt(2) - return similarity / math.sqrt(2) + return similarity diff --git a/libs/community/tests/integration_tests/vectorstores/test_cratedb.py b/libs/community/tests/integration_tests/vectorstores/test_cratedb.py index acc03547fe5650..52aad4a0a85371 100644 --- a/libs/community/tests/integration_tests/vectorstores/test_cratedb.py +++ b/libs/community/tests/integration_tests/vectorstores/test_cratedb.py @@ -470,9 +470,9 @@ def test_cratedb_relevance_score() -> None: output = docsearch.similarity_search_with_relevance_scores("foo", k=3) # Original score values: 1.0, 0.9996744261675065, 0.9986996093328621 assert output == [ - (Document(page_content="foo", metadata={"page": "0"}), 0.7071067811865475), - (Document(page_content="bar", metadata={"page": "1"}), 0.35355339059327373), - (Document(page_content="baz", metadata={"page": "2"}), 0.1414213562373095), + (Document(page_content="foo", metadata={"page": "0"}), 1.0), + (Document(page_content="bar", metadata={"page": "1"}), 0.5), + (Document(page_content="baz", metadata={"page": "2"}), 0.2), ]