Skip to content

Commit

Permalink
squash
Browse files Browse the repository at this point in the history
  • Loading branch information
anakin87 committed Jan 31, 2024
1 parent ae80056 commit 8d1088b
Show file tree
Hide file tree
Showing 3 changed files with 239 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,10 @@
meta = EXCLUDED.meta
"""

VALID_VECTOR_FUNCTIONS = ["cosine_similarity", "inner_product", "l2_distance"]

VECTOR_FUNCTION_TO_POSTGRESQL_OPS = {
"cosine_distance": "vector_cosine_ops",
"cosine_similarity": "vector_cosine_ops",
"inner_product": "vector_ip_ops",
"l2_distance": "vector_l2_ops",
}
Expand All @@ -70,7 +72,7 @@ def __init__(
connection_string: str,
table_name: str = "haystack_documents",
embedding_dimension: int = 768,
vector_function: Literal["cosine_distance", "inner_product", "l2_distance"] = "cosine_distance",
vector_function: Literal["cosine_similarity", "inner_product", "l2_distance"] = "cosine_similarity",
recreate_table: bool = False,
search_strategy: Literal["exact_nearest_neighbor", "hnsw"] = "exact_nearest_neighbor",
hnsw_recreate_index_if_exists: bool = False,
Expand All @@ -87,12 +89,16 @@ def __init__(
:param table_name: The name of the table to use to store Haystack documents. Defaults to "haystack_documents".
:param embedding_dimension: The dimension of the embedding. Defaults to 768.
:param vector_function: The similarity function to use when searching for similar embeddings.
Defaults to "cosine_distance". Set it to one of the following values:
:type vector_function: Literal["cosine_distance", "inner_product", "l2_distance"]
Defaults to "cosine_similarity". "cosine_similarity" and "inner_product" are similarity functions,
so the most similar documents are the ones with the lowest score.
"l2_distance" is a distance function, so the most similar documents are the ones with the smallest score.
When using the "hnsw" search strategy, the vector_function value is used to build an appropriate index.
:type vector_function: Literal["cosine_similarity", "inner_product", "l2_distance"]
:param recreate_table: Whether to recreate the table if it already exists. Defaults to False.
:param search_strategy: The search strategy to use when searching for similar embeddings.
Defaults to "exact_nearest_neighbor". "hnsw" is an approximate nearest neighbor search strategy,
which trades off some accuracy for speed; it is recommended for large numbers of documents.
When using the "hnsw" search strategy, the vector_function value is used to build an appropriate index.
:type search_strategy: Literal["exact_nearest_neighbor", "hnsw"]
:param hnsw_recreate_index_if_exists: Whether to recreate the HNSW index if it already exists.
Defaults to False. Only used if search_strategy is set to "hnsw".
Expand All @@ -107,6 +113,9 @@ def __init__(
self.connection_string = connection_string
self.table_name = table_name
self.embedding_dimension = embedding_dimension
if vector_function not in VALID_VECTOR_FUNCTIONS:
msg = f"vector_function must be one of {VALID_VECTOR_FUNCTIONS}, but got {vector_function}"
raise ValueError(msg)
self.vector_function = vector_function
self.recreate_table = recreate_table
self.search_strategy = search_strategy
Expand Down Expand Up @@ -423,3 +432,98 @@ def delete_documents(self, document_ids: List[str]) -> None:
)

self._execute_sql(delete_sql, error_msg="Could not delete documents from PgvectorDocumentStore")

def _embedding_retrieval(
self,
query_embedding: List[float],
*,
filters: Optional[Dict[str, Any]] = None,
top_k: int = 10,
vector_function: Optional[Literal["cosine_similarity", "inner_product", "l2_distance"]] = None,
) -> List[Document]:
"""
Retrieves documents that are most similar to the query embedding using a vector similarity metric.
This method is not mean to be part of the public interface of
`PgvectorDocumentStore` nor called directly.
`PgvectorEmbeddingRetriever` uses this method directly and is the public interface for it.
:param query_embedding: Embedding of the query.
:param filters: Filters applied to the retrieved Documents. Defaults to None.
When using the "hnsw" search strategy, filters are applied after the most similar Documents are retrieved,
so the number of results may be less than `top_k`.
To better understand HNSW index creation and configuration, refer to the pgvector documentation:
https://github.com/pgvector/pgvector?tab=readme-ov-file#hnsw
:param top_k: Maximum number of Documents to return, defaults to 10
:param vector_function: The similarity function to use when searching for similar embeddings.
Defaults to the PgvectorDocumentStore's vector_function.
Since vector_function is used to build the HNSW index (when using the "hnsw" search strategy),
if a vector_function other than the one used to build the index is chosen,
the index will not be used and the search will be slower.
"cosine_similarity" and "inner_product" are similarity functions,
so the most similar documents are the ones with the lowest score.
"l2_distance" is a distance function, so the most similar documents are the ones with the smallest score.
:type vector_function: Literal["cosine_similarity", "inner_product", "l2_distance"]
:raises ValueError
:return: List of Documents that are most similar to `query_embedding`
"""

if not query_embedding:
msg = "query_embedding must be a non-empty list of floats"
raise ValueError(msg)
if len(query_embedding) != self.embedding_dimension:
msg = (
f"query_embedding dimension ({len(query_embedding)}) does not match PgvectorDocumentStore "
f"embedding dimension ({self.embedding_dimension})."
)
raise ValueError(msg)

vector_function = vector_function or self.vector_function
if vector_function not in VALID_VECTOR_FUNCTIONS:
msg = f"vector_function must be one of {VALID_VECTOR_FUNCTIONS}, but got {vector_function}"
raise ValueError(msg)

# the vector must be a string with this format: "'[3,1,2]'"
query_embedding_for_postgres = f"'[{','.join(str(el) for el in query_embedding)}]'"

# to compute the scores, we use the approach described in pgvector README:
# https://github.com/pgvector/pgvector?tab=readme-ov-file#distances
# cosine_similarity and inner_product are modified from the result of the operator
if vector_function == "cosine_similarity":
score_definition = f"1 - (embedding <=> {query_embedding_for_postgres}) AS score"
elif vector_function == "inner_product":
score_definition = f"(embedding <#> {query_embedding_for_postgres}) * -1 AS score"
elif vector_function == "l2_distance":
score_definition = f"embedding <-> {query_embedding_for_postgres} AS score"

sql_select = SQL("SELECT *, {score} FROM {table_name}").format(
table_name=Identifier(self.table_name),
score=SQL(score_definition),
)

sql_where_clause = SQL("")
params = ()
if filters:
sql_where_clause, params = _convert_filters_to_where_clause_and_params(filters)

# we always want to return the most similar documents first
# so when using l2_distance, the sort order must be ASC
sort_order = "ASC" if vector_function == "l2_distance" else "DESC"

sql_sort = SQL(" ORDER BY score {sort_order} LIMIT {top_k}").format(
top_k=SQLLiteral(top_k),
sort_order=SQL(sort_order),
)

sql_query = sql_select + sql_where_clause + sql_sort

result = self._execute_sql(
sql_query,
params,
error_msg="Could not retrieve documents from PgvectorDocumentStore.",
cursor=self._dict_cursor,
)

records = result.fetchall()
docs = self._from_pg_to_haystack_documents(records)
return docs
2 changes: 1 addition & 1 deletion integrations/pgvector/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def document_store(request):
connection_string = "postgresql://postgres:postgres@localhost:5432/postgres"
table_name = f"haystack_{request.node.name}"
embedding_dimension = 768
vector_function = "cosine_distance"
vector_function = "cosine_similarity"
recreate_table = True
search_strategy = "exact_nearest_neighbor"

Expand Down
130 changes: 130 additions & 0 deletions integrations/pgvector/tests/test_embedding_retrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0

from typing import List

import pytest
from haystack.dataclasses.document import Document
from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore
from numpy.random import rand


class TestEmbeddingRetrieval:
@pytest.fixture
def document_store_w_hnsw_index(self, request):
connection_string = "postgresql://postgres:postgres@localhost:5432/postgres"
table_name = f"haystack_hnsw_{request.node.name}"
embedding_dimension = 768
vector_function = "cosine_similarity"
recreate_table = True
search_strategy = "hnsw"

store = PgvectorDocumentStore(
connection_string=connection_string,
table_name=table_name,
embedding_dimension=embedding_dimension,
vector_function=vector_function,
recreate_table=recreate_table,
search_strategy=search_strategy,
)
yield store

store.delete_table()

@pytest.mark.parametrize("document_store", ["document_store", "document_store_w_hnsw_index"], indirect=True)
def test_embedding_retrieval_cosine_similarity(self, document_store: PgvectorDocumentStore):
query_embedding = [0.1] * 768
most_similar_embedding = [0.8] * 768
second_best_embedding = [0.8] * 700 + [0.1] * 3 + [0.2] * 65
another_embedding = rand(768).tolist()

docs = [
Document(content="Most similar document (cosine sim)", embedding=most_similar_embedding),
Document(content="2nd best document (cosine sim)", embedding=second_best_embedding),
Document(content="Not very similar document (cosine sim)", embedding=another_embedding),
]

document_store.write_documents(docs)

results = document_store._embedding_retrieval(
query_embedding=query_embedding, top_k=2, filters={}, vector_function="cosine_similarity"
)
assert len(results) == 2
assert results[0].content == "Most similar document (cosine sim)"
assert results[1].content == "2nd best document (cosine sim)"
assert results[0].score > results[1].score

@pytest.mark.parametrize("document_store", ["document_store", "document_store_w_hnsw_index"], indirect=True)
def test_embedding_retrieval_inner_product(self, document_store: PgvectorDocumentStore):
query_embedding = [0.1] * 768
most_similar_embedding = [0.8] * 768
second_best_embedding = [0.8] * 700 + [0.1] * 3 + [0.2] * 65
another_embedding = rand(768).tolist()

docs = [
Document(content="Most similar document (inner product)", embedding=most_similar_embedding),
Document(content="2nd best document (inner product)", embedding=second_best_embedding),
Document(content="Not very similar document (inner product)", embedding=another_embedding),
]

document_store.write_documents(docs)

results = document_store._embedding_retrieval(
query_embedding=query_embedding, top_k=2, filters={}, vector_function="inner_product"
)
assert len(results) == 2
assert results[0].content == "Most similar document (inner product)"
assert results[1].content == "2nd best document (inner product)"
assert results[0].score > results[1].score

@pytest.mark.parametrize("document_store", ["document_store", "document_store_w_hnsw_index"], indirect=True)
def test_embedding_retrieval_l2_distance(self, document_store: PgvectorDocumentStore):
query_embedding = [0.1] * 768
most_similar_embedding = [0.1] * 765 + [0.15] * 3
second_best_embedding = [0.1] * 700 + [0.1] * 3 + [0.2] * 65
another_embedding = rand(768).tolist()

docs = [
Document(content="Most similar document (l2 dist)", embedding=most_similar_embedding),
Document(content="2nd best document (l2 dist)", embedding=second_best_embedding),
Document(content="Not very similar document (l2 dist)", embedding=another_embedding),
]

document_store.write_documents(docs)

results = document_store._embedding_retrieval(
query_embedding=query_embedding, top_k=2, filters={}, vector_function="l2_distance"
)
assert len(results) == 2
assert results[0].content == "Most similar document (l2 dist)"
assert results[1].content == "2nd best document (l2 dist)"
assert results[0].score < results[1].score

@pytest.mark.parametrize("document_store", ["document_store", "document_store_w_hnsw_index"], indirect=True)
def test_embedding_retrieval_with_filters(self, document_store: PgvectorDocumentStore):
docs = [Document(content=f"Document {i}", embedding=rand(768).tolist()) for i in range(10)]

for i in range(10):
docs[i].meta["meta_field"] = "custom_value" if i % 2 == 0 else "other_value"

document_store.write_documents(docs)

query_embedding = [0.1] * 768
filters = {"field": "meta.meta_field", "operator": "==", "value": "custom_value"}

results = document_store._embedding_retrieval(query_embedding=query_embedding, top_k=3, filters=filters)
assert len(results) == 3
for result in results:
assert result.meta["meta_field"] == "custom_value"
assert results[0].score > results[1].score > results[2].score

def test_empty_query_embedding(self, document_store: PgvectorDocumentStore):
query_embedding: List[float] = []
with pytest.raises(ValueError):
document_store._embedding_retrieval(query_embedding=query_embedding)

def test_query_embedding_wrong_dimension(self, document_store: PgvectorDocumentStore):
query_embedding = [0.1] * 4
with pytest.raises(ValueError):
document_store._embedding_retrieval(query_embedding=query_embedding)

0 comments on commit 8d1088b

Please sign in to comment.