Skip to content

Commit

Permalink
Pgvector - embedding retrieval (#298)
Browse files Browse the repository at this point in the history
* squash

* Update integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py

Co-authored-by: Massimiliano Pippi <[email protected]>

* Update integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py

Co-authored-by: Massimiliano Pippi <[email protected]>

* Update integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py

Co-authored-by: Massimiliano Pippi <[email protected]>

* Update integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py

Co-authored-by: Massimiliano Pippi <[email protected]>

* fix fmt

---------

Co-authored-by: Massimiliano Pippi <[email protected]>
  • Loading branch information
anakin87 and masci authored Jan 31, 2024
1 parent ae80056 commit 0d15e36
Show file tree
Hide file tree
Showing 3 changed files with 229 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,10 @@
meta = EXCLUDED.meta
"""

VALID_VECTOR_FUNCTIONS = ["cosine_similarity", "inner_product", "l2_distance"]

VECTOR_FUNCTION_TO_POSTGRESQL_OPS = {
"cosine_distance": "vector_cosine_ops",
"cosine_similarity": "vector_cosine_ops",
"inner_product": "vector_ip_ops",
"l2_distance": "vector_l2_ops",
}
Expand All @@ -70,7 +72,7 @@ def __init__(
connection_string: str,
table_name: str = "haystack_documents",
embedding_dimension: int = 768,
vector_function: Literal["cosine_distance", "inner_product", "l2_distance"] = "cosine_distance",
vector_function: Literal["cosine_similarity", "inner_product", "l2_distance"] = "cosine_similarity",
recreate_table: bool = False,
search_strategy: Literal["exact_nearest_neighbor", "hnsw"] = "exact_nearest_neighbor",
hnsw_recreate_index_if_exists: bool = False,
Expand All @@ -87,12 +89,23 @@ def __init__(
:param table_name: The name of the table to use to store Haystack documents. Defaults to "haystack_documents".
:param embedding_dimension: The dimension of the embedding. Defaults to 768.
:param vector_function: The similarity function to use when searching for similar embeddings.
Defaults to "cosine_distance". Set it to one of the following values:
:type vector_function: Literal["cosine_distance", "inner_product", "l2_distance"]
Defaults to "cosine_similarity". "cosine_similarity" and "inner_product" are similarity functions and
higher scores indicate greater similarity between the documents.
"l2_distance" returns the straight-line distance between vectors,
and the most similar documents are the ones with the smallest score.
Important: when using the "hnsw" search strategy, an index will be created that depends on the
`vector_function` passed here. Make sure subsequent queries will keep using the same
vector similarity function in order to take advantage of the index.
:type vector_function: Literal["cosine_similarity", "inner_product", "l2_distance"]
:param recreate_table: Whether to recreate the table if it already exists. Defaults to False.
:param search_strategy: The search strategy to use when searching for similar embeddings.
Defaults to "exact_nearest_neighbor". "hnsw" is an approximate nearest neighbor search strategy,
which trades off some accuracy for speed; it is recommended for large numbers of documents.
Important: when using the "hnsw" search strategy, an index will be created that depends on the
`vector_function` passed here. Make sure subsequent queries will keep using the same
vector similarity function in order to take advantage of the index.
:type search_strategy: Literal["exact_nearest_neighbor", "hnsw"]
:param hnsw_recreate_index_if_exists: Whether to recreate the HNSW index if it already exists.
Defaults to False. Only used if search_strategy is set to "hnsw".
Expand All @@ -107,6 +120,9 @@ def __init__(
self.connection_string = connection_string
self.table_name = table_name
self.embedding_dimension = embedding_dimension
if vector_function not in VALID_VECTOR_FUNCTIONS:
msg = f"vector_function must be one of {VALID_VECTOR_FUNCTIONS}, but got {vector_function}"
raise ValueError(msg)
self.vector_function = vector_function
self.recreate_table = recreate_table
self.search_strategy = search_strategy
Expand Down Expand Up @@ -423,3 +439,81 @@ def delete_documents(self, document_ids: List[str]) -> None:
)

self._execute_sql(delete_sql, error_msg="Could not delete documents from PgvectorDocumentStore")

def _embedding_retrieval(
self,
query_embedding: List[float],
*,
filters: Optional[Dict[str, Any]] = None,
top_k: int = 10,
vector_function: Optional[Literal["cosine_similarity", "inner_product", "l2_distance"]] = None,
) -> List[Document]:
"""
Retrieves documents that are most similar to the query embedding using a vector similarity metric.
This method is not meant to be part of the public interface of
`PgvectorDocumentStore` and it should not be called directly.
`PgvectorEmbeddingRetriever` uses this method directly and is the public interface for it.
:raises ValueError
:return: List of Documents that are most similar to `query_embedding`
"""

if not query_embedding:
msg = "query_embedding must be a non-empty list of floats"
raise ValueError(msg)
if len(query_embedding) != self.embedding_dimension:
msg = (
f"query_embedding dimension ({len(query_embedding)}) does not match PgvectorDocumentStore "
f"embedding dimension ({self.embedding_dimension})."
)
raise ValueError(msg)

vector_function = vector_function or self.vector_function
if vector_function not in VALID_VECTOR_FUNCTIONS:
msg = f"vector_function must be one of {VALID_VECTOR_FUNCTIONS}, but got {vector_function}"
raise ValueError(msg)

# the vector must be a string with this format: "'[3,1,2]'"
query_embedding_for_postgres = f"'[{','.join(str(el) for el in query_embedding)}]'"

# to compute the scores, we use the approach described in pgvector README:
# https://github.com/pgvector/pgvector?tab=readme-ov-file#distances
# cosine_similarity and inner_product are modified from the result of the operator
if vector_function == "cosine_similarity":
score_definition = f"1 - (embedding <=> {query_embedding_for_postgres}) AS score"
elif vector_function == "inner_product":
score_definition = f"(embedding <#> {query_embedding_for_postgres}) * -1 AS score"
elif vector_function == "l2_distance":
score_definition = f"embedding <-> {query_embedding_for_postgres} AS score"

sql_select = SQL("SELECT *, {score} FROM {table_name}").format(
table_name=Identifier(self.table_name),
score=SQL(score_definition),
)

sql_where_clause = SQL("")
params = ()
if filters:
sql_where_clause, params = _convert_filters_to_where_clause_and_params(filters)

# we always want to return the most similar documents first
# so when using l2_distance, the sort order must be ASC
sort_order = "ASC" if vector_function == "l2_distance" else "DESC"

sql_sort = SQL(" ORDER BY score {sort_order} LIMIT {top_k}").format(
top_k=SQLLiteral(top_k),
sort_order=SQL(sort_order),
)

sql_query = sql_select + sql_where_clause + sql_sort

result = self._execute_sql(
sql_query,
params,
error_msg="Could not retrieve documents from PgvectorDocumentStore.",
cursor=self._dict_cursor,
)

records = result.fetchall()
docs = self._from_pg_to_haystack_documents(records)
return docs
2 changes: 1 addition & 1 deletion integrations/pgvector/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def document_store(request):
connection_string = "postgresql://postgres:postgres@localhost:5432/postgres"
table_name = f"haystack_{request.node.name}"
embedding_dimension = 768
vector_function = "cosine_distance"
vector_function = "cosine_similarity"
recreate_table = True
search_strategy = "exact_nearest_neighbor"

Expand Down
130 changes: 130 additions & 0 deletions integrations/pgvector/tests/test_embedding_retrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0

from typing import List

import pytest
from haystack.dataclasses.document import Document
from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore
from numpy.random import rand


class TestEmbeddingRetrieval:
@pytest.fixture
def document_store_w_hnsw_index(self, request):
connection_string = "postgresql://postgres:postgres@localhost:5432/postgres"
table_name = f"haystack_hnsw_{request.node.name}"
embedding_dimension = 768
vector_function = "cosine_similarity"
recreate_table = True
search_strategy = "hnsw"

store = PgvectorDocumentStore(
connection_string=connection_string,
table_name=table_name,
embedding_dimension=embedding_dimension,
vector_function=vector_function,
recreate_table=recreate_table,
search_strategy=search_strategy,
)
yield store

store.delete_table()

@pytest.mark.parametrize("document_store", ["document_store", "document_store_w_hnsw_index"], indirect=True)
def test_embedding_retrieval_cosine_similarity(self, document_store: PgvectorDocumentStore):
query_embedding = [0.1] * 768
most_similar_embedding = [0.8] * 768
second_best_embedding = [0.8] * 700 + [0.1] * 3 + [0.2] * 65
another_embedding = rand(768).tolist()

docs = [
Document(content="Most similar document (cosine sim)", embedding=most_similar_embedding),
Document(content="2nd best document (cosine sim)", embedding=second_best_embedding),
Document(content="Not very similar document (cosine sim)", embedding=another_embedding),
]

document_store.write_documents(docs)

results = document_store._embedding_retrieval(
query_embedding=query_embedding, top_k=2, filters={}, vector_function="cosine_similarity"
)
assert len(results) == 2
assert results[0].content == "Most similar document (cosine sim)"
assert results[1].content == "2nd best document (cosine sim)"
assert results[0].score > results[1].score

@pytest.mark.parametrize("document_store", ["document_store", "document_store_w_hnsw_index"], indirect=True)
def test_embedding_retrieval_inner_product(self, document_store: PgvectorDocumentStore):
query_embedding = [0.1] * 768
most_similar_embedding = [0.8] * 768
second_best_embedding = [0.8] * 700 + [0.1] * 3 + [0.2] * 65
another_embedding = rand(768).tolist()

docs = [
Document(content="Most similar document (inner product)", embedding=most_similar_embedding),
Document(content="2nd best document (inner product)", embedding=second_best_embedding),
Document(content="Not very similar document (inner product)", embedding=another_embedding),
]

document_store.write_documents(docs)

results = document_store._embedding_retrieval(
query_embedding=query_embedding, top_k=2, filters={}, vector_function="inner_product"
)
assert len(results) == 2
assert results[0].content == "Most similar document (inner product)"
assert results[1].content == "2nd best document (inner product)"
assert results[0].score > results[1].score

@pytest.mark.parametrize("document_store", ["document_store", "document_store_w_hnsw_index"], indirect=True)
def test_embedding_retrieval_l2_distance(self, document_store: PgvectorDocumentStore):
query_embedding = [0.1] * 768
most_similar_embedding = [0.1] * 765 + [0.15] * 3
second_best_embedding = [0.1] * 700 + [0.1] * 3 + [0.2] * 65
another_embedding = rand(768).tolist()

docs = [
Document(content="Most similar document (l2 dist)", embedding=most_similar_embedding),
Document(content="2nd best document (l2 dist)", embedding=second_best_embedding),
Document(content="Not very similar document (l2 dist)", embedding=another_embedding),
]

document_store.write_documents(docs)

results = document_store._embedding_retrieval(
query_embedding=query_embedding, top_k=2, filters={}, vector_function="l2_distance"
)
assert len(results) == 2
assert results[0].content == "Most similar document (l2 dist)"
assert results[1].content == "2nd best document (l2 dist)"
assert results[0].score < results[1].score

@pytest.mark.parametrize("document_store", ["document_store", "document_store_w_hnsw_index"], indirect=True)
def test_embedding_retrieval_with_filters(self, document_store: PgvectorDocumentStore):
docs = [Document(content=f"Document {i}", embedding=rand(768).tolist()) for i in range(10)]

for i in range(10):
docs[i].meta["meta_field"] = "custom_value" if i % 2 == 0 else "other_value"

document_store.write_documents(docs)

query_embedding = [0.1] * 768
filters = {"field": "meta.meta_field", "operator": "==", "value": "custom_value"}

results = document_store._embedding_retrieval(query_embedding=query_embedding, top_k=3, filters=filters)
assert len(results) == 3
for result in results:
assert result.meta["meta_field"] == "custom_value"
assert results[0].score > results[1].score > results[2].score

def test_empty_query_embedding(self, document_store: PgvectorDocumentStore):
query_embedding: List[float] = []
with pytest.raises(ValueError):
document_store._embedding_retrieval(query_embedding=query_embedding)

def test_query_embedding_wrong_dimension(self, document_store: PgvectorDocumentStore):
query_embedding = [0.1] * 4
with pytest.raises(ValueError):
document_store._embedding_retrieval(query_embedding=query_embedding)

0 comments on commit 0d15e36

Please sign in to comment.