Skip to content

Commit

Permalink
pgvector - review docstrings and API reference (#502)
Browse files Browse the repository at this point in the history
* pgvector - docstrings and api ref

* rm os.environ from usage example
  • Loading branch information
anakin87 authored Feb 29, 2024
1 parent f49523e commit e5ee06e
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 46 deletions.
4 changes: 2 additions & 2 deletions integrations/pgvector/examples/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
# git clone https://github.com/anakin87/neural-search-pills

import glob
import os

from haystack import Pipeline
from haystack.components.converters import MarkdownToDocument
Expand All @@ -21,7 +20,8 @@
from haystack_integrations.components.retrievers.pgvector import PgvectorEmbeddingRetriever
from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore

os.environ["PG_CONN_STR"] = "postgresql://postgres:postgres@localhost:5432/postgres"
# Set an environment variable `PG_CONN_STR` with the connection string to your PostgreSQL database.
# e.g., "postgresql://USER:PASSWORD@HOST:PORT/DB_NAME"

# Initialize PgvectorDocumentStore
document_store = PgvectorDocumentStore(
Expand Down
1 change: 0 additions & 1 deletion integrations/pgvector/pydoc/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ loaders:
modules: [
"haystack_integrations.components.retrievers.pgvector.embedding_retriever",
"haystack_integrations.document_stores.pgvector.document_store",
"haystack_integrations.document_stores.pgvector.filters",
]
ignore_when_discovered: ["__init__"]
processors:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,47 @@
@component
class PgvectorEmbeddingRetriever:
"""
Retrieves documents from the PgvectorDocumentStore, based on their dense embeddings.
Retrieves documents from the `PgvectorDocumentStore`, based on their dense embeddings.
Needs to be connected to the PgvectorDocumentStore.
Example usage:
```python
from haystack.document_stores import DuplicatePolicy
from haystack import Document, Pipeline
from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder
from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore
from haystack_integrations.components.retrievers.pgvector import PgvectorEmbeddingRetriever
# Set an environment variable `PG_CONN_STR` with the connection string to your PostgreSQL database.
# e.g., "postgresql://USER:PASSWORD@HOST:PORT/DB_NAME"
document_store = PgvectorDocumentStore(
embedding_dimension=768,
vector_function="cosine_similarity",
recreate_table=True,
)
documents = [Document(content="There are over 7,000 languages spoken around the world today."),
Document(content="Elephants have been observed to behave in a way that indicates..."),
Document(content="In certain places, you can witness the phenomenon of bioluminescent waves.")]
document_embedder = SentenceTransformersDocumentEmbedder()
document_embedder.warm_up()
documents_with_embeddings = document_embedder.run(documents)
document_store.write_documents(documents_with_embeddings.get("documents"), policy=DuplicatePolicy.OVERWRITE)
query_pipeline = Pipeline()
query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder())
query_pipeline.add_component("retriever", PgvectorEmbeddingRetriever(document_store=document_store))
query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
query = "How many languages are there?"
res = query_pipeline.run({"text_embedder": {"text": query}})
assert res['retriever']['documents'][0].content == "There are over 7,000 languages spoken around the world today."
```
"""

def __init__(
Expand All @@ -26,23 +64,20 @@ def __init__(
vector_function: Optional[Literal["cosine_similarity", "inner_product", "l2_distance"]] = None,
):
"""
Create the PgvectorEmbeddingRetriever component.
:param document_store: An instance of PgvectorDocumentStore.
:param filters: Filters applied to the retrieved Documents. Defaults to None.
:param top_k: Maximum number of Documents to return, defaults to 10.
:param document_store: An instance of `PgvectorDocumentStore}.
:param filters: Filters applied to the retrieved Documents.
:param top_k: Maximum number of Documents to return.
:param vector_function: The similarity function to use when searching for similar embeddings.
Defaults to the one set in the `document_store` instance.
"cosine_similarity" and "inner_product" are similarity functions and
`"cosine_similarity"` and `"inner_product"` are similarity functions and
higher scores indicate greater similarity between the documents.
"l2_distance" returns the straight-line distance between vectors,
`"l2_distance"` returns the straight-line distance between vectors,
and the most similar documents are the ones with the smallest score.
Important: if the document store is using the "hnsw" search strategy, the vector function
**Important**: if the document store is using the `"hnsw"` search strategy, the vector function
should match the one utilized during index creation to take advantage of the index.
:type vector_function: Literal["cosine_similarity", "inner_product", "l2_distance"]
:raises ValueError: If `document_store` is not an instance of PgvectorDocumentStore.
:raises ValueError: If `document_store` is not an instance of `PgvectorDocumentStore` or if `vector_function`
is not one of the valid options.
"""
if not isinstance(document_store, PgvectorDocumentStore):
msg = "document_store must be an instance of PgvectorDocumentStore"
Expand All @@ -58,6 +93,12 @@ def __init__(
self.vector_function = vector_function or document_store.vector_function

def to_dict(self) -> Dict[str, Any]:
"""
Serializes the component to a dictionary.
:returns:
Dictionary with serialized data.
"""
return default_to_dict(
self,
filters=self.filters,
Expand All @@ -68,6 +109,14 @@ def to_dict(self) -> Dict[str, Any]:

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "PgvectorEmbeddingRetriever":
"""
Deserializes the component from a dictionary.
:param data:
Dictionary to deserialize from.
:returns:
Deserialized component.
"""
doc_store_params = data["init_parameters"]["document_store"]
data["init_parameters"]["document_store"] = PgvectorDocumentStore.from_dict(doc_store_params)
return default_from_dict(cls, data)
Expand All @@ -81,14 +130,14 @@ def run(
vector_function: Optional[Literal["cosine_similarity", "inner_product", "l2_distance"]] = None,
):
"""
Retrieve documents from the PgvectorDocumentStore, based on their embeddings.
Retrieve documents from the `PgvectorDocumentStore`, based on their embeddings.
:param query_embedding: Embedding of the query.
:param filters: Filters applied to the retrieved Documents.
:param top_k: Maximum number of Documents to return.
:param vector_function: The similarity function to use when searching for similar embeddings.
:type vector_function: Literal["cosine_similarity", "inner_product", "l2_distance"]
:return: List of Documents similar to `query_embedding`.
:returns: List of Documents similar to `query_embedding`.
"""
filters = filters or self.filters
top_k = top_k or self.top_k
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@


class PgvectorDocumentStore:
"""
A Document Store using PostgreSQL with the [pgvector extension](https://github.com/pgvector/pgvector) installed.
"""

def __init__(
self,
*,
Expand All @@ -86,36 +90,33 @@ def __init__(
A specific table to store Haystack documents will be created if it doesn't exist yet.
:param connection_string: The connection string to use to connect to the PostgreSQL database, defined as an
environment variable, e.g.: PG_CONN_STR="postgresql://USER:PASSWORD@HOST:PORT/DB_NAME"
:param table_name: The name of the table to use to store Haystack documents. Defaults to "haystack_documents".
:param embedding_dimension: The dimension of the embedding. Defaults to 768.
environment variable, e.g.: `PG_CONN_STR="postgresql://USER:PASSWORD@HOST:PORT/DB_NAME"`
:param table_name: The name of the table to use to store Haystack documents.
:param embedding_dimension: The dimension of the embedding.
:param vector_function: The similarity function to use when searching for similar embeddings.
Defaults to "cosine_similarity". "cosine_similarity" and "inner_product" are similarity functions and
`"cosine_similarity"` and `"inner_product"` are similarity functions and
higher scores indicate greater similarity between the documents.
"l2_distance" returns the straight-line distance between vectors,
`"l2_distance"` returns the straight-line distance between vectors,
and the most similar documents are the ones with the smallest score.
Important: when using the "hnsw" search strategy, an index will be created that depends on the
**Important**: when using the `"hnsw"` search strategy, an index will be created that depends on the
`vector_function` passed here. Make sure subsequent queries will keep using the same
vector similarity function in order to take advantage of the index.
:type vector_function: Literal["cosine_similarity", "inner_product", "l2_distance"]
:param recreate_table: Whether to recreate the table if it already exists. Defaults to False.
:param recreate_table: Whether to recreate the table if it already exists.
:param search_strategy: The search strategy to use when searching for similar embeddings.
Defaults to "exact_nearest_neighbor". "hnsw" is an approximate nearest neighbor search strategy,
`"exact_nearest_neighbor"` provides perfect recall but can be slow for large numbers of documents.
`"hnsw"` is an approximate nearest neighbor search strategy,
which trades off some accuracy for speed; it is recommended for large numbers of documents.
Important: when using the "hnsw" search strategy, an index will be created that depends on the
**Important**: when using the `"hnsw"` search strategy, an index will be created that depends on the
`vector_function` passed here. Make sure subsequent queries will keep using the same
vector similarity function in order to take advantage of the index.
:type search_strategy: Literal["exact_nearest_neighbor", "hnsw"]
:param hnsw_recreate_index_if_exists: Whether to recreate the HNSW index if it already exists.
Defaults to False. Only used if search_strategy is set to "hnsw".
Only used if search_strategy is set to `"hnsw"`.
:param hnsw_index_creation_kwargs: Additional keyword arguments to pass to the HNSW index creation.
Only used if search_strategy is set to "hnsw". You can find the list of valid arguments in the
pgvector documentation: https://github.com/pgvector/pgvector?tab=readme-ov-file#hnsw
:param hnsw_ef_search: The ef_search parameter to use at query time. Only used if search_strategy is set to
"hnsw". You can find more information about this parameter in the pgvector documentation:
https://github.com/pgvector/pgvector?tab=readme-ov-file#hnsw
Only used if search_strategy is set to `"hnsw"`. You can find the list of valid arguments in the
[pgvector documentation](https://github.com/pgvector/pgvector?tab=readme-ov-file#hnsw)
:param hnsw_ef_search: The `ef_search` parameter to use at query time. Only used if search_strategy is set to
`"hnsw"`. You can find more information about this parameter in the
[pgvector documentation](https://github.com/pgvector/pgvector?tab=readme-ov-file#hnsw)
"""

self.connection_string = connection_string
Expand Down Expand Up @@ -150,6 +151,12 @@ def __init__(
self._handle_hnsw()

def to_dict(self) -> Dict[str, Any]:
"""
Serializes the component to a dictionary.
:returns:
Dictionary with serialized data.
"""
return default_to_dict(
self,
connection_string=self.connection_string.to_dict(),
Expand All @@ -165,6 +172,14 @@ def to_dict(self) -> Dict[str, Any]:

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "PgvectorDocumentStore":
"""
Deserializes the component from a dictionary.
:param data:
Dictionary to deserialize from.
:returns:
Deserialized component.
"""
deserialize_secrets_inplace(data["init_parameters"], ["connection_string"])
return default_from_dict(cls, data)

Expand Down Expand Up @@ -209,6 +224,7 @@ def _create_table_if_not_exists(self):
def delete_table(self):
"""
Deletes the table used to store Haystack documents.
The name of the table (`table_name`) is defined when initializing the `PgvectorDocumentStore`.
"""

delete_sql = SQL("DROP TABLE IF EXISTS {table_name}").format(table_name=Identifier(self.table_name))
Expand All @@ -218,7 +234,7 @@ def delete_table(self):
def _handle_hnsw(self):
"""
Internal method to handle the HNSW index creation.
It also sets the hnsw.ef_search parameter for queries if it is specified.
It also sets the `hnsw.ef_search` parameter for queries if it is specified.
"""

if self.hnsw_ef_search:
Expand Down Expand Up @@ -295,7 +311,8 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc
refer to the [documentation](https://docs.haystack.deepset.ai/v2.0/docs/metadata-filtering)
:param filters: The filters to apply to the document list.
:return: A list of Documents that match the given filters.
:raises TypeError: If `filters` is not a dictionary.
:returns: A list of Documents that match the given filters.
"""
if filters:
if not isinstance(filters, dict):
Expand Down Expand Up @@ -324,13 +341,13 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc

def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE) -> int:
"""
Writes documents into to PgvectorDocumentStore.
Writes documents to the document store.
:param documents: A list of Documents to write to the document store.
:param policy: The duplicate policy to use when writing documents.
:raises DuplicateDocumentError: If a document with the same id already exists in the document store
and the policy is set to DuplicatePolicy.FAIL (or not specified).
:return: The number of documents written to the document store.
and the policy is set to `DuplicatePolicy.FAIL` (or not specified).
:returns: The number of documents written to the document store.
"""

if len(documents) > 0:
Expand Down Expand Up @@ -432,7 +449,7 @@ def _from_pg_to_haystack_documents(documents: List[Dict[str, Any]]) -> List[Docu

def delete_documents(self, document_ids: List[str]) -> None:
"""
Deletes all documents with a matching document_ids from the document store.
Deletes documents that match the provided `document_ids` from the document store.
:param document_ids: the document ids to delete
"""
Expand Down Expand Up @@ -462,8 +479,7 @@ def _embedding_retrieval(
This method is not meant to be part of the public interface of
`PgvectorDocumentStore` and it should not be called directly.
`PgvectorEmbeddingRetriever` uses this method directly and is the public interface for it.
:raises ValueError
:return: List of Documents that are most similar to `query_embedding`
:returns: List of Documents that are most similar to `query_embedding`
"""

if not query_embedding:
Expand Down

0 comments on commit e5ee06e

Please sign in to comment.