From e5ee06e6ea21983d62d2d56b7e3371448e34a9a3 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Thu, 29 Feb 2024 15:27:14 +0100 Subject: [PATCH] pgvector - review docstrings and API reference (#502) * pgvector - docstrings and api ref * rm os.environ from usage example --- integrations/pgvector/examples/example.py | 4 +- integrations/pgvector/pydoc/config.yml | 1 - .../pgvector/embedding_retriever.py | 81 +++++++++++++++---- .../pgvector/document_store.py | 70 +++++++++------- 4 files changed, 110 insertions(+), 46 deletions(-) diff --git a/integrations/pgvector/examples/example.py b/integrations/pgvector/examples/example.py index 764c915d1..37ea88929 100644 --- a/integrations/pgvector/examples/example.py +++ b/integrations/pgvector/examples/example.py @@ -11,7 +11,6 @@ # git clone https://github.com/anakin87/neural-search-pills import glob -import os from haystack import Pipeline from haystack.components.converters import MarkdownToDocument @@ -21,7 +20,8 @@ from haystack_integrations.components.retrievers.pgvector import PgvectorEmbeddingRetriever from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore -os.environ["PG_CONN_STR"] = "postgresql://postgres:postgres@localhost:5432/postgres" +# Set an environment variable `PG_CONN_STR` with the connection string to your PostgreSQL database. +# e.g., "postgresql://USER:PASSWORD@HOST:PORT/DB_NAME" # Initialize PgvectorDocumentStore document_store = PgvectorDocumentStore( diff --git a/integrations/pgvector/pydoc/config.yml b/integrations/pgvector/pydoc/config.yml index ea354c14b..449937629 100644 --- a/integrations/pgvector/pydoc/config.yml +++ b/integrations/pgvector/pydoc/config.yml @@ -4,7 +4,6 @@ loaders: modules: [ "haystack_integrations.components.retrievers.pgvector.embedding_retriever", "haystack_integrations.document_stores.pgvector.document_store", - "haystack_integrations.document_stores.pgvector.filters", ] ignore_when_discovered: ["__init__"] processors: diff --git a/integrations/pgvector/src/haystack_integrations/components/retrievers/pgvector/embedding_retriever.py b/integrations/pgvector/src/haystack_integrations/components/retrievers/pgvector/embedding_retriever.py index 4b8df868b..6085545cb 100644 --- a/integrations/pgvector/src/haystack_integrations/components/retrievers/pgvector/embedding_retriever.py +++ b/integrations/pgvector/src/haystack_integrations/components/retrievers/pgvector/embedding_retriever.py @@ -12,9 +12,47 @@ @component class PgvectorEmbeddingRetriever: """ - Retrieves documents from the PgvectorDocumentStore, based on their dense embeddings. + Retrieves documents from the `PgvectorDocumentStore`, based on their dense embeddings. - Needs to be connected to the PgvectorDocumentStore. + Example usage: + ```python + from haystack.document_stores import DuplicatePolicy + from haystack import Document, Pipeline + from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder + + from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore + from haystack_integrations.components.retrievers.pgvector import PgvectorEmbeddingRetriever + + # Set an environment variable `PG_CONN_STR` with the connection string to your PostgreSQL database. + # e.g., "postgresql://USER:PASSWORD@HOST:PORT/DB_NAME" + + document_store = PgvectorDocumentStore( + embedding_dimension=768, + vector_function="cosine_similarity", + recreate_table=True, + ) + + documents = [Document(content="There are over 7,000 languages spoken around the world today."), + Document(content="Elephants have been observed to behave in a way that indicates..."), + Document(content="In certain places, you can witness the phenomenon of bioluminescent waves.")] + + document_embedder = SentenceTransformersDocumentEmbedder() + document_embedder.warm_up() + documents_with_embeddings = document_embedder.run(documents) + + document_store.write_documents(documents_with_embeddings.get("documents"), policy=DuplicatePolicy.OVERWRITE) + + query_pipeline = Pipeline() + query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder()) + query_pipeline.add_component("retriever", PgvectorEmbeddingRetriever(document_store=document_store)) + query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding") + + query = "How many languages are there?" + + res = query_pipeline.run({"text_embedder": {"text": query}}) + + assert res['retriever']['documents'][0].content == "There are over 7,000 languages spoken around the world today." + ``` """ def __init__( @@ -26,23 +64,20 @@ def __init__( vector_function: Optional[Literal["cosine_similarity", "inner_product", "l2_distance"]] = None, ): """ - Create the PgvectorEmbeddingRetriever component. - - :param document_store: An instance of PgvectorDocumentStore. - :param filters: Filters applied to the retrieved Documents. Defaults to None. - :param top_k: Maximum number of Documents to return, defaults to 10. + :param document_store: An instance of `PgvectorDocumentStore}. + :param filters: Filters applied to the retrieved Documents. + :param top_k: Maximum number of Documents to return. :param vector_function: The similarity function to use when searching for similar embeddings. Defaults to the one set in the `document_store` instance. - "cosine_similarity" and "inner_product" are similarity functions and + `"cosine_similarity"` and `"inner_product"` are similarity functions and higher scores indicate greater similarity between the documents. - "l2_distance" returns the straight-line distance between vectors, + `"l2_distance"` returns the straight-line distance between vectors, and the most similar documents are the ones with the smallest score. - - Important: if the document store is using the "hnsw" search strategy, the vector function + **Important**: if the document store is using the `"hnsw"` search strategy, the vector function should match the one utilized during index creation to take advantage of the index. - :type vector_function: Literal["cosine_similarity", "inner_product", "l2_distance"] - :raises ValueError: If `document_store` is not an instance of PgvectorDocumentStore. + :raises ValueError: If `document_store` is not an instance of `PgvectorDocumentStore` or if `vector_function` + is not one of the valid options. """ if not isinstance(document_store, PgvectorDocumentStore): msg = "document_store must be an instance of PgvectorDocumentStore" @@ -58,6 +93,12 @@ def __init__( self.vector_function = vector_function or document_store.vector_function def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ return default_to_dict( self, filters=self.filters, @@ -68,6 +109,14 @@ def to_dict(self) -> Dict[str, Any]: @classmethod def from_dict(cls, data: Dict[str, Any]) -> "PgvectorEmbeddingRetriever": + """ + Deserializes the component from a dictionary. + + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. + """ doc_store_params = data["init_parameters"]["document_store"] data["init_parameters"]["document_store"] = PgvectorDocumentStore.from_dict(doc_store_params) return default_from_dict(cls, data) @@ -81,14 +130,14 @@ def run( vector_function: Optional[Literal["cosine_similarity", "inner_product", "l2_distance"]] = None, ): """ - Retrieve documents from the PgvectorDocumentStore, based on their embeddings. + Retrieve documents from the `PgvectorDocumentStore`, based on their embeddings. :param query_embedding: Embedding of the query. :param filters: Filters applied to the retrieved Documents. :param top_k: Maximum number of Documents to return. :param vector_function: The similarity function to use when searching for similar embeddings. - :type vector_function: Literal["cosine_similarity", "inner_product", "l2_distance"] - :return: List of Documents similar to `query_embedding`. + + :returns: List of Documents similar to `query_embedding`. """ filters = filters or self.filters top_k = top_k or self.top_k diff --git a/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py b/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py index 798c75276..3396c15ea 100644 --- a/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py +++ b/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py @@ -67,6 +67,10 @@ class PgvectorDocumentStore: + """ + A Document Store using PostgreSQL with the [pgvector extension](https://github.com/pgvector/pgvector) installed. + """ + def __init__( self, *, @@ -86,36 +90,33 @@ def __init__( A specific table to store Haystack documents will be created if it doesn't exist yet. :param connection_string: The connection string to use to connect to the PostgreSQL database, defined as an - environment variable, e.g.: PG_CONN_STR="postgresql://USER:PASSWORD@HOST:PORT/DB_NAME" - :param table_name: The name of the table to use to store Haystack documents. Defaults to "haystack_documents". - :param embedding_dimension: The dimension of the embedding. Defaults to 768. + environment variable, e.g.: `PG_CONN_STR="postgresql://USER:PASSWORD@HOST:PORT/DB_NAME"` + :param table_name: The name of the table to use to store Haystack documents. + :param embedding_dimension: The dimension of the embedding. :param vector_function: The similarity function to use when searching for similar embeddings. - Defaults to "cosine_similarity". "cosine_similarity" and "inner_product" are similarity functions and + `"cosine_similarity"` and `"inner_product"` are similarity functions and higher scores indicate greater similarity between the documents. - "l2_distance" returns the straight-line distance between vectors, + `"l2_distance"` returns the straight-line distance between vectors, and the most similar documents are the ones with the smallest score. - - Important: when using the "hnsw" search strategy, an index will be created that depends on the + **Important**: when using the `"hnsw"` search strategy, an index will be created that depends on the `vector_function` passed here. Make sure subsequent queries will keep using the same vector similarity function in order to take advantage of the index. - :type vector_function: Literal["cosine_similarity", "inner_product", "l2_distance"] - :param recreate_table: Whether to recreate the table if it already exists. Defaults to False. + :param recreate_table: Whether to recreate the table if it already exists. :param search_strategy: The search strategy to use when searching for similar embeddings. - Defaults to "exact_nearest_neighbor". "hnsw" is an approximate nearest neighbor search strategy, + `"exact_nearest_neighbor"` provides perfect recall but can be slow for large numbers of documents. + `"hnsw"` is an approximate nearest neighbor search strategy, which trades off some accuracy for speed; it is recommended for large numbers of documents. - - Important: when using the "hnsw" search strategy, an index will be created that depends on the + **Important**: when using the `"hnsw"` search strategy, an index will be created that depends on the `vector_function` passed here. Make sure subsequent queries will keep using the same vector similarity function in order to take advantage of the index. - :type search_strategy: Literal["exact_nearest_neighbor", "hnsw"] :param hnsw_recreate_index_if_exists: Whether to recreate the HNSW index if it already exists. - Defaults to False. Only used if search_strategy is set to "hnsw". + Only used if search_strategy is set to `"hnsw"`. :param hnsw_index_creation_kwargs: Additional keyword arguments to pass to the HNSW index creation. - Only used if search_strategy is set to "hnsw". You can find the list of valid arguments in the - pgvector documentation: https://github.com/pgvector/pgvector?tab=readme-ov-file#hnsw - :param hnsw_ef_search: The ef_search parameter to use at query time. Only used if search_strategy is set to - "hnsw". You can find more information about this parameter in the pgvector documentation: - https://github.com/pgvector/pgvector?tab=readme-ov-file#hnsw + Only used if search_strategy is set to `"hnsw"`. You can find the list of valid arguments in the + [pgvector documentation](https://github.com/pgvector/pgvector?tab=readme-ov-file#hnsw) + :param hnsw_ef_search: The `ef_search` parameter to use at query time. Only used if search_strategy is set to + `"hnsw"`. You can find more information about this parameter in the + [pgvector documentation](https://github.com/pgvector/pgvector?tab=readme-ov-file#hnsw) """ self.connection_string = connection_string @@ -150,6 +151,12 @@ def __init__( self._handle_hnsw() def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ return default_to_dict( self, connection_string=self.connection_string.to_dict(), @@ -165,6 +172,14 @@ def to_dict(self) -> Dict[str, Any]: @classmethod def from_dict(cls, data: Dict[str, Any]) -> "PgvectorDocumentStore": + """ + Deserializes the component from a dictionary. + + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. + """ deserialize_secrets_inplace(data["init_parameters"], ["connection_string"]) return default_from_dict(cls, data) @@ -209,6 +224,7 @@ def _create_table_if_not_exists(self): def delete_table(self): """ Deletes the table used to store Haystack documents. + The name of the table (`table_name`) is defined when initializing the `PgvectorDocumentStore`. """ delete_sql = SQL("DROP TABLE IF EXISTS {table_name}").format(table_name=Identifier(self.table_name)) @@ -218,7 +234,7 @@ def delete_table(self): def _handle_hnsw(self): """ Internal method to handle the HNSW index creation. - It also sets the hnsw.ef_search parameter for queries if it is specified. + It also sets the `hnsw.ef_search` parameter for queries if it is specified. """ if self.hnsw_ef_search: @@ -295,7 +311,8 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc refer to the [documentation](https://docs.haystack.deepset.ai/v2.0/docs/metadata-filtering) :param filters: The filters to apply to the document list. - :return: A list of Documents that match the given filters. + :raises TypeError: If `filters` is not a dictionary. + :returns: A list of Documents that match the given filters. """ if filters: if not isinstance(filters, dict): @@ -324,13 +341,13 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE) -> int: """ - Writes documents into to PgvectorDocumentStore. + Writes documents to the document store. :param documents: A list of Documents to write to the document store. :param policy: The duplicate policy to use when writing documents. :raises DuplicateDocumentError: If a document with the same id already exists in the document store - and the policy is set to DuplicatePolicy.FAIL (or not specified). - :return: The number of documents written to the document store. + and the policy is set to `DuplicatePolicy.FAIL` (or not specified). + :returns: The number of documents written to the document store. """ if len(documents) > 0: @@ -432,7 +449,7 @@ def _from_pg_to_haystack_documents(documents: List[Dict[str, Any]]) -> List[Docu def delete_documents(self, document_ids: List[str]) -> None: """ - Deletes all documents with a matching document_ids from the document store. + Deletes documents that match the provided `document_ids` from the document store. :param document_ids: the document ids to delete """ @@ -462,8 +479,7 @@ def _embedding_retrieval( This method is not meant to be part of the public interface of `PgvectorDocumentStore` and it should not be called directly. `PgvectorEmbeddingRetriever` uses this method directly and is the public interface for it. - :raises ValueError - :return: List of Documents that are most similar to `query_embedding` + :returns: List of Documents that are most similar to `query_embedding` """ if not query_embedding: