pgvector - review docstrings and API reference (#502)

* pgvector - docstrings and api ref * rm os.environ from usage example
deepset-ai · Feb 29, 2024 · e5ee06e · e5ee06e
1 parent f49523e
commit e5ee06e
Show file tree

Hide file tree

Showing 4 changed files with 110 additions and 46 deletions.
diff --git a/integrations/pgvector/examples/example.py b/integrations/pgvector/examples/example.py
@@ -11,7 +11,6 @@
 # git clone https://github.com/anakin87/neural-search-pills
 
 import glob
-import os
 
 from haystack import Pipeline
 from haystack.components.converters import MarkdownToDocument
@@ -21,7 +20,8 @@
 from haystack_integrations.components.retrievers.pgvector import PgvectorEmbeddingRetriever
 from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore
 
-os.environ["PG_CONN_STR"] = "postgresql://postgres:postgres@localhost:5432/postgres"
+# Set an environment variable `PG_CONN_STR` with the connection string to your PostgreSQL database.
+# e.g., "postgresql://USER:PASSWORD@HOST:PORT/DB_NAME"
 
 # Initialize PgvectorDocumentStore
 document_store = PgvectorDocumentStore(

diff --git a/integrations/pgvector/pydoc/config.yml b/integrations/pgvector/pydoc/config.yml
@@ -4,7 +4,6 @@ loaders:
     modules: [
       "haystack_integrations.components.retrievers.pgvector.embedding_retriever",
       "haystack_integrations.document_stores.pgvector.document_store",
-      "haystack_integrations.document_stores.pgvector.filters",
     ]
     ignore_when_discovered: ["__init__"]
 processors:

diff --git a/.../pgvector/src/haystack_integrations/components/retrievers/pgvector/embedding_retriever.py b/.../pgvector/src/haystack_integrations/components/retrievers/pgvector/embedding_retriever.py
@@ -12,9 +12,47 @@
 @component
 class PgvectorEmbeddingRetriever:
     """
-    Retrieves documents from the PgvectorDocumentStore, based on their dense embeddings.
+    Retrieves documents from the `PgvectorDocumentStore`, based on their dense embeddings.
 
-    Needs to be connected to the PgvectorDocumentStore.
+    Example usage:
+    ```python
+    from haystack.document_stores import DuplicatePolicy
+    from haystack import Document, Pipeline
+    from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder
+
+    from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore
+    from haystack_integrations.components.retrievers.pgvector import PgvectorEmbeddingRetriever
+
+    # Set an environment variable `PG_CONN_STR` with the connection string to your PostgreSQL database.
+    # e.g., "postgresql://USER:PASSWORD@HOST:PORT/DB_NAME"
+
+    document_store = PgvectorDocumentStore(
+        embedding_dimension=768,
+        vector_function="cosine_similarity",
+        recreate_table=True,
+    )
+
+    documents = [Document(content="There are over 7,000 languages spoken around the world today."),
+                 Document(content="Elephants have been observed to behave in a way that indicates..."),
+                 Document(content="In certain places, you can witness the phenomenon of bioluminescent waves.")]
+
+    document_embedder = SentenceTransformersDocumentEmbedder()
+    document_embedder.warm_up()
+    documents_with_embeddings = document_embedder.run(documents)
+
+    document_store.write_documents(documents_with_embeddings.get("documents"), policy=DuplicatePolicy.OVERWRITE)
+
+    query_pipeline = Pipeline()
+    query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder())
+    query_pipeline.add_component("retriever", PgvectorEmbeddingRetriever(document_store=document_store))
+    query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
+
+    query = "How many languages are there?"
+
+    res = query_pipeline.run({"text_embedder": {"text": query}})
+
+    assert res['retriever']['documents'][0].content == "There are over 7,000 languages spoken around the world today."
+    ```
     """
 
     def __init__(
@@ -26,23 +64,20 @@ def __init__(
         vector_function: Optional[Literal["cosine_similarity", "inner_product", "l2_distance"]] = None,
     ):
         """
-        Create the PgvectorEmbeddingRetriever component.
-
-        :param document_store: An instance of PgvectorDocumentStore.
-        :param filters: Filters applied to the retrieved Documents. Defaults to None.
-        :param top_k: Maximum number of Documents to return, defaults to 10.
+        :param document_store: An instance of `PgvectorDocumentStore}.
+        :param filters: Filters applied to the retrieved Documents.
+        :param top_k: Maximum number of Documents to return.
         :param vector_function: The similarity function to use when searching for similar embeddings.
             Defaults to the one set in the `document_store` instance.
-            "cosine_similarity" and "inner_product" are similarity functions and
+            `"cosine_similarity"` and `"inner_product"` are similarity functions and
             higher scores indicate greater similarity between the documents.
-            "l2_distance" returns the straight-line distance between vectors,
+            `"l2_distance"` returns the straight-line distance between vectors,
             and the most similar documents are the ones with the smallest score.
-
-            Important: if the document store is using the "hnsw" search strategy, the vector function
+            **Important**: if the document store is using the `"hnsw"` search strategy, the vector function
             should match the one utilized during index creation to take advantage of the index.
-        :type vector_function: Literal["cosine_similarity", "inner_product", "l2_distance"]
 
-        :raises ValueError: If `document_store` is not an instance of PgvectorDocumentStore.
+        :raises ValueError: If `document_store` is not an instance of `PgvectorDocumentStore` or if `vector_function`
+            is not one of the valid options.
         """
         if not isinstance(document_store, PgvectorDocumentStore):
             msg = "document_store must be an instance of PgvectorDocumentStore"
@@ -58,6 +93,12 @@ def __init__(
         self.vector_function = vector_function or document_store.vector_function
 
     def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes the component to a dictionary.
+
+        :returns:
+            Dictionary with serialized data.
+        """
         return default_to_dict(
             self,
             filters=self.filters,
@@ -68,6 +109,14 @@ def to_dict(self) -> Dict[str, Any]:
 
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "PgvectorEmbeddingRetriever":
+        """
+        Deserializes the component from a dictionary.
+
+        :param data:
+            Dictionary to deserialize from.
+        :returns:
+            Deserialized component.
+        """
         doc_store_params = data["init_parameters"]["document_store"]
         data["init_parameters"]["document_store"] = PgvectorDocumentStore.from_dict(doc_store_params)
         return default_from_dict(cls, data)
@@ -81,14 +130,14 @@ def run(
         vector_function: Optional[Literal["cosine_similarity", "inner_product", "l2_distance"]] = None,
     ):
         """
-        Retrieve documents from the PgvectorDocumentStore, based on their embeddings.
+        Retrieve documents from the `PgvectorDocumentStore`, based on their embeddings.
 
         :param query_embedding: Embedding of the query.
         :param filters: Filters applied to the retrieved Documents.
         :param top_k: Maximum number of Documents to return.
         :param vector_function: The similarity function to use when searching for similar embeddings.
-        :type vector_function: Literal["cosine_similarity", "inner_product", "l2_distance"]
-        :return: List of Documents similar to `query_embedding`.
+
+        :returns: List of Documents similar to `query_embedding`.
         """
         filters = filters or self.filters
         top_k = top_k or self.top_k

diff --git a/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py b/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py
@@ -67,6 +67,10 @@
 
 
 class PgvectorDocumentStore:
+    """
+    A Document Store using PostgreSQL with the [pgvector extension](https://github.com/pgvector/pgvector) installed.
+    """
+
     def __init__(
         self,
         *,
@@ -86,36 +90,33 @@ def __init__(
         A specific table to store Haystack documents will be created if it doesn't exist yet.
 
         :param connection_string: The connection string to use to connect to the PostgreSQL database, defined as an
-            environment variable, e.g.: PG_CONN_STR="postgresql://USER:PASSWORD@HOST:PORT/DB_NAME"
-        :param table_name: The name of the table to use to store Haystack documents. Defaults to "haystack_documents".
-        :param embedding_dimension: The dimension of the embedding. Defaults to 768.
+            environment variable, e.g.: `PG_CONN_STR="postgresql://USER:PASSWORD@HOST:PORT/DB_NAME"`
+        :param table_name: The name of the table to use to store Haystack documents.
+        :param embedding_dimension: The dimension of the embedding.
         :param vector_function: The similarity function to use when searching for similar embeddings.
-            Defaults to "cosine_similarity". "cosine_similarity" and "inner_product" are similarity functions and
+            `"cosine_similarity"` and `"inner_product"` are similarity functions and
             higher scores indicate greater similarity between the documents.
-            "l2_distance" returns the straight-line distance between vectors,
+            `"l2_distance"` returns the straight-line distance between vectors,
             and the most similar documents are the ones with the smallest score.
-
-            Important: when using the "hnsw" search strategy, an index will be created that depends on the
+            **Important**: when using the `"hnsw"` search strategy, an index will be created that depends on the
             `vector_function` passed here. Make sure subsequent queries will keep using the same
             vector similarity function in order to take advantage of the index.
-        :type vector_function: Literal["cosine_similarity", "inner_product", "l2_distance"]
-        :param recreate_table: Whether to recreate the table if it already exists. Defaults to False.
+        :param recreate_table: Whether to recreate the table if it already exists.
         :param search_strategy: The search strategy to use when searching for similar embeddings.
-            Defaults to "exact_nearest_neighbor". "hnsw" is an approximate nearest neighbor search strategy,
+            `"exact_nearest_neighbor"` provides perfect recall but can be slow for large numbers of documents.
+            `"hnsw"` is an approximate nearest neighbor search strategy,
             which trades off some accuracy for speed; it is recommended for large numbers of documents.
-
-            Important: when using the "hnsw" search strategy, an index will be created that depends on the
+            **Important**: when using the `"hnsw"` search strategy, an index will be created that depends on the
             `vector_function` passed here. Make sure subsequent queries will keep using the same
             vector similarity function in order to take advantage of the index.
-        :type search_strategy: Literal["exact_nearest_neighbor", "hnsw"]
         :param hnsw_recreate_index_if_exists: Whether to recreate the HNSW index if it already exists.
-            Defaults to False. Only used if search_strategy is set to "hnsw".
+            Only used if search_strategy is set to `"hnsw"`.
         :param hnsw_index_creation_kwargs: Additional keyword arguments to pass to the HNSW index creation.
-            Only used if search_strategy is set to "hnsw". You can find the list of valid arguments in the
-            pgvector documentation: https://github.com/pgvector/pgvector?tab=readme-ov-file#hnsw
-        :param hnsw_ef_search: The ef_search parameter to use at query time. Only used if search_strategy is set to
-            "hnsw". You can find more information about this parameter in the pgvector documentation:
-            https://github.com/pgvector/pgvector?tab=readme-ov-file#hnsw
+            Only used if search_strategy is set to `"hnsw"`. You can find the list of valid arguments in the
+            [pgvector documentation](https://github.com/pgvector/pgvector?tab=readme-ov-file#hnsw)
+        :param hnsw_ef_search: The `ef_search` parameter to use at query time. Only used if search_strategy is set to
+            `"hnsw"`. You can find more information about this parameter in the
+            [pgvector documentation](https://github.com/pgvector/pgvector?tab=readme-ov-file#hnsw)
         """
 
         self.connection_string = connection_string
@@ -150,6 +151,12 @@ def __init__(
             self._handle_hnsw()
 
     def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes the component to a dictionary.
+
+        :returns:
+            Dictionary with serialized data.
+        """
         return default_to_dict(
             self,
             connection_string=self.connection_string.to_dict(),
@@ -165,6 +172,14 @@ def to_dict(self) -> Dict[str, Any]:
 
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "PgvectorDocumentStore":
+        """
+        Deserializes the component from a dictionary.
+
+        :param data:
+            Dictionary to deserialize from.
+        :returns:
+            Deserialized component.
+        """
         deserialize_secrets_inplace(data["init_parameters"], ["connection_string"])
         return default_from_dict(cls, data)
 
@@ -209,6 +224,7 @@ def _create_table_if_not_exists(self):
     def delete_table(self):
         """
         Deletes the table used to store Haystack documents.
+        The name of the table (`table_name`) is defined when initializing the `PgvectorDocumentStore`.
         """
 
         delete_sql = SQL("DROP TABLE IF EXISTS {table_name}").format(table_name=Identifier(self.table_name))
@@ -218,7 +234,7 @@ def delete_table(self):
     def _handle_hnsw(self):
         """
         Internal method to handle the HNSW index creation.
-        It also sets the hnsw.ef_search parameter for queries if it is specified.
+        It also sets the `hnsw.ef_search` parameter for queries if it is specified.
         """
 
         if self.hnsw_ef_search:
@@ -295,7 +311,8 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc
         refer to the [documentation](https://docs.haystack.deepset.ai/v2.0/docs/metadata-filtering)
 
         :param filters: The filters to apply to the document list.
-        :return: A list of Documents that match the given filters.
+        :raises TypeError: If `filters` is not a dictionary.
+        :returns: A list of Documents that match the given filters.
         """
         if filters:
             if not isinstance(filters, dict):
@@ -324,13 +341,13 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc
 
     def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE) -> int:
         """
-        Writes documents into to PgvectorDocumentStore.
+        Writes documents to the document store.
 
         :param documents: A list of Documents to write to the document store.
         :param policy: The duplicate policy to use when writing documents.
         :raises DuplicateDocumentError: If a document with the same id already exists in the document store
-             and the policy is set to DuplicatePolicy.FAIL (or not specified).
-        :return: The number of documents written to the document store.
+             and the policy is set to `DuplicatePolicy.FAIL` (or not specified).
+        :returns: The number of documents written to the document store.
         """
 
         if len(documents) > 0:
@@ -432,7 +449,7 @@ def _from_pg_to_haystack_documents(documents: List[Dict[str, Any]]) -> List[Docu
 
     def delete_documents(self, document_ids: List[str]) -> None:
         """
-        Deletes all documents with a matching document_ids from the document store.
+        Deletes documents that match the provided `document_ids` from the document store.
 
         :param document_ids: the document ids to delete
         """
@@ -462,8 +479,7 @@ def _embedding_retrieval(
         This method is not meant to be part of the public interface of
         `PgvectorDocumentStore` and it should not be called directly.
         `PgvectorEmbeddingRetriever` uses this method directly and is the public interface for it.
-        :raises ValueError
-        :return: List of Documents that are most similar to `query_embedding`
+        :returns: List of Documents that are most similar to `query_embedding`
         """
 
         if not query_embedding: