docs: Docstring update (#525)

* Docstring update * PR review - Julian * pylint fixes
deepset-ai · Mar 4, 2024 · 9b98f60 · 9b98f60
1 parent 710ac4d
commit 9b98f60
Show file tree

Hide file tree

Showing 3 changed files with 121 additions and 31 deletions.
diff --git a/...ticsearch/src/haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py b/...ticsearch/src/haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py
@@ -11,8 +11,9 @@
 @component
 class ElasticsearchBM25Retriever:
     """
-    ElasticsearchBM25Retriever is a keyword-based retriever that uses BM25 to find the most
-    similar documents to a user's query.
+    ElasticsearchBM25Retriever retrieves documents from the ElasticsearchDocumentStore using BM25 algorithm to find the
+    most similar documents to a user's query.
+
     This retriever is only compatible with ElasticsearchDocumentStore.
 
     Usage example:
@@ -35,7 +36,7 @@ class ElasticsearchBM25Retriever:
 
     result = retriever.run(query="Who lives in Berlin?")
     for doc in result["documents"]:
-        print(doc.text)
+        print(doc.content)
     ```
     """
 
@@ -55,8 +56,9 @@ def __init__(
         :param filters: Filters applied to the retrieved Documents, for more info
                         see `ElasticsearchDocumentStore.filter_documents`, defaults to None
         :param fuzziness: Fuzziness parameter passed to Elasticsearch, defaults to "AUTO".
-                          see the official documentation for valid values:
-                          https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
+                          See the official
+        [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness)
+        for more details.
         :param top_k: Maximum number of Documents to return, defaults to 10
         :param scale_score: If `True` scales the Document`s scores between 0 and 1, defaults to False
         """
@@ -72,6 +74,12 @@ def __init__(
         self._scale_score = scale_score
 
     def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes the component to a dictionary.
+
+        :returns:
+            Dictionary with serialized data.
+        """
         return default_to_dict(
             self,
             filters=self._filters,
@@ -83,6 +91,14 @@ def to_dict(self) -> Dict[str, Any]:
 
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "ElasticsearchBM25Retriever":
+        """
+        Deserializes the component from a dictionary.
+
+        :param data:
+            Dictionary to deserialize from.
+        :returns:
+              Deserialized component.
+        """
         data["init_parameters"]["document_store"] = ElasticsearchDocumentStore.from_dict(
             data["init_parameters"]["document_store"]
         )
@@ -96,7 +112,8 @@ def run(self, query: str, filters: Optional[Dict[str, Any]] = None, top_k: Optio
         :param query: String to search in Documents' text.
         :param filters: Filters applied to the retrieved Documents.
         :param top_k: Maximum number of Documents to return.
-        :return: List of Documents that match the query.
+        :returns: A dictionary with the following keys:
+            - `documents`: List of Documents that match the query.
         """
         docs = self._document_store._bm25_retrieval(
             query=query,

diff --git a/...arch/src/haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py b/...arch/src/haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py
@@ -11,9 +11,35 @@
 @component
 class ElasticsearchEmbeddingRetriever:
     """
-    Uses a vector similarity metric to retrieve documents from the ElasticsearchDocumentStore.
+    ElasticsearchEmbeddingRetriever retrieves documents from the ElasticsearchDocumentStore using vector similarity.
 
-    Needs to be connected to the ElasticsearchDocumentStore to run.
+    Usage example:
+    ```python
+    from haystack import Document
+    from haystack.components.embedders import SentenceTransformersTextEmbedder
+    from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
+    from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchEmbeddingRetriever
+
+    document_store = ElasticsearchDocumentStore(hosts="http://localhost:9200")
+    retriever = ElasticsearchEmbeddingRetriever(document_store=document_store)
+
+    # Add documents to DocumentStore
+    documents = [
+        Document(text="My name is Carla and I live in Berlin"),
+        Document(text="My name is Paul and I live in New York"),
+        Document(text="My name is Silvano and I live in Matera"),
+        Document(text="My name is Usagi Tsukino and I live in Tokyo"),
+    ]
+    document_store.write_documents(documents)
+
+    te = SentenceTransformersTextEmbedder()
+    te.warm_up()
+    query_embeddings = te.run("Who lives in Berlin?")["embedding"]
+
+    result = retriever.run(query=query_embeddings)
+    for doc in result["documents"]:
+    print(doc.content)
+    ```
     """
 
     def __init__(
@@ -33,8 +59,8 @@ def __init__(
         :param top_k: Maximum number of Documents to return, defaults to 10
         :param num_candidates: Number of approximate nearest neighbor candidates on each shard. Defaults to top_k * 10.
             Increasing this value will improve search accuracy at the cost of slower search speeds.
-            You can read more about it in the Elasticsearch documentation:
-            https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy
+            You can read more about it in the Elasticsearch
+            [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy)
         :raises ValueError: If `document_store` is not an instance of ElasticsearchDocumentStore.
         """
         if not isinstance(document_store, ElasticsearchDocumentStore):
@@ -47,6 +73,12 @@ def __init__(
         self._num_candidates = num_candidates
 
     def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes the component to a dictionary.
+
+        :returns:
+            Dictionary with serialized data.
+        """
         return default_to_dict(
             self,
             filters=self._filters,
@@ -57,6 +89,14 @@ def to_dict(self) -> Dict[str, Any]:
 
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "ElasticsearchEmbeddingRetriever":
+        """
+        Deserializes the component from a dictionary.
+
+        :param data:
+            Dictionary to deserialize from.
+        :returns:
+              Deserialized component.
+        """
         data["init_parameters"]["document_store"] = ElasticsearchDocumentStore.from_dict(
             data["init_parameters"]["document_store"]
         )
@@ -70,7 +110,8 @@ def run(self, query_embedding: List[float], filters: Optional[Dict[str, Any]] =
         :param query_embedding: Embedding of the query.
         :param filters: Filters applied to the retrieved Documents.
         :param top_k: Maximum number of Documents to return.
-        :return: List of Documents similar to `query_embedding`.
+        :returns: A dictionary with the following keys:
+            - `documents`: List of Documents most similar to the given query_embedding
         """
         docs = self._document_store._embedding_retrieval(
             query_embedding=query_embedding,

diff --git a/...s/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py b/...s/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py
@@ -35,16 +35,16 @@
 
 class ElasticsearchDocumentStore:
     """
-    ElasticsearchDocumentStore is a Document Store for Elasticsearch.
-    It can be used with Elastic Cloud or your own Elasticsearch cluster.
+    ElasticsearchDocumentStore is a Document Store for Elasticsearch. It can be used with Elastic Cloud or your own
+    Elasticsearch cluster.
 
-    Simple usage with Elastic Cloud:
+    Usage example with Elastic Cloud:
     ```python
     from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
     document_store = ElasticsearchDocumentStore(cloud_id="YOUR_CLOUD_ID", api_key="YOUR_API_KEY")
     ```
 
-    One can also connect to a self-hosted Elasticsearch instance:
+    Usage example with a self-hosted Elasticsearch instance:
     ```python
     from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
     document_store = ElasticsearchDocumentStore(hosts="http://localhost:9200")
@@ -53,8 +53,8 @@ class ElasticsearchDocumentStore:
     We strongly recommend to enable security so that only authorized users can access your data.
 
     For more details on how to connect to Elasticsearch and configure security,
-    see the official Elasticsearch documentation:
-    https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html
+    see the official Elasticsearch
+    [documentation](https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html)
 
     All extra keyword arguments will be passed to the Elasticsearch client.
     """
@@ -75,19 +75,19 @@ def __init__(
         One can also set the similarity function used to compare Documents embeddings. This is mostly useful
         when using the `ElasticsearchDocumentStore` in a Pipeline with an `ElasticsearchEmbeddingRetriever`.
 
-        For more information on connection parameters, see the official Elasticsearch documentation:
-        https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html
+        For more information on connection parameters, see the official Elasticsearch
+        [documentation](https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html)
 
-        For the full list of supported kwargs, see the official Elasticsearch reference:
-        https://elasticsearch-py.readthedocs.io/en/stable/api.html#module-elasticsearch
+        For the full list of supported kwargs, see the official Elasticsearch
+        [reference](https://elasticsearch-py.readthedocs.io/en/stable/api.html#module-elasticsearch)
 
         :param hosts: List of hosts running the Elasticsearch client. Defaults to None
         :param index: Name of index in Elasticsearch, if it doesn't exist it will be created. Defaults to "default"
         :param embedding_similarity_function: The similarity function used to compare Documents embeddings.
             Defaults to "cosine". This parameter only takes effect if the index does not yet exist and is created.
             To choose the most appropriate function, look for information about your embedding model.
-            To understand how document scores are computed, see the Elasticsearch documentation:
-            https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html#dense-vector-params
+            To understand how document scores are computed, see the Elasticsearch
+            [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html#dense-vector-params)
         :param **kwargs: Optional arguments that ``Elasticsearch`` takes.
         """
         self._hosts = hosts
@@ -115,6 +115,12 @@ def __init__(
             self._client.indices.create(index=index, mappings=mappings)
 
     def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes the component to a dictionary.
+
+        :returns:
+            Dictionary with serialized data.
+        """
         # This is not the best solution to serialise this class but is the fastest to implement.
         # Not all kwargs types can be serialised to text so this can fail. We must serialise each
         # type explicitly to handle this properly.
@@ -128,11 +134,20 @@ def to_dict(self) -> Dict[str, Any]:
 
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "ElasticsearchDocumentStore":
+        """
+        Deserializes the component from a dictionary.
+
+        :param data:
+            Dictionary to deserialize from.
+        :returns:
+              Deserialized component.
+        """
         return default_from_dict(cls, data)
 
     def count_documents(self) -> int:
         """
         Returns how many documents are present in the document store.
+        :returns: Number of documents in the document store.
         """
         return self._client.count(index=self._index)["count"]
 
@@ -165,6 +180,14 @@ def _search_documents(self, **kwargs) -> List[Document]:
         return documents
 
     def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
+        """
+        The main query method for the document store. It retrieves all documents that match the filters.
+
+        :param filters: A dictionary of filters to apply. For more information on the structure of the filters,
+            see the official Elasticsearch
+            [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html)
+        :returns: List of Documents that match the filters.
+        """
         if filters and "operator" not in filters and "conditions" not in filters:
             filters = convert(filters)
 
@@ -175,8 +198,13 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc
     def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE) -> int:
         """
         Writes Documents to Elasticsearch.
+
         If policy is not specified or set to DuplicatePolicy.NONE, it will raise an exception if a document with the
         same ID already exists in the document store.
+
+        :param documents: List of Documents to write to the document store.
+        :param policy: DuplicatePolicy to apply when a document with the same ID already exists in the document store.
+        :returns: Number of documents written to the document store.
         """
         if len(documents) > 0:
             if not isinstance(documents[0], Document):
@@ -229,6 +257,9 @@ def _deserialize_document(self, hit: Dict[str, Any]) -> Document:
         """
         Creates a Document from the search hit provided.
         This is mostly useful in self.filter_documents().
+
+        :param hit: A search hit from Elasticsearch.
+        :returns: Document created from the search hit.
         """
         data = hit["_source"]
 
@@ -242,7 +273,7 @@ def delete_documents(self, document_ids: List[str]) -> None:
         """
         Deletes all documents with a matching document_ids from the document store.
 
-        :param object_ids: the object_ids to delete
+        :param document_ids: the object_ids to delete
         """
 
         #
@@ -272,18 +303,19 @@ def _bm25_retrieval(
         `ElasticsearchDocumentStore` nor called directly.
         `ElasticsearchBM25Retriever` uses this method directly and is the public interface for it.
 
-        `query` must be a non empty string, otherwise a `ValueError` will be raised.
+        `query` must be a non-empty string, otherwise a `ValueError` will be raised.
 
         :param query: String to search in saved Documents' text.
         :param filters: Filters applied to the retrieved Documents, for more info
                         see `ElasticsearchDocumentStore.filter_documents`, defaults to None
         :param fuzziness: Fuzziness parameter passed to Elasticsearch, defaults to "AUTO".
-                          see the official documentation for valid values:
-                          https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
+                          see the official
+        [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness)
+        for valid values.
         :param top_k: Maximum number of Documents to return, defaults to 10
         :param scale_score: If `True` scales the Document`s scores between 0 and 1, defaults to False
         :raises ValueError: If `query` is an empty string
-        :return: List of Document that match `query`
+        :returns: List of Document that match `query`
         """
 
         if not query:
@@ -341,10 +373,10 @@ def _embedding_retrieval(
         :param top_k: Maximum number of Documents to return, defaults to 10
         :param num_candidates: Number of approximate nearest neighbor candidates on each shard. Defaults to top_k * 10.
             Increasing this value will improve search accuracy at the cost of slower search speeds.
-            You can read more about it in the Elasticsearch documentation:
-            https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy
+            You can read more about it in the Elasticsearch
+        [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy)
         :raises ValueError: If `query_embedding` is an empty list
-        :return: List of Document that are most similar to `query_embedding`
+        :returns: List of Document that are most similar to `query_embedding`
         """
 
         if not query_embedding: