From d7ad329dac59b667de1726f21ed91db3580fd28b Mon Sep 17 00:00:00 2001 From: Tobias Wochinger Date: Wed, 6 Mar 2024 13:28:56 +0100 Subject: [PATCH] docs: review Elastic (#541) * docs: review Elastic * docs: correctly describe `DocumentStoreError` Co-authored-by: Stefano Fiorucci --------- Co-authored-by: Stefano Fiorucci --- .../elasticsearch/bm25_retriever.py | 24 +++--- .../elasticsearch/embedding_retriever.py | 16 ++-- .../elasticsearch/document_store.py | 81 ++++++++++--------- 3 files changed, 61 insertions(+), 60 deletions(-) diff --git a/integrations/elasticsearch/src/haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py b/integrations/elasticsearch/src/haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py index df1cb4a26..867d49c0e 100644 --- a/integrations/elasticsearch/src/haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py +++ b/integrations/elasticsearch/src/haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py @@ -54,13 +54,13 @@ def __init__( :param document_store: An instance of ElasticsearchDocumentStore. :param filters: Filters applied to the retrieved Documents, for more info - see `ElasticsearchDocumentStore.filter_documents`, defaults to None - :param fuzziness: Fuzziness parameter passed to Elasticsearch, defaults to "AUTO". - See the official - [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness) - for more details. - :param top_k: Maximum number of Documents to return, defaults to 10 - :param scale_score: If `True` scales the Document`s scores between 0 and 1, defaults to False + see `ElasticsearchDocumentStore.filter_documents`. + :param fuzziness: Fuzziness parameter passed to Elasticsearch. See the official + [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness) + for more details. + :param top_k: Maximum number of Documents to return. + :param scale_score: If `True` scales the Document`s scores between 0 and 1. + :raises ValueError: If `document_store` is not an instance of `ElasticsearchDocumentStore`. """ if not isinstance(document_store, ElasticsearchDocumentStore): @@ -97,7 +97,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "ElasticsearchBM25Retriever": :param data: Dictionary to deserialize from. :returns: - Deserialized component. + Deserialized component. """ data["init_parameters"]["document_store"] = ElasticsearchDocumentStore.from_dict( data["init_parameters"]["document_store"] @@ -109,11 +109,11 @@ def run(self, query: str, filters: Optional[Dict[str, Any]] = None, top_k: Optio """ Retrieve documents using the BM25 keyword-based algorithm. - :param query: String to search in Documents' text. - :param filters: Filters applied to the retrieved Documents. - :param top_k: Maximum number of Documents to return. + :param query: String to search in `Document`s' text. + :param filters: Filters applied to the retrieved `Document`s. + :param top_k: Maximum number of `Document` to return. :returns: A dictionary with the following keys: - - `documents`: List of Documents that match the query. + - `documents`: List of `Document`s that match the query. """ docs = self._document_store._bm25_retrieval( query=query, diff --git a/integrations/elasticsearch/src/haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py b/integrations/elasticsearch/src/haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py index d9f7f1fe6..fa292fe63 100644 --- a/integrations/elasticsearch/src/haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py +++ b/integrations/elasticsearch/src/haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py @@ -38,7 +38,7 @@ class ElasticsearchEmbeddingRetriever: result = retriever.run(query=query_embeddings) for doc in result["documents"]: - print(doc.content) + print(doc.content) ``` """ @@ -54,9 +54,9 @@ def __init__( Create the ElasticsearchEmbeddingRetriever component. :param document_store: An instance of ElasticsearchDocumentStore. - :param filters: Filters applied to the retrieved Documents. Defaults to None. - Filters are applied during the approximate kNN search to ensure that top_k matching documents are returned. - :param top_k: Maximum number of Documents to return, defaults to 10 + :param filters: Filters applied to the retrieved Documents. + Filters are applied during the approximate KNN search to ensure that top_k matching documents are returned. + :param top_k: Maximum number of Documents to return. :param num_candidates: Number of approximate nearest neighbor candidates on each shard. Defaults to top_k * 10. Increasing this value will improve search accuracy at the cost of slower search speeds. You can read more about it in the Elasticsearch @@ -95,7 +95,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "ElasticsearchEmbeddingRetriever": :param data: Dictionary to deserialize from. :returns: - Deserialized component. + Deserialized component. """ data["init_parameters"]["document_store"] = ElasticsearchDocumentStore.from_dict( data["init_parameters"]["document_store"] @@ -108,10 +108,10 @@ def run(self, query_embedding: List[float], filters: Optional[Dict[str, Any]] = Retrieve documents using a vector similarity metric. :param query_embedding: Embedding of the query. - :param filters: Filters applied to the retrieved Documents. - :param top_k: Maximum number of Documents to return. + :param filters: Filters applied to the retrieved `Document`s. + :param top_k: Maximum number of `Document`s to return. :returns: A dictionary with the following keys: - - `documents`: List of Documents most similar to the given query_embedding + - `documents`: List of `Document`s most similar to the given `query_embedding` """ docs = self._document_store._embedding_retrieval( query_embedding=query_embedding, diff --git a/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py b/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py index f50e2b1b3..0429f8811 100644 --- a/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py +++ b/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py @@ -38,13 +38,13 @@ class ElasticsearchDocumentStore: ElasticsearchDocumentStore is a Document Store for Elasticsearch. It can be used with Elastic Cloud or your own Elasticsearch cluster. - Usage example with Elastic Cloud: + Usage example (Elastic Cloud): ```python from haystack.document_store.elasticsearch import ElasticsearchDocumentStore document_store = ElasticsearchDocumentStore(cloud_id="YOUR_CLOUD_ID", api_key="YOUR_API_KEY") ``` - Usage example with a self-hosted Elasticsearch instance: + Usage example (self-hosted Elasticsearch instance): ```python from haystack.document_store.elasticsearch import ElasticsearchDocumentStore document_store = ElasticsearchDocumentStore(hosts="http://localhost:9200") @@ -69,8 +69,8 @@ def __init__( ): """ Creates a new ElasticsearchDocumentStore instance. - When no index is explicitly specified, it will use the default index "default". - It will also try to create that index if it doesn't exist yet. Otherwise it will use the existing one. + + It will also try to create that index if it doesn't exist yet. Otherwise, it will use the existing one. One can also set the similarity function used to compare Documents embeddings. This is mostly useful when using the `ElasticsearchDocumentStore` in a Pipeline with an `ElasticsearchEmbeddingRetriever`. @@ -81,14 +81,14 @@ def __init__( For the full list of supported kwargs, see the official Elasticsearch [reference](https://elasticsearch-py.readthedocs.io/en/stable/api.html#module-elasticsearch) - :param hosts: List of hosts running the Elasticsearch client. Defaults to None - :param index: Name of index in Elasticsearch, if it doesn't exist it will be created. Defaults to "default" + :param hosts: List of hosts running the Elasticsearch client. + :param index: Name of index in Elasticsearch. :param embedding_similarity_function: The similarity function used to compare Documents embeddings. - Defaults to "cosine". This parameter only takes effect if the index does not yet exist and is created. + This parameter only takes effect if the index does not yet exist and is created. To choose the most appropriate function, look for information about your embedding model. To understand how document scores are computed, see the Elasticsearch [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html#dense-vector-params) - :param **kwargs: Optional arguments that ``Elasticsearch`` takes. + :param **kwargs: Optional arguments that `Elasticsearch` takes. """ self._hosts = hosts self._client = Elasticsearch( @@ -140,7 +140,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "ElasticsearchDocumentStore": :param data: Dictionary to deserialize from. :returns: - Deserialized component. + Deserialized component. """ return default_from_dict(cls, data) @@ -186,7 +186,7 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc :param filters: A dictionary of filters to apply. For more information on the structure of the filters, see the official Elasticsearch [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html) - :returns: List of Documents that match the filters. + :returns: List of `Document`s that match the filters. """ if filters and "operator" not in filters and "conditions" not in filters: filters = convert(filters) @@ -197,13 +197,14 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE) -> int: """ - Writes Documents to Elasticsearch. - - If policy is not specified or set to DuplicatePolicy.NONE, it will raise an exception if a document with the - same ID already exists in the document store. + Writes `Document`s to Elasticsearch. :param documents: List of Documents to write to the document store. :param policy: DuplicatePolicy to apply when a document with the same ID already exists in the document store. + :raises ValueError: If `documents` is not a list of `Document`s. + :raises DuplicateDocumentError: If a document with the same ID already exists in the document store and + `policy` is set to `DuplicatePolicy.FAIL` or `DuplicatePolicy.NONE`. + :raises DocumentStoreError: If an error occurs while writing the documents to the document store. :returns: Number of documents written to the document store. """ if len(documents) > 0: @@ -253,13 +254,15 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D return documents_written - def _deserialize_document(self, hit: Dict[str, Any]) -> Document: + @staticmethod + def _deserialize_document(hit: Dict[str, Any]) -> Document: """ - Creates a Document from the search hit provided. + Creates a `Document` from the search hit provided. + This is mostly useful in self.filter_documents(). :param hit: A search hit from Elasticsearch. - :returns: Document created from the search hit. + :returns: `Document` created from the search hit. """ data = hit["_source"] @@ -271,12 +274,11 @@ def _deserialize_document(self, hit: Dict[str, Any]) -> Document: def delete_documents(self, document_ids: List[str]) -> None: """ - Deletes all documents with a matching document_ids from the document store. + Deletes all `Document`s with a matching `document_ids` from the document store. - :param document_ids: the object_ids to delete + :param document_ids: the object IDs to delete """ - # helpers.bulk( client=self._client, actions=({"_op_type": "delete", "_id": id_} for id_ in document_ids), @@ -295,27 +297,25 @@ def _bm25_retrieval( scale_score: bool = False, ) -> List[Document]: """ - Elasticsearch by defaults uses BM25 search algorithm. + Retrieves `Document`s from Elasticsearch using the BM25 search algorithm. + Even though this method is called `bm25_retrieval` it searches for `query` using the search algorithm `_client` was configured with. - This method is not mean to be part of the public interface of + This method is not meant to be part of the public interface of `ElasticsearchDocumentStore` nor called directly. `ElasticsearchBM25Retriever` uses this method directly and is the public interface for it. - `query` must be a non-empty string, otherwise a `ValueError` will be raised. - - :param query: String to search in saved Documents' text. - :param filters: Filters applied to the retrieved Documents, for more info - see `ElasticsearchDocumentStore.filter_documents`, defaults to None - :param fuzziness: Fuzziness parameter passed to Elasticsearch, defaults to "AUTO". - see the official - [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness) - for valid values. - :param top_k: Maximum number of Documents to return, defaults to 10 - :param scale_score: If `True` scales the Document`s scores between 0 and 1, defaults to False + :param query: String to search in saved `Document`s' text. + :param filters: Filters applied to the retrieved `Document`s, for more info + see `ElasticsearchDocumentStore.filter_documents`. + :param fuzziness: Fuzziness parameter passed to Elasticsearch. See the official + [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness) + for valid values. + :param top_k: Maximum number of `Document`s to return. + :param scale_score: If `True` scales the `Document``s scores between 0 and 1. :raises ValueError: If `query` is an empty string - :returns: List of Document that match `query` + :returns: List of `Document` that match `query` """ if not query: @@ -361,22 +361,23 @@ def _embedding_retrieval( ) -> List[Document]: """ Retrieves documents that are most similar to the query embedding using a vector similarity metric. + It uses the Elasticsearch's Approximate k-Nearest Neighbors search algorithm. - This method is not mean to be part of the public interface of + This method is not meant to be part of the public interface of `ElasticsearchDocumentStore` nor called directly. `ElasticsearchEmbeddingRetriever` uses this method directly and is the public interface for it. :param query_embedding: Embedding of the query. - :param filters: Filters applied to the retrieved Documents. Defaults to None. + :param filters: Filters applied to the retrieved `Document`s. Filters are applied during the approximate kNN search to ensure that top_k matching documents are returned. - :param top_k: Maximum number of Documents to return, defaults to 10 + :param top_k: Maximum number of `Document`s to return. :param num_candidates: Number of approximate nearest neighbor candidates on each shard. Defaults to top_k * 10. Increasing this value will improve search accuracy at the cost of slower search speeds. You can read more about it in the Elasticsearch - [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy) - :raises ValueError: If `query_embedding` is an empty list - :returns: List of Document that are most similar to `query_embedding` + [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy) + :raises ValueError: If `query_embedding` is an empty list. + :returns: List of `Document` that are most similar to `query_embedding`. """ if not query_embedding: