From 7a1e1182afc434b53946b0640300c20abfb2d61f Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Mon, 4 Mar 2024 09:22:23 +0100 Subject: [PATCH 1/7] cohere: fix linting (#509) --- .../components/embedders/cohere/document_embedder.py | 4 ++-- .../components/embedders/cohere/text_embedder.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/integrations/cohere/src/haystack_integrations/components/embedders/cohere/document_embedder.py b/integrations/cohere/src/haystack_integrations/components/embedders/cohere/document_embedder.py index c7a249f6c..6342425fe 100644 --- a/integrations/cohere/src/haystack_integrations/components/embedders/cohere/document_embedder.py +++ b/integrations/cohere/src/haystack_integrations/components/embedders/cohere/document_embedder.py @@ -58,8 +58,8 @@ def __init__( [model documentation](https://docs.cohere.com/docs/models#representation). :param input_type: specifies the type of input you're giving to the model. Supported values are "search_document", "search_query", "classification" and "clustering". Not - required for older versions of the embedding models (meaning anything lower than v3), but is required for more - recent versions (meaning anything bigger than v2). + required for older versions of the embedding models (meaning anything lower than v3), but is required for + more recent versions (meaning anything bigger than v2). :param api_base_url: the Cohere API Base url. :param truncate: truncate embeddings that are too long from start or end, ("NONE"|"START"|"END"). Passing "START" will discard the start of the input. "END" will discard the end of the input. In both diff --git a/integrations/cohere/src/haystack_integrations/components/embedders/cohere/text_embedder.py b/integrations/cohere/src/haystack_integrations/components/embedders/cohere/text_embedder.py index 2aa779771..305743126 100644 --- a/integrations/cohere/src/haystack_integrations/components/embedders/cohere/text_embedder.py +++ b/integrations/cohere/src/haystack_integrations/components/embedders/cohere/text_embedder.py @@ -51,11 +51,11 @@ def __init__( [model documentation](https://docs.cohere.com/docs/models#representation). :param input_type: specifies the type of input you're giving to the model. Supported values are "search_document", "search_query", "classification" and "clustering". Not - required for older versions of the embedding models (meaning anything lower than v3), but is required for more - recent versions (meaning anything bigger than v2). + required for older versions of the embedding models (meaning anything lower than v3), but is required for + more recent versions (meaning anything bigger than v2). :param api_base_url: the Cohere API Base url. - :param truncate: truncate embeddings that are too long from start or end, ("NONE"|"START"|"END"), defaults to - `"END"`. Passing "START" will discard the start of the input. "END" will discard the end of the input. In both + :param truncate: truncate embeddings that are too long from start or end, ("NONE"|"START"|"END"). + Passing "START" will discard the start of the input. "END" will discard the end of the input. In both cases, input is discarded until the remaining input is exactly the maximum input token length for the model. If "NONE" is selected, when the input exceeds the maximum input token length an error will be returned. :param use_async_client: flag to select the AsyncClient. It is recommended to use From 4df46344df2d1ef4f19187d69d31edf0cef05c28 Mon Sep 17 00:00:00 2001 From: Madeesh Kannan Date: Mon, 4 Mar 2024 13:23:19 +0100 Subject: [PATCH 2/7] feat: Add example for Optimum integration, fix docs, CI (#526) * feat: Add example for Optimum integration, fix docs, CI * Apply suggestions from code review Co-authored-by: Stefano Fiorucci --------- Co-authored-by: Stefano Fiorucci --- .github/workflows/optimum.yml | 6 +- README.md | 1 + integrations/optimum/README.md | 20 ++-- integrations/optimum/example/example.py | 34 ++++++ .../embedders/optimum/optimization.py | 2 +- .../optimum/optimum_document_embedder.py | 63 ++++++----- .../optimum/optimum_text_embedder.py | 103 +++++++++--------- .../components/embedders/optimum/pooling.py | 2 +- .../embedders/optimum/quantization.py | 2 +- 9 files changed, 136 insertions(+), 97 deletions(-) create mode 100644 integrations/optimum/example/example.py diff --git a/.github/workflows/optimum.yml b/.github/workflows/optimum.yml index f5f59ec89..3b0d137da 100644 --- a/.github/workflows/optimum.yml +++ b/.github/workflows/optimum.yml @@ -52,9 +52,9 @@ jobs: if: matrix.python-version == '3.9' && runner.os == 'Linux' run: hatch run lint:all - # - name: Generate docs - # if: matrix.python-version == '3.9' && runner.os == 'Linux' - # run: hatch run docs + - name: Generate docs + if: matrix.python-version == '3.9' && runner.os == 'Linux' + run: hatch run docs - name: Run tests run: hatch run cov diff --git a/README.md b/README.md index b502f3f66..2db267f6b 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,7 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta | [nvidia-haystack](integrations/nvidia/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/nvidia-haystack.svg?color=orange)](https://pypi.org/project/nvidia-haystack) | [![Test / nvidia](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/nvidia.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/nvidia.yml) | | [ollama-haystack](integrations/ollama/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/ollama-haystack.svg?color=orange)](https://pypi.org/project/ollama-haystack) | [![Test / ollama](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ollama.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ollama.yml) | | [opensearch-haystack](integrations/opensearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/opensearch-haystack.svg)](https://pypi.org/project/opensearch-haystack) | [![Test / opensearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml) | +| [optimum-haystack](integrations/optimum/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/optimum-haystack.svg)](https://pypi.org/project/optimum-haystack) | [![Test / optimum](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/optimum.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/optimum.yml) | | [pinecone-haystack](integrations/pinecone/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/pinecone-haystack.svg?color=orange)](https://pypi.org/project/pinecone-haystack) | [![Test / pinecone](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml) | | [pgvector-haystack](integrations/pgvector/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/pgvector-haystack.svg?color=orange)](https://pypi.org/project/pgvector-haystack) | [![Test / pgvector](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pgvector.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pgvector.yml) | | [qdrant-haystack](integrations/qdrant/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/qdrant-haystack.svg?color=orange)](https://pypi.org/project/qdrant-haystack) | [![Test / qdrant](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml) | diff --git a/integrations/optimum/README.md b/integrations/optimum/README.md index 1438f6e92..d620b1bb3 100644 --- a/integrations/optimum/README.md +++ b/integrations/optimum/README.md @@ -1,30 +1,24 @@ # optimum -[![PyPI - Version](https://img.shields.io/pypi/v/optimum.svg)](https://pypi.org/project/optimum-haystack) -[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/optimum.svg)](https://pypi.org/project/optimum-haystack) +[![PyPI - Version](https://img.shields.io/pypi/v/optimum-haystack.svg)](https://pypi.org/project/optimum-haystack) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/optimum-haystack.svg)](https://pypi.org/project/optimum-haystack) ------ +--- Component to embed strings and Documents using models loaded with the HuggingFace Optimum library. This component is designed to seamlessly inference models using the high speed ONNX runtime. **Table of Contents** -- [Installation](#installation) -- [License](#license) +- [optimum](#optimum) + - [Installation](#installation) + - [License](#license) ## Installation -To use the ONNX runtime for CPU, use the CPU version: ```console -pip install optimum-haystack[cpu] +pip install optimum-haystack ``` -For using the GPU runtimes: -```console -pip install optimum-haystack[gpu] -``` - - ## License `optimum-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license. diff --git a/integrations/optimum/example/example.py b/integrations/optimum/example/example.py new file mode 100644 index 000000000..0d86ce99b --- /dev/null +++ b/integrations/optimum/example/example.py @@ -0,0 +1,34 @@ +# This example requires GPU support to execute. + +from haystack import Pipeline + +from haystack_integrations.components.embedders.optimum import ( + OptimumTextEmbedder, + OptimumEmbedderPooling, + OptimumEmbedderOptimizationConfig, + OptimumEmbedderOptimizationMode, +) + +pipeline = Pipeline() +embedder = OptimumTextEmbedder( + model="intfloat/e5-base-v2", + normalize_embeddings=True, + onnx_execution_provider="CUDAExecutionProvider", + optimizer_settings=OptimumEmbedderOptimizationConfig( + mode=OptimumEmbedderOptimizationMode.O4, + for_gpu=True, + ), + working_dir="/tmp/optimum", + pooling_mode=OptimumEmbedderPooling.MEAN, +) +pipeline.add_component("embedder", embedder) + +results = pipeline.run( + { + "embedder": { + "text": "Ex profunditate antiquae doctrinae, Ad caelos supra semper, Hoc incantamentum evoco, draco apparet, Incantamentum iam transactum est" + }, + } +) + +print(results["embedder"]["embedding"]) diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimization.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimization.py index 5a4447570..17e553b83 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimization.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimization.py @@ -7,7 +7,7 @@ class OptimumEmbedderOptimizationMode(Enum): """ - [ONXX Optimization Modes](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization.html) + [ONXX Optimization modes](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization) support by the Optimum Embedders. """ diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_document_embedder.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_document_embedder.py index 783e4ac5e..27f533430 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_document_embedder.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_document_embedder.py @@ -70,41 +70,42 @@ def __init__( The [execution provider](https://onnxruntime.ai/docs/execution-providers/) to use for ONNX models. - Note: Using the TensorRT execution provider - TensorRT requires to build its inference engine ahead of inference, which takes some time due to the model - optimization and nodes fusion. To avoid rebuilding the engine every time the model is loaded, ONNX Runtime - provides a pair of options to save the engine: `trt_engine_cache_enable` and `trt_engine_cache_path`. We - recommend setting these two provider options using the model_kwargs parameter, when using the TensorRT - execution provider. The usage is as follows: - ```python - embedder = OptimumDocumentEmbedder( - model="sentence-transformers/all-mpnet-base-v2", - onnx_execution_provider="TensorrtExecutionProvider", - model_kwargs={ - "provider_options": { - "trt_engine_cache_enable": True, - "trt_engine_cache_path": "tmp/trt_cache", - } - }, - ) - ``` + Note: Using the TensorRT execution provider + TensorRT requires to build its inference engine ahead of inference, + which takes some time due to the model optimization and nodes fusion. + To avoid rebuilding the engine every time the model is loaded, ONNX + Runtime provides a pair of options to save the engine: `trt_engine_cache_enable` + and `trt_engine_cache_path`. We recommend setting these two provider + options using the `model_kwargs` parameter, when using the TensorRT execution provider. + The usage is as follows: + ```python + embedder = OptimumDocumentEmbedder( + model="sentence-transformers/all-mpnet-base-v2", + onnx_execution_provider="TensorrtExecutionProvider", + model_kwargs={ + "provider_options": { + "trt_engine_cache_enable": True, + "trt_engine_cache_path": "tmp/trt_cache", + } + }, + ) + ``` :param pooling_mode: The pooling mode to use. When `None`, pooling mode will be inferred from the model config. :param model_kwargs: Dictionary containing additional keyword arguments to pass to the model. In case of duplication, these kwargs override `model`, `onnx_execution_provider` and `token` initialization parameters. - :param working_dir: - The directory to use for storing intermediate files - generated during model optimization/quantization. - - Required for optimization and quantization. - :param optimizer_settings: - Configuration for Optimum Embedder Optimization. - If `None`, no additional optimization is be applied. - :param quantizer_settings: - Configuration for Optimum Embedder Quantization. - If `None`, no quantization is be applied. + :param working_dir: + The directory to use for storing intermediate files + generated during model optimization/quantization. Required + for optimization and quantization. + :param optimizer_settings: + Configuration for Optimum Embedder Optimization. + If `None`, no additional optimization is be applied. + :param quantizer_settings: + Configuration for Optimum Embedder Quantization. + If `None`, no quantization is be applied. :param batch_size: Number of Documents to encode at once. :param progress_bar: @@ -199,6 +200,10 @@ def run(self, documents: List[Document]): A list of Documents to embed. :returns: The updated Documents with their embeddings. + :raises RuntimeError: + If the component was not initialized. + :raises TypeError: + If the input is not a list of Documents. """ if not self._initialized: msg = "The embedding model has not been loaded. Please call warm_up() before running." diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_text_embedder.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_text_embedder.py index 976a7bfa8..e3cffe183 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_text_embedder.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_text_embedder.py @@ -46,57 +46,58 @@ def __init__( quantizer_settings: Optional[OptimumEmbedderQuantizationConfig] = None, ): """ - Create a OptimumTextEmbedder component. + Create a OptimumTextEmbedder component. :param model: - A string representing the model id on HF Hub. - :param token: - The HuggingFace token to use as HTTP bearer authorization. - :param prefix: - A string to add to the beginning of each text. - :param suffix: - A string to add to the end of each text. - :param normalize_embeddings: - Whether to normalize the embeddings to unit length. - :param onnx_execution_provider: - The [execution provider](https://onnxruntime.ai/docs/execution-providers/) - to use for ONNX models. - - Note: Using the TensorRT execution provider - TensorRT requires to build its inference engine ahead of inference, which takes some time due to the model - optimization and nodes fusion. To avoid rebuilding the engine every time the model is loaded, ONNX Runtime - provides a pair of options to save the engine: `trt_engine_cache_enable` and `trt_engine_cache_path`. We - recommend setting these two provider options using the model_kwargs parameter, when using the TensorRT - execution provider. The usage is as follows: - ```python - embedder = OptimumDocumentEmbedder( - model="sentence-transformers/all-mpnet-base-v2", - onnx_execution_provider="TensorrtExecutionProvider", - model_kwargs={ - "provider_options": { - "trt_engine_cache_enable": True, - "trt_engine_cache_path": "tmp/trt_cache", - } - }, - ) - ``` - :param pooling_mode: - The pooling mode to use. When `None`, pooling mode will be inferred from the model config. - :param model_kwargs: - Dictionary containing additional keyword arguments to pass to the model. - In case of duplication, these kwargs override `model`, `onnx_execution_provider` - and `token` initialization parameters. - :param working_dir: - The directory to use for storing intermediate files - generated during model optimization/quantization. - - Required for optimization and quantization. - :param optimizer_settings: - Configuration for Optimum Embedder Optimization. - If `None`, no additional optimization is applied. - :param quantizer_settings: - Configuration for Optimum Embedder Quantization. - If `None`, no quantization is applied. + A string representing the model id on HF Hub. + :param token: + The HuggingFace token to use as HTTP bearer authorization. + :param prefix: + A string to add to the beginning of each text. + :param suffix: + A string to add to the end of each text. + :param normalize_embeddings: + Whether to normalize the embeddings to unit length. + :param onnx_execution_provider: + The [execution provider](https://onnxruntime.ai/docs/execution-providers/) + to use for ONNX models. + + Note: Using the TensorRT execution provider + TensorRT requires to build its inference engine ahead of inference, + which takes some time due to the model optimization and nodes fusion. + To avoid rebuilding the engine every time the model is loaded, ONNX + Runtime provides a pair of options to save the engine: `trt_engine_cache_enable` + and `trt_engine_cache_path`. We recommend setting these two provider + options using the `model_kwargs` parameter, when using the TensorRT execution provider. + The usage is as follows: + ```python + embedder = OptimumDocumentEmbedder( + model="sentence-transformers/all-mpnet-base-v2", + onnx_execution_provider="TensorrtExecutionProvider", + model_kwargs={ + "provider_options": { + "trt_engine_cache_enable": True, + "trt_engine_cache_path": "tmp/trt_cache", + } + }, + ) + ``` + :param pooling_mode: + The pooling mode to use. When `None`, pooling mode will be inferred from the model config. + :param model_kwargs: + Dictionary containing additional keyword arguments to pass to the model. + In case of duplication, these kwargs override `model`, `onnx_execution_provider` + and `token` initialization parameters. + :param working_dir: + The directory to use for storing intermediate files + generated during model optimization/quantization. Required + for optimization and quantization. + :param optimizer_settings: + Configuration for Optimum Embedder Optimization. + If `None`, no additional optimization is be applied. + :param quantizer_settings: + Configuration for Optimum Embedder Quantization. + If `None`, no quantization is be applied. """ params = _EmbedderParams( model=model, @@ -161,6 +162,10 @@ def run(self, text: str): The text to embed. :returns: The embeddings of the text. + :raises RuntimeError: + If the component was not initialized. + :raises TypeError: + If the input is not a string. """ if not self._initialized: msg = "The embedding model has not been loaded. Please call warm_up() before running." diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/pooling.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/pooling.py index 41aa24d64..2c8bbd967 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/pooling.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/pooling.py @@ -3,7 +3,7 @@ class OptimumEmbedderPooling(Enum): """ - Pooling Modes support by the Optimum Embedders. + Pooling modes support by the Optimum Embedders. """ #: Perform CLS Pooling on the output of the embedding model diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/quantization.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/quantization.py index 2e68081b5..d45369544 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/quantization.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/quantization.py @@ -7,7 +7,7 @@ class OptimumEmbedderQuantizationMode(Enum): """ - [Dynamic Quantization Modes](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/quantization) + [Dynamic Quantization modes](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/quantization) support by the Optimum Embedders. """ From b721907ab45fb9f4769c7be665a03c4bf84fe236 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 4 Mar 2024 14:48:22 +0100 Subject: [PATCH 3/7] docs: fixing opensearch docstrings (#521) * initial import * fixing line limit * fixing typo --- .../retrievers/opensearch/bm25_retriever.py | 21 +++++++++++- .../opensearch/embedding_retriever.py | 19 ++++++++++- .../opensearch/document_store.py | 34 ++++++++++++------- 3 files changed, 59 insertions(+), 15 deletions(-) diff --git a/integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/bm25_retriever.py b/integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/bm25_retriever.py index d1b1e6ada..0ad257b42 100644 --- a/integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/bm25_retriever.py +++ b/integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/bm25_retriever.py @@ -46,6 +46,12 @@ def __init__( self._all_terms_must_match = all_terms_must_match def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ return default_to_dict( self, filters=self._filters, @@ -57,6 +63,15 @@ def to_dict(self) -> Dict[str, Any]: @classmethod def from_dict(cls, data: Dict[str, Any]) -> "OpenSearchBM25Retriever": + """ + Deserializes the component from a dictionary. + + :param data: + Dictionary to deserialize from. + + :returns: + Deserialized component. + """ data["init_parameters"]["document_store"] = OpenSearchDocumentStore.from_dict( data["init_parameters"]["document_store"] ) @@ -82,7 +97,11 @@ def run( :param fuzziness: Fuzziness parameter for full-text queries. :param scale_score: Whether to scale the score of retrieved documents between 0 and 1. This is useful when comparing documents across different indexes. - :return: A dictionary containing the retrieved documents. + + :returns: + A dictionary containing the retrieved documents with the following structure: + - documents: List of retrieved Documents. + """ if filters is None: filters = self._filters diff --git a/integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/embedding_retriever.py b/integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/embedding_retriever.py index 15c8313ab..50b30d7f1 100644 --- a/integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/embedding_retriever.py +++ b/integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/embedding_retriever.py @@ -41,6 +41,12 @@ def __init__( self._top_k = top_k def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ return default_to_dict( self, filters=self._filters, @@ -50,6 +56,15 @@ def to_dict(self) -> Dict[str, Any]: @classmethod def from_dict(cls, data: Dict[str, Any]) -> "OpenSearchEmbeddingRetriever": + """ + Deserializes the component from a dictionary. + + :param data: + Dictionary to deserialize from. + + :returns: + Deserialized component. + """ data["init_parameters"]["document_store"] = OpenSearchDocumentStore.from_dict( data["init_parameters"]["document_store"] ) @@ -63,7 +78,9 @@ def run(self, query_embedding: List[float], filters: Optional[Dict[str, Any]] = :param query_embedding: Embedding of the query. :param filters: Optional filters to narrow down the search space. :param top_k: Maximum number of Documents to return. - :return: List of Document similar to `query_embedding`. + :returns: + Dictionary with key "documents" containing the retrieved Documents. + - documents: List of Document similar to `query_embedding`. """ if filters is None: filters = self._filters diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index dc6941854..e91347728 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -39,11 +39,9 @@ def __init__( """ Creates a new OpenSearchDocumentStore instance. - For more information on connection parameters, see the official OpenSearch documentation: - https://opensearch.org/docs/latest/clients/python-low-level/#connecting-to-opensearch + For more information on connection parameters, see the [official OpenSearch documentation](https://opensearch.org/docs/latest/clients/python-low-level/#connecting-to-opensearch) - For the full list of supported kwargs, see the official OpenSearch reference: - https://opensearch-project.github.io/opensearch-py/api-ref/clients/opensearch_client.html + For the full list of supported kwargs, see the [official OpenSearch reference](https://opensearch-project.github.io/opensearch-py/api-ref/clients/opensearch_client.html) :param hosts: List of hosts running the OpenSearch client. Defaults to None :param index: Name of index in OpenSearch, if it doesn't exist it will be created. Defaults to "default" @@ -94,6 +92,12 @@ def to_dict(self) -> Dict[str, Any]: # This is not the best solution to serialise this class but is the fastest to implement. # Not all kwargs types can be serialised to text so this can fail. We must serialise each # type explicitly to handle this properly. + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ return default_to_dict( self, hosts=self._hosts, @@ -103,6 +107,15 @@ def to_dict(self) -> Dict[str, Any]: @classmethod def from_dict(cls, data: Dict[str, Any]) -> "OpenSearchDocumentStore": + """ + Deserializes the component from a dictionary. + + :param data: + Dictionary to deserialize from. + + :returns: + Deserialized component. + """ return default_from_dict(cls, data) def count_documents(self) -> int: @@ -230,7 +243,7 @@ def _bm25_retrieval( Even though this method is called `bm25_retrieval` it searches for `query` using the search algorithm `_client` was configured with. - This method is not mean to be part of the public interface of + This method is not meant to be part of the public interface of `OpenSearchDocumentStore` nor called directly. `OpenSearchBM25Retriever` uses this method directly and is the public interface for it. @@ -238,9 +251,8 @@ def _bm25_retrieval( :param query: String to search in saved Documents' text. :param filters: Optional filters to narrow down the search space. - :param fuzziness: Fuzziness parameter passed to OpenSearch, defaults to "AUTO". - see the official documentation for valid values: - https://www.elastic.co/guide/en/OpenSearch/reference/current/common-options.html#fuzziness + :param fuzziness: Fuzziness parameter passed to OpenSearch, defaults to "AUTO". see the official documentation + for valid [fuzziness values](https://www.elastic.co/guide/en/OpenSearch/reference/current/common-options.html#fuzziness) :param top_k: Maximum number of Documents to return, defaults to 10 :param scale_score: If `True` scales the Document`s scores between 0 and 1, defaults to False :param all_terms_must_match: If `True` all terms in `query` must be present in the Document, defaults to False @@ -293,7 +305,7 @@ def _embedding_retrieval( Retrieves documents that are most similar to the query embedding using a vector similarity metric. It uses the OpenSearch's Approximate k-Nearest Neighbors search algorithm. - This method is not mean to be part of the public interface of + This method is not meant to be part of the public interface of `OpenSearchDocumentStore` nor called directly. `OpenSearchEmbeddingRetriever` uses this method directly and is the public interface for it. @@ -301,10 +313,6 @@ def _embedding_retrieval( :param filters: Filters applied to the retrieved Documents. Defaults to None. Filters are applied during the approximate kNN search to ensure that top_k matching documents are returned. :param top_k: Maximum number of Documents to return, defaults to 10 - :param num_candidates: Number of approximate nearest neighbor candidates on each shard. Defaults to top_k * 10. - Increasing this value will improve search accuracy at the cost of slower search speeds. - You can read more about it in the OpenSearch documentation: - https://www.elastic.co/guide/en/OpenSearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy :raises ValueError: If `query_embedding` is an empty list :return: List of Document that are most similar to `query_embedding` """ From a67e0b73ec73cf06ac1d5dec7acba3b76d3471d1 Mon Sep 17 00:00:00 2001 From: Madeesh Kannan Date: Mon, 4 Mar 2024 15:00:31 +0100 Subject: [PATCH 4/7] docs: Update `deepeval-haystack` docstrings (#527) * docs: Update `deepeval-haystack` docstrings * Move note about required init params --- .../evaluators/deepeval/evaluator.py | 75 +++++++++++-------- .../components/evaluators/deepeval/metrics.py | 20 +++-- 2 files changed, 56 insertions(+), 39 deletions(-) diff --git a/integrations/deepeval/src/haystack_integrations/components/evaluators/deepeval/evaluator.py b/integrations/deepeval/src/haystack_integrations/components/evaluators/deepeval/evaluator.py index 03e226d0e..082ae15fd 100644 --- a/integrations/deepeval/src/haystack_integrations/components/evaluators/deepeval/evaluator.py +++ b/integrations/deepeval/src/haystack_integrations/components/evaluators/deepeval/evaluator.py @@ -17,10 +17,31 @@ @component class DeepEvalEvaluator: """ - A component that uses the DeepEval framework to evaluate inputs against a specific metric. - - The supported metrics are defined by :class:`DeepEvalMetric`. The inputs of the component - metric-dependent. + A component that uses the [DeepEval framework](https://docs.confident-ai.com/docs/evaluation-introduction) + to evaluate inputs against a specific metric. Supported metrics are defined by `DeepEvalMetric`. + + Usage example: + ```python + from haystack_integrations.components.evaluators.deepeval import DeepEvalEvaluator, DeepEvalMetric + + evaluator = DeepEvalEvaluator( + metric=DeepEvalMetric.FAITHFULNESS, + metric_params={"model": "gpt-4"}, + ) + output = evaluator.run( + questions=["Which is the most popular global sport?"], + contexts=[ + [ + "Football is undoubtedly the world's most popular sport with" + "major events like the FIFA World Cup and sports personalities" + "like Ronaldo and Messi, drawing a followership of more than 4" + "billion people." + ] + ], + responses=["Football is the most popular sport with around 4 billion" "followers worldwide"], + ) + print(output["results"]) + ``` """ _backend_metric: BaseMetric @@ -39,6 +60,8 @@ def __init__( The metric to use for evaluation. :param metric_params: Parameters to pass to the metric's constructor. + Refer to the `RagasMetric` class for more details + on required parameters. """ self.metric = metric if isinstance(metric, DeepEvalMetric) else DeepEvalMetric.from_str(metric) self.metric_params = metric_params @@ -51,37 +74,20 @@ def __init__( @component.output_types(results=List[List[Dict[str, Any]]]) def run(self, **inputs) -> Dict[str, Any]: """ - Run the DeepEval evaluator. - - Example: - ```python - pipeline = Pipeline() - evaluator = DeepEvalEvaluator( - metric=DeepEvalMetric.ANSWER_RELEVANCY, - metric_params={"model": "gpt-4"}, - ) - pipeline.add_component("evaluator", evaluator) - - # Each metric expects a specific set of parameters as input. Refer to the - # DeepEvalMetric class' documentation for more details. - output = pipeline.run({"evaluator": { - "questions": ["question], - "contexts": [["context"]], - "responses": ["response"] - }}) - ``` + Run the DeepEval evaluator on the provided inputs. :param inputs: The inputs to evaluate. These are determined by the - metric being calculated. See :class:`DeepEvalMetric` for more + metric being calculated. See `DeepEvalMetric` for more information. :returns: - A nested list of metric results. Each input can have one or more + A dictionary with a single `results` entry that contains + a nested list of metric results. Each input can have one or more results, depending on the metric. Each result is a dictionary containing the following keys and values: - * `name` - The name of the metric. - * `score` - The score of the metric. - * `explanation` - An optional explanation of the score. + - `name` - The name of the metric. + - `score` - The score of the metric. + - `explanation` - An optional explanation of the score. """ InputConverters.validate_input_parameters(self.metric, self.descriptor.input_parameters, inputs) converted_inputs: List[LLMTestCase] = list(self.descriptor.input_converter(**inputs)) # type: ignore @@ -93,7 +99,12 @@ def run(self, **inputs) -> Dict[str, Any]: def to_dict(self) -> Dict[str, Any]: """ - Serialize this component to a dictionary. + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + :raises DeserializationError: + If the component cannot be serialized. """ def check_serializable(obj: Any): @@ -116,10 +127,12 @@ def check_serializable(obj: Any): @classmethod def from_dict(cls, data: Dict[str, Any]) -> "DeepEvalEvaluator": """ - Deserialize a component from a dictionary. + Deserializes the component from a dictionary. :param data: - The dictionary to deserialize from. + Dictionary to deserialize from. + :returns: + Deserialized component. """ return default_from_dict(cls, data) diff --git a/integrations/deepeval/src/haystack_integrations/components/evaluators/deepeval/metrics.py b/integrations/deepeval/src/haystack_integrations/components/evaluators/deepeval/metrics.py index 5ca6922fc..7fb5db5b0 100644 --- a/integrations/deepeval/src/haystack_integrations/components/evaluators/deepeval/metrics.py +++ b/integrations/deepeval/src/haystack_integrations/components/evaluators/deepeval/metrics.py @@ -20,27 +20,31 @@ class DeepEvalMetric(Enum): """ Metrics supported by DeepEval. + + All metrics require a `model` parameter, which specifies + the model to use for evaluation. Refer to the DeepEval + documentation for information on the supported models. """ - #: Answer relevancy. + #: Answer relevancy.\ #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]` ANSWER_RELEVANCY = "answer_relevancy" - #: Faithfulness. + #: Faithfulness.\ #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]` FAITHFULNESS = "faithfulness" - #: Contextual precision. - #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str], ground_truths: List[str]` + #: Contextual precision.\ + #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str], ground_truths: List[str]`\ #: The ground truth is the expected response. CONTEXTUAL_PRECISION = "contextual_precision" - #: Contextual recall. - #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str], ground_truths: List[str]` - #: The ground truth is the expected response. + #: Contextual recall.\ + #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str], ground_truths: List[str]`\ + #: The ground truth is the expected response.\ CONTEXTUAL_RECALL = "contextual_recall" - #: Contextual relevance. + #: Contextual relevance.\ #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]` CONTEXTUAL_RELEVANCE = "contextual_relevance" From 710ac4d942ec1cd6d5255b15286c3513a6a24939 Mon Sep 17 00:00:00 2001 From: Madeesh Kannan Date: Mon, 4 Mar 2024 15:00:54 +0100 Subject: [PATCH 5/7] docs: Update `ragas-haystack` docstrings (#529) --- .../components/evaluators/ragas/evaluator.py | 70 +++++++++++-------- .../components/evaluators/ragas/metrics.py | 34 +++++---- 2 files changed, 61 insertions(+), 43 deletions(-) diff --git a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py index 7e088b47d..71dacd6c7 100644 --- a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py +++ b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py @@ -20,12 +20,30 @@ @component class RagasEvaluator: """ - A component that uses the Ragas framework to evaluate inputs against a specific metric. - - The supported metrics are defined by `RagasMetric`. - Most of them require an OpenAI API key to be provided as an environment variable "OPENAI_API_KEY". - The inputs of the component are metric-dependent. - The output is a nested list of evaluation results where each inner list contains the results for a single input. + A component that uses the [Ragas framework](https://docs.ragas.io/) to evaluate + inputs against a specific metric. Supported metrics are defined by `RagasMetric`. + + Usage example: + ```python + from haystack_integrations.components.evaluators.ragas import RagasEvaluator, RagasMetric + + evaluator = RagasEvaluator( + metric=RagasMetric.CONTEXT_PRECISION, + ) + output = evaluator.run( + questions=["Which is the most popular global sport?"], + contexts=[ + [ + "Football is undoubtedly the world's most popular sport with" + "major events like the FIFA World Cup and sports personalities" + "like Ronaldo and Messi, drawing a followership of more than 4" + "billion people." + ] + ], + ground_truths=["Football is the most popular sport with around 4 billion" "followers worldwide"], + ) + print(output["results"]) + ``` """ # Wrapped for easy mocking. @@ -44,6 +62,8 @@ def __init__( The metric to use for evaluation. :param metric_params: Parameters to pass to the metric's constructor. + Refer to the `RagasMetric` class for more details + on required parameters. """ self.metric = metric if isinstance(metric, RagasMetric) else RagasMetric.from_str(metric) self.metric_params = metric_params or {} @@ -56,9 +76,6 @@ def __init__( component.set_input_types(self, **expected_inputs) def _init_backend(self): - """ - Initialize the Ragas backend and validate inputs. - """ self._backend_callable = RagasEvaluator._invoke_evaluate def _init_metric(self): @@ -74,29 +91,19 @@ def _invoke_evaluate(dataset: Dataset, metric: Metric) -> Result: @component.output_types(results=List[List[Dict[str, Any]]]) def run(self, **inputs) -> Dict[str, Any]: """ - Run the Ragas evaluator. - - Example: - ```python - p = Pipeline() - evaluator = RagasEvaluator( - metric=RagasMetric.CONTEXT_PRECISION, - ) - p.add_component("evaluator", evaluator) - - results = p.run({"evaluator": {"questions": QUESTIONS, "contexts": CONTEXTS, "ground_truths": GROUND_TRUTHS}}) - ``` + Run the Ragas evaluator on the provided inputs. :param inputs: The inputs to evaluate. These are determined by the - metric being calculated. See :class:`RagasMetric` for more + metric being calculated. See `RagasMetric` for more information. :returns: - A nested list of metric results. Each input can have one or more + A dictionary with a single `results` entry that contains + a nested list of metric results. Each input can have one or more results, depending on the metric. Each result is a dictionary containing the following keys and values: - * `name` - The name of the metric. - * `score` - The score of the metric. + - `name` - The name of the metric. + - `score` - The score of the metric. """ InputConverters.validate_input_parameters(self.metric, self.descriptor.input_parameters, inputs) converted_inputs: List[Dict[str, str]] = list(self.descriptor.input_converter(**inputs)) # type: ignore @@ -113,7 +120,12 @@ def run(self, **inputs) -> Dict[str, Any]: def to_dict(self) -> Dict[str, Any]: """ - Serialize this component to a dictionary. + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + :raises DeserializationError: + If the component cannot be serialized. """ def check_serializable(obj: Any): @@ -136,9 +148,11 @@ def check_serializable(obj: Any): @classmethod def from_dict(cls, data: Dict[str, Any]) -> "RagasEvaluator": """ - Deserialize a component from a dictionary. + Deserializes the component from a dictionary. :param data: - The dictionary to deserialize from. + Dictionary to deserialize from. + :returns: + Deserialized component. """ return default_from_dict(cls, data) diff --git a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py index 8d1f53593..72f3e8a3b 100644 --- a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py +++ b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py @@ -50,40 +50,44 @@ class RagasMetric(RagasBaseEnum): Metrics supported by Ragas. """ - #: Answer correctness - #: Inputs - `questions: List[str], responses: List[str], ground_truths: List[str]` + #: Answer correctness.\ + #: Inputs - `questions: List[str], responses: List[str], ground_truths: List[str]`\ + #: Parameters - `weights: Tuple[float, float]` ANSWER_CORRECTNESS = "answer_correctness" - #: Faithfulness + #: Faithfulness.\ #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]` FAITHFULNESS = "faithfulness" - #: Answer similarity - #: Inputs - `responses: List[str], ground_truths: List[str]` + #: Answer similarity.\ + #: Inputs - `responses: List[str], ground_truths: List[str]`\ + #: Parameters - `threshold: float` ANSWER_SIMILARITY = "answer_similarity" - #: Context precision + #: Context precision.\ #: Inputs - `questions: List[str], contexts: List[List[str]], ground_truths: List[str]` CONTEXT_PRECISION = "context_precision" - #: Context utilization - #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]` + #: Context utilization. + #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]`\ CONTEXT_UTILIZATION = "context_utilization" - #: Context recall - #: Inputs - `questions: List[str], contexts: List[List[str]], ground_truths: List[str]` + #: Context recall. + #: Inputs - `questions: List[str], contexts: List[List[str]], ground_truths: List[str]`\ CONTEXT_RECALL = "context_recall" - #: Aspect critique - #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]` + #: Aspect critique. + #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]`\ + #: Parameters - `name: str, definition: str, strictness: int` ASPECT_CRITIQUE = "aspect_critique" - #: Context relevancy + #: Context relevancy.\ #: Inputs - `questions: List[str], contexts: List[List[str]]` CONTEXT_RELEVANCY = "context_relevancy" - #: Answer relevancy - #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]` + #: Answer relevancy.\ + #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]`\ + #: Parameters - `strictness: int` ANSWER_RELEVANCY = "answer_relevancy" From 9b98f60d93061b24e934a7bbacfb2ec3dc5fbc64 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Mon, 4 Mar 2024 15:06:14 +0100 Subject: [PATCH 6/7] docs: Docstring update (#525) * Docstring update * PR review - Julian * pylint fixes --- .../elasticsearch/bm25_retriever.py | 29 ++++++-- .../elasticsearch/embedding_retriever.py | 51 +++++++++++-- .../elasticsearch/document_store.py | 72 +++++++++++++------ 3 files changed, 121 insertions(+), 31 deletions(-) diff --git a/integrations/elasticsearch/src/haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py b/integrations/elasticsearch/src/haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py index 0416389a2..df1cb4a26 100644 --- a/integrations/elasticsearch/src/haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py +++ b/integrations/elasticsearch/src/haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py @@ -11,8 +11,9 @@ @component class ElasticsearchBM25Retriever: """ - ElasticsearchBM25Retriever is a keyword-based retriever that uses BM25 to find the most - similar documents to a user's query. + ElasticsearchBM25Retriever retrieves documents from the ElasticsearchDocumentStore using BM25 algorithm to find the + most similar documents to a user's query. + This retriever is only compatible with ElasticsearchDocumentStore. Usage example: @@ -35,7 +36,7 @@ class ElasticsearchBM25Retriever: result = retriever.run(query="Who lives in Berlin?") for doc in result["documents"]: - print(doc.text) + print(doc.content) ``` """ @@ -55,8 +56,9 @@ def __init__( :param filters: Filters applied to the retrieved Documents, for more info see `ElasticsearchDocumentStore.filter_documents`, defaults to None :param fuzziness: Fuzziness parameter passed to Elasticsearch, defaults to "AUTO". - see the official documentation for valid values: - https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness + See the official + [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness) + for more details. :param top_k: Maximum number of Documents to return, defaults to 10 :param scale_score: If `True` scales the Document`s scores between 0 and 1, defaults to False """ @@ -72,6 +74,12 @@ def __init__( self._scale_score = scale_score def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ return default_to_dict( self, filters=self._filters, @@ -83,6 +91,14 @@ def to_dict(self) -> Dict[str, Any]: @classmethod def from_dict(cls, data: Dict[str, Any]) -> "ElasticsearchBM25Retriever": + """ + Deserializes the component from a dictionary. + + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. + """ data["init_parameters"]["document_store"] = ElasticsearchDocumentStore.from_dict( data["init_parameters"]["document_store"] ) @@ -96,7 +112,8 @@ def run(self, query: str, filters: Optional[Dict[str, Any]] = None, top_k: Optio :param query: String to search in Documents' text. :param filters: Filters applied to the retrieved Documents. :param top_k: Maximum number of Documents to return. - :return: List of Documents that match the query. + :returns: A dictionary with the following keys: + - `documents`: List of Documents that match the query. """ docs = self._document_store._bm25_retrieval( query=query, diff --git a/integrations/elasticsearch/src/haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py b/integrations/elasticsearch/src/haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py index 2ba68916f..d9f7f1fe6 100644 --- a/integrations/elasticsearch/src/haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py +++ b/integrations/elasticsearch/src/haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py @@ -11,9 +11,35 @@ @component class ElasticsearchEmbeddingRetriever: """ - Uses a vector similarity metric to retrieve documents from the ElasticsearchDocumentStore. + ElasticsearchEmbeddingRetriever retrieves documents from the ElasticsearchDocumentStore using vector similarity. - Needs to be connected to the ElasticsearchDocumentStore to run. + Usage example: + ```python + from haystack import Document + from haystack.components.embedders import SentenceTransformersTextEmbedder + from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore + from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchEmbeddingRetriever + + document_store = ElasticsearchDocumentStore(hosts="http://localhost:9200") + retriever = ElasticsearchEmbeddingRetriever(document_store=document_store) + + # Add documents to DocumentStore + documents = [ + Document(text="My name is Carla and I live in Berlin"), + Document(text="My name is Paul and I live in New York"), + Document(text="My name is Silvano and I live in Matera"), + Document(text="My name is Usagi Tsukino and I live in Tokyo"), + ] + document_store.write_documents(documents) + + te = SentenceTransformersTextEmbedder() + te.warm_up() + query_embeddings = te.run("Who lives in Berlin?")["embedding"] + + result = retriever.run(query=query_embeddings) + for doc in result["documents"]: + print(doc.content) + ``` """ def __init__( @@ -33,8 +59,8 @@ def __init__( :param top_k: Maximum number of Documents to return, defaults to 10 :param num_candidates: Number of approximate nearest neighbor candidates on each shard. Defaults to top_k * 10. Increasing this value will improve search accuracy at the cost of slower search speeds. - You can read more about it in the Elasticsearch documentation: - https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy + You can read more about it in the Elasticsearch + [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy) :raises ValueError: If `document_store` is not an instance of ElasticsearchDocumentStore. """ if not isinstance(document_store, ElasticsearchDocumentStore): @@ -47,6 +73,12 @@ def __init__( self._num_candidates = num_candidates def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ return default_to_dict( self, filters=self._filters, @@ -57,6 +89,14 @@ def to_dict(self) -> Dict[str, Any]: @classmethod def from_dict(cls, data: Dict[str, Any]) -> "ElasticsearchEmbeddingRetriever": + """ + Deserializes the component from a dictionary. + + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. + """ data["init_parameters"]["document_store"] = ElasticsearchDocumentStore.from_dict( data["init_parameters"]["document_store"] ) @@ -70,7 +110,8 @@ def run(self, query_embedding: List[float], filters: Optional[Dict[str, Any]] = :param query_embedding: Embedding of the query. :param filters: Filters applied to the retrieved Documents. :param top_k: Maximum number of Documents to return. - :return: List of Documents similar to `query_embedding`. + :returns: A dictionary with the following keys: + - `documents`: List of Documents most similar to the given query_embedding """ docs = self._document_store._embedding_retrieval( query_embedding=query_embedding, diff --git a/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py b/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py index e60c7fd8f..f50e2b1b3 100644 --- a/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py +++ b/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py @@ -35,16 +35,16 @@ class ElasticsearchDocumentStore: """ - ElasticsearchDocumentStore is a Document Store for Elasticsearch. - It can be used with Elastic Cloud or your own Elasticsearch cluster. + ElasticsearchDocumentStore is a Document Store for Elasticsearch. It can be used with Elastic Cloud or your own + Elasticsearch cluster. - Simple usage with Elastic Cloud: + Usage example with Elastic Cloud: ```python from haystack.document_store.elasticsearch import ElasticsearchDocumentStore document_store = ElasticsearchDocumentStore(cloud_id="YOUR_CLOUD_ID", api_key="YOUR_API_KEY") ``` - One can also connect to a self-hosted Elasticsearch instance: + Usage example with a self-hosted Elasticsearch instance: ```python from haystack.document_store.elasticsearch import ElasticsearchDocumentStore document_store = ElasticsearchDocumentStore(hosts="http://localhost:9200") @@ -53,8 +53,8 @@ class ElasticsearchDocumentStore: We strongly recommend to enable security so that only authorized users can access your data. For more details on how to connect to Elasticsearch and configure security, - see the official Elasticsearch documentation: - https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html + see the official Elasticsearch + [documentation](https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html) All extra keyword arguments will be passed to the Elasticsearch client. """ @@ -75,19 +75,19 @@ def __init__( One can also set the similarity function used to compare Documents embeddings. This is mostly useful when using the `ElasticsearchDocumentStore` in a Pipeline with an `ElasticsearchEmbeddingRetriever`. - For more information on connection parameters, see the official Elasticsearch documentation: - https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html + For more information on connection parameters, see the official Elasticsearch + [documentation](https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html) - For the full list of supported kwargs, see the official Elasticsearch reference: - https://elasticsearch-py.readthedocs.io/en/stable/api.html#module-elasticsearch + For the full list of supported kwargs, see the official Elasticsearch + [reference](https://elasticsearch-py.readthedocs.io/en/stable/api.html#module-elasticsearch) :param hosts: List of hosts running the Elasticsearch client. Defaults to None :param index: Name of index in Elasticsearch, if it doesn't exist it will be created. Defaults to "default" :param embedding_similarity_function: The similarity function used to compare Documents embeddings. Defaults to "cosine". This parameter only takes effect if the index does not yet exist and is created. To choose the most appropriate function, look for information about your embedding model. - To understand how document scores are computed, see the Elasticsearch documentation: - https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html#dense-vector-params + To understand how document scores are computed, see the Elasticsearch + [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html#dense-vector-params) :param **kwargs: Optional arguments that ``Elasticsearch`` takes. """ self._hosts = hosts @@ -115,6 +115,12 @@ def __init__( self._client.indices.create(index=index, mappings=mappings) def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ # This is not the best solution to serialise this class but is the fastest to implement. # Not all kwargs types can be serialised to text so this can fail. We must serialise each # type explicitly to handle this properly. @@ -128,11 +134,20 @@ def to_dict(self) -> Dict[str, Any]: @classmethod def from_dict(cls, data: Dict[str, Any]) -> "ElasticsearchDocumentStore": + """ + Deserializes the component from a dictionary. + + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. + """ return default_from_dict(cls, data) def count_documents(self) -> int: """ Returns how many documents are present in the document store. + :returns: Number of documents in the document store. """ return self._client.count(index=self._index)["count"] @@ -165,6 +180,14 @@ def _search_documents(self, **kwargs) -> List[Document]: return documents def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]: + """ + The main query method for the document store. It retrieves all documents that match the filters. + + :param filters: A dictionary of filters to apply. For more information on the structure of the filters, + see the official Elasticsearch + [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html) + :returns: List of Documents that match the filters. + """ if filters and "operator" not in filters and "conditions" not in filters: filters = convert(filters) @@ -175,8 +198,13 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE) -> int: """ Writes Documents to Elasticsearch. + If policy is not specified or set to DuplicatePolicy.NONE, it will raise an exception if a document with the same ID already exists in the document store. + + :param documents: List of Documents to write to the document store. + :param policy: DuplicatePolicy to apply when a document with the same ID already exists in the document store. + :returns: Number of documents written to the document store. """ if len(documents) > 0: if not isinstance(documents[0], Document): @@ -229,6 +257,9 @@ def _deserialize_document(self, hit: Dict[str, Any]) -> Document: """ Creates a Document from the search hit provided. This is mostly useful in self.filter_documents(). + + :param hit: A search hit from Elasticsearch. + :returns: Document created from the search hit. """ data = hit["_source"] @@ -242,7 +273,7 @@ def delete_documents(self, document_ids: List[str]) -> None: """ Deletes all documents with a matching document_ids from the document store. - :param object_ids: the object_ids to delete + :param document_ids: the object_ids to delete """ # @@ -272,18 +303,19 @@ def _bm25_retrieval( `ElasticsearchDocumentStore` nor called directly. `ElasticsearchBM25Retriever` uses this method directly and is the public interface for it. - `query` must be a non empty string, otherwise a `ValueError` will be raised. + `query` must be a non-empty string, otherwise a `ValueError` will be raised. :param query: String to search in saved Documents' text. :param filters: Filters applied to the retrieved Documents, for more info see `ElasticsearchDocumentStore.filter_documents`, defaults to None :param fuzziness: Fuzziness parameter passed to Elasticsearch, defaults to "AUTO". - see the official documentation for valid values: - https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness + see the official + [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness) + for valid values. :param top_k: Maximum number of Documents to return, defaults to 10 :param scale_score: If `True` scales the Document`s scores between 0 and 1, defaults to False :raises ValueError: If `query` is an empty string - :return: List of Document that match `query` + :returns: List of Document that match `query` """ if not query: @@ -341,10 +373,10 @@ def _embedding_retrieval( :param top_k: Maximum number of Documents to return, defaults to 10 :param num_candidates: Number of approximate nearest neighbor candidates on each shard. Defaults to top_k * 10. Increasing this value will improve search accuracy at the cost of slower search speeds. - You can read more about it in the Elasticsearch documentation: - https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy + You can read more about it in the Elasticsearch + [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy) :raises ValueError: If `query_embedding` is an empty list - :return: List of Document that are most similar to `query_embedding` + :returns: List of Document that are most similar to `query_embedding` """ if not query_embedding: From de56507070b32101b0afcde29263089eff7ed305 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Mon, 4 Mar 2024 16:13:22 +0100 Subject: [PATCH 7/7] Unstructured: review docstrings (#531) * unstructured: review docstrings * Update integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py Co-authored-by: Madeesh Kannan --------- Co-authored-by: Madeesh Kannan --- integrations/unstructured/pyproject.toml | 3 +- .../converters/unstructured/converter.py | 71 ++++++++++++------- 2 files changed, 48 insertions(+), 26 deletions(-) diff --git a/integrations/unstructured/pyproject.toml b/integrations/unstructured/pyproject.toml index 298fdb993..5d14fcfe1 100644 --- a/integrations/unstructured/pyproject.toml +++ b/integrations/unstructured/pyproject.toml @@ -180,6 +180,7 @@ markers = [ module = [ "haystack.*", "haystack_integrations.*", - "pytest.*" + "pytest.*", + "unstructured.*", ] ignore_missing_imports = true diff --git a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py index a4a132437..0eff7bc82 100644 --- a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py +++ b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py @@ -13,8 +13,8 @@ from haystack.utils import Secret from tqdm import tqdm -from unstructured.documents.elements import Element # type: ignore[import] -from unstructured.partition.api import partition_via_api # type: ignore[import] +from unstructured.documents.elements import Element +from unstructured.partition.api import partition_via_api logger = logging.getLogger(__name__) @@ -24,7 +24,23 @@ @component class UnstructuredFileConverter: """ - Convert files to Haystack Documents using the Unstructured API (hosted or running locally). + A component for converting files to Haystack Documents using the Unstructured API (hosted or running locally). + + For the supported file types and the specific API parameters, see + [Unstructured docs](https://unstructured-io.github.io/unstructured/api.html). + + Usage example: + ```python + from haystack_integrations.components.converters.unstructured import UnstructuredFileConverter + + # make sure to either set the environment variable UNSTRUCTURED_API_KEY + # or run the Unstructured API locally: + # docker run -p 8000:8000 -d --rm --name unstructured-api quay.io/unstructured-io/unstructured-api:latest + # --port 8000 --host 0.0.0.0 + + converter = UnstructuredFileConverter() + documents = converter.run(paths = ["a/file/path.pdf", "a/directory/path"])["documents"] + ``` """ def __init__( @@ -39,22 +55,21 @@ def __init__( progress_bar: bool = True, # noqa: FBT001, FBT002 ): """ - :param api_url: URL of the Unstructured API. Defaults to the hosted version. - If you run the API locally, specify the URL of your local API (e.g. http://localhost:8000/general/v0/general). - See https://unstructured-io.github.io/unstructured/api.html#using-the-api-locally for more information. - :param api_key: API key for the Unstructured API (https://unstructured.io/#get-api-key). + :param api_url: URL of the Unstructured API. Defaults to the URL of the hosted version. + If you run the API locally, specify the URL of your local API (e.g. `"http://localhost:8000/general/v0/general"`). + :param api_key: API key for the Unstructured API. + It can be explicitly passed or read the environment variable `UNSTRUCTURED_API_KEY` (recommended). If you run the API locally, it is not needed. - If you use the hosted version, it defaults to the environment variable UNSTRUCTURED_API_KEY. :param document_creation_mode: How to create Haystack Documents from the elements returned by Unstructured. - - "one-doc-per-file": One Haystack Document per file. All elements are concatenated into one text field. - - "one-doc-per-page": One Haystack Document per page. - All elements on a page are concatenated into one text field. - - "one-doc-per-element": One Haystack Document per element. - Each element is converted to a Haystack Document. + `"one-doc-per-file"`: One Haystack Document per file. All elements are concatenated into one text field. + `"one-doc-per-page"`: One Haystack Document per page. + All elements on a page are concatenated into one text field. + `"one-doc-per-element"`: One Haystack Document per element. Each element is converted to a Haystack Document. :param separator: Separator between elements when concatenating them into one text field. - :param unstructured_kwargs: Additional keyword arguments that are passed to the Unstructured API. - See https://unstructured-io.github.io/unstructured/api.html. - :param progress_bar: Show a progress bar for the conversion. Defaults to True. + :param unstructured_kwargs: Additional parameters that are passed to the Unstructured API. + For the available parameters, see + [Unstructured API docs](https://unstructured-io.github.io/unstructured/apis/api_parameters.html). + :param progress_bar: Whether to show a progress bar during the conversion. """ self.api_url = api_url @@ -77,10 +92,12 @@ def __init__( def to_dict(self) -> Dict[str, Any]: """ - Serialize this component to a dictionary. + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. """ - # do not serialize api_key return default_to_dict( self, api_url=self.api_url, @@ -98,17 +115,21 @@ def run( meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, ): """ - Convert files to Haystack Documents using the Unstructured API (hosted or running locally). + Convert files to Haystack Documents using the Unstructured API. :param paths: List of paths to convert. Paths can be files or directories. If a path is a directory, all files in the directory are converted. Subdirectories are ignored. :param meta: Optional metadata to attach to the Documents. - This value can be either a list of dictionaries or a single dictionary. - If it's a single dictionary, its content is added to the metadata of all produced Documents. - If it's a list, the length of the list must match the number of paths, because the two lists will be zipped. - Please note that if the paths contain directories, meta can only be a single dictionary - (same metadata for all files). - Defaults to `None`. + This value can be either a list of dictionaries or a single dictionary. + If it's a single dictionary, its content is added to the metadata of all produced Documents. + If it's a list, the length of the list must match the number of paths, because the two lists will be zipped. + Please note that if the paths contain directories, `meta` can only be a single dictionary + (same metadata for all files). + + :returns: A dictionary with the following key: + - "documents": List of Haystack Documents. + + :raises ValueError: If `meta` is a list and `paths` contains directories. """ paths_obj = [Path(path) for path in paths] filepaths = [path for path in paths_obj if path.is_file()]