Merge branch 'main' of https://github.com/deepset-ai/haystack-core-in…

…tegrations into weaviate-client-v4
deepset-ai · Mar 4, 2024 · 6a6013b · 6a6013b
2 parents 6be11af + de56507
commit 6a6013b
Show file tree

Hide file tree

Showing 23 changed files with 487 additions and 257 deletions.
diff --git a/.github/workflows/optimum.yml b/.github/workflows/optimum.yml
@@ -52,9 +52,9 @@ jobs:
         if: matrix.python-version == '3.9' && runner.os == 'Linux'
         run: hatch run lint:all
 
-      # - name: Generate docs
-      #   if: matrix.python-version == '3.9' && runner.os == 'Linux'
-      #   run: hatch run docs
+      - name: Generate docs
+        if: matrix.python-version == '3.9' && runner.os == 'Linux'
+        run: hatch run docs
 
       - name: Run tests
         run: hatch run cov
diff --git a/README.md b/README.md
@@ -43,6 +43,7 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta
 | [nvidia-haystack](integrations/nvidia/)                             | Generator           | [![PyPI - Version](https://img.shields.io/pypi/v/nvidia-haystack.svg?color=orange)](https://pypi.org/project/nvidia-haystack)                            | [![Test / nvidia](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/nvidia.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/nvidia.yml)                                           |
 | [ollama-haystack](integrations/ollama/)                             | Generator           | [![PyPI - Version](https://img.shields.io/pypi/v/ollama-haystack.svg?color=orange)](https://pypi.org/project/ollama-haystack)                            | [![Test / ollama](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ollama.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ollama.yml)                                           |
 | [opensearch-haystack](integrations/opensearch/)                     | Document Store      | [![PyPI - Version](https://img.shields.io/pypi/v/opensearch-haystack.svg)](https://pypi.org/project/opensearch-haystack)                                 | [![Test / opensearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml)                               |
+| [optimum-haystack](integrations/optimum/)                           | Embedder            | [![PyPI - Version](https://img.shields.io/pypi/v/optimum-haystack.svg)](https://pypi.org/project/optimum-haystack)                                       | [![Test / optimum](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/optimum.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/optimum.yml)                                        |
 | [pinecone-haystack](integrations/pinecone/)                         | Document Store      | [![PyPI - Version](https://img.shields.io/pypi/v/pinecone-haystack.svg?color=orange)](https://pypi.org/project/pinecone-haystack)                        | [![Test / pinecone](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml)                                     |
 | [pgvector-haystack](integrations/pgvector/)                         | Document Store      | [![PyPI - Version](https://img.shields.io/pypi/v/pgvector-haystack.svg?color=orange)](https://pypi.org/project/pgvector-haystack)                        | [![Test / pgvector](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pgvector.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pgvector.yml)                                     |
 | [qdrant-haystack](integrations/qdrant/)                             | Document Store      | [![PyPI - Version](https://img.shields.io/pypi/v/qdrant-haystack.svg?color=orange)](https://pypi.org/project/qdrant-haystack)                            | [![Test / qdrant](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml)                                           |

diff --git a/...rations/cohere/src/haystack_integrations/components/embedders/cohere/document_embedder.py b/...rations/cohere/src/haystack_integrations/components/embedders/cohere/document_embedder.py
@@ -58,8 +58,8 @@ def __init__(
             [model documentation](https://docs.cohere.com/docs/models#representation).
         :param input_type: specifies the type of input you're giving to the model. Supported values are
             "search_document", "search_query", "classification" and "clustering". Not
-            required for older versions of the embedding models (meaning anything lower than v3), but is required for more
-            recent versions (meaning anything bigger than v2).
+            required for older versions of the embedding models (meaning anything lower than v3), but is required for
+            more recent versions (meaning anything bigger than v2).
         :param api_base_url: the Cohere API Base url.
         :param truncate: truncate embeddings that are too long from start or end, ("NONE"|"START"|"END").
             Passing "START" will discard the start of the input. "END" will discard the end of the input. In both

diff --git a/integrations/cohere/src/haystack_integrations/components/embedders/cohere/text_embedder.py b/integrations/cohere/src/haystack_integrations/components/embedders/cohere/text_embedder.py
@@ -51,11 +51,11 @@ def __init__(
             [model documentation](https://docs.cohere.com/docs/models#representation).
         :param input_type: specifies the type of input you're giving to the model. Supported values are
         "search_document", "search_query", "classification" and "clustering". Not
-            required for older versions of the embedding models (meaning anything lower than v3), but is required for more
-            recent versions (meaning anything bigger than v2).
+            required for older versions of the embedding models (meaning anything lower than v3), but is required for
+            more recent versions (meaning anything bigger than v2).
         :param api_base_url: the Cohere API Base url.
-        :param truncate: truncate embeddings that are too long from start or end, ("NONE"|"START"|"END"), defaults to
-            `"END"`. Passing "START" will discard the start of the input. "END" will discard the end of the input. In both
+        :param truncate: truncate embeddings that are too long from start or end, ("NONE"|"START"|"END").
+            Passing "START" will discard the start of the input. "END" will discard the end of the input. In both
             cases, input is discarded until the remaining input is exactly the maximum input token length for the model.
             If "NONE" is selected, when the input exceeds the maximum input token length an error will be returned.
         :param use_async_client: flag to select the AsyncClient. It is recommended to use

diff --git a/integrations/deepeval/src/haystack_integrations/components/evaluators/deepeval/evaluator.py b/integrations/deepeval/src/haystack_integrations/components/evaluators/deepeval/evaluator.py
@@ -17,10 +17,31 @@
 @component
 class DeepEvalEvaluator:
     """
-    A component that uses the DeepEval framework to evaluate inputs against a specific metric.
-
-    The supported metrics are defined by :class:`DeepEvalMetric`. The inputs of the component
-    metric-dependent.
+    A component that uses the [DeepEval framework](https://docs.confident-ai.com/docs/evaluation-introduction)
+    to evaluate inputs against a specific metric. Supported metrics are defined by `DeepEvalMetric`.
+
+    Usage example:
+    ```python
+    from haystack_integrations.components.evaluators.deepeval import DeepEvalEvaluator, DeepEvalMetric
+
+    evaluator = DeepEvalEvaluator(
+        metric=DeepEvalMetric.FAITHFULNESS,
+        metric_params={"model": "gpt-4"},
+    )
+    output = evaluator.run(
+        questions=["Which is the most popular global sport?"],
+        contexts=[
+            [
+                "Football is undoubtedly the world's most popular sport with"
+                "major events like the FIFA World Cup and sports personalities"
+                "like Ronaldo and Messi, drawing a followership of more than 4"
+                "billion people."
+            ]
+        ],
+        responses=["Football is the most popular sport with around 4 billion" "followers worldwide"],
+    )
+    print(output["results"])
+    ```
     """
 
     _backend_metric: BaseMetric
@@ -39,6 +60,8 @@ def __init__(
             The metric to use for evaluation.
         :param metric_params:
             Parameters to pass to the metric's constructor.
+            Refer to the `RagasMetric` class for more details
+            on required parameters.
         """
         self.metric = metric if isinstance(metric, DeepEvalMetric) else DeepEvalMetric.from_str(metric)
         self.metric_params = metric_params
@@ -51,37 +74,20 @@ def __init__(
     @component.output_types(results=List[List[Dict[str, Any]]])
     def run(self, **inputs) -> Dict[str, Any]:
         """
-        Run the DeepEval evaluator.
-
-        Example:
-        ```python
-        pipeline = Pipeline()
-        evaluator = DeepEvalEvaluator(
-            metric=DeepEvalMetric.ANSWER_RELEVANCY,
-            metric_params={"model": "gpt-4"},
-        )
-        pipeline.add_component("evaluator", evaluator)
-
-        # Each metric expects a specific set of parameters as input. Refer to the
-        # DeepEvalMetric class' documentation for more details.
-        output = pipeline.run({"evaluator": {
-            "questions": ["question],
-            "contexts": [["context"]],
-            "responses": ["response"]
-        }})
-        ```
+        Run the DeepEval evaluator on the provided inputs.
 
         :param inputs:
             The inputs to evaluate. These are determined by the
-            metric being calculated. See :class:`DeepEvalMetric` for more
+            metric being calculated. See `DeepEvalMetric` for more
             information.
         :returns:
-            A nested list of metric results. Each input can have one or more
+            A dictionary with a single `results` entry that contains
+            a nested list of metric results. Each input can have one or more
             results, depending on the metric. Each result is a dictionary
             containing the following keys and values:
-                * `name` - The name of the metric.
-                * `score` - The score of the metric.
-                * `explanation` - An optional explanation of the score.
+            - `name` - The name of the metric.
+            - `score` - The score of the metric.
+            - `explanation` - An optional explanation of the score.
         """
         InputConverters.validate_input_parameters(self.metric, self.descriptor.input_parameters, inputs)
         converted_inputs: List[LLMTestCase] = list(self.descriptor.input_converter(**inputs))  # type: ignore
@@ -93,7 +99,12 @@ def run(self, **inputs) -> Dict[str, Any]:
 
     def to_dict(self) -> Dict[str, Any]:
         """
-        Serialize this component to a dictionary.
+        Serializes the component to a dictionary.
+
+        :returns:
+            Dictionary with serialized data.
+        :raises DeserializationError:
+            If the component cannot be serialized.
         """
 
         def check_serializable(obj: Any):
@@ -116,10 +127,12 @@ def check_serializable(obj: Any):
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "DeepEvalEvaluator":
         """
-        Deserialize a component from a dictionary.
+        Deserializes the component from a dictionary.
 
         :param data:
-            The dictionary to deserialize from.
+            Dictionary to deserialize from.
+        :returns:
+            Deserialized component.
         """
         return default_from_dict(cls, data)
 

diff --git a/integrations/deepeval/src/haystack_integrations/components/evaluators/deepeval/metrics.py b/integrations/deepeval/src/haystack_integrations/components/evaluators/deepeval/metrics.py
@@ -20,27 +20,31 @@
 class DeepEvalMetric(Enum):
     """
     Metrics supported by DeepEval.
+
+    All metrics require a `model` parameter, which specifies
+    the model to use for evaluation. Refer to the DeepEval
+    documentation for information on the supported models.
     """
 
-    #: Answer relevancy.
+    #: Answer relevancy.\
     #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]`
     ANSWER_RELEVANCY = "answer_relevancy"
 
-    #: Faithfulness.
+    #: Faithfulness.\
     #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]`
     FAITHFULNESS = "faithfulness"
 
-    #: Contextual precision.
-    #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str], ground_truths: List[str]`
+    #: Contextual precision.\
+    #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str], ground_truths: List[str]`\
     #: The ground truth is the expected response.
     CONTEXTUAL_PRECISION = "contextual_precision"
 
-    #: Contextual recall.
-    #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str], ground_truths: List[str]`
-    #: The ground truth is the expected response.
+    #: Contextual recall.\
+    #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str], ground_truths: List[str]`\
+    #: The ground truth is the expected response.\
     CONTEXTUAL_RECALL = "contextual_recall"
 
-    #: Contextual relevance.
+    #: Contextual relevance.\
     #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]`
     CONTEXTUAL_RELEVANCE = "contextual_relevance"
 

diff --git a/...ticsearch/src/haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py b/...ticsearch/src/haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py
@@ -11,8 +11,9 @@
 @component
 class ElasticsearchBM25Retriever:
     """
-    ElasticsearchBM25Retriever is a keyword-based retriever that uses BM25 to find the most
-    similar documents to a user's query.
+    ElasticsearchBM25Retriever retrieves documents from the ElasticsearchDocumentStore using BM25 algorithm to find the
+    most similar documents to a user's query.
+
     This retriever is only compatible with ElasticsearchDocumentStore.
 
     Usage example:
@@ -35,7 +36,7 @@ class ElasticsearchBM25Retriever:
 
     result = retriever.run(query="Who lives in Berlin?")
     for doc in result["documents"]:
-        print(doc.text)
+        print(doc.content)
     ```
     """
 
@@ -55,8 +56,9 @@ def __init__(
         :param filters: Filters applied to the retrieved Documents, for more info
                         see `ElasticsearchDocumentStore.filter_documents`, defaults to None
         :param fuzziness: Fuzziness parameter passed to Elasticsearch, defaults to "AUTO".
-                          see the official documentation for valid values:
-                          https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
+                          See the official
+        [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness)
+        for more details.
         :param top_k: Maximum number of Documents to return, defaults to 10
         :param scale_score: If `True` scales the Document`s scores between 0 and 1, defaults to False
         """
@@ -72,6 +74,12 @@ def __init__(
         self._scale_score = scale_score
 
     def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes the component to a dictionary.
+
+        :returns:
+            Dictionary with serialized data.
+        """
         return default_to_dict(
             self,
             filters=self._filters,
@@ -83,6 +91,14 @@ def to_dict(self) -> Dict[str, Any]:
 
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "ElasticsearchBM25Retriever":
+        """
+        Deserializes the component from a dictionary.
+
+        :param data:
+            Dictionary to deserialize from.
+        :returns:
+              Deserialized component.
+        """
         data["init_parameters"]["document_store"] = ElasticsearchDocumentStore.from_dict(
             data["init_parameters"]["document_store"]
         )
@@ -96,7 +112,8 @@ def run(self, query: str, filters: Optional[Dict[str, Any]] = None, top_k: Optio
         :param query: String to search in Documents' text.
         :param filters: Filters applied to the retrieved Documents.
         :param top_k: Maximum number of Documents to return.
-        :return: List of Documents that match the query.
+        :returns: A dictionary with the following keys:
+            - `documents`: List of Documents that match the query.
         """
         docs = self._document_store._bm25_retrieval(
             query=query,

diff --git a/...arch/src/haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py b/...arch/src/haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py
@@ -11,9 +11,35 @@
 @component
 class ElasticsearchEmbeddingRetriever:
     """
-    Uses a vector similarity metric to retrieve documents from the ElasticsearchDocumentStore.
+    ElasticsearchEmbeddingRetriever retrieves documents from the ElasticsearchDocumentStore using vector similarity.
 
-    Needs to be connected to the ElasticsearchDocumentStore to run.
+    Usage example:
+    ```python
+    from haystack import Document
+    from haystack.components.embedders import SentenceTransformersTextEmbedder
+    from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
+    from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchEmbeddingRetriever
+
+    document_store = ElasticsearchDocumentStore(hosts="http://localhost:9200")
+    retriever = ElasticsearchEmbeddingRetriever(document_store=document_store)
+
+    # Add documents to DocumentStore
+    documents = [
+        Document(text="My name is Carla and I live in Berlin"),
+        Document(text="My name is Paul and I live in New York"),
+        Document(text="My name is Silvano and I live in Matera"),
+        Document(text="My name is Usagi Tsukino and I live in Tokyo"),
+    ]
+    document_store.write_documents(documents)
+
+    te = SentenceTransformersTextEmbedder()
+    te.warm_up()
+    query_embeddings = te.run("Who lives in Berlin?")["embedding"]
+
+    result = retriever.run(query=query_embeddings)
+    for doc in result["documents"]:
+    print(doc.content)
+    ```
     """
 
     def __init__(
@@ -33,8 +59,8 @@ def __init__(
         :param top_k: Maximum number of Documents to return, defaults to 10
         :param num_candidates: Number of approximate nearest neighbor candidates on each shard. Defaults to top_k * 10.
             Increasing this value will improve search accuracy at the cost of slower search speeds.
-            You can read more about it in the Elasticsearch documentation:
-            https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy
+            You can read more about it in the Elasticsearch
+            [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy)
         :raises ValueError: If `document_store` is not an instance of ElasticsearchDocumentStore.
         """
         if not isinstance(document_store, ElasticsearchDocumentStore):
@@ -47,6 +73,12 @@ def __init__(
         self._num_candidates = num_candidates
 
     def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes the component to a dictionary.
+
+        :returns:
+            Dictionary with serialized data.
+        """
         return default_to_dict(
             self,
             filters=self._filters,
@@ -57,6 +89,14 @@ def to_dict(self) -> Dict[str, Any]:
 
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "ElasticsearchEmbeddingRetriever":
+        """
+        Deserializes the component from a dictionary.
+
+        :param data:
+            Dictionary to deserialize from.
+        :returns:
+              Deserialized component.
+        """
         data["init_parameters"]["document_store"] = ElasticsearchDocumentStore.from_dict(
             data["init_parameters"]["document_store"]
         )
@@ -70,7 +110,8 @@ def run(self, query_embedding: List[float], filters: Optional[Dict[str, Any]] =
         :param query_embedding: Embedding of the query.
         :param filters: Filters applied to the retrieved Documents.
         :param top_k: Maximum number of Documents to return.
-        :return: List of Documents similar to `query_embedding`.
+        :returns: A dictionary with the following keys:
+            - `documents`: List of Documents most similar to the given query_embedding
         """
         docs = self._document_store._embedding_retrieval(
             query_embedding=query_embedding,