Merge branch 'main' into hfapigenerator

deepset-ai · Apr 5, 2024 · 8253515 · 8253515
2 parents 9f2c9c0 + 65705a8
commit 8253515
Show file tree

Hide file tree

Showing 29 changed files with 1,577 additions and 171 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -28,6 +28,7 @@ env:
   AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }}
   AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
   GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  HF_API_TOKEN: ${{ secrets.HUGGINGFACE_API_KEY }}
   PYTHON_VERSION: "3.8"
   HATCH_VERSION: "1.9.3"
 

diff --git a/docker/README.md b/docker/README.md
@@ -1,25 +1,19 @@
 <p align="center">
-  <a href="https://www.deepset.ai/haystack/"><img src="https://raw.githubusercontent.com/deepset-ai/haystack/main/docs/img/haystack_logo_colored.png" alt="Haystack"></a>
+  <a href="https://haystack.deepset.ai/"><img src="https://raw.githubusercontent.com/deepset-ai/.github/main/haystack-logo-colored.png" alt="Haystack by deepset"></a>
 </p>
 
-Haystack is an end-to-end framework that enables you to build powerful and production-ready
-pipelines for different search use cases. The Docker image comes with a web service
-configured to serve Haystack's `rest_api` to ease pipeline deployments in containerized
-environments.
+[Haystack](https://github.com/deepset-ai/haystack) is an end-to-end LLM framework that allows you to build applications powered by LLMs, Transformer models, vector search and more. Whether you want to perform retrieval-augmented generation (RAG), document search, question answering or answer generation, Haystack can orchestrate state-of-the-art embedding models and LLMs into pipelines to build end-to-end NLP applications and solve your use case.
 
-To start the Docker container binding the TCP port `8000` locally, run:
-```sh
-docker run -p 8000:8000 deepset/haystack
-```
+## Haystack 2.0
 
-If you need the container to access other services available in the host, run:
-```sh
-docker run -p 8000:8000 --network="host" deepset/haystack
-```
+For the latest version of Haystack there's only one image available:
+
+- `haystack:base-<version>` contains a working Python environment with Haystack preinstalled. This image is expected to
+  be derived `FROM`.
 
-## Image Variants
+## Haystack 1.x image variants
 
-The Docker image comes in six variants:
+The Docker image for Haystack 1.x comes in six variants:
 - `haystack:gpu-<version>` contains Haystack dependencies as well as what's needed to run the REST API and UI. It comes with the CUDA runtime and is capable of running on GPUs.
 - `haystack:cpu-remote-inference-<version>` is a slimmed down version of the CPU image with the REST API and UI. It is specifically designed for PromptNode inferencing using remotely hosted models, such as Hugging Face Inference, OpenAI, Cohere, Anthropic, and similar.
 - `haystack:cpu-<version>` contains Haystack dependencies as well as what's needed to run the REST API and UI. It has no support for GPU so must be run on CPU.

diff --git a/haystack/components/converters/html.py b/haystack/components/converters/html.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import Any, Dict, List, Literal, Optional, Union
+from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
 
 from boilerpy3 import extractors
 
@@ -27,6 +27,16 @@ class HTMLToDocument:
     ```
     """
 
+    known_extractors: ClassVar[List[str]] = [
+        "DefaultExtractor",
+        "ArticleExtractor",
+        "ArticleSentencesExtractor",
+        "LargestContentExtractor",
+        "CanolaExtractor",
+        "KeepEverythingExtractor",
+        "NumWordsRulesExtractor",
+    ]
+
     def __init__(
         self,
         extractor_type: Literal[
@@ -38,6 +48,7 @@ def __init__(
             "KeepEverythingExtractor",
             "NumWordsRulesExtractor",
         ] = "DefaultExtractor",
+        try_others: bool = True,
     ):
         """
         Create an HTMLToDocument component.
@@ -46,8 +57,10 @@ def __init__(
             extractor_type: Name of the extractor class to use. Defaults to `DefaultExtractor`.
             For more information on the different types of extractors,
             see [boilerpy3 documentation](https://github.com/jmriebold/BoilerPy3?tab=readme-ov-file#extractors).
+        :param try_others: If `True`, the component will try other extractors if the user chosen extractor fails.
         """
         self.extractor_type = extractor_type
+        self.try_others = try_others
 
     def to_dict(self) -> Dict[str, Any]:
         """
@@ -56,7 +69,7 @@ def to_dict(self) -> Dict[str, Any]:
         :returns:
             Dictionary with serialized data.
         """
-        return default_to_dict(self, extractor_type=self.extractor_type)
+        return default_to_dict(self, extractor_type=self.extractor_type, try_others=self.try_others)
 
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "HTMLToDocument":
@@ -96,28 +109,49 @@ def run(
         documents = []
         meta_list = normalize_metadata(meta=meta, sources_count=len(sources))
 
-        extractor_class = getattr(extractors, self.extractor_type)
-        extractor = extractor_class(raise_on_failure=False)
+        # Use all extractor types, ensuring user chosen extractor is first, preserve order, avoid duplicates
+        extractors_list = (
+            list(
+                dict.fromkeys(
+                    [self.extractor_type, *self.known_extractors]  # User chosen extractor is always tried first
+                )
+            )
+            if self.try_others
+            else [self.extractor_type]
+        )
 
         for source, metadata in zip(sources, meta_list):
             try:
                 bytestream = get_bytestream_from_source(source=source)
             except Exception as e:
                 logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
                 continue
-            try:
-                file_content = bytestream.data.decode("utf-8")
-                text = extractor.get_content(file_content)
-            except Exception as conversion_e:
+
+            text = None
+            for extractor_name in extractors_list:
+                extractor_class = getattr(extractors, extractor_name)
+                extractor = extractor_class(raise_on_failure=False)
+                try:
+                    text = extractor.get_content(bytestream.data.decode("utf-8"))
+                    if text:
+                        break
+                except Exception as conversion_e:
+                    if self.try_others:
+                        logger.warning(
+                            "Failed to extract text using {extractor} from {source}. Trying next extractor. Error: {error}",
+                            extractor=extractor_name,
+                            source=source,
+                            error=conversion_e,
+                        )
+            if not text:
                 logger.warning(
-                    "Failed to extract text from {source}. Skipping it. Error: {error}",
+                    f"Failed to extract text from {source} using extractors: {extractors_list}. Skipping it.",
                     source=source,
-                    error=conversion_e,
+                    extractors_list=extractors_list,
                 )
                 continue
 
-            merged_metadata = {**bytestream.meta, **metadata}
-            document = Document(content=text, meta=merged_metadata)
+            document = Document(content=text, meta={**bytestream.meta, **metadata})
             documents.append(document)
 
         return {"documents": documents}
diff --git a/haystack/components/evaluators/__init__.py b/haystack/components/evaluators/__init__.py
@@ -1,4 +1,17 @@
 from .answer_exact_match import AnswerExactMatchEvaluator
+from .document_map import DocumentMAPEvaluator
+from .document_mrr import DocumentMRREvaluator
+from .document_recall import DocumentRecallEvaluator
+from .faithfulness import FaithfulnessEvaluator
 from .llm_evaluator import LLMEvaluator
+from .sas_evaluator import SASEvaluator
 
-__all__ = ["AnswerExactMatchEvaluator", "LLMEvaluator"]
+__all__ = [
+    "AnswerExactMatchEvaluator",
+    "DocumentMAPEvaluator",
+    "DocumentMRREvaluator",
+    "DocumentRecallEvaluator",
+    "FaithfulnessEvaluator",
+    "LLMEvaluator",
+    "SASEvaluator",
+]
diff --git a/haystack/components/evaluators/answer_exact_match.py b/haystack/components/evaluators/answer_exact_match.py
@@ -7,17 +7,16 @@
 class AnswerExactMatchEvaluator:
     """
     Evaluator that checks if the predicted answers matches any of the ground truth answers exactly.
-    The result is a number from 0.0 to 1.0, it represents the proportion of questions where any predicted answer
-    matched one of the ground truth answers.
-    Each question can have multiple ground truth answers and multiple predicted answers.
+    The result is a number from 0.0 to 1.0, it represents the proportion any predicted answer
+    that matched one of the ground truth answers.
+    There can be multiple ground truth answers and multiple predicted answers as input.
 
     Usage example:
     ```python
     from haystack.components.evaluators import AnswerExactMatchEvaluator
 
     evaluator = AnswerExactMatchEvaluator()
     result = evaluator.run(
-        questions=["What is the capital of Germany?", "What is the capital of France?"],
         ground_truth_answers=[["Berlin"], ["Paris"]],
         predicted_answers=[["Berlin"], ["Lyon"]],
     )
@@ -30,15 +29,11 @@ class AnswerExactMatchEvaluator:
     """
 
     @component.output_types(individual_scores=List[int], score=float)
-    def run(
-        self, questions: List[str], ground_truth_answers: List[List[str]], predicted_answers: List[List[str]]
-    ) -> Dict[str, Any]:
+    def run(self, ground_truth_answers: List[List[str]], predicted_answers: List[List[str]]) -> Dict[str, Any]:
         """
         Run the AnswerExactMatchEvaluator on the given inputs.
-        All lists must have the same length.
+        `ground_truth_answers` and `retrieved_answers` must have the same length.
 
-        :param questions:
-            A list of questions.
         :param ground_truth_answers:
             A list of expected answers for each question.
         :param predicted_answers:
@@ -49,8 +44,8 @@ def run(
             - `score` - A number from 0.0 to 1.0 that represents the proportion of questions where any predicted
                          answer matched one of the ground truth answers.
         """
-        if not len(questions) == len(ground_truth_answers) == len(predicted_answers):
-            raise ValueError("The length of questions, ground_truth_answers, and predicted_answers must be the same.")
+        if not len(ground_truth_answers) == len(predicted_answers):
+            raise ValueError("The length of ground_truth_answers and predicted_answers must be the same.")
 
         matches = []
         for truths, extracted in zip(ground_truth_answers, predicted_answers):
@@ -60,6 +55,6 @@ def run(
                 matches.append(0)
 
         # The proportion of questions where any predicted answer matched one of the ground truth answers
-        average = sum(matches) / len(questions)
+        average = sum(matches) / len(predicted_answers)
 
         return {"individual_scores": matches, "score": average}
diff --git a/haystack/components/evaluators/document_map.py b/haystack/components/evaluators/document_map.py
@@ -0,0 +1,84 @@
+from typing import Any, Dict, List
+
+from haystack import Document, component
+
+
+@component
+class DocumentMAPEvaluator:
+    """
+    Evaluator that calculates the mean average precision of the retrieved documents, a metric
+    that measures how high retrieved documents are ranked.
+    Each question can have multiple ground truth documents and multiple retrieved documents.
+
+    `DocumentMAPEvaluator` doesn't normalize its inputs, the `DocumentCleaner` component
+    should be used to clean and normalize the documents before passing them to this evaluator.
+
+    Usage example:
+    ```python
+    from haystack.components.evaluators import AnswerExactMatchEvaluator
+
+    evaluator = DocumentMAPEvaluator()
+    result = evaluator.run(
+        ground_truth_documents=[
+            [Document(content="France")],
+            [Document(content="9th century"), Document(content="9th")],
+        ],
+        retrieved_documents=[
+            [Document(content="France")],
+            [Document(content="9th century"), Document(content="10th century"), Document(content="9th")],
+        ],
+    )
+
+    print(result["individual_scores"])
+    # [1.0, 0.8333333333333333]
+    print(result["score"])
+    # 0.9166666666666666
+    ```
+    """
+
+    @component.output_types(score=float, individual_scores=List[float])
+    def run(
+        self, ground_truth_documents: List[List[Document]], retrieved_documents: List[List[Document]]
+    ) -> Dict[str, Any]:
+        """
+        Run the DocumentMAPEvaluator on the given inputs.
+        All lists must have the same length.
+
+        :param ground_truth_documents:
+            A list of expected documents for each question.
+        :param retrieved_documents:
+            A list of retrieved documents for each question.
+        :returns:
+            A dictionary with the following outputs:
+            - `score` - The average of calculated scores.
+            - `invididual_scores` - A list of numbers from 0.0 to 1.0 that represents how high retrieved documents are ranked.
+        """
+        if len(ground_truth_documents) != len(retrieved_documents):
+            msg = "The length of ground_truth_documents and retrieved_documents must be the same."
+            raise ValueError(msg)
+
+        individual_scores = []
+
+        for ground_truth, retrieved in zip(ground_truth_documents, retrieved_documents):
+            score = 0.0
+            for ground_document in ground_truth:
+                if ground_document.content is None:
+                    continue
+
+                average_precision = 0.0
+                relevant_documents = 0
+
+                for rank, retrieved_document in enumerate(retrieved):
+                    if retrieved_document.content is None:
+                        continue
+
+                    if ground_document.content in retrieved_document.content:
+                        relevant_documents += 1
+                        average_precision += relevant_documents / (rank + 1)
+                if relevant_documents > 0:
+                    score = average_precision / relevant_documents
+            individual_scores.append(score)
+
+        score = sum(individual_scores) / len(retrieved_documents)
+
+        return {"score": score, "individual_scores": individual_scores}
diff --git a/haystack/components/evaluators/document_mrr.py b/haystack/components/evaluators/document_mrr.py
@@ -0,0 +1,79 @@
+from typing import Any, Dict, List
+
+from haystack import Document, component
+
+
+@component
+class DocumentMRREvaluator:
+    """
+    Evaluator that calculates the mean reciprocal rank of the retrieved documents.
+
+    MRR measures how high the first retrieved document is ranked.
+    Each question can have multiple ground truth documents and multiple retrieved documents.
+
+    `DocumentMRREvaluator` doesn't normalize its inputs, the `DocumentCleaner` component
+    should be used to clean and normalize the documents before passing them to this evaluator.
+
+    Usage example:
+    ```python
+    from haystack.components.evaluators import AnswerExactMatchEvaluator
+    evaluator = DocumentMRREvaluator()
+    result = evaluator.run(
+        ground_truth_documents=[
+            [Document(content="France")],
+            [Document(content="9th century"), Document(content="9th")],
+        ],
+        retrieved_documents=[
+            [Document(content="France")],
+            [Document(content="9th century"), Document(content="10th century"), Document(content="9th")],
+        ],
+    )
+    print(result["individual_scores"])
+    # [1.0, 0.8333333333333333]
+    print(result["score"])
+    # 0.9166666666666666
+    ```
+    """
+
+    @component.output_types(score=float, individual_scores=List[float])
+    def run(
+        self, ground_truth_documents: List[List[Document]], retrieved_documents: List[List[Document]]
+    ) -> Dict[str, Any]:
+        """
+        Run the DocumentMRREvaluator on the given inputs.
+
+        `ground_truth_documents` and `retrieved_documents` must have the same length.
+
+        :param ground_truth_documents:
+            A list of expected documents for each question.
+        :param retrieved_documents:
+            A list of retrieved documents for each question.
+        :returns:
+            A dictionary with the following outputs:
+            - `score` - The average of calculated scores.
+            - `invididual_scores` - A list of numbers from 0.0 to 1.0 that represents how high the first retrieved document is ranked.
+        """
+        if len(ground_truth_documents) != len(retrieved_documents):
+            msg = "The length of ground_truth_documents and retrieved_documents must be the same."
+            raise ValueError(msg)
+
+        individual_scores = []
+
+        for ground_truth, retrieved in zip(ground_truth_documents, retrieved_documents):
+            score = 0.0
+            for ground_document in ground_truth:
+                if ground_document.content is None:
+                    continue
+
+                for rank, retrieved_document in enumerate(retrieved):
+                    if retrieved_document.content is None:
+                        continue
+
+                    if ground_document.content in retrieved_document.content:
+                        score = 1 / (rank + 1)
+                        break
+            individual_scores.append(score)
+
+        score = sum(individual_scores) / len(retrieved_documents)
+
+        return {"score": score, "individual_scores": individual_scores}