docs: Fix eval metric examples in docstrings (#7505)

* fix eval metric docstrings, change type of individual scores * change import order * change exactmatch docstring to single ground truth answer * change exactmatch comment to single ground truth answer * reverted changing docs to single ground truth * add warm up in SASEvaluator example * fix FaithfulnessEvaluator docstring example * extend FaithfulnessEvaluator docstring example * Update FaithfulnessEvaluator init docstring * Remove outdated default from LLMEvaluator docstring * Add examples param to LLMEvaluator docstring example * Add import and print to LLMEvaluator docstring example
deepset-ai · Apr 10, 2024 · e974a23 · e974a23
1 parent 932213e
commit e974a23
Show file tree

Hide file tree

Showing 7 changed files with 47 additions and 24 deletions.
diff --git a/haystack/components/evaluators/document_map.py b/haystack/components/evaluators/document_map.py
@@ -15,7 +15,8 @@ class DocumentMAPEvaluator:
 
     Usage example:
     ```python
-    from haystack.components.evaluators import AnswerExactMatchEvaluator
+    from haystack import Document
+    from haystack.components.evaluators import DocumentMAPEvaluator
 
     evaluator = DocumentMAPEvaluator()
     result = evaluator.run(

diff --git a/haystack/components/evaluators/document_mrr.py b/haystack/components/evaluators/document_mrr.py
@@ -16,7 +16,9 @@ class DocumentMRREvaluator:
 
     Usage example:
     ```python
-    from haystack.components.evaluators import AnswerExactMatchEvaluator
+    from haystack import Document
+    from haystack.components.evaluators import DocumentMRREvaluator
+
     evaluator = DocumentMRREvaluator()
     result = evaluator.run(
         ground_truth_documents=[
@@ -29,9 +31,9 @@ class DocumentMRREvaluator:
         ],
     )
     print(result["individual_scores"])
-    # [1.0, 0.8333333333333333]
+    # [1.0, 1.0]
     print(result["score"])
-    # 0.9166666666666666
+    # 1.0
     ```
     """
 

diff --git a/haystack/components/evaluators/document_recall.py b/haystack/components/evaluators/document_recall.py
@@ -37,16 +37,24 @@ class DocumentRecallEvaluator:
 
     Usage example:
     ```python
+    from haystack import Document
     from haystack.components.evaluators import DocumentRecallEvaluator
+
     evaluator = DocumentRecallEvaluator()
     result = evaluator.run(
-        ground_truth_answers=[["Berlin"], ["Paris"]],
-        predicted_answers=[["Paris"], ["London"]],
+        ground_truth_documents=[
+            [Document(content="France")],
+            [Document(content="9th century"), Document(content="9th")],
+        ],
+        retrieved_documents=[
+            [Document(content="France")],
+            [Document(content="9th century"), Document(content="10th century"), Document(content="9th")],
+        ],
     )
     print(result["individual_scores"])
-    # [0.0, 0.0]
+    # [1.0, 1.0]
     print(result["score"])
-    # 0.0
+    # 1.0
     ```
     """
 
@@ -63,12 +71,12 @@ def __init__(self, mode: Union[str, RecallMode] = RecallMode.SINGLE_HIT):
         mode_functions = {RecallMode.SINGLE_HIT: self._recall_single_hit, RecallMode.MULTI_HIT: self._recall_multi_hit}
         self.mode_function = mode_functions[mode]
 
-    def _recall_single_hit(self, ground_truth_documents: List[Document], retrieved_documents: List[Document]) -> bool:
+    def _recall_single_hit(self, ground_truth_documents: List[Document], retrieved_documents: List[Document]) -> float:
         unique_truths = {g.content for g in ground_truth_documents}
         unique_retrievals = {p.content for p in retrieved_documents}
         retrieved_ground_truths = unique_truths.intersection(unique_retrievals)
 
-        return len(retrieved_ground_truths) > 0
+        return float(len(retrieved_ground_truths) > 0)
 
     def _recall_multi_hit(self, ground_truth_documents: List[Document], retrieved_documents: List[Document]) -> float:
         unique_truths = {g.content for g in ground_truth_documents}
@@ -92,7 +100,7 @@ def run(
         A dictionary with the following outputs:
             - `score` - The average of calculated scores.
             - `invididual_scores` - A list of numbers from 0.0 to 1.0 that represents the proportion of matching documents retrieved.
-                                    If the mode is `single_hit`, the individual scores are True or False.
+                                    If the mode is `single_hit`, the individual scores are 0 or 1.
         """
         if len(ground_truth_documents) != len(retrieved_documents):
             msg = "The length of ground_truth_documents and retrieved_documents must be the same."

diff --git a/haystack/components/evaluators/faithfulness.py b/haystack/components/evaluators/faithfulness.py
@@ -29,11 +29,14 @@ class FaithfulnessEvaluator(LLMEvaluator):
     responses = ["Python is a high-level general-purpose programming language that was created by George Lucas."]
     evaluator = FaithfulnessEvaluator()
     result = evaluator.run(questions=questions, contexts=contexts, responses=responses)
-    print(results["evaluator"])
-    # {'results': [{'statements': ['Python is a high-level general-purpose programming language.',
-    # 'Python was created by George Lucas.'], 'statement_scores':
-    # [1, 0], 'score': 0.5}], 'score': 0.5, 'individual_scores': [0.5]}
 
+    print(result["individual_scores"])
+    # [0.5]
+    print(result["score"])
+    # 0.5
+    print(result["results"])
+    # [{'statements': ['Python is a high-level general-purpose programming language.',
+    'Python was created by George Lucas.'], 'statement_scores': [1, 0], 'score': 0.5}]
     ```
     """
 
@@ -44,7 +47,7 @@ def __init__(
         api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
     ):
         """
-        Creates an instance of LLMEvaluator.
+        Creates an instance of FaithfulnessEvaluator.
 
         :param examples:
             Few-shot examples conforming to the expected input and output format of FaithfulnessEvaluator.

diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
@@ -19,17 +19,24 @@ class LLMEvaluator:
 
     Usage example:
     ```python
+    from typing import List
     from haystack.components.evaluators import LLMEvaluator
     evaluator = LLMEvaluator(
         instructions="Is this answer problematic for children?",
         inputs=[("responses", List[str])],
         outputs=["score"],
+        examples=[
+            {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}},
+            {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}},
+        ],
     )
     RESPONSES = [
         "Football is the most popular sport with around 4 billion followers worldwide",
         "Python language was created by Guido van Rossum.",
     ]
     results = evaluator.run(responses=RESPONSES)
+    print(results)
+    # {'results': [{'score': 0}, {'score': 0}]}
     ```
     """
 
@@ -54,7 +61,6 @@ def __init__(
             Each input is a tuple of an input name and input type. Input types must be lists.
         :param outputs:
             Output names of the evaluation results. They correspond to keys in the output dictionary.
-            The default is a single key "score".
         :param examples:
             Few-shot examples conforming to the expected input and output format as defined in the `inputs` and
              `outputs` parameters.

diff --git a/haystack/components/evaluators/sas_evaluator.py b/haystack/components/evaluators/sas_evaluator.py
@@ -26,6 +26,7 @@ class SASEvaluator:
     from haystack.components.evaluators.sas_evaluator import SASEvaluator
 
     evaluator = SASEvaluator(model="cross-encoder/ms-marco-MiniLM-L-6-v2")
+    evaluator.warm_up()
     ground_truths = [
         "A construction budget of US $2.3 billion",
         "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",

diff --git a/test/components/evaluators/test_document_recall.py b/test/components/evaluators/test_document_recall.py
@@ -19,23 +19,23 @@ def test_run_with_all_matching(self, evaluator):
             ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
             retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
         )
-
+        assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
         assert result == {"individual_scores": [1.0, 1.0], "score": 1.0}
 
     def test_run_with_no_matching(self, evaluator):
         result = evaluator.run(
             ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
             retrieved_documents=[[Document(content="Paris")], [Document(content="London")]],
         )
-
+        assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
         assert result == {"individual_scores": [0.0, 0.0], "score": 0.0}
 
     def test_run_with_partial_matching(self, evaluator):
         result = evaluator.run(
             ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
             retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
         )
-
+        assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
         assert result == {"individual_scores": [1.0, 0.0], "score": 0.5}
 
     def test_run_with_complex_data(self, evaluator):
@@ -62,7 +62,8 @@ def test_run_with_complex_data(self, evaluator):
                 ],
             ],
         )
-        assert result == {"individual_scores": [True, True, True, True, False, True], "score": 0.8333333333333334}
+        assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
+        assert result == {"individual_scores": [1, 1, 1, 1, 0, 1], "score": 0.8333333333333334}
 
     def test_run_with_different_lengths(self, evaluator):
         with pytest.raises(ValueError):
@@ -88,23 +89,23 @@ def test_run_with_all_matching(self, evaluator):
             ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
             retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
         )
-
+        assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
         assert result == {"individual_scores": [1.0, 1.0], "score": 1.0}
 
     def test_run_with_no_matching(self, evaluator):
         result = evaluator.run(
             ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
             retrieved_documents=[[Document(content="Paris")], [Document(content="London")]],
         )
-
+        assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
         assert result == {"individual_scores": [0.0, 0.0], "score": 0.0}
 
     def test_run_with_partial_matching(self, evaluator):
         result = evaluator.run(
             ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
             retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
         )
-
+        assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
         assert result == {"individual_scores": [1.0, 0.0], "score": 0.5}
 
     def test_run_with_complex_data(self, evaluator):
@@ -136,6 +137,7 @@ def test_run_with_complex_data(self, evaluator):
                 ],
             ],
         )
+        assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
         assert result == {"individual_scores": [1.0, 1.0, 0.5, 1.0, 0.75, 1.0], "score": 0.875}
 
     def test_run_with_different_lengths(self, evaluator):