From e974a23fa306a38cf482d24b1b7e4efd7789b32a Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Wed, 10 Apr 2024 11:00:20 +0200 Subject: [PATCH] docs: Fix eval metric examples in docstrings (#7505) * fix eval metric docstrings, change type of individual scores * change import order * change exactmatch docstring to single ground truth answer * change exactmatch comment to single ground truth answer * reverted changing docs to single ground truth * add warm up in SASEvaluator example * fix FaithfulnessEvaluator docstring example * extend FaithfulnessEvaluator docstring example * Update FaithfulnessEvaluator init docstring * Remove outdated default from LLMEvaluator docstring * Add examples param to LLMEvaluator docstring example * Add import and print to LLMEvaluator docstring example --- .../components/evaluators/document_map.py | 3 ++- .../components/evaluators/document_mrr.py | 8 ++++--- .../components/evaluators/document_recall.py | 22 +++++++++++++------ .../components/evaluators/faithfulness.py | 13 ++++++----- .../components/evaluators/llm_evaluator.py | 8 ++++++- .../components/evaluators/sas_evaluator.py | 1 + .../evaluators/test_document_recall.py | 16 ++++++++------ 7 files changed, 47 insertions(+), 24 deletions(-) diff --git a/haystack/components/evaluators/document_map.py b/haystack/components/evaluators/document_map.py index 6d30ebbdb6..303d7c4dfa 100644 --- a/haystack/components/evaluators/document_map.py +++ b/haystack/components/evaluators/document_map.py @@ -15,7 +15,8 @@ class DocumentMAPEvaluator: Usage example: ```python - from haystack.components.evaluators import AnswerExactMatchEvaluator + from haystack import Document + from haystack.components.evaluators import DocumentMAPEvaluator evaluator = DocumentMAPEvaluator() result = evaluator.run( diff --git a/haystack/components/evaluators/document_mrr.py b/haystack/components/evaluators/document_mrr.py index ab8bcc1389..f65cf8fe57 100644 --- a/haystack/components/evaluators/document_mrr.py +++ b/haystack/components/evaluators/document_mrr.py @@ -16,7 +16,9 @@ class DocumentMRREvaluator: Usage example: ```python - from haystack.components.evaluators import AnswerExactMatchEvaluator + from haystack import Document + from haystack.components.evaluators import DocumentMRREvaluator + evaluator = DocumentMRREvaluator() result = evaluator.run( ground_truth_documents=[ @@ -29,9 +31,9 @@ class DocumentMRREvaluator: ], ) print(result["individual_scores"]) - # [1.0, 0.8333333333333333] + # [1.0, 1.0] print(result["score"]) - # 0.9166666666666666 + # 1.0 ``` """ diff --git a/haystack/components/evaluators/document_recall.py b/haystack/components/evaluators/document_recall.py index 4102aa1ff5..3bd9a767b3 100644 --- a/haystack/components/evaluators/document_recall.py +++ b/haystack/components/evaluators/document_recall.py @@ -37,16 +37,24 @@ class DocumentRecallEvaluator: Usage example: ```python + from haystack import Document from haystack.components.evaluators import DocumentRecallEvaluator + evaluator = DocumentRecallEvaluator() result = evaluator.run( - ground_truth_answers=[["Berlin"], ["Paris"]], - predicted_answers=[["Paris"], ["London"]], + ground_truth_documents=[ + [Document(content="France")], + [Document(content="9th century"), Document(content="9th")], + ], + retrieved_documents=[ + [Document(content="France")], + [Document(content="9th century"), Document(content="10th century"), Document(content="9th")], + ], ) print(result["individual_scores"]) - # [0.0, 0.0] + # [1.0, 1.0] print(result["score"]) - # 0.0 + # 1.0 ``` """ @@ -63,12 +71,12 @@ def __init__(self, mode: Union[str, RecallMode] = RecallMode.SINGLE_HIT): mode_functions = {RecallMode.SINGLE_HIT: self._recall_single_hit, RecallMode.MULTI_HIT: self._recall_multi_hit} self.mode_function = mode_functions[mode] - def _recall_single_hit(self, ground_truth_documents: List[Document], retrieved_documents: List[Document]) -> bool: + def _recall_single_hit(self, ground_truth_documents: List[Document], retrieved_documents: List[Document]) -> float: unique_truths = {g.content for g in ground_truth_documents} unique_retrievals = {p.content for p in retrieved_documents} retrieved_ground_truths = unique_truths.intersection(unique_retrievals) - return len(retrieved_ground_truths) > 0 + return float(len(retrieved_ground_truths) > 0) def _recall_multi_hit(self, ground_truth_documents: List[Document], retrieved_documents: List[Document]) -> float: unique_truths = {g.content for g in ground_truth_documents} @@ -92,7 +100,7 @@ def run( A dictionary with the following outputs: - `score` - The average of calculated scores. - `invididual_scores` - A list of numbers from 0.0 to 1.0 that represents the proportion of matching documents retrieved. - If the mode is `single_hit`, the individual scores are True or False. + If the mode is `single_hit`, the individual scores are 0 or 1. """ if len(ground_truth_documents) != len(retrieved_documents): msg = "The length of ground_truth_documents and retrieved_documents must be the same." diff --git a/haystack/components/evaluators/faithfulness.py b/haystack/components/evaluators/faithfulness.py index 9ceb997330..7722995b44 100644 --- a/haystack/components/evaluators/faithfulness.py +++ b/haystack/components/evaluators/faithfulness.py @@ -29,11 +29,14 @@ class FaithfulnessEvaluator(LLMEvaluator): responses = ["Python is a high-level general-purpose programming language that was created by George Lucas."] evaluator = FaithfulnessEvaluator() result = evaluator.run(questions=questions, contexts=contexts, responses=responses) - print(results["evaluator"]) - # {'results': [{'statements': ['Python is a high-level general-purpose programming language.', - # 'Python was created by George Lucas.'], 'statement_scores': - # [1, 0], 'score': 0.5}], 'score': 0.5, 'individual_scores': [0.5]} + print(result["individual_scores"]) + # [0.5] + print(result["score"]) + # 0.5 + print(result["results"]) + # [{'statements': ['Python is a high-level general-purpose programming language.', + 'Python was created by George Lucas.'], 'statement_scores': [1, 0], 'score': 0.5}] ``` """ @@ -44,7 +47,7 @@ def __init__( api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"), ): """ - Creates an instance of LLMEvaluator. + Creates an instance of FaithfulnessEvaluator. :param examples: Few-shot examples conforming to the expected input and output format of FaithfulnessEvaluator. diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py index d8f266360e..34a69e3b11 100644 --- a/haystack/components/evaluators/llm_evaluator.py +++ b/haystack/components/evaluators/llm_evaluator.py @@ -19,17 +19,24 @@ class LLMEvaluator: Usage example: ```python + from typing import List from haystack.components.evaluators import LLMEvaluator evaluator = LLMEvaluator( instructions="Is this answer problematic for children?", inputs=[("responses", List[str])], outputs=["score"], + examples=[ + {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}}, + {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}, + ], ) RESPONSES = [ "Football is the most popular sport with around 4 billion followers worldwide", "Python language was created by Guido van Rossum.", ] results = evaluator.run(responses=RESPONSES) + print(results) + # {'results': [{'score': 0}, {'score': 0}]} ``` """ @@ -54,7 +61,6 @@ def __init__( Each input is a tuple of an input name and input type. Input types must be lists. :param outputs: Output names of the evaluation results. They correspond to keys in the output dictionary. - The default is a single key "score". :param examples: Few-shot examples conforming to the expected input and output format as defined in the `inputs` and `outputs` parameters. diff --git a/haystack/components/evaluators/sas_evaluator.py b/haystack/components/evaluators/sas_evaluator.py index 555b1def46..6590d25d7d 100644 --- a/haystack/components/evaluators/sas_evaluator.py +++ b/haystack/components/evaluators/sas_evaluator.py @@ -26,6 +26,7 @@ class SASEvaluator: from haystack.components.evaluators.sas_evaluator import SASEvaluator evaluator = SASEvaluator(model="cross-encoder/ms-marco-MiniLM-L-6-v2") + evaluator.warm_up() ground_truths = [ "A construction budget of US $2.3 billion", "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", diff --git a/test/components/evaluators/test_document_recall.py b/test/components/evaluators/test_document_recall.py index 56e77f02c3..b3335957e2 100644 --- a/test/components/evaluators/test_document_recall.py +++ b/test/components/evaluators/test_document_recall.py @@ -19,7 +19,7 @@ def test_run_with_all_matching(self, evaluator): ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]], retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]], ) - + assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"]) assert result == {"individual_scores": [1.0, 1.0], "score": 1.0} def test_run_with_no_matching(self, evaluator): @@ -27,7 +27,7 @@ def test_run_with_no_matching(self, evaluator): ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]], retrieved_documents=[[Document(content="Paris")], [Document(content="London")]], ) - + assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"]) assert result == {"individual_scores": [0.0, 0.0], "score": 0.0} def test_run_with_partial_matching(self, evaluator): @@ -35,7 +35,7 @@ def test_run_with_partial_matching(self, evaluator): ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]], retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]], ) - + assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"]) assert result == {"individual_scores": [1.0, 0.0], "score": 0.5} def test_run_with_complex_data(self, evaluator): @@ -62,7 +62,8 @@ def test_run_with_complex_data(self, evaluator): ], ], ) - assert result == {"individual_scores": [True, True, True, True, False, True], "score": 0.8333333333333334} + assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"]) + assert result == {"individual_scores": [1, 1, 1, 1, 0, 1], "score": 0.8333333333333334} def test_run_with_different_lengths(self, evaluator): with pytest.raises(ValueError): @@ -88,7 +89,7 @@ def test_run_with_all_matching(self, evaluator): ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]], retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]], ) - + assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"]) assert result == {"individual_scores": [1.0, 1.0], "score": 1.0} def test_run_with_no_matching(self, evaluator): @@ -96,7 +97,7 @@ def test_run_with_no_matching(self, evaluator): ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]], retrieved_documents=[[Document(content="Paris")], [Document(content="London")]], ) - + assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"]) assert result == {"individual_scores": [0.0, 0.0], "score": 0.0} def test_run_with_partial_matching(self, evaluator): @@ -104,7 +105,7 @@ def test_run_with_partial_matching(self, evaluator): ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]], retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]], ) - + assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"]) assert result == {"individual_scores": [1.0, 0.0], "score": 0.5} def test_run_with_complex_data(self, evaluator): @@ -136,6 +137,7 @@ def test_run_with_complex_data(self, evaluator): ], ], ) + assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"]) assert result == {"individual_scores": [1.0, 1.0, 0.5, 1.0, 0.75, 1.0], "score": 0.875} def test_run_with_different_lengths(self, evaluator):