Skip to content

Commit

Permalink
docs: Fix eval metric examples in docstrings (#7505)
Browse files Browse the repository at this point in the history
* fix eval metric docstrings, change type of individual scores

* change import order

* change exactmatch docstring to single ground truth answer

* change exactmatch comment to single ground truth answer

* reverted changing docs to single ground truth

* add warm up in SASEvaluator example

* fix FaithfulnessEvaluator docstring example

* extend FaithfulnessEvaluator docstring example

* Update FaithfulnessEvaluator init docstring

* Remove outdated default from LLMEvaluator docstring

* Add examples param to LLMEvaluator docstring example

* Add import and print to LLMEvaluator docstring example
  • Loading branch information
julian-risch authored Apr 10, 2024
1 parent 932213e commit e974a23
Show file tree
Hide file tree
Showing 7 changed files with 47 additions and 24 deletions.
3 changes: 2 additions & 1 deletion haystack/components/evaluators/document_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ class DocumentMAPEvaluator:
Usage example:
```python
from haystack.components.evaluators import AnswerExactMatchEvaluator
from haystack import Document
from haystack.components.evaluators import DocumentMAPEvaluator
evaluator = DocumentMAPEvaluator()
result = evaluator.run(
Expand Down
8 changes: 5 additions & 3 deletions haystack/components/evaluators/document_mrr.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ class DocumentMRREvaluator:
Usage example:
```python
from haystack.components.evaluators import AnswerExactMatchEvaluator
from haystack import Document
from haystack.components.evaluators import DocumentMRREvaluator
evaluator = DocumentMRREvaluator()
result = evaluator.run(
ground_truth_documents=[
Expand All @@ -29,9 +31,9 @@ class DocumentMRREvaluator:
],
)
print(result["individual_scores"])
# [1.0, 0.8333333333333333]
# [1.0, 1.0]
print(result["score"])
# 0.9166666666666666
# 1.0
```
"""

Expand Down
22 changes: 15 additions & 7 deletions haystack/components/evaluators/document_recall.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,24 @@ class DocumentRecallEvaluator:
Usage example:
```python
from haystack import Document
from haystack.components.evaluators import DocumentRecallEvaluator
evaluator = DocumentRecallEvaluator()
result = evaluator.run(
ground_truth_answers=[["Berlin"], ["Paris"]],
predicted_answers=[["Paris"], ["London"]],
ground_truth_documents=[
[Document(content="France")],
[Document(content="9th century"), Document(content="9th")],
],
retrieved_documents=[
[Document(content="France")],
[Document(content="9th century"), Document(content="10th century"), Document(content="9th")],
],
)
print(result["individual_scores"])
# [0.0, 0.0]
# [1.0, 1.0]
print(result["score"])
# 0.0
# 1.0
```
"""

Expand All @@ -63,12 +71,12 @@ def __init__(self, mode: Union[str, RecallMode] = RecallMode.SINGLE_HIT):
mode_functions = {RecallMode.SINGLE_HIT: self._recall_single_hit, RecallMode.MULTI_HIT: self._recall_multi_hit}
self.mode_function = mode_functions[mode]

def _recall_single_hit(self, ground_truth_documents: List[Document], retrieved_documents: List[Document]) -> bool:
def _recall_single_hit(self, ground_truth_documents: List[Document], retrieved_documents: List[Document]) -> float:
unique_truths = {g.content for g in ground_truth_documents}
unique_retrievals = {p.content for p in retrieved_documents}
retrieved_ground_truths = unique_truths.intersection(unique_retrievals)

return len(retrieved_ground_truths) > 0
return float(len(retrieved_ground_truths) > 0)

def _recall_multi_hit(self, ground_truth_documents: List[Document], retrieved_documents: List[Document]) -> float:
unique_truths = {g.content for g in ground_truth_documents}
Expand All @@ -92,7 +100,7 @@ def run(
A dictionary with the following outputs:
- `score` - The average of calculated scores.
- `invididual_scores` - A list of numbers from 0.0 to 1.0 that represents the proportion of matching documents retrieved.
If the mode is `single_hit`, the individual scores are True or False.
If the mode is `single_hit`, the individual scores are 0 or 1.
"""
if len(ground_truth_documents) != len(retrieved_documents):
msg = "The length of ground_truth_documents and retrieved_documents must be the same."
Expand Down
13 changes: 8 additions & 5 deletions haystack/components/evaluators/faithfulness.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,14 @@ class FaithfulnessEvaluator(LLMEvaluator):
responses = ["Python is a high-level general-purpose programming language that was created by George Lucas."]
evaluator = FaithfulnessEvaluator()
result = evaluator.run(questions=questions, contexts=contexts, responses=responses)
print(results["evaluator"])
# {'results': [{'statements': ['Python is a high-level general-purpose programming language.',
# 'Python was created by George Lucas.'], 'statement_scores':
# [1, 0], 'score': 0.5}], 'score': 0.5, 'individual_scores': [0.5]}
print(result["individual_scores"])
# [0.5]
print(result["score"])
# 0.5
print(result["results"])
# [{'statements': ['Python is a high-level general-purpose programming language.',
'Python was created by George Lucas.'], 'statement_scores': [1, 0], 'score': 0.5}]
```
"""

Expand All @@ -44,7 +47,7 @@ def __init__(
api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
):
"""
Creates an instance of LLMEvaluator.
Creates an instance of FaithfulnessEvaluator.
:param examples:
Few-shot examples conforming to the expected input and output format of FaithfulnessEvaluator.
Expand Down
8 changes: 7 additions & 1 deletion haystack/components/evaluators/llm_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,24 @@ class LLMEvaluator:
Usage example:
```python
from typing import List
from haystack.components.evaluators import LLMEvaluator
evaluator = LLMEvaluator(
instructions="Is this answer problematic for children?",
inputs=[("responses", List[str])],
outputs=["score"],
examples=[
{"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}},
{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}},
],
)
RESPONSES = [
"Football is the most popular sport with around 4 billion followers worldwide",
"Python language was created by Guido van Rossum.",
]
results = evaluator.run(responses=RESPONSES)
print(results)
# {'results': [{'score': 0}, {'score': 0}]}
```
"""

Expand All @@ -54,7 +61,6 @@ def __init__(
Each input is a tuple of an input name and input type. Input types must be lists.
:param outputs:
Output names of the evaluation results. They correspond to keys in the output dictionary.
The default is a single key "score".
:param examples:
Few-shot examples conforming to the expected input and output format as defined in the `inputs` and
`outputs` parameters.
Expand Down
1 change: 1 addition & 0 deletions haystack/components/evaluators/sas_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ class SASEvaluator:
from haystack.components.evaluators.sas_evaluator import SASEvaluator
evaluator = SASEvaluator(model="cross-encoder/ms-marco-MiniLM-L-6-v2")
evaluator.warm_up()
ground_truths = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
Expand Down
16 changes: 9 additions & 7 deletions test/components/evaluators/test_document_recall.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,23 +19,23 @@ def test_run_with_all_matching(self, evaluator):
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
)

assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
assert result == {"individual_scores": [1.0, 1.0], "score": 1.0}

def test_run_with_no_matching(self, evaluator):
result = evaluator.run(
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
retrieved_documents=[[Document(content="Paris")], [Document(content="London")]],
)

assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
assert result == {"individual_scores": [0.0, 0.0], "score": 0.0}

def test_run_with_partial_matching(self, evaluator):
result = evaluator.run(
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
)

assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
assert result == {"individual_scores": [1.0, 0.0], "score": 0.5}

def test_run_with_complex_data(self, evaluator):
Expand All @@ -62,7 +62,8 @@ def test_run_with_complex_data(self, evaluator):
],
],
)
assert result == {"individual_scores": [True, True, True, True, False, True], "score": 0.8333333333333334}
assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
assert result == {"individual_scores": [1, 1, 1, 1, 0, 1], "score": 0.8333333333333334}

def test_run_with_different_lengths(self, evaluator):
with pytest.raises(ValueError):
Expand All @@ -88,23 +89,23 @@ def test_run_with_all_matching(self, evaluator):
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
)

assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
assert result == {"individual_scores": [1.0, 1.0], "score": 1.0}

def test_run_with_no_matching(self, evaluator):
result = evaluator.run(
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
retrieved_documents=[[Document(content="Paris")], [Document(content="London")]],
)

assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
assert result == {"individual_scores": [0.0, 0.0], "score": 0.0}

def test_run_with_partial_matching(self, evaluator):
result = evaluator.run(
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
)

assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
assert result == {"individual_scores": [1.0, 0.0], "score": 0.5}

def test_run_with_complex_data(self, evaluator):
Expand Down Expand Up @@ -136,6 +137,7 @@ def test_run_with_complex_data(self, evaluator):
],
],
)
assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
assert result == {"individual_scores": [1.0, 1.0, 0.5, 1.0, 0.75, 1.0], "score": 0.875}

def test_run_with_different_lengths(self, evaluator):
Expand Down

0 comments on commit e974a23

Please sign in to comment.