Skip to content

Commit

Permalink
feat: implementing evalualtion results API (#7520)
Browse files Browse the repository at this point in the history
* initial import

* adding tests

* attending PR comments

* fixing tests

* updating tests

* updating tests and code

* renaming

* fixing linting issues

* adding release notes

* adding docstrings

* latest fixes
  • Loading branch information
davidsbatista authored Apr 10, 2024
1 parent e974a23 commit 9a9c8aa
Show file tree
Hide file tree
Showing 5 changed files with 275 additions and 0 deletions.
2 changes: 2 additions & 0 deletions haystack/components/evaluators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from .document_map import DocumentMAPEvaluator
from .document_mrr import DocumentMRREvaluator
from .document_recall import DocumentRecallEvaluator
from .evaluation_result import EvaluationResult
from .faithfulness import FaithfulnessEvaluator
from .llm_evaluator import LLMEvaluator
from .sas_evaluator import SASEvaluator
Expand All @@ -11,6 +12,7 @@
"DocumentMAPEvaluator",
"DocumentMRREvaluator",
"DocumentRecallEvaluator",
"EvaluationResult",
"FaithfulnessEvaluator",
"LLMEvaluator",
"SASEvaluator",
Expand Down
98 changes: 98 additions & 0 deletions haystack/components/evaluators/evaluation_result.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from typing import Any, Dict

from pandas import DataFrame
from pandas import concat as pd_concat


class EvaluationResult:
"""
A class to store the results of an evaluation pipeline.
data = {
"inputs": {
"question": ["What is the capital of France?", "What is the capital of Spain?"],
"contexts": ["wiki_France", "wiki_Spain"],
"predicted_answer": ["Paris", "Madrid"],
},
"metrics": [
{"name": "reciprocal_rank", "scores": [0.378064, 0.534964, 0.216058, 0.778642]},
{"name": "context_relevance", "scores": [0.805466, 0.410251, 0.750070, 0.361332]},
],
}
eval_result = EvaluationResult(pipeline_name="testing_pipeline_1", results=data)
eval_result.to_pandas()
"""

def __init__(self, pipeline_name: str, results: Dict[str, Any]):
"""
Initialize the EvaluationResult object.
:param pipeline_name: The name of the pipeline that generated the results.
:param results: A dictionary containing the results of the evaluators used in the EvaluationPipeline.
it should have the following keys:
- inputs: A dictionary containing the inputs used in the evaluation.
- metrics: A list of dictionaries each containing the following keys:
'name': The name of the metric.
'score': The aggregated score for the metric.
'individual_scores': A list of scores for each query.
"""
self.results = results
self.pipeline_name = pipeline_name

def score_report(self) -> DataFrame:
"""
Transforms the results into a DataFrame with the aggregated scores for each metric.
:returns:
A DataFrame with the aggregated scores.
"""
results = {entry["name"]: entry["score"] for entry in self.results["metrics"]}
return DataFrame.from_dict(results, orient="index", columns=["score"])

def to_pandas(self) -> DataFrame:
"""
Creates a DataFrame containing the scores for each query and each metric.
:returns:
A DataFrame with the scores.
"""
inputs_columns = list(self.results["inputs"].keys())
inputs_values = list(self.results["inputs"].values())
inputs_values = list(map(list, zip(*inputs_values))) # transpose the values
df_inputs = DataFrame(inputs_values, columns=inputs_columns)

scores_columns = [entry["name"] for entry in self.results["metrics"]]
scores_values = [entry["individual_scores"] for entry in self.results["metrics"]]
scores_values = list(map(list, zip(*scores_values))) # transpose the values
df_scores = DataFrame(scores_values, columns=scores_columns)

return df_inputs.join(df_scores)

def comparative_individual_scores_report(self, other: "EvaluationResult") -> DataFrame:
"""
Creates a DataFrame with the scores for each metric in the results of two different pipelines.
:param other: The other EvaluationResults object to compare with.
:returns:
A DataFrame with the scores from both EvaluationResults objects.
"""
pipe_a_df = self.to_pandas()
pipe_b_df = other.to_pandas()

# check if the columns are the same, i.e.: the same queries and evaluation pipeline
columns_a = list(pipe_a_df.columns)
columns_b = list(pipe_b_df.columns)
if columns_a != columns_b:
raise ValueError(f"The two evaluation results do not have the same columns: {columns_a} != {columns_b}")

# add the pipeline name to the column
ignore = ["query_id", "question", "contexts", "answer"]
pipe_b_df.drop(columns=ignore, inplace=True, errors="ignore")
pipe_b_df.columns = [f"{other.pipeline_name}_{column}" for column in pipe_b_df.columns]
pipe_a_df.columns = [f"{self.pipeline_name}_{col}" if col not in ignore else col for col in pipe_a_df.columns]

results_df = pd_concat([pipe_a_df, pipe_b_df], axis=1)

return results_df
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
features:
- |
Added a new EvaluationResult component.
This is a wrapper for all the results coming from the Evaluators, presenting the metric scores as a DataFrame.
Empty file.
170 changes: 170 additions & 0 deletions test/components/evaluators/test_results_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
from haystack.components.evaluators.evaluation_result import EvaluationResult


def test_init_results_evaluator():
data = {
"inputs": {
"query_id": ["53c3b3e6", "225f87f7"],
"question": ["What is the capital of France?", "What is the capital of Spain?"],
"contexts": ["wiki_France", "wiki_Spain"],
"answer": ["Paris", "Madrid"],
"predicted_answer": ["Paris", "Madrid"],
},
"metrics": [
{"name": "reciprocal_rank", "scores": [0.378064, 0.534964, 0.216058, 0.778642]},
{"name": "single_hit", "scores": [1, 1, 0, 1]},
{"name": "multi_hit", "scores": [0.706125, 0.454976, 0.445512, 0.250522]},
{"name": "context_relevance", "scores": [0.805466, 0.410251, 0.750070, 0.361332]},
{"name": "faithfulness", "scores": [0.135581, 0.695974, 0.749861, 0.041999]},
{"name": "semantic_answer_similarity", "scores": [0.971241, 0.159320, 0.019722, 1]},
],
}

_ = EvaluationResult(pipeline_name="testing_pipeline_1", results=data)


def test_score_report():
data = {
"inputs": {
"query_id": ["53c3b3e6", "225f87f7"],
"question": ["What is the capital of France?", "What is the capital of Spain?"],
"contexts": ["wiki_France", "wiki_Spain"],
"answer": ["Paris", "Madrid"],
"predicted_answer": ["Paris", "Madrid"],
},
"metrics": [
{
"name": "reciprocal_rank",
"individual_scores": [0.378064, 0.534964, 0.216058, 0.778642],
"score": 0.476932,
},
{"name": "single_hit", "individual_scores": [1, 1, 0, 1], "score": 0.75},
{"name": "multi_hit", "individual_scores": [0.706125, 0.454976, 0.445512, 0.250522], "score": 0.46428375},
{
"name": "context_relevance",
"individual_scores": [0.805466, 0.410251, 0.750070, 0.361332],
"score": 0.58177975,
},
{
"name": "faithfulness",
"individual_scores": [0.135581, 0.695974, 0.749861, 0.041999],
"score": 0.40585375,
},
{
"name": "semantic_answer_similarity",
"individual_scores": [0.971241, 0.159320, 0.019722, 1],
"score": 0.53757075,
},
],
}

evaluator = EvaluationResult(pipeline_name="testing_pipeline_1", results=data)
result = evaluator.score_report().to_json()
assert result == (
'{"score":{"reciprocal_rank":0.476932,"single_hit":0.75,"multi_hit":0.46428375,'
'"context_relevance":0.58177975,"faithfulness":0.40585375,'
'"semantic_answer_similarity":0.53757075}}'
)


def test_to_pandas():
data = {
"inputs": {
"query_id": ["53c3b3e6", "225f87f7", "53c3b3e6", "225f87f7"],
"question": [
"What is the capital of France?",
"What is the capital of Spain?",
"What is the capital of Luxembourg?",
"What is the capital of Portugal?",
],
"contexts": ["wiki_France", "wiki_Spain", "wiki_Luxembourg", "wiki_Portugal"],
"answer": ["Paris", "Madrid", "Luxembourg", "Lisbon"],
"predicted_answer": ["Paris", "Madrid", "Luxembourg", "Lisbon"],
},
"metrics": [
{"name": "reciprocal_rank", "individual_scores": [0.378064, 0.534964, 0.216058, 0.778642]},
{"name": "single_hit", "individual_scores": [1, 1, 0, 1]},
{"name": "multi_hit", "individual_scores": [0.706125, 0.454976, 0.445512, 0.250522]},
{"name": "context_relevance", "individual_scores": [0.805466, 0.410251, 0.750070, 0.361332]},
{"name": "faithfulness", "individual_scores": [0.135581, 0.695974, 0.749861, 0.041999]},
{"name": "semantic_answer_similarity", "individual_scores": [0.971241, 0.159320, 0.019722, 1]},
],
}

evaluator = EvaluationResult(pipeline_name="testing_pipeline_1", results=data)
assert evaluator.to_pandas().to_json() == (
'{"query_id":{"0":"53c3b3e6","1":"225f87f7","2":"53c3b3e6","3":"225f87f7"},'
'"question":{"0":"What is the capital of France?","1":"What is the capital of Spain?",'
'"2":"What is the capital of Luxembourg?","3":"What is the capital of Portugal?"},'
'"contexts":{"0":"wiki_France","1":"wiki_Spain","2":"wiki_Luxembourg","3":"wiki_Portugal"},'
'"answer":{"0":"Paris","1":"Madrid","2":"Luxembourg","3":"Lisbon"},'
'"predicted_answer":{"0":"Paris","1":"Madrid","2":"Luxembourg","3":"Lisbon"},'
'"reciprocal_rank":{"0":0.378064,"1":0.534964,"2":0.216058,"3":0.778642},'
'"single_hit":{"0":1,"1":1,"2":0,"3":1},'
'"multi_hit":{"0":0.706125,"1":0.454976,"2":0.445512,"3":0.250522},'
'"context_relevance":{"0":0.805466,"1":0.410251,"2":0.75007,"3":0.361332},'
'"faithfulness":{"0":0.135581,"1":0.695974,"2":0.749861,"3":0.041999},'
'"semantic_answer_similarity":{"0":0.971241,"1":0.15932,"2":0.019722,"3":1.0}}'
)


def test_comparative_individual_scores_report():
data_1 = {
"inputs": {
"query_id": ["53c3b3e6", "225f87f7"],
"question": ["What is the capital of France?", "What is the capital of Spain?"],
"contexts": ["wiki_France", "wiki_Spain"],
"answer": ["Paris", "Madrid"],
"predicted_answer": ["Paris", "Madrid"],
},
"metrics": [
{"name": "reciprocal_rank", "individual_scores": [0.378064, 0.534964, 0.216058, 0.778642]},
{"name": "single_hit", "individual_scores": [1, 1, 0, 1]},
{"name": "multi_hit", "individual_scores": [0.706125, 0.454976, 0.445512, 0.250522]},
{"name": "context_relevance", "individual_scores": [0.805466, 0.410251, 0.750070, 0.361332]},
{"name": "faithfulness", "individual_scores": [0.135581, 0.695974, 0.749861, 0.041999]},
{"name": "semantic_answer_similarity", "individual_scores": [0.971241, 0.159320, 0.019722, 1]},
],
}

data_2 = {
"inputs": {
"query_id": ["53c3b3e6", "225f87f7"],
"question": ["What is the capital of France?", "What is the capital of Spain?"],
"contexts": ["wiki_France", "wiki_Spain"],
"answer": ["Paris", "Madrid"],
"predicted_answer": ["Paris", "Madrid"],
},
"metrics": [
{"name": "reciprocal_rank", "individual_scores": [0.378064, 0.534964, 0.216058, 0.778642]},
{"name": "single_hit", "individual_scores": [1, 1, 0, 1]},
{"name": "multi_hit", "individual_scores": [0.706125, 0.454976, 0.445512, 0.250522]},
{"name": "context_relevance", "individual_scores": [0.805466, 0.410251, 0.750070, 0.361332]},
{"name": "faithfulness", "individual_scores": [0.135581, 0.695974, 0.749861, 0.041999]},
{"name": "semantic_answer_similarity", "individual_scores": [0.971241, 0.159320, 0.019722, 1]},
],
}

evaluator_1 = EvaluationResult(pipeline_name="testing_pipeline_1", results=data_1)
evaluator_2 = EvaluationResult(pipeline_name="testing_pipeline_2", results=data_2)
results = evaluator_1.comparative_individual_scores_report(evaluator_2)

assert results.to_json() == (
'{"query_id":{"0":"53c3b3e6","1":"225f87f7"},'
'"question":{"0":"What is the capital of France?","1":"What is the capital of Spain?"},'
'"contexts":{"0":"wiki_France","1":"wiki_Spain"},"answer":{"0":"Paris","1":"Madrid"},'
'"testing_pipeline_1_predicted_answer":{"0":"Paris","1":"Madrid"},'
'"testing_pipeline_1_reciprocal_rank":{"0":0.378064,"1":0.534964},'
'"testing_pipeline_1_single_hit":{"0":1,"1":1},'
'"testing_pipeline_1_multi_hit":{"0":0.706125,"1":0.454976},'
'"testing_pipeline_1_context_relevance":{"0":0.805466,"1":0.410251},'
'"testing_pipeline_1_faithfulness":{"0":0.135581,"1":0.695974},'
'"testing_pipeline_1_semantic_answer_similarity":{"0":0.971241,"1":0.15932},'
'"testing_pipeline_2_predicted_answer":{"0":"Paris","1":"Madrid"},'
'"testing_pipeline_2_reciprocal_rank":{"0":0.378064,"1":0.534964},'
'"testing_pipeline_2_single_hit":{"0":1,"1":1},'
'"testing_pipeline_2_multi_hit":{"0":0.706125,"1":0.454976},'
'"testing_pipeline_2_context_relevance":{"0":0.805466,"1":0.410251},'
'"testing_pipeline_2_faithfulness":{"0":0.135581,"1":0.695974},'
'"testing_pipeline_2_semantic_answer_similarity":{"0":0.971241,"1":0.15932}}'
)

0 comments on commit 9a9c8aa

Please sign in to comment.