-
Notifications
You must be signed in to change notification settings - Fork 2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: implementing evalualtion results API (#7520)
* initial import * adding tests * attending PR comments * fixing tests * updating tests * updating tests and code * renaming * fixing linting issues * adding release notes * adding docstrings * latest fixes
- Loading branch information
1 parent
e974a23
commit 9a9c8aa
Showing
5 changed files
with
275 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
from typing import Any, Dict | ||
|
||
from pandas import DataFrame | ||
from pandas import concat as pd_concat | ||
|
||
|
||
class EvaluationResult: | ||
""" | ||
A class to store the results of an evaluation pipeline. | ||
data = { | ||
"inputs": { | ||
"question": ["What is the capital of France?", "What is the capital of Spain?"], | ||
"contexts": ["wiki_France", "wiki_Spain"], | ||
"predicted_answer": ["Paris", "Madrid"], | ||
}, | ||
"metrics": [ | ||
{"name": "reciprocal_rank", "scores": [0.378064, 0.534964, 0.216058, 0.778642]}, | ||
{"name": "context_relevance", "scores": [0.805466, 0.410251, 0.750070, 0.361332]}, | ||
], | ||
} | ||
eval_result = EvaluationResult(pipeline_name="testing_pipeline_1", results=data) | ||
eval_result.to_pandas() | ||
""" | ||
|
||
def __init__(self, pipeline_name: str, results: Dict[str, Any]): | ||
""" | ||
Initialize the EvaluationResult object. | ||
:param pipeline_name: The name of the pipeline that generated the results. | ||
:param results: A dictionary containing the results of the evaluators used in the EvaluationPipeline. | ||
it should have the following keys: | ||
- inputs: A dictionary containing the inputs used in the evaluation. | ||
- metrics: A list of dictionaries each containing the following keys: | ||
'name': The name of the metric. | ||
'score': The aggregated score for the metric. | ||
'individual_scores': A list of scores for each query. | ||
""" | ||
self.results = results | ||
self.pipeline_name = pipeline_name | ||
|
||
def score_report(self) -> DataFrame: | ||
""" | ||
Transforms the results into a DataFrame with the aggregated scores for each metric. | ||
:returns: | ||
A DataFrame with the aggregated scores. | ||
""" | ||
results = {entry["name"]: entry["score"] for entry in self.results["metrics"]} | ||
return DataFrame.from_dict(results, orient="index", columns=["score"]) | ||
|
||
def to_pandas(self) -> DataFrame: | ||
""" | ||
Creates a DataFrame containing the scores for each query and each metric. | ||
:returns: | ||
A DataFrame with the scores. | ||
""" | ||
inputs_columns = list(self.results["inputs"].keys()) | ||
inputs_values = list(self.results["inputs"].values()) | ||
inputs_values = list(map(list, zip(*inputs_values))) # transpose the values | ||
df_inputs = DataFrame(inputs_values, columns=inputs_columns) | ||
|
||
scores_columns = [entry["name"] for entry in self.results["metrics"]] | ||
scores_values = [entry["individual_scores"] for entry in self.results["metrics"]] | ||
scores_values = list(map(list, zip(*scores_values))) # transpose the values | ||
df_scores = DataFrame(scores_values, columns=scores_columns) | ||
|
||
return df_inputs.join(df_scores) | ||
|
||
def comparative_individual_scores_report(self, other: "EvaluationResult") -> DataFrame: | ||
""" | ||
Creates a DataFrame with the scores for each metric in the results of two different pipelines. | ||
:param other: The other EvaluationResults object to compare with. | ||
:returns: | ||
A DataFrame with the scores from both EvaluationResults objects. | ||
""" | ||
pipe_a_df = self.to_pandas() | ||
pipe_b_df = other.to_pandas() | ||
|
||
# check if the columns are the same, i.e.: the same queries and evaluation pipeline | ||
columns_a = list(pipe_a_df.columns) | ||
columns_b = list(pipe_b_df.columns) | ||
if columns_a != columns_b: | ||
raise ValueError(f"The two evaluation results do not have the same columns: {columns_a} != {columns_b}") | ||
|
||
# add the pipeline name to the column | ||
ignore = ["query_id", "question", "contexts", "answer"] | ||
pipe_b_df.drop(columns=ignore, inplace=True, errors="ignore") | ||
pipe_b_df.columns = [f"{other.pipeline_name}_{column}" for column in pipe_b_df.columns] | ||
pipe_a_df.columns = [f"{self.pipeline_name}_{col}" if col not in ignore else col for col in pipe_a_df.columns] | ||
|
||
results_df = pd_concat([pipe_a_df, pipe_b_df], axis=1) | ||
|
||
return results_df |
5 changes: 5 additions & 0 deletions
5
releasenotes/notes/implemeting-eval-results-API-25b2f8707495bea0.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
--- | ||
features: | ||
- | | ||
Added a new EvaluationResult component. | ||
This is a wrapper for all the results coming from the Evaluators, presenting the metric scores as a DataFrame. |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
from haystack.components.evaluators.evaluation_result import EvaluationResult | ||
|
||
|
||
def test_init_results_evaluator(): | ||
data = { | ||
"inputs": { | ||
"query_id": ["53c3b3e6", "225f87f7"], | ||
"question": ["What is the capital of France?", "What is the capital of Spain?"], | ||
"contexts": ["wiki_France", "wiki_Spain"], | ||
"answer": ["Paris", "Madrid"], | ||
"predicted_answer": ["Paris", "Madrid"], | ||
}, | ||
"metrics": [ | ||
{"name": "reciprocal_rank", "scores": [0.378064, 0.534964, 0.216058, 0.778642]}, | ||
{"name": "single_hit", "scores": [1, 1, 0, 1]}, | ||
{"name": "multi_hit", "scores": [0.706125, 0.454976, 0.445512, 0.250522]}, | ||
{"name": "context_relevance", "scores": [0.805466, 0.410251, 0.750070, 0.361332]}, | ||
{"name": "faithfulness", "scores": [0.135581, 0.695974, 0.749861, 0.041999]}, | ||
{"name": "semantic_answer_similarity", "scores": [0.971241, 0.159320, 0.019722, 1]}, | ||
], | ||
} | ||
|
||
_ = EvaluationResult(pipeline_name="testing_pipeline_1", results=data) | ||
|
||
|
||
def test_score_report(): | ||
data = { | ||
"inputs": { | ||
"query_id": ["53c3b3e6", "225f87f7"], | ||
"question": ["What is the capital of France?", "What is the capital of Spain?"], | ||
"contexts": ["wiki_France", "wiki_Spain"], | ||
"answer": ["Paris", "Madrid"], | ||
"predicted_answer": ["Paris", "Madrid"], | ||
}, | ||
"metrics": [ | ||
{ | ||
"name": "reciprocal_rank", | ||
"individual_scores": [0.378064, 0.534964, 0.216058, 0.778642], | ||
"score": 0.476932, | ||
}, | ||
{"name": "single_hit", "individual_scores": [1, 1, 0, 1], "score": 0.75}, | ||
{"name": "multi_hit", "individual_scores": [0.706125, 0.454976, 0.445512, 0.250522], "score": 0.46428375}, | ||
{ | ||
"name": "context_relevance", | ||
"individual_scores": [0.805466, 0.410251, 0.750070, 0.361332], | ||
"score": 0.58177975, | ||
}, | ||
{ | ||
"name": "faithfulness", | ||
"individual_scores": [0.135581, 0.695974, 0.749861, 0.041999], | ||
"score": 0.40585375, | ||
}, | ||
{ | ||
"name": "semantic_answer_similarity", | ||
"individual_scores": [0.971241, 0.159320, 0.019722, 1], | ||
"score": 0.53757075, | ||
}, | ||
], | ||
} | ||
|
||
evaluator = EvaluationResult(pipeline_name="testing_pipeline_1", results=data) | ||
result = evaluator.score_report().to_json() | ||
assert result == ( | ||
'{"score":{"reciprocal_rank":0.476932,"single_hit":0.75,"multi_hit":0.46428375,' | ||
'"context_relevance":0.58177975,"faithfulness":0.40585375,' | ||
'"semantic_answer_similarity":0.53757075}}' | ||
) | ||
|
||
|
||
def test_to_pandas(): | ||
data = { | ||
"inputs": { | ||
"query_id": ["53c3b3e6", "225f87f7", "53c3b3e6", "225f87f7"], | ||
"question": [ | ||
"What is the capital of France?", | ||
"What is the capital of Spain?", | ||
"What is the capital of Luxembourg?", | ||
"What is the capital of Portugal?", | ||
], | ||
"contexts": ["wiki_France", "wiki_Spain", "wiki_Luxembourg", "wiki_Portugal"], | ||
"answer": ["Paris", "Madrid", "Luxembourg", "Lisbon"], | ||
"predicted_answer": ["Paris", "Madrid", "Luxembourg", "Lisbon"], | ||
}, | ||
"metrics": [ | ||
{"name": "reciprocal_rank", "individual_scores": [0.378064, 0.534964, 0.216058, 0.778642]}, | ||
{"name": "single_hit", "individual_scores": [1, 1, 0, 1]}, | ||
{"name": "multi_hit", "individual_scores": [0.706125, 0.454976, 0.445512, 0.250522]}, | ||
{"name": "context_relevance", "individual_scores": [0.805466, 0.410251, 0.750070, 0.361332]}, | ||
{"name": "faithfulness", "individual_scores": [0.135581, 0.695974, 0.749861, 0.041999]}, | ||
{"name": "semantic_answer_similarity", "individual_scores": [0.971241, 0.159320, 0.019722, 1]}, | ||
], | ||
} | ||
|
||
evaluator = EvaluationResult(pipeline_name="testing_pipeline_1", results=data) | ||
assert evaluator.to_pandas().to_json() == ( | ||
'{"query_id":{"0":"53c3b3e6","1":"225f87f7","2":"53c3b3e6","3":"225f87f7"},' | ||
'"question":{"0":"What is the capital of France?","1":"What is the capital of Spain?",' | ||
'"2":"What is the capital of Luxembourg?","3":"What is the capital of Portugal?"},' | ||
'"contexts":{"0":"wiki_France","1":"wiki_Spain","2":"wiki_Luxembourg","3":"wiki_Portugal"},' | ||
'"answer":{"0":"Paris","1":"Madrid","2":"Luxembourg","3":"Lisbon"},' | ||
'"predicted_answer":{"0":"Paris","1":"Madrid","2":"Luxembourg","3":"Lisbon"},' | ||
'"reciprocal_rank":{"0":0.378064,"1":0.534964,"2":0.216058,"3":0.778642},' | ||
'"single_hit":{"0":1,"1":1,"2":0,"3":1},' | ||
'"multi_hit":{"0":0.706125,"1":0.454976,"2":0.445512,"3":0.250522},' | ||
'"context_relevance":{"0":0.805466,"1":0.410251,"2":0.75007,"3":0.361332},' | ||
'"faithfulness":{"0":0.135581,"1":0.695974,"2":0.749861,"3":0.041999},' | ||
'"semantic_answer_similarity":{"0":0.971241,"1":0.15932,"2":0.019722,"3":1.0}}' | ||
) | ||
|
||
|
||
def test_comparative_individual_scores_report(): | ||
data_1 = { | ||
"inputs": { | ||
"query_id": ["53c3b3e6", "225f87f7"], | ||
"question": ["What is the capital of France?", "What is the capital of Spain?"], | ||
"contexts": ["wiki_France", "wiki_Spain"], | ||
"answer": ["Paris", "Madrid"], | ||
"predicted_answer": ["Paris", "Madrid"], | ||
}, | ||
"metrics": [ | ||
{"name": "reciprocal_rank", "individual_scores": [0.378064, 0.534964, 0.216058, 0.778642]}, | ||
{"name": "single_hit", "individual_scores": [1, 1, 0, 1]}, | ||
{"name": "multi_hit", "individual_scores": [0.706125, 0.454976, 0.445512, 0.250522]}, | ||
{"name": "context_relevance", "individual_scores": [0.805466, 0.410251, 0.750070, 0.361332]}, | ||
{"name": "faithfulness", "individual_scores": [0.135581, 0.695974, 0.749861, 0.041999]}, | ||
{"name": "semantic_answer_similarity", "individual_scores": [0.971241, 0.159320, 0.019722, 1]}, | ||
], | ||
} | ||
|
||
data_2 = { | ||
"inputs": { | ||
"query_id": ["53c3b3e6", "225f87f7"], | ||
"question": ["What is the capital of France?", "What is the capital of Spain?"], | ||
"contexts": ["wiki_France", "wiki_Spain"], | ||
"answer": ["Paris", "Madrid"], | ||
"predicted_answer": ["Paris", "Madrid"], | ||
}, | ||
"metrics": [ | ||
{"name": "reciprocal_rank", "individual_scores": [0.378064, 0.534964, 0.216058, 0.778642]}, | ||
{"name": "single_hit", "individual_scores": [1, 1, 0, 1]}, | ||
{"name": "multi_hit", "individual_scores": [0.706125, 0.454976, 0.445512, 0.250522]}, | ||
{"name": "context_relevance", "individual_scores": [0.805466, 0.410251, 0.750070, 0.361332]}, | ||
{"name": "faithfulness", "individual_scores": [0.135581, 0.695974, 0.749861, 0.041999]}, | ||
{"name": "semantic_answer_similarity", "individual_scores": [0.971241, 0.159320, 0.019722, 1]}, | ||
], | ||
} | ||
|
||
evaluator_1 = EvaluationResult(pipeline_name="testing_pipeline_1", results=data_1) | ||
evaluator_2 = EvaluationResult(pipeline_name="testing_pipeline_2", results=data_2) | ||
results = evaluator_1.comparative_individual_scores_report(evaluator_2) | ||
|
||
assert results.to_json() == ( | ||
'{"query_id":{"0":"53c3b3e6","1":"225f87f7"},' | ||
'"question":{"0":"What is the capital of France?","1":"What is the capital of Spain?"},' | ||
'"contexts":{"0":"wiki_France","1":"wiki_Spain"},"answer":{"0":"Paris","1":"Madrid"},' | ||
'"testing_pipeline_1_predicted_answer":{"0":"Paris","1":"Madrid"},' | ||
'"testing_pipeline_1_reciprocal_rank":{"0":0.378064,"1":0.534964},' | ||
'"testing_pipeline_1_single_hit":{"0":1,"1":1},' | ||
'"testing_pipeline_1_multi_hit":{"0":0.706125,"1":0.454976},' | ||
'"testing_pipeline_1_context_relevance":{"0":0.805466,"1":0.410251},' | ||
'"testing_pipeline_1_faithfulness":{"0":0.135581,"1":0.695974},' | ||
'"testing_pipeline_1_semantic_answer_similarity":{"0":0.971241,"1":0.15932},' | ||
'"testing_pipeline_2_predicted_answer":{"0":"Paris","1":"Madrid"},' | ||
'"testing_pipeline_2_reciprocal_rank":{"0":0.378064,"1":0.534964},' | ||
'"testing_pipeline_2_single_hit":{"0":1,"1":1},' | ||
'"testing_pipeline_2_multi_hit":{"0":0.706125,"1":0.454976},' | ||
'"testing_pipeline_2_context_relevance":{"0":0.805466,"1":0.410251},' | ||
'"testing_pipeline_2_faithfulness":{"0":0.135581,"1":0.695974},' | ||
'"testing_pipeline_2_semantic_answer_similarity":{"0":0.971241,"1":0.15932}}' | ||
) |