diff --git a/haystack/components/evaluators/__init__.py b/haystack/components/evaluators/__init__.py index 0da03f913c..f69c8257a9 100644 --- a/haystack/components/evaluators/__init__.py +++ b/haystack/components/evaluators/__init__.py @@ -2,6 +2,7 @@ from .document_map import DocumentMAPEvaluator from .document_mrr import DocumentMRREvaluator from .document_recall import DocumentRecallEvaluator +from .evaluation_result import EvaluationResult from .faithfulness import FaithfulnessEvaluator from .llm_evaluator import LLMEvaluator from .sas_evaluator import SASEvaluator @@ -11,6 +12,7 @@ "DocumentMAPEvaluator", "DocumentMRREvaluator", "DocumentRecallEvaluator", + "EvaluationResult", "FaithfulnessEvaluator", "LLMEvaluator", "SASEvaluator", diff --git a/haystack/components/evaluators/evaluation_result.py b/haystack/components/evaluators/evaluation_result.py new file mode 100644 index 0000000000..d2a146f3f1 --- /dev/null +++ b/haystack/components/evaluators/evaluation_result.py @@ -0,0 +1,98 @@ +from typing import Any, Dict + +from pandas import DataFrame +from pandas import concat as pd_concat + + +class EvaluationResult: + """ + A class to store the results of an evaluation pipeline. + + data = { + "inputs": { + "question": ["What is the capital of France?", "What is the capital of Spain?"], + "contexts": ["wiki_France", "wiki_Spain"], + "predicted_answer": ["Paris", "Madrid"], + }, + "metrics": [ + {"name": "reciprocal_rank", "scores": [0.378064, 0.534964, 0.216058, 0.778642]}, + {"name": "context_relevance", "scores": [0.805466, 0.410251, 0.750070, 0.361332]}, + ], + } + + eval_result = EvaluationResult(pipeline_name="testing_pipeline_1", results=data) + eval_result.to_pandas() + """ + + def __init__(self, pipeline_name: str, results: Dict[str, Any]): + """ + Initialize the EvaluationResult object. + + :param pipeline_name: The name of the pipeline that generated the results. + :param results: A dictionary containing the results of the evaluators used in the EvaluationPipeline. + it should have the following keys: + - inputs: A dictionary containing the inputs used in the evaluation. + - metrics: A list of dictionaries each containing the following keys: + 'name': The name of the metric. + 'score': The aggregated score for the metric. + 'individual_scores': A list of scores for each query. + """ + self.results = results + self.pipeline_name = pipeline_name + + def score_report(self) -> DataFrame: + """ + Transforms the results into a DataFrame with the aggregated scores for each metric. + + :returns: + A DataFrame with the aggregated scores. + + """ + results = {entry["name"]: entry["score"] for entry in self.results["metrics"]} + return DataFrame.from_dict(results, orient="index", columns=["score"]) + + def to_pandas(self) -> DataFrame: + """ + Creates a DataFrame containing the scores for each query and each metric. + + :returns: + A DataFrame with the scores. + """ + inputs_columns = list(self.results["inputs"].keys()) + inputs_values = list(self.results["inputs"].values()) + inputs_values = list(map(list, zip(*inputs_values))) # transpose the values + df_inputs = DataFrame(inputs_values, columns=inputs_columns) + + scores_columns = [entry["name"] for entry in self.results["metrics"]] + scores_values = [entry["individual_scores"] for entry in self.results["metrics"]] + scores_values = list(map(list, zip(*scores_values))) # transpose the values + df_scores = DataFrame(scores_values, columns=scores_columns) + + return df_inputs.join(df_scores) + + def comparative_individual_scores_report(self, other: "EvaluationResult") -> DataFrame: + """ + Creates a DataFrame with the scores for each metric in the results of two different pipelines. + + :param other: The other EvaluationResults object to compare with. + :returns: + A DataFrame with the scores from both EvaluationResults objects. + """ + pipe_a_df = self.to_pandas() + pipe_b_df = other.to_pandas() + + # check if the columns are the same, i.e.: the same queries and evaluation pipeline + columns_a = list(pipe_a_df.columns) + columns_b = list(pipe_b_df.columns) + if columns_a != columns_b: + raise ValueError(f"The two evaluation results do not have the same columns: {columns_a} != {columns_b}") + + # add the pipeline name to the column + ignore = ["query_id", "question", "contexts", "answer"] + pipe_b_df.drop(columns=ignore, inplace=True, errors="ignore") + pipe_b_df.columns = [f"{other.pipeline_name}_{column}" for column in pipe_b_df.columns] + pipe_a_df.columns = [f"{self.pipeline_name}_{col}" if col not in ignore else col for col in pipe_a_df.columns] + + results_df = pd_concat([pipe_a_df, pipe_b_df], axis=1) + + return results_df diff --git a/releasenotes/notes/implemeting-eval-results-API-25b2f8707495bea0.yaml b/releasenotes/notes/implemeting-eval-results-API-25b2f8707495bea0.yaml new file mode 100644 index 0000000000..2f02640cae --- /dev/null +++ b/releasenotes/notes/implemeting-eval-results-API-25b2f8707495bea0.yaml @@ -0,0 +1,5 @@ +--- +features: + - | + Added a new EvaluationResult component. + This is a wrapper for all the results coming from the Evaluators, presenting the metric scores as a DataFrame. diff --git a/test/components/evaluators/__init__.py b/test/components/evaluators/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/components/evaluators/test_results_evaluator.py b/test/components/evaluators/test_results_evaluator.py new file mode 100644 index 0000000000..4e07c9679c --- /dev/null +++ b/test/components/evaluators/test_results_evaluator.py @@ -0,0 +1,170 @@ +from haystack.components.evaluators.evaluation_result import EvaluationResult + + +def test_init_results_evaluator(): + data = { + "inputs": { + "query_id": ["53c3b3e6", "225f87f7"], + "question": ["What is the capital of France?", "What is the capital of Spain?"], + "contexts": ["wiki_France", "wiki_Spain"], + "answer": ["Paris", "Madrid"], + "predicted_answer": ["Paris", "Madrid"], + }, + "metrics": [ + {"name": "reciprocal_rank", "scores": [0.378064, 0.534964, 0.216058, 0.778642]}, + {"name": "single_hit", "scores": [1, 1, 0, 1]}, + {"name": "multi_hit", "scores": [0.706125, 0.454976, 0.445512, 0.250522]}, + {"name": "context_relevance", "scores": [0.805466, 0.410251, 0.750070, 0.361332]}, + {"name": "faithfulness", "scores": [0.135581, 0.695974, 0.749861, 0.041999]}, + {"name": "semantic_answer_similarity", "scores": [0.971241, 0.159320, 0.019722, 1]}, + ], + } + + _ = EvaluationResult(pipeline_name="testing_pipeline_1", results=data) + + +def test_score_report(): + data = { + "inputs": { + "query_id": ["53c3b3e6", "225f87f7"], + "question": ["What is the capital of France?", "What is the capital of Spain?"], + "contexts": ["wiki_France", "wiki_Spain"], + "answer": ["Paris", "Madrid"], + "predicted_answer": ["Paris", "Madrid"], + }, + "metrics": [ + { + "name": "reciprocal_rank", + "individual_scores": [0.378064, 0.534964, 0.216058, 0.778642], + "score": 0.476932, + }, + {"name": "single_hit", "individual_scores": [1, 1, 0, 1], "score": 0.75}, + {"name": "multi_hit", "individual_scores": [0.706125, 0.454976, 0.445512, 0.250522], "score": 0.46428375}, + { + "name": "context_relevance", + "individual_scores": [0.805466, 0.410251, 0.750070, 0.361332], + "score": 0.58177975, + }, + { + "name": "faithfulness", + "individual_scores": [0.135581, 0.695974, 0.749861, 0.041999], + "score": 0.40585375, + }, + { + "name": "semantic_answer_similarity", + "individual_scores": [0.971241, 0.159320, 0.019722, 1], + "score": 0.53757075, + }, + ], + } + + evaluator = EvaluationResult(pipeline_name="testing_pipeline_1", results=data) + result = evaluator.score_report().to_json() + assert result == ( + '{"score":{"reciprocal_rank":0.476932,"single_hit":0.75,"multi_hit":0.46428375,' + '"context_relevance":0.58177975,"faithfulness":0.40585375,' + '"semantic_answer_similarity":0.53757075}}' + ) + + +def test_to_pandas(): + data = { + "inputs": { + "query_id": ["53c3b3e6", "225f87f7", "53c3b3e6", "225f87f7"], + "question": [ + "What is the capital of France?", + "What is the capital of Spain?", + "What is the capital of Luxembourg?", + "What is the capital of Portugal?", + ], + "contexts": ["wiki_France", "wiki_Spain", "wiki_Luxembourg", "wiki_Portugal"], + "answer": ["Paris", "Madrid", "Luxembourg", "Lisbon"], + "predicted_answer": ["Paris", "Madrid", "Luxembourg", "Lisbon"], + }, + "metrics": [ + {"name": "reciprocal_rank", "individual_scores": [0.378064, 0.534964, 0.216058, 0.778642]}, + {"name": "single_hit", "individual_scores": [1, 1, 0, 1]}, + {"name": "multi_hit", "individual_scores": [0.706125, 0.454976, 0.445512, 0.250522]}, + {"name": "context_relevance", "individual_scores": [0.805466, 0.410251, 0.750070, 0.361332]}, + {"name": "faithfulness", "individual_scores": [0.135581, 0.695974, 0.749861, 0.041999]}, + {"name": "semantic_answer_similarity", "individual_scores": [0.971241, 0.159320, 0.019722, 1]}, + ], + } + + evaluator = EvaluationResult(pipeline_name="testing_pipeline_1", results=data) + assert evaluator.to_pandas().to_json() == ( + '{"query_id":{"0":"53c3b3e6","1":"225f87f7","2":"53c3b3e6","3":"225f87f7"},' + '"question":{"0":"What is the capital of France?","1":"What is the capital of Spain?",' + '"2":"What is the capital of Luxembourg?","3":"What is the capital of Portugal?"},' + '"contexts":{"0":"wiki_France","1":"wiki_Spain","2":"wiki_Luxembourg","3":"wiki_Portugal"},' + '"answer":{"0":"Paris","1":"Madrid","2":"Luxembourg","3":"Lisbon"},' + '"predicted_answer":{"0":"Paris","1":"Madrid","2":"Luxembourg","3":"Lisbon"},' + '"reciprocal_rank":{"0":0.378064,"1":0.534964,"2":0.216058,"3":0.778642},' + '"single_hit":{"0":1,"1":1,"2":0,"3":1},' + '"multi_hit":{"0":0.706125,"1":0.454976,"2":0.445512,"3":0.250522},' + '"context_relevance":{"0":0.805466,"1":0.410251,"2":0.75007,"3":0.361332},' + '"faithfulness":{"0":0.135581,"1":0.695974,"2":0.749861,"3":0.041999},' + '"semantic_answer_similarity":{"0":0.971241,"1":0.15932,"2":0.019722,"3":1.0}}' + ) + + +def test_comparative_individual_scores_report(): + data_1 = { + "inputs": { + "query_id": ["53c3b3e6", "225f87f7"], + "question": ["What is the capital of France?", "What is the capital of Spain?"], + "contexts": ["wiki_France", "wiki_Spain"], + "answer": ["Paris", "Madrid"], + "predicted_answer": ["Paris", "Madrid"], + }, + "metrics": [ + {"name": "reciprocal_rank", "individual_scores": [0.378064, 0.534964, 0.216058, 0.778642]}, + {"name": "single_hit", "individual_scores": [1, 1, 0, 1]}, + {"name": "multi_hit", "individual_scores": [0.706125, 0.454976, 0.445512, 0.250522]}, + {"name": "context_relevance", "individual_scores": [0.805466, 0.410251, 0.750070, 0.361332]}, + {"name": "faithfulness", "individual_scores": [0.135581, 0.695974, 0.749861, 0.041999]}, + {"name": "semantic_answer_similarity", "individual_scores": [0.971241, 0.159320, 0.019722, 1]}, + ], + } + + data_2 = { + "inputs": { + "query_id": ["53c3b3e6", "225f87f7"], + "question": ["What is the capital of France?", "What is the capital of Spain?"], + "contexts": ["wiki_France", "wiki_Spain"], + "answer": ["Paris", "Madrid"], + "predicted_answer": ["Paris", "Madrid"], + }, + "metrics": [ + {"name": "reciprocal_rank", "individual_scores": [0.378064, 0.534964, 0.216058, 0.778642]}, + {"name": "single_hit", "individual_scores": [1, 1, 0, 1]}, + {"name": "multi_hit", "individual_scores": [0.706125, 0.454976, 0.445512, 0.250522]}, + {"name": "context_relevance", "individual_scores": [0.805466, 0.410251, 0.750070, 0.361332]}, + {"name": "faithfulness", "individual_scores": [0.135581, 0.695974, 0.749861, 0.041999]}, + {"name": "semantic_answer_similarity", "individual_scores": [0.971241, 0.159320, 0.019722, 1]}, + ], + } + + evaluator_1 = EvaluationResult(pipeline_name="testing_pipeline_1", results=data_1) + evaluator_2 = EvaluationResult(pipeline_name="testing_pipeline_2", results=data_2) + results = evaluator_1.comparative_individual_scores_report(evaluator_2) + + assert results.to_json() == ( + '{"query_id":{"0":"53c3b3e6","1":"225f87f7"},' + '"question":{"0":"What is the capital of France?","1":"What is the capital of Spain?"},' + '"contexts":{"0":"wiki_France","1":"wiki_Spain"},"answer":{"0":"Paris","1":"Madrid"},' + '"testing_pipeline_1_predicted_answer":{"0":"Paris","1":"Madrid"},' + '"testing_pipeline_1_reciprocal_rank":{"0":0.378064,"1":0.534964},' + '"testing_pipeline_1_single_hit":{"0":1,"1":1},' + '"testing_pipeline_1_multi_hit":{"0":0.706125,"1":0.454976},' + '"testing_pipeline_1_context_relevance":{"0":0.805466,"1":0.410251},' + '"testing_pipeline_1_faithfulness":{"0":0.135581,"1":0.695974},' + '"testing_pipeline_1_semantic_answer_similarity":{"0":0.971241,"1":0.15932},' + '"testing_pipeline_2_predicted_answer":{"0":"Paris","1":"Madrid"},' + '"testing_pipeline_2_reciprocal_rank":{"0":0.378064,"1":0.534964},' + '"testing_pipeline_2_single_hit":{"0":1,"1":1},' + '"testing_pipeline_2_multi_hit":{"0":0.706125,"1":0.454976},' + '"testing_pipeline_2_context_relevance":{"0":0.805466,"1":0.410251},' + '"testing_pipeline_2_faithfulness":{"0":0.135581,"1":0.695974},' + '"testing_pipeline_2_semantic_answer_similarity":{"0":0.971241,"1":0.15932}}' + )