From 9a9c8aa1c86aa43fb239e053d70cebdb4734d4d0 Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Wed, 10 Apr 2024 15:34:03 +0200
Subject: [PATCH] feat: implementing evalualtion results API (#7520)

* initial import

* adding tests

* attending PR comments

* fixing tests

* updating tests

* updating tests and code

* renaming

* fixing linting issues

* adding release notes

* adding docstrings

* latest fixes
---
 haystack/components/evaluators/__init__.py    |   2 +
 .../evaluators/evaluation_result.py           |  98 ++++++++++
 ...ing-eval-results-API-25b2f8707495bea0.yaml |   5 +
 test/components/evaluators/__init__.py        |   0
 .../evaluators/test_results_evaluator.py      | 170 ++++++++++++++++++
 5 files changed, 275 insertions(+)
 create mode 100644 haystack/components/evaluators/evaluation_result.py
 create mode 100644 releasenotes/notes/implemeting-eval-results-API-25b2f8707495bea0.yaml
 create mode 100644 test/components/evaluators/__init__.py
 create mode 100644 test/components/evaluators/test_results_evaluator.py

diff --git a/haystack/components/evaluators/__init__.py b/haystack/components/evaluators/__init__.py
index 0da03f913c..f69c8257a9 100644
--- a/haystack/components/evaluators/__init__.py
+++ b/haystack/components/evaluators/__init__.py
@@ -2,6 +2,7 @@
 from .document_map import DocumentMAPEvaluator
 from .document_mrr import DocumentMRREvaluator
 from .document_recall import DocumentRecallEvaluator
+from .evaluation_result import EvaluationResult
 from .faithfulness import FaithfulnessEvaluator
 from .llm_evaluator import LLMEvaluator
 from .sas_evaluator import SASEvaluator
@@ -11,6 +12,7 @@
     "DocumentMAPEvaluator",
     "DocumentMRREvaluator",
     "DocumentRecallEvaluator",
+    "EvaluationResult",
     "FaithfulnessEvaluator",
     "LLMEvaluator",
     "SASEvaluator",
diff --git a/haystack/components/evaluators/evaluation_result.py b/haystack/components/evaluators/evaluation_result.py
new file mode 100644
index 0000000000..d2a146f3f1
--- /dev/null
+++ b/haystack/components/evaluators/evaluation_result.py
@@ -0,0 +1,98 @@
+from typing import Any, Dict
+
+from pandas import DataFrame
+from pandas import concat as pd_concat
+
+
+class EvaluationResult:
+    """
+    A class to store the results of an evaluation pipeline.
+
+    data = {
+        "inputs": {
+            "question": ["What is the capital of France?", "What is the capital of Spain?"],
+            "contexts": ["wiki_France", "wiki_Spain"],
+            "predicted_answer": ["Paris", "Madrid"],
+        },
+        "metrics": [
+            {"name": "reciprocal_rank", "scores": [0.378064, 0.534964, 0.216058, 0.778642]},
+            {"name": "context_relevance", "scores": [0.805466, 0.410251, 0.750070, 0.361332]},
+        ],
+    }
+
+    eval_result = EvaluationResult(pipeline_name="testing_pipeline_1", results=data)
+    eval_result.to_pandas()
+    """
+
+    def __init__(self, pipeline_name: str, results: Dict[str, Any]):
+        """
+        Initialize the EvaluationResult object.
+
+        :param pipeline_name: The name of the pipeline that generated the results.
+        :param results: A dictionary containing the results of the evaluators used in the EvaluationPipeline.
+                        it should have the following keys:
+                        - inputs: A dictionary containing the inputs used in the evaluation.
+                        - metrics: A list of dictionaries each containing the following keys:
+                            'name': The name of the metric.
+                            'score': The aggregated score for the metric.
+                            'individual_scores': A list of scores for each query.
+        """
+        self.results = results
+        self.pipeline_name = pipeline_name
+
+    def score_report(self) -> DataFrame:
+        """
+        Transforms the results into a DataFrame with the aggregated scores for each metric.
+
+        :returns:
+            A DataFrame with the aggregated scores.
+
+        """
+        results = {entry["name"]: entry["score"] for entry in self.results["metrics"]}
+        return DataFrame.from_dict(results, orient="index", columns=["score"])
+
+    def to_pandas(self) -> DataFrame:
+        """
+        Creates a DataFrame containing the scores for each query and each metric.
+
+        :returns:
+            A DataFrame with the scores.
+        """
+        inputs_columns = list(self.results["inputs"].keys())
+        inputs_values = list(self.results["inputs"].values())
+        inputs_values = list(map(list, zip(*inputs_values)))  # transpose the values
+        df_inputs = DataFrame(inputs_values, columns=inputs_columns)
+
+        scores_columns = [entry["name"] for entry in self.results["metrics"]]
+        scores_values = [entry["individual_scores"] for entry in self.results["metrics"]]
+        scores_values = list(map(list, zip(*scores_values)))  # transpose the values
+        df_scores = DataFrame(scores_values, columns=scores_columns)
+
+        return df_inputs.join(df_scores)
+
+    def comparative_individual_scores_report(self, other: "EvaluationResult") -> DataFrame:
+        """
+        Creates a DataFrame with the scores for each metric in the results of two different pipelines.
+
+        :param other: The other EvaluationResults object to compare with.
+        :returns:
+            A DataFrame with the scores from both EvaluationResults objects.
+        """
+        pipe_a_df = self.to_pandas()
+        pipe_b_df = other.to_pandas()
+
+        # check if the columns are the same, i.e.: the same queries and evaluation pipeline
+        columns_a = list(pipe_a_df.columns)
+        columns_b = list(pipe_b_df.columns)
+        if columns_a != columns_b:
+            raise ValueError(f"The two evaluation results do not have the same columns: {columns_a} != {columns_b}")
+
+        # add the pipeline name to the column
+        ignore = ["query_id", "question", "contexts", "answer"]
+        pipe_b_df.drop(columns=ignore, inplace=True, errors="ignore")
+        pipe_b_df.columns = [f"{other.pipeline_name}_{column}" for column in pipe_b_df.columns]
+        pipe_a_df.columns = [f"{self.pipeline_name}_{col}" if col not in ignore else col for col in pipe_a_df.columns]
+
+        results_df = pd_concat([pipe_a_df, pipe_b_df], axis=1)
+
+        return results_df
diff --git a/releasenotes/notes/implemeting-eval-results-API-25b2f8707495bea0.yaml b/releasenotes/notes/implemeting-eval-results-API-25b2f8707495bea0.yaml
new file mode 100644
index 0000000000..2f02640cae
--- /dev/null
+++ b/releasenotes/notes/implemeting-eval-results-API-25b2f8707495bea0.yaml
@@ -0,0 +1,5 @@
+---
+features:
+  - |
+    Added a new EvaluationResult component.
+    This is a wrapper for all the results coming from the Evaluators, presenting the metric scores as a DataFrame.
diff --git a/test/components/evaluators/__init__.py b/test/components/evaluators/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test/components/evaluators/test_results_evaluator.py b/test/components/evaluators/test_results_evaluator.py
new file mode 100644
index 0000000000..4e07c9679c
--- /dev/null
+++ b/test/components/evaluators/test_results_evaluator.py
@@ -0,0 +1,170 @@
+from haystack.components.evaluators.evaluation_result import EvaluationResult
+
+
+def test_init_results_evaluator():
+    data = {
+        "inputs": {
+            "query_id": ["53c3b3e6", "225f87f7"],
+            "question": ["What is the capital of France?", "What is the capital of Spain?"],
+            "contexts": ["wiki_France", "wiki_Spain"],
+            "answer": ["Paris", "Madrid"],
+            "predicted_answer": ["Paris", "Madrid"],
+        },
+        "metrics": [
+            {"name": "reciprocal_rank", "scores": [0.378064, 0.534964, 0.216058, 0.778642]},
+            {"name": "single_hit", "scores": [1, 1, 0, 1]},
+            {"name": "multi_hit", "scores": [0.706125, 0.454976, 0.445512, 0.250522]},
+            {"name": "context_relevance", "scores": [0.805466, 0.410251, 0.750070, 0.361332]},
+            {"name": "faithfulness", "scores": [0.135581, 0.695974, 0.749861, 0.041999]},
+            {"name": "semantic_answer_similarity", "scores": [0.971241, 0.159320, 0.019722, 1]},
+        ],
+    }
+
+    _ = EvaluationResult(pipeline_name="testing_pipeline_1", results=data)
+
+
+def test_score_report():
+    data = {
+        "inputs": {
+            "query_id": ["53c3b3e6", "225f87f7"],
+            "question": ["What is the capital of France?", "What is the capital of Spain?"],
+            "contexts": ["wiki_France", "wiki_Spain"],
+            "answer": ["Paris", "Madrid"],
+            "predicted_answer": ["Paris", "Madrid"],
+        },
+        "metrics": [
+            {
+                "name": "reciprocal_rank",
+                "individual_scores": [0.378064, 0.534964, 0.216058, 0.778642],
+                "score": 0.476932,
+            },
+            {"name": "single_hit", "individual_scores": [1, 1, 0, 1], "score": 0.75},
+            {"name": "multi_hit", "individual_scores": [0.706125, 0.454976, 0.445512, 0.250522], "score": 0.46428375},
+            {
+                "name": "context_relevance",
+                "individual_scores": [0.805466, 0.410251, 0.750070, 0.361332],
+                "score": 0.58177975,
+            },
+            {
+                "name": "faithfulness",
+                "individual_scores": [0.135581, 0.695974, 0.749861, 0.041999],
+                "score": 0.40585375,
+            },
+            {
+                "name": "semantic_answer_similarity",
+                "individual_scores": [0.971241, 0.159320, 0.019722, 1],
+                "score": 0.53757075,
+            },
+        ],
+    }
+
+    evaluator = EvaluationResult(pipeline_name="testing_pipeline_1", results=data)
+    result = evaluator.score_report().to_json()
+    assert result == (
+        '{"score":{"reciprocal_rank":0.476932,"single_hit":0.75,"multi_hit":0.46428375,'
+        '"context_relevance":0.58177975,"faithfulness":0.40585375,'
+        '"semantic_answer_similarity":0.53757075}}'
+    )
+
+
+def test_to_pandas():
+    data = {
+        "inputs": {
+            "query_id": ["53c3b3e6", "225f87f7", "53c3b3e6", "225f87f7"],
+            "question": [
+                "What is the capital of France?",
+                "What is the capital of Spain?",
+                "What is the capital of Luxembourg?",
+                "What is the capital of Portugal?",
+            ],
+            "contexts": ["wiki_France", "wiki_Spain", "wiki_Luxembourg", "wiki_Portugal"],
+            "answer": ["Paris", "Madrid", "Luxembourg", "Lisbon"],
+            "predicted_answer": ["Paris", "Madrid", "Luxembourg", "Lisbon"],
+        },
+        "metrics": [
+            {"name": "reciprocal_rank", "individual_scores": [0.378064, 0.534964, 0.216058, 0.778642]},
+            {"name": "single_hit", "individual_scores": [1, 1, 0, 1]},
+            {"name": "multi_hit", "individual_scores": [0.706125, 0.454976, 0.445512, 0.250522]},
+            {"name": "context_relevance", "individual_scores": [0.805466, 0.410251, 0.750070, 0.361332]},
+            {"name": "faithfulness", "individual_scores": [0.135581, 0.695974, 0.749861, 0.041999]},
+            {"name": "semantic_answer_similarity", "individual_scores": [0.971241, 0.159320, 0.019722, 1]},
+        ],
+    }
+
+    evaluator = EvaluationResult(pipeline_name="testing_pipeline_1", results=data)
+    assert evaluator.to_pandas().to_json() == (
+        '{"query_id":{"0":"53c3b3e6","1":"225f87f7","2":"53c3b3e6","3":"225f87f7"},'
+        '"question":{"0":"What is the capital of France?","1":"What is the capital of Spain?",'
+        '"2":"What is the capital of Luxembourg?","3":"What is the capital of Portugal?"},'
+        '"contexts":{"0":"wiki_France","1":"wiki_Spain","2":"wiki_Luxembourg","3":"wiki_Portugal"},'
+        '"answer":{"0":"Paris","1":"Madrid","2":"Luxembourg","3":"Lisbon"},'
+        '"predicted_answer":{"0":"Paris","1":"Madrid","2":"Luxembourg","3":"Lisbon"},'
+        '"reciprocal_rank":{"0":0.378064,"1":0.534964,"2":0.216058,"3":0.778642},'
+        '"single_hit":{"0":1,"1":1,"2":0,"3":1},'
+        '"multi_hit":{"0":0.706125,"1":0.454976,"2":0.445512,"3":0.250522},'
+        '"context_relevance":{"0":0.805466,"1":0.410251,"2":0.75007,"3":0.361332},'
+        '"faithfulness":{"0":0.135581,"1":0.695974,"2":0.749861,"3":0.041999},'
+        '"semantic_answer_similarity":{"0":0.971241,"1":0.15932,"2":0.019722,"3":1.0}}'
+    )
+
+
+def test_comparative_individual_scores_report():
+    data_1 = {
+        "inputs": {
+            "query_id": ["53c3b3e6", "225f87f7"],
+            "question": ["What is the capital of France?", "What is the capital of Spain?"],
+            "contexts": ["wiki_France", "wiki_Spain"],
+            "answer": ["Paris", "Madrid"],
+            "predicted_answer": ["Paris", "Madrid"],
+        },
+        "metrics": [
+            {"name": "reciprocal_rank", "individual_scores": [0.378064, 0.534964, 0.216058, 0.778642]},
+            {"name": "single_hit", "individual_scores": [1, 1, 0, 1]},
+            {"name": "multi_hit", "individual_scores": [0.706125, 0.454976, 0.445512, 0.250522]},
+            {"name": "context_relevance", "individual_scores": [0.805466, 0.410251, 0.750070, 0.361332]},
+            {"name": "faithfulness", "individual_scores": [0.135581, 0.695974, 0.749861, 0.041999]},
+            {"name": "semantic_answer_similarity", "individual_scores": [0.971241, 0.159320, 0.019722, 1]},
+        ],
+    }
+
+    data_2 = {
+        "inputs": {
+            "query_id": ["53c3b3e6", "225f87f7"],
+            "question": ["What is the capital of France?", "What is the capital of Spain?"],
+            "contexts": ["wiki_France", "wiki_Spain"],
+            "answer": ["Paris", "Madrid"],
+            "predicted_answer": ["Paris", "Madrid"],
+        },
+        "metrics": [
+            {"name": "reciprocal_rank", "individual_scores": [0.378064, 0.534964, 0.216058, 0.778642]},
+            {"name": "single_hit", "individual_scores": [1, 1, 0, 1]},
+            {"name": "multi_hit", "individual_scores": [0.706125, 0.454976, 0.445512, 0.250522]},
+            {"name": "context_relevance", "individual_scores": [0.805466, 0.410251, 0.750070, 0.361332]},
+            {"name": "faithfulness", "individual_scores": [0.135581, 0.695974, 0.749861, 0.041999]},
+            {"name": "semantic_answer_similarity", "individual_scores": [0.971241, 0.159320, 0.019722, 1]},
+        ],
+    }
+
+    evaluator_1 = EvaluationResult(pipeline_name="testing_pipeline_1", results=data_1)
+    evaluator_2 = EvaluationResult(pipeline_name="testing_pipeline_2", results=data_2)
+    results = evaluator_1.comparative_individual_scores_report(evaluator_2)
+
+    assert results.to_json() == (
+        '{"query_id":{"0":"53c3b3e6","1":"225f87f7"},'
+        '"question":{"0":"What is the capital of France?","1":"What is the capital of Spain?"},'
+        '"contexts":{"0":"wiki_France","1":"wiki_Spain"},"answer":{"0":"Paris","1":"Madrid"},'
+        '"testing_pipeline_1_predicted_answer":{"0":"Paris","1":"Madrid"},'
+        '"testing_pipeline_1_reciprocal_rank":{"0":0.378064,"1":0.534964},'
+        '"testing_pipeline_1_single_hit":{"0":1,"1":1},'
+        '"testing_pipeline_1_multi_hit":{"0":0.706125,"1":0.454976},'
+        '"testing_pipeline_1_context_relevance":{"0":0.805466,"1":0.410251},'
+        '"testing_pipeline_1_faithfulness":{"0":0.135581,"1":0.695974},'
+        '"testing_pipeline_1_semantic_answer_similarity":{"0":0.971241,"1":0.15932},'
+        '"testing_pipeline_2_predicted_answer":{"0":"Paris","1":"Madrid"},'
+        '"testing_pipeline_2_reciprocal_rank":{"0":0.378064,"1":0.534964},'
+        '"testing_pipeline_2_single_hit":{"0":1,"1":1},'
+        '"testing_pipeline_2_multi_hit":{"0":0.706125,"1":0.454976},'
+        '"testing_pipeline_2_context_relevance":{"0":0.805466,"1":0.410251},'
+        '"testing_pipeline_2_faithfulness":{"0":0.135581,"1":0.695974},'
+        '"testing_pipeline_2_semantic_answer_similarity":{"0":0.971241,"1":0.15932}}'
+    )