From 22d3d7016876a96b236d2106c81a2d3e0cdf1bd1 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Fri, 21 Jun 2024 11:42:32 +0200
Subject: [PATCH] feat: Add ground truth documents and answers to RAG eval run
 results as inputs (#17)

---
 .../evaluation/harness/rag/harness.py         | 36 +++++++-----
 test/evaluation/harness/rag/test_harness.py   | 56 +++++++++++++++++++
 2 files changed, 78 insertions(+), 14 deletions(-)

diff --git a/haystack_experimental/evaluation/harness/rag/harness.py b/haystack_experimental/evaluation/harness/rag/harness.py
index 50dd446f..9ea3cfae 100644
--- a/haystack_experimental/evaluation/harness/rag/harness.py
+++ b/haystack_experimental/evaluation/harness/rag/harness.py
@@ -131,23 +131,31 @@ def run(  # noqa: D102
             pipeline_outputs["second"],
         )
 
+        result_inputs = {
+            "questions": inputs.queries,
+            "contexts": [
+                [doc.content for doc in docs]
+                for docs in self._lookup_component_output(
+                    RAGExpectedComponent.DOCUMENT_RETRIEVER,
+                    rag_outputs,
+                    "retrieved_documents",
+                )
+            ],
+            "responses": self._lookup_component_output(
+                RAGExpectedComponent.RESPONSE_GENERATOR, rag_outputs, "replies"
+            ),
+        }
+        if inputs.ground_truth_answers is not None:
+            result_inputs["ground_truth_answers"] = inputs.ground_truth_answers
+        if inputs.ground_truth_documents is not None:
+            result_inputs["ground_truth_documents"] = [
+                [doc.content for doc in docs] for docs in inputs.ground_truth_documents
+            ]
+
         assert run_name is not None
         run_results = EvaluationRunResult(
             run_name,
-            inputs={
-                "questions": inputs.queries,
-                "contexts": [
-                    [doc.content for doc in docs]
-                    for docs in self._lookup_component_output(
-                        RAGExpectedComponent.DOCUMENT_RETRIEVER,
-                        rag_outputs,
-                        "retrieved_documents",
-                    )
-                ],
-                "responses": self._lookup_component_output(
-                    RAGExpectedComponent.RESPONSE_GENERATOR, rag_outputs, "replies"
-                ),
-            },
+            inputs=result_inputs,
             results=eval_outputs,
         )
 
diff --git a/test/evaluation/harness/rag/test_harness.py b/test/evaluation/harness/rag/test_harness.py
index 1ec9763f..b36e51d9 100644
--- a/test/evaluation/harness/rag/test_harness.py
+++ b/test/evaluation/harness/rag/test_harness.py
@@ -664,6 +664,62 @@ def test_run_model_based_metrics(self, monkeypatch):
 
         assert output.inputs == inputs
         assert output.results.run_name == "test_run"
+        assert output.results.inputs == {
+            "questions": ["What is the capital of France?"] * 6,
+            "contexts": [
+                ["France"],
+                [
+                    "9th century",
+                    "10th century",
+                    "9th",
+                ],
+                [
+                    "classical",
+                    "rock music",
+                    "dubstep",
+                ],
+                [
+                    "11th",
+                    "the 11th",
+                    "11th century",
+                ],
+                [
+                    "Denmark",
+                    "Norway",
+                    "Iceland",
+                ],
+                [
+                    "10th century",
+                    "the first half of the 10th century",
+                    "10th",
+                    "10th",
+                ],
+            ],
+            "responses": [
+                "placeholder",
+                "placeholder",
+                "placeholder",
+                "placeholder",
+                "placeholder",
+                "placeholder",
+            ],
+            "ground_truth_documents": [
+                ["France"],
+                ["9th century", "9th"],
+                ["classical music", "classical"],
+                ["11th century", "the 11th"],
+                ["Denmark, Iceland and Norway"],
+                ["10th century", "10th"],
+            ],
+            "ground_truth_answers": [
+                "Paris is the capital of France.",
+                "9th century",
+                "classical music",
+                "11th century",
+                "Denmark, Iceland and Norway",
+                "10th century",
+            ],
+        }
         assert output.results.results == {
             "metric_answer_faithfulness": MockModelBasedEvaluator.default_output(
                 RAGEvaluationMetric.ANSWER_FAITHFULNESS