feat: Add ground truth documents and answers to RAG eval run results …

…as inputs (#17)
deepset-ai · Jun 21, 2024 · 22d3d70 · 22d3d70
1 parent 252e9d1
commit 22d3d70
Show file tree

Hide file tree

Showing 2 changed files with 78 additions and 14 deletions.
diff --git a/haystack_experimental/evaluation/harness/rag/harness.py b/haystack_experimental/evaluation/harness/rag/harness.py
@@ -131,23 +131,31 @@ def run(  # noqa: D102
             pipeline_outputs["second"],
         )
 
+        result_inputs = {
+            "questions": inputs.queries,
+            "contexts": [
+                [doc.content for doc in docs]
+                for docs in self._lookup_component_output(
+                    RAGExpectedComponent.DOCUMENT_RETRIEVER,
+                    rag_outputs,
+                    "retrieved_documents",
+                )
+            ],
+            "responses": self._lookup_component_output(
+                RAGExpectedComponent.RESPONSE_GENERATOR, rag_outputs, "replies"
+            ),
+        }
+        if inputs.ground_truth_answers is not None:
+            result_inputs["ground_truth_answers"] = inputs.ground_truth_answers
+        if inputs.ground_truth_documents is not None:
+            result_inputs["ground_truth_documents"] = [
+                [doc.content for doc in docs] for docs in inputs.ground_truth_documents
+            ]
+
         assert run_name is not None
         run_results = EvaluationRunResult(
             run_name,
-            inputs={
-                "questions": inputs.queries,
-                "contexts": [
-                    [doc.content for doc in docs]
-                    for docs in self._lookup_component_output(
-                        RAGExpectedComponent.DOCUMENT_RETRIEVER,
-                        rag_outputs,
-                        "retrieved_documents",
-                    )
-                ],
-                "responses": self._lookup_component_output(
-                    RAGExpectedComponent.RESPONSE_GENERATOR, rag_outputs, "replies"
-                ),
-            },
+            inputs=result_inputs,
             results=eval_outputs,
         )
 

diff --git a/test/evaluation/harness/rag/test_harness.py b/test/evaluation/harness/rag/test_harness.py
@@ -664,6 +664,62 @@ def test_run_model_based_metrics(self, monkeypatch):
 
         assert output.inputs == inputs
         assert output.results.run_name == "test_run"
+        assert output.results.inputs == {
+            "questions": ["What is the capital of France?"] * 6,
+            "contexts": [
+                ["France"],
+                [
+                    "9th century",
+                    "10th century",
+                    "9th",
+                ],
+                [
+                    "classical",
+                    "rock music",
+                    "dubstep",
+                ],
+                [
+                    "11th",
+                    "the 11th",
+                    "11th century",
+                ],
+                [
+                    "Denmark",
+                    "Norway",
+                    "Iceland",
+                ],
+                [
+                    "10th century",
+                    "the first half of the 10th century",
+                    "10th",
+                    "10th",
+                ],
+            ],
+            "responses": [
+                "placeholder",
+                "placeholder",
+                "placeholder",
+                "placeholder",
+                "placeholder",
+                "placeholder",
+            ],
+            "ground_truth_documents": [
+                ["France"],
+                ["9th century", "9th"],
+                ["classical music", "classical"],
+                ["11th century", "the 11th"],
+                ["Denmark, Iceland and Norway"],
+                ["10th century", "10th"],
+            ],
+            "ground_truth_answers": [
+                "Paris is the capital of France.",
+                "9th century",
+                "classical music",
+                "11th century",
+                "Denmark, Iceland and Norway",
+                "10th century",
+            ],
+        }
         assert output.results.results == {
             "metric_answer_faithfulness": MockModelBasedEvaluator.default_output(
                 RAGEvaluationMetric.ANSWER_FAITHFULNESS