From 22d3d7016876a96b236d2106c81a2d3e0cdf1bd1 Mon Sep 17 00:00:00 2001 From: Madeesh Kannan Date: Fri, 21 Jun 2024 11:42:32 +0200 Subject: [PATCH] feat: Add ground truth documents and answers to RAG eval run results as inputs (#17) --- .../evaluation/harness/rag/harness.py | 36 +++++++----- test/evaluation/harness/rag/test_harness.py | 56 +++++++++++++++++++ 2 files changed, 78 insertions(+), 14 deletions(-) diff --git a/haystack_experimental/evaluation/harness/rag/harness.py b/haystack_experimental/evaluation/harness/rag/harness.py index 50dd446f..9ea3cfae 100644 --- a/haystack_experimental/evaluation/harness/rag/harness.py +++ b/haystack_experimental/evaluation/harness/rag/harness.py @@ -131,23 +131,31 @@ def run( # noqa: D102 pipeline_outputs["second"], ) + result_inputs = { + "questions": inputs.queries, + "contexts": [ + [doc.content for doc in docs] + for docs in self._lookup_component_output( + RAGExpectedComponent.DOCUMENT_RETRIEVER, + rag_outputs, + "retrieved_documents", + ) + ], + "responses": self._lookup_component_output( + RAGExpectedComponent.RESPONSE_GENERATOR, rag_outputs, "replies" + ), + } + if inputs.ground_truth_answers is not None: + result_inputs["ground_truth_answers"] = inputs.ground_truth_answers + if inputs.ground_truth_documents is not None: + result_inputs["ground_truth_documents"] = [ + [doc.content for doc in docs] for docs in inputs.ground_truth_documents + ] + assert run_name is not None run_results = EvaluationRunResult( run_name, - inputs={ - "questions": inputs.queries, - "contexts": [ - [doc.content for doc in docs] - for docs in self._lookup_component_output( - RAGExpectedComponent.DOCUMENT_RETRIEVER, - rag_outputs, - "retrieved_documents", - ) - ], - "responses": self._lookup_component_output( - RAGExpectedComponent.RESPONSE_GENERATOR, rag_outputs, "replies" - ), - }, + inputs=result_inputs, results=eval_outputs, ) diff --git a/test/evaluation/harness/rag/test_harness.py b/test/evaluation/harness/rag/test_harness.py index 1ec9763f..b36e51d9 100644 --- a/test/evaluation/harness/rag/test_harness.py +++ b/test/evaluation/harness/rag/test_harness.py @@ -664,6 +664,62 @@ def test_run_model_based_metrics(self, monkeypatch): assert output.inputs == inputs assert output.results.run_name == "test_run" + assert output.results.inputs == { + "questions": ["What is the capital of France?"] * 6, + "contexts": [ + ["France"], + [ + "9th century", + "10th century", + "9th", + ], + [ + "classical", + "rock music", + "dubstep", + ], + [ + "11th", + "the 11th", + "11th century", + ], + [ + "Denmark", + "Norway", + "Iceland", + ], + [ + "10th century", + "the first half of the 10th century", + "10th", + "10th", + ], + ], + "responses": [ + "placeholder", + "placeholder", + "placeholder", + "placeholder", + "placeholder", + "placeholder", + ], + "ground_truth_documents": [ + ["France"], + ["9th century", "9th"], + ["classical music", "classical"], + ["11th century", "the 11th"], + ["Denmark, Iceland and Norway"], + ["10th century", "10th"], + ], + "ground_truth_answers": [ + "Paris is the capital of France.", + "9th century", + "classical music", + "11th century", + "Denmark, Iceland and Norway", + "10th century", + ], + } assert output.results.results == { "metric_answer_faithfulness": MockModelBasedEvaluator.default_output( RAGEvaluationMetric.ANSWER_FAITHFULNESS