Skip to content

Commit

Permalink
feat: Add ground truth documents and answers to RAG eval run results …
Browse files Browse the repository at this point in the history
…as inputs (#17)
  • Loading branch information
shadeMe authored Jun 21, 2024
1 parent 252e9d1 commit 22d3d70
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 14 deletions.
36 changes: 22 additions & 14 deletions haystack_experimental/evaluation/harness/rag/harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,23 +131,31 @@ def run( # noqa: D102
pipeline_outputs["second"],
)

result_inputs = {
"questions": inputs.queries,
"contexts": [
[doc.content for doc in docs]
for docs in self._lookup_component_output(
RAGExpectedComponent.DOCUMENT_RETRIEVER,
rag_outputs,
"retrieved_documents",
)
],
"responses": self._lookup_component_output(
RAGExpectedComponent.RESPONSE_GENERATOR, rag_outputs, "replies"
),
}
if inputs.ground_truth_answers is not None:
result_inputs["ground_truth_answers"] = inputs.ground_truth_answers
if inputs.ground_truth_documents is not None:
result_inputs["ground_truth_documents"] = [
[doc.content for doc in docs] for docs in inputs.ground_truth_documents
]

assert run_name is not None
run_results = EvaluationRunResult(
run_name,
inputs={
"questions": inputs.queries,
"contexts": [
[doc.content for doc in docs]
for docs in self._lookup_component_output(
RAGExpectedComponent.DOCUMENT_RETRIEVER,
rag_outputs,
"retrieved_documents",
)
],
"responses": self._lookup_component_output(
RAGExpectedComponent.RESPONSE_GENERATOR, rag_outputs, "replies"
),
},
inputs=result_inputs,
results=eval_outputs,
)

Expand Down
56 changes: 56 additions & 0 deletions test/evaluation/harness/rag/test_harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -664,6 +664,62 @@ def test_run_model_based_metrics(self, monkeypatch):

assert output.inputs == inputs
assert output.results.run_name == "test_run"
assert output.results.inputs == {
"questions": ["What is the capital of France?"] * 6,
"contexts": [
["France"],
[
"9th century",
"10th century",
"9th",
],
[
"classical",
"rock music",
"dubstep",
],
[
"11th",
"the 11th",
"11th century",
],
[
"Denmark",
"Norway",
"Iceland",
],
[
"10th century",
"the first half of the 10th century",
"10th",
"10th",
],
],
"responses": [
"placeholder",
"placeholder",
"placeholder",
"placeholder",
"placeholder",
"placeholder",
],
"ground_truth_documents": [
["France"],
["9th century", "9th"],
["classical music", "classical"],
["11th century", "the 11th"],
["Denmark, Iceland and Norway"],
["10th century", "10th"],
],
"ground_truth_answers": [
"Paris is the capital of France.",
"9th century",
"classical music",
"11th century",
"Denmark, Iceland and Norway",
"10th century",
],
}
assert output.results.results == {
"metric_answer_faithfulness": MockModelBasedEvaluator.default_output(
RAGEvaluationMetric.ANSWER_FAITHFULNESS
Expand Down

0 comments on commit 22d3d70

Please sign in to comment.