Skip to content

Commit

Permalink
feat: Add telemetry to RAG eval harness (#42)
Browse files Browse the repository at this point in the history
  • Loading branch information
shadeMe authored Jul 22, 2024
1 parent 3258230 commit 2d61e0a
Show file tree
Hide file tree
Showing 2 changed files with 106 additions and 0 deletions.
89 changes: 89 additions & 0 deletions haystack_experimental/evaluation/harness/rag/_telemetry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0

from dataclasses import asdict, dataclass, replace
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple

from haystack.telemetry._telemetry import send_telemetry

from .parameters import RAGEvaluationInput, RAGEvaluationMetric, RAGEvaluationOverrides

if TYPE_CHECKING:
from .harness import DefaultRAGArchitecture, RAGEvaluationHarness


@dataclass
class TelemetryPayload: # pylint: disable=too-many-instance-attributes
"""
Represents the telemetry payload for evaluating a RAG model.
:param eval_metrics:
Active evaluation metrics and per-metric metadata.
:param num_queries:
Number of queries used for evaluation.
:param execution_time_sec:
Execution time in seconds for the evaluation.
:param default_architecture:
Default RAG architecture used for the RAG pipeline.
:param num_gt_answers:
Number of ground truth answers used in evaluation.
:param num_gt_contexts:
Number of ground truth contexts used in evaluation.
:param rag_pipeline_overrides:
Indicates if the RAG pipeline has any overrides.
:param eval_pipeline_overrides:
Indicates if the evaluation pipeline has any overrides.
"""

eval_metrics: Dict[RAGEvaluationMetric, Optional[Dict[str, Any]]]
num_queries: int
execution_time_sec: float

default_architecture: Optional["DefaultRAGArchitecture"] = None
num_gt_answers: Optional[int] = None
num_gt_contexts: Optional[int] = None
rag_pipeline_overrides: Optional[bool] = None
eval_pipeline_overrides: Optional[bool] = None

def serialize(self) -> Dict[str, Any]:
out = asdict(self)

out["eval_metrics"] = {k.value: v for k, v in self.eval_metrics.items()}
out["default_architecture"] = (
self.default_architecture.value if self.default_architecture else None
)

return out


@send_telemetry
def harness_eval_run_complete(
harness: "RAGEvaluationHarness",
inputs: RAGEvaluationInput,
execution_time_sec: float,
overrides: Optional[RAGEvaluationOverrides] = None,
) -> Optional[Tuple[str, Dict[str, Any]]]:
payload = harness._telemetry_payload

payload = replace(
payload,
num_queries=len(inputs.queries),
execution_time_sec=execution_time_sec,
num_gt_answers=(
len(inputs.ground_truth_answers) if inputs.ground_truth_answers else None
),
num_gt_contexts=(
len(inputs.ground_truth_documents)
if inputs.ground_truth_documents
else None
),
rag_pipeline_overrides=(
overrides.rag_pipeline is not None if overrides else None
),
eval_pipeline_overrides=(
overrides.eval_pipeline is not None if overrides else None
),
)

return "RAG evaluation harness eval run", payload.serialize()
17 changes: 17 additions & 0 deletions haystack_experimental/evaluation/harness/rag/harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#
# SPDX-License-Identifier: Apache-2.0

import time
from copy import deepcopy
from enum import Enum
from typing import Any, Dict, List, Optional, Set, Union
Expand All @@ -15,6 +16,7 @@
)
from ...util.pipeline_pair import PipelinePair
from ..evaluation_harness import EvaluationHarness
from ._telemetry import TelemetryPayload, harness_eval_run_complete
from .evaluation_pipeline import default_rag_evaluation_pipeline
from .parameters import (
RAGEvaluationInput,
Expand Down Expand Up @@ -136,6 +138,17 @@ def __init__(
"""
super().__init__()

self._telemetry_payload = TelemetryPayload(
eval_metrics={m: None for m in metrics},
num_queries=0,
execution_time_sec=0.0,
default_architecture=(
rag_components
if isinstance(rag_components, DefaultRAGArchitecture)
else None
),
)

if isinstance(rag_components, DefaultRAGArchitecture):
rag_components = rag_components.expected_components

Expand All @@ -154,6 +167,8 @@ def run( # noqa: D102
overrides: Optional[RAGEvaluationOverrides] = None,
run_name: Optional[str] = "RAG Evaluation",
) -> RAGEvaluationOutput:
start_time = time.time()

rag_inputs = self._prepare_rag_pipeline_inputs(inputs)
eval_inputs = self._prepare_eval_pipeline_additional_inputs(inputs)
pipeline_pair = self._generate_eval_run_pipelines(overrides)
Expand Down Expand Up @@ -198,6 +213,8 @@ def run( # noqa: D102
results=eval_outputs,
)

harness_eval_run_complete(self, inputs, time.time() - start_time, overrides)

return RAGEvaluationOutput(
evaluated_pipeline=pipeline_pair.first.dumps(),
evaluation_pipeline=pipeline_pair.second.dumps(),
Expand Down

0 comments on commit 2d61e0a

Please sign in to comment.