Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Port evaluation harness from haystack-experimental #8595

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion docs/pydoc/config/evaluation_api.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@ loaders:
search_path: [../../../haystack/evaluation]
modules:
[
"base",
"base_eval_run_result",
"eval_run_result",
"harness.evaluation_harness",
"harness.rag.harness",
"harness.rag.parameters",
]
ignore_when_discovered: ["__init__"]
processors:
Expand Down
27 changes: 25 additions & 2 deletions haystack/evaluation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,30 @@
#
# SPDX-License-Identifier: Apache-2.0

from .base import BaseEvaluationRunResult
from .base_eval_run_result import BaseEvaluationRunResult
from .eval_run_result import EvaluationRunResult
from .harness import EvaluationHarness, EvaluationRunOverrides
from .harness.rag import DefaultRAGArchitecture, RAGEvaluationHarness
from .harness.rag.parameters import (
RAGEvaluationInput,
RAGEvaluationMetric,
RAGEvaluationOutput,
RAGEvaluationOverrides,
RAGExpectedComponent,
RAGExpectedComponentMetadata,
)

__all__ = ["BaseEvaluationRunResult", "EvaluationRunResult"]
__all__ = [
"BaseEvaluationRunResult",
"EvaluationRunResult",
"EvaluationHarness",
"EvaluationRunOverrides",
"DefaultRAGArchitecture",
"RAGEvaluationHarness",
"RAGExpectedComponent",
"RAGExpectedComponentMetadata",
"RAGEvaluationMetric",
"RAGEvaluationOutput",
"RAGEvaluationOverrides",
"RAGEvaluationInput",
]
2 changes: 1 addition & 1 deletion haystack/evaluation/eval_run_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from pandas import DataFrame
from pandas import concat as pd_concat

from .base import BaseEvaluationRunResult
from .base_eval_run_result import BaseEvaluationRunResult


class EvaluationRunResult(BaseEvaluationRunResult):
Expand Down
7 changes: 7 additions & 0 deletions haystack/evaluation/harness/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0

from .evaluation_harness import EvaluationHarness, EvaluationRunOverrides

_all_ = ["EvaluationHarness", "EvaluationRunOverrides"]
83 changes: 83 additions & 0 deletions haystack/evaluation/harness/evaluation_harness.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0

from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Any, Dict, Generic, Optional, Type, TypeVar

from haystack import Pipeline
from haystack.core.serialization import DeserializationCallbacks


@dataclass
class EvaluationRunOverrides:
"""
Overrides for an evaluation run.

Use it to override the init parameters of components in either
or both the evaluated and evaluation pipelines. Each key is
a component name, and its value is a dictionary with init parameters
to override.

:param evaluated_pipeline_overrides:
Overrides for the evaluated pipeline.
:param evaluation_pipeline_overrides:
Overrides for the evaluation pipeline.
"""

evaluated_pipeline_overrides: Optional[Dict[str, Dict[str, Any]]] = None
evaluation_pipeline_overrides: Optional[Dict[str, Dict[str, Any]]] = None


EvalRunInputT = TypeVar("EvalRunInputT")
EvalRunOutputT = TypeVar("EvalRunOutputT")
EvalRunOverridesT = TypeVar("EvalRunOverridesT")


class EvaluationHarness(ABC, Generic[EvalRunInputT, EvalRunOverridesT, EvalRunOutputT]):
"""
Executes a pipeline with specified parameters and inputs, then evaluates its outputs using an evaluation pipeline.
"""

@staticmethod
def _override_pipeline(pipeline: Pipeline, parameter_overrides: Optional[Dict[str, Any]]) -> Pipeline:
def component_pre_init_callback(name: str, cls: Type, init_params: Dict[str, Any]): # pylint: disable=unused-argument
assert parameter_overrides is not None
overrides = parameter_overrides.get(name)
if overrides:
init_params.update(overrides)

def validate_overrides():
if parameter_overrides is None:
return

pipeline_components = pipeline.inputs(include_components_with_connected_inputs=True).keys()
for component_name in parameter_overrides.keys():
if component_name not in pipeline_components:
raise ValueError(f"Cannot override non-existent component '{component_name}'")

callbacks = DeserializationCallbacks(component_pre_init_callback)
if parameter_overrides:
validate_overrides()
serialized_pipeline = pipeline.dumps()
pipeline = Pipeline.loads(serialized_pipeline, callbacks=callbacks)

return pipeline

@abstractmethod
def run(
self, inputs: EvalRunInputT, *, overrides: Optional[EvalRunOverridesT] = None, run_name: Optional[str] = None
) -> EvalRunOutputT:
"""
Launch an evaluation run.

:param inputs:
Inputs to the evaluated and evaluation pipelines.
:param overrides:
Overrides for the harness.
:param run_name:
A name for the evaluation run.
:returns:
The output of the evaluation pipeline.
"""
24 changes: 24 additions & 0 deletions haystack/evaluation/harness/rag/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0

from .harness import DefaultRAGArchitecture, RAGEvaluationHarness
from .parameters import (
RAGEvaluationInput,
RAGEvaluationMetric,
RAGEvaluationOutput,
RAGEvaluationOverrides,
RAGExpectedComponent,
RAGExpectedComponentMetadata,
)

_all_ = [
"DefaultRAGArchitecture",
"RAGEvaluationHarness",
"RAGExpectedComponent",
"RAGExpectedComponentMetadata",
"RAGEvaluationMetric",
"RAGEvaluationOutput",
"RAGEvaluationOverrides",
"RAGEvaluationInput",
]
77 changes: 77 additions & 0 deletions haystack/evaluation/harness/rag/_telemetry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0

from dataclasses import asdict, dataclass, replace
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple

from haystack.telemetry._telemetry import send_telemetry

from .parameters import RAGEvaluationInput, RAGEvaluationMetric, RAGEvaluationOverrides

if TYPE_CHECKING:
from .harness import DefaultRAGArchitecture, RAGEvaluationHarness


@dataclass
class TelemetryPayload: # pylint: disable=too-many-instance-attributes
"""
Represents the telemetry payload for evaluating a RAG model.

:param eval_metrics:
Active evaluation metrics and per-metric metadata.
:param num_queries:
Number of queries used for evaluation.
:param execution_time_sec:
Execution time in seconds for the evaluation.
:param default_architecture:
Default RAG architecture used for the RAG pipeline.
:param num_gt_answers:
Number of ground truth answers used in evaluation.
:param num_gt_contexts:
Number of ground truth contexts used in evaluation.
:param rag_pipeline_overrides:
Indicates if the RAG pipeline has any overrides.
:param eval_pipeline_overrides:
Indicates if the evaluation pipeline has any overrides.
"""

eval_metrics: Dict[RAGEvaluationMetric, Optional[Dict[str, Any]]]
num_queries: int
execution_time_sec: float

default_architecture: Optional["DefaultRAGArchitecture"] = None
num_gt_answers: Optional[int] = None
num_gt_contexts: Optional[int] = None
rag_pipeline_overrides: Optional[bool] = None
eval_pipeline_overrides: Optional[bool] = None

def serialize(self) -> Dict[str, Any]:
out = asdict(self)

out["eval_metrics"] = {k.value: v for k, v in self.eval_metrics.items()}
out["default_architecture"] = self.default_architecture.value if self.default_architecture else None

return out


@send_telemetry
def harness_eval_run_complete(
harness: "RAGEvaluationHarness",
inputs: RAGEvaluationInput,
execution_time_sec: float,
overrides: Optional[RAGEvaluationOverrides] = None,
) -> Optional[Tuple[str, Dict[str, Any]]]:
payload = harness._telemetry_payload

payload = replace(
payload,
num_queries=len(inputs.queries),
execution_time_sec=execution_time_sec,
num_gt_answers=(len(inputs.ground_truth_answers) if inputs.ground_truth_answers else None),
num_gt_contexts=(len(inputs.ground_truth_documents) if inputs.ground_truth_documents else None),
rag_pipeline_overrides=(overrides.rag_pipeline is not None if overrides else None),
eval_pipeline_overrides=(overrides.eval_pipeline is not None if overrides else None),
)

return "RAG evaluation harness eval run", payload.serialize()
49 changes: 49 additions & 0 deletions haystack/evaluation/harness/rag/evaluation_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0

from functools import partial
from typing import Callable, Dict, Set

from haystack import Pipeline
from haystack.components.evaluators import (
ContextRelevanceEvaluator,
DocumentMAPEvaluator,
DocumentMRREvaluator,
DocumentRecallEvaluator,
FaithfulnessEvaluator,
SASEvaluator,
)
from haystack.components.evaluators.document_recall import RecallMode

from .parameters import RAGEvaluationMetric


def default_rag_evaluation_pipeline(metrics: Set[RAGEvaluationMetric]) -> Pipeline:
"""
Builds the default evaluation pipeline for RAG.

:param metrics:
The set of metrics to include in the pipeline.
:returns:
The evaluation pipeline.
"""
pipeline = Pipeline()

metric_ctors: Dict[RAGEvaluationMetric, Callable] = {
RAGEvaluationMetric.DOCUMENT_MAP: DocumentMAPEvaluator,
RAGEvaluationMetric.DOCUMENT_MRR: DocumentMRREvaluator,
RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT: partial(DocumentRecallEvaluator, mode=RecallMode.SINGLE_HIT),
RAGEvaluationMetric.DOCUMENT_RECALL_MULTI_HIT: partial(DocumentRecallEvaluator, mode=RecallMode.MULTI_HIT),
RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY: partial(
SASEvaluator, model="sentence-transformers/all-MiniLM-L6-v2"
),
RAGEvaluationMetric.FAITHFULNESS: partial(FaithfulnessEvaluator, raise_on_failure=False),
RAGEvaluationMetric.CONTEXT_RELEVANCE: partial(ContextRelevanceEvaluator, raise_on_failure=False),
}

for metric in metrics:
ctor = metric_ctors[metric]
pipeline.add_component(metric.value, ctor())

return pipeline
Loading
Loading