diff --git a/haystack/components/eval/__init__.py b/haystack/components/eval/__init__.py
index 9477cb1242..547eb580b0 100644
--- a/haystack/components/eval/__init__.py
+++ b/haystack/components/eval/__init__.py
@@ -1,3 +1,4 @@
 from .sas_evaluator import SASEvaluator
+from .statistical_evaluator import StatisticalEvaluator
 
-__all__ = ["SASEvaluator"]
+__all__ = ["SASEvaluator", "StatisticalEvaluator"]
diff --git a/haystack/components/eval/statistical_evaluator.py b/haystack/components/eval/statistical_evaluator.py
new file mode 100644
index 0000000000..37bafe5e1e
--- /dev/null
+++ b/haystack/components/eval/statistical_evaluator.py
@@ -0,0 +1,131 @@
+import collections
+from enum import Enum
+from typing import Any, Dict, List, Optional
+
+from numpy import array as np_array
+from numpy import mean as np_mean
+
+from haystack import default_from_dict, default_to_dict
+from haystack.core.component import component
+
+from .preprocess import _preprocess_text
+
+
+@component
+class StatisticalEvaluator:
+    """
+    StatisticalEvaluator is a component that evaluates the performance of a model based on statistical metrics.
+    It's usually used in QA and Retrieval Augmented Generation (RAG) pipelines to evaluate the quality of the generated answers.
+
+    The supported metrics are:
+    - F1: Measures word overlap between predictions and labels.
+    - Exact Match: Measures the proportion of cases where prediction is identical to the expected label.
+    """
+
+    class Metric(Enum):
+        """
+        Supported metrics
+        """
+
+        F1 = "F1"
+        EM = "Exact Match"
+
+    def __init__(
+        self,
+        labels: List[str],
+        metric: Metric,
+        regexes_to_ignore: Optional[List[str]] = None,
+        ignore_case: bool = False,
+        ignore_punctuation: bool = False,
+        ignore_numbers: bool = False,
+    ):
+        """
+        Creates a new instance of StatisticalEvaluator.
+
+        :param labels: The list of expected answers.
+        :param metric: Metric to use for evaluation in this component. Supported metrics are F1 and Exact Match.
+        :param regexes_to_ignore: A list of regular expressions. If provided, it removes substrings
+            matching these regular expressions from both predictions and labels before comparison. Defaults to None.
+        :param ignore_case: If True, performs case-insensitive comparison. Defaults to False.
+        :param ignore_punctuation: If True, removes punctuation from both predictions and labels before
+            comparison. Defaults to False.
+        :param ignore_numbers: If True, removes numerical digits from both predictions and labels
+            before comparison. Defaults to False.
+        """
+        self._labels = labels
+        self._metric = metric
+        self._regexes_to_ignore = regexes_to_ignore
+        self._ignore_case = ignore_case
+        self._ignore_punctuation = ignore_punctuation
+        self._ignore_numbers = ignore_numbers
+
+        self._metric_function = {
+            StatisticalEvaluator.Metric.F1: self._f1,
+            StatisticalEvaluator.Metric.EM: self._exact_match,
+        }[self._metric]
+
+    def to_dict(self) -> Dict[str, Any]:
+        return default_to_dict(
+            self,
+            labels=self._labels,
+            metric=self._metric.value,
+            regexes_to_ignore=self._regexes_to_ignore,
+            ignore_case=self._ignore_case,
+            ignore_punctuation=self._ignore_punctuation,
+            ignore_numbers=self._ignore_numbers,
+        )
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "StatisticalEvaluator":
+        data["init_parameters"]["metric"] = StatisticalEvaluator.Metric(data["init_parameters"]["metric"])
+        return default_from_dict(cls, data)
+
+    @component.output_types(result=float)
+    def run(self, predictions: List[str]) -> Dict[str, Any]:
+        if len(predictions) != len(self._labels):
+            raise ValueError("The number of predictions and labels must be the same.")
+
+        predictions = _preprocess_text(
+            predictions, self._regexes_to_ignore, self._ignore_case, self._ignore_punctuation, self._ignore_numbers
+        )
+        labels = _preprocess_text(
+            self._labels, self._regexes_to_ignore, self._ignore_case, self._ignore_punctuation, self._ignore_numbers
+        )
+
+        return {"result": self._metric_function(labels, predictions)}
+
+    def _f1(self, labels: List[str], predictions: List[str]):
+        """
+        Measure word overlap between predictions and labels.
+        """
+        if len(predictions) == 0:
+            # We expect callers of this function already checked if predictions and labels are equal length
+            return 0.0
+
+        scores: List[float] = []
+        tokenized_predictions = [pred.split() for pred in predictions]
+        tokenized_labels = [label.split() for label in labels]
+        for label_tokens, prediction_tokens in zip(tokenized_labels, tokenized_predictions):
+            common = collections.Counter(label_tokens) & collections.Counter(prediction_tokens)
+            num_same = sum(common.values())
+            if len(label_tokens) == 0 or len(prediction_tokens) == 0:
+                # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
+                return int(label_tokens == prediction_tokens)
+            if num_same == 0:
+                return 0
+            precision = 1.0 * num_same / len(prediction_tokens)
+            recall = 1.0 * num_same / len(label_tokens)
+            f1 = (2 * precision * recall) / (precision + recall)
+            scores.append(f1)
+
+        return np_mean(scores)
+
+    def _exact_match(self, labels: List[str], predictions: List[str]) -> float:
+        """
+        Measure the proportion of cases where predictiond is identical to the the expected label.
+        """
+        if len(predictions) == 0:
+            # We expect callers of this function already checked if predictions and labels are equal length
+            return 0.0
+        score_list = np_array(predictions) == np_array(labels)
+        return np_mean(score_list)
diff --git a/haystack/evaluation/eval.py b/haystack/evaluation/eval.py
index 500b423e86..2c5c8be657 100644
--- a/haystack/evaluation/eval.py
+++ b/haystack/evaluation/eval.py
@@ -1,11 +1,7 @@
-import collections
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import numpy as np
+from typing import Any, Callable, Dict, List, Union
 
 from haystack import Pipeline
 from haystack.core.component import Component
-from haystack.evaluation.eval_utils import get_answers_from_output, preprocess_text
 from haystack.evaluation.metrics import Metric, MetricsResult
 
 
@@ -45,8 +41,6 @@ def __init__(
             Metric.RECALL: self._calculate_recall,
             Metric.MRR: self._calculate_mrr,
             Metric.MAP: self._calculate_map,
-            Metric.F1: self._calculate_f1,
-            Metric.EM: self._calculate_em,
         }
 
     def calculate_metrics(self, metric: Union[Metric, Callable[..., MetricsResult]], **kwargs) -> MetricsResult:
@@ -71,119 +65,6 @@ def _calculate_map(self):
     def _calculate_mrr(self):
         return MetricsResult({"mean_reciprocal_rank": None})
 
-    def _compute_f1_single(self, label_toks: List[str], pred_toks: List[str]) -> float:
-        """
-        Compute F1 score for a single sample.
-        """
-        common: collections.Counter = collections.Counter(label_toks) & collections.Counter(pred_toks)
-        num_same = sum(common.values())
-        if len(label_toks) == 0 or len(pred_toks) == 0:
-            # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
-            return int(label_toks == pred_toks)
-        if num_same == 0:
-            return 0
-        precision = 1.0 * num_same / len(pred_toks)
-        recall = 1.0 * num_same / len(label_toks)
-        f1 = (2 * precision * recall) / (precision + recall)
-        return f1
-
-    def _calculate_f1(
-        self,
-        output_key: str,
-        regexes_to_ignore: Optional[List[str]] = None,
-        ignore_case: bool = False,
-        ignore_punctuation: bool = False,
-        ignore_numbers: bool = False,
-    ) -> MetricsResult:
-        """
-        Calculates the F1 score between two lists of predictions and labels.
-        F1 score measures the word overlap between the predicted text and the corresponding ground truth label.
-
-        :param output_key: The key of the output to use for comparison.
-        :param regexes_to_ignore (list, optional): A list of regular expressions. If provided, it removes substrings
-            matching these regular expressions from both predictions and labels before comparison. Defaults to None.
-        :param ignore_case (bool, optional): If True, performs case-insensitive comparison. Defaults to False.
-        :param ignore_punctuation (bool, optional): If True, removes punctuation from both predictions and labels before
-            comparison. Defaults to False.
-        :param ignore_numbers (bool, optional): If True, removes numerical digits from both predictions and labels
-            before comparison. Defaults to False.
-        :return: A MetricsResult object containing the calculated F1 score.
-        """
-
-        predictions = get_answers_from_output(
-            outputs=self.outputs, output_key=output_key, runnable_type=self.runnable_type
-        )
-        labels = get_answers_from_output(
-            outputs=self.expected_outputs, output_key=output_key, runnable_type=self.runnable_type
-        )
-
-        if len(predictions) != len(labels):
-            raise ValueError("The number of predictions and labels must be the same.")
-        if len(predictions) == len(labels) == 0:
-            # Return F1 as 0 for no inputs
-            return MetricsResult({"f1": 0.0})
-
-        predictions = preprocess_text(predictions, regexes_to_ignore, ignore_case, ignore_punctuation, ignore_numbers)
-        labels = preprocess_text(labels, regexes_to_ignore, ignore_case, ignore_punctuation, ignore_numbers)
-
-        # Tokenize by splitting on spaces
-        tokenized_predictions = [pred.split() for pred in predictions]
-        tokenized_labels = [label.split() for label in labels]
-
-        f1_scores = [
-            self._compute_f1_single(label_toks, pred_toks)
-            for label_toks, pred_toks in zip(tokenized_labels, tokenized_predictions)
-        ]
-
-        f1 = np.mean(f1_scores)
-
-        return MetricsResult({"f1": f1})
-
-    def _calculate_em(
-        self,
-        output_key: str,
-        regexes_to_ignore: Optional[List[str]] = None,
-        ignore_case: bool = False,
-        ignore_punctuation: bool = False,
-        ignore_numbers: bool = False,
-    ) -> MetricsResult:
-        """
-        Calculates the Exact Match (EM) score between two lists of predictions and labels.
-        Exact Match (EM) score measures the percentage of samples where the predicted text exactly matches the
-          corresponding ground truth label.
-
-        :param output_key: The key of the output to use for comparison.
-        :param regexes_to_ignore (list, optional): A list of regular expressions. If provided, it removes substrings
-            matching these regular expressions from both predictions and labels before comparison. Defaults to None.
-        :param ignore_case (bool, optional): If True, performs case-insensitive comparison. Defaults to False.
-        :param ignore_punctuation (bool, optional): If True, removes punctuation from both predictions and labels before
-            comparison. Defaults to False.
-        :param ignore_numbers (bool, optional): If True, removes numerical digits from both predictions and labels
-            before comparison. Defaults to False.
-        :return: A MetricsResult object containing the calculated Exact Match (EM) score.
-        """
-
-        predictions = get_answers_from_output(
-            outputs=self.outputs, output_key=output_key, runnable_type=self.runnable_type
-        )
-        labels = get_answers_from_output(
-            outputs=self.expected_outputs, output_key=output_key, runnable_type=self.runnable_type
-        )
-
-        if len(predictions) != len(labels):
-            raise ValueError("The number of predictions and labels must be the same.")
-        if len(predictions) == len(labels) == 0:
-            # Return Exact Match as 0 for no inputs
-            return MetricsResult({"exact_match": 0.0})
-
-        predictions = preprocess_text(predictions, regexes_to_ignore, ignore_case, ignore_punctuation, ignore_numbers)
-        labels = preprocess_text(labels, regexes_to_ignore, ignore_case, ignore_punctuation, ignore_numbers)
-
-        score_list = np.array(predictions) == np.array(labels)
-        exact_match_score = np.mean(score_list)
-
-        return MetricsResult({"exact_match": exact_match_score})
-
 
 def eval(
     runnable: Union[Pipeline, Component], inputs: List[Dict[str, Any]], expected_outputs: List[Dict[str, Any]]
diff --git a/releasenotes/notes/statistical-evaluator-d65b80e3ac24778a.yaml b/releasenotes/notes/statistical-evaluator-d65b80e3ac24778a.yaml
new file mode 100644
index 0000000000..7da31e250e
--- /dev/null
+++ b/releasenotes/notes/statistical-evaluator-d65b80e3ac24778a.yaml
@@ -0,0 +1,4 @@
+---
+features:
+  - |
+    Add `StatisticalEvaluator`, this Component can be used to calculate the different statistic metrics from answers returned by LLMs.
diff --git a/test/components/eval/test_statistical_evaluator.py b/test/components/eval/test_statistical_evaluator.py
new file mode 100644
index 0000000000..3abbeb7220
--- /dev/null
+++ b/test/components/eval/test_statistical_evaluator.py
@@ -0,0 +1,259 @@
+import pytest
+
+from haystack.components.eval import StatisticalEvaluator
+
+
+class TestStatisticalEvaluator:
+    def test_init_default(self):
+        labels = ["label1", "label2", "label3"]
+        evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.F1)
+        assert evaluator._labels == labels
+        assert evaluator._metric == StatisticalEvaluator.Metric.F1
+        assert evaluator._regexes_to_ignore is None
+        assert evaluator._ignore_case is False
+        assert evaluator._ignore_punctuation is False
+        assert evaluator._ignore_numbers is False
+
+    def test_to_dict(self):
+        labels = ["label1", "label2", "label3"]
+        evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.F1)
+
+        expected_dict = {
+            "type": "haystack.components.eval.statistical_evaluator.StatisticalEvaluator",
+            "init_parameters": {
+                "labels": labels,
+                "metric": "F1",
+                "regexes_to_ignore": None,
+                "ignore_case": False,
+                "ignore_punctuation": False,
+                "ignore_numbers": False,
+            },
+        }
+        assert evaluator.to_dict() == expected_dict
+
+    def test_from_dict(self):
+        evaluator = StatisticalEvaluator.from_dict(
+            {
+                "type": "haystack.components.eval.statistical_evaluator.StatisticalEvaluator",
+                "init_parameters": {
+                    "labels": ["label1", "label2", "label3"],
+                    "metric": "F1",
+                    "regexes_to_ignore": None,
+                    "ignore_case": False,
+                    "ignore_punctuation": False,
+                    "ignore_numbers": False,
+                },
+            }
+        )
+
+        assert evaluator._labels == ["label1", "label2", "label3"]
+        assert evaluator._metric == StatisticalEvaluator.Metric.F1
+        assert evaluator._regexes_to_ignore is None
+        assert evaluator._ignore_case is False
+        assert evaluator._ignore_punctuation is False
+        assert evaluator._ignore_numbers is False
+
+
+class TestStatisticalEvaluatorF1:
+    def test_run_with_empty_inputs(self):
+        evaluator = StatisticalEvaluator(labels=[], metric=StatisticalEvaluator.Metric.F1)
+        result = evaluator.run(predictions=[])
+        assert len(result) == 1
+        assert result["result"] == 0.0
+
+    def test_run_with_different_lengths(self):
+        labels = [
+            "A construction budget of US $2.3 billion",
+            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
+        ]
+        evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.F1)
+
+        predictions = [
+            "A construction budget of US $2.3 billion",
+            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
+            "The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
+        ]
+        with pytest.raises(ValueError):
+            evaluator.run(predictions)
+
+    def test_run_with_matching_predictions(self):
+        labels = ["OpenSource", "HaystackAI", "LLMs"]
+        evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.F1)
+        predictions = ["OpenSource", "HaystackAI", "LLMs"]
+        result = evaluator.run(predictions=predictions)
+
+        assert len(result) == 1
+        assert result["result"] == 1.0
+
+    def test_run_with_single_prediction(self):
+        labels = ["Source"]
+        evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.F1)
+
+        result = evaluator.run(predictions=["Open Source"])
+        assert len(result) == 1
+        assert result["result"] == pytest.approx(2 / 3)
+
+    def test_run_with_mismatched_predictions(self):
+        labels = ["Source", "HaystackAI"]
+        evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.F1)
+        predictions = ["Open Source", "HaystackAI"]
+        result = evaluator.run(predictions=predictions)
+        assert len(result) == 1
+        assert result["result"] == pytest.approx(5 / 6)
+
+    def test_run_with_ignore_case(self):
+        labels = ["source", "HAYSTACKAI"]
+        evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.F1, ignore_case=True)
+        predictions = ["Open Source", "HaystackAI"]
+        result = evaluator.run(predictions=predictions)
+        assert len(result) == 1
+        assert result["result"] == pytest.approx(5 / 6)
+
+    def test_run_with_ignore_punctuation(self):
+        labels = ["Source", "HaystackAI"]
+        evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.F1, ignore_punctuation=True)
+        predictions = ["Open Source!", "Haystack.AI"]
+        result = evaluator.run(predictions=predictions)
+
+        assert result["result"] == pytest.approx(5 / 6)
+
+    def test_run_with_ignore_numbers(self):
+        labels = ["Source", "HaystackAI"]
+        evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.F1, ignore_numbers=True)
+        predictions = ["Open Source123", "HaystackAI"]
+        result = evaluator.run(predictions=predictions)
+        assert result["result"] == pytest.approx(5 / 6)
+
+    def test_run_with_regex_to_ignore(self):
+        labels = ["Source", "HaystackAI"]
+        evaluator = StatisticalEvaluator(
+            labels=labels, metric=StatisticalEvaluator.Metric.F1, regexes_to_ignore=[r"\d+"]
+        )
+        predictions = ["Open123 Source", "HaystackAI"]
+        result = evaluator.run(predictions=predictions)
+        assert result["result"] == pytest.approx(5 / 6)
+
+    def test_run_with_multiple_regex_to_ignore(self):
+        labels = ["Source", "HaystackAI"]
+        evaluator = StatisticalEvaluator(
+            labels=labels, metric=StatisticalEvaluator.Metric.F1, regexes_to_ignore=[r"\d+", r"[^\w\s]"]
+        )
+        predictions = ["Open123! Source", "Haystack.AI"]
+        result = evaluator.run(predictions=predictions)
+        assert result["result"] == pytest.approx(5 / 6)
+
+    def test_run_with_multiple_ignore_parameters(self):
+        labels = ["Source", "HaystackAI"]
+        evaluator = StatisticalEvaluator(
+            labels=labels,
+            metric=StatisticalEvaluator.Metric.F1,
+            ignore_numbers=True,
+            ignore_punctuation=True,
+            ignore_case=True,
+            regexes_to_ignore=[r"[^\w\s\d]+"],
+        )
+        predictions = ["Open%123. !$Source", "Haystack.AI##"]
+        result = evaluator.run(predictions=predictions)
+        assert result["result"] == pytest.approx(5 / 6)
+
+
+class TestStatisticalEvaluatorExactMatch:
+    def test_run_with_empty_inputs(self):
+        evaluator = StatisticalEvaluator(labels=[], metric=StatisticalEvaluator.Metric.EM)
+        result = evaluator.run(predictions=[])
+        assert len(result) == 1
+        assert result["result"] == 0.0
+
+    def test_run_with_different_lengths(self):
+        labels = [
+            "A construction budget of US $2.3 billion",
+            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
+        ]
+        evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.EM)
+
+        predictions = [
+            "A construction budget of US $2.3 billion",
+            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
+            "The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
+        ]
+        with pytest.raises(ValueError):
+            evaluator.run(predictions)
+
+    def test_run_with_matching_predictions(self):
+        labels = ["OpenSource", "HaystackAI", "LLMs"]
+        evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.EM)
+        predictions = ["OpenSource", "HaystackAI", "LLMs"]
+        result = evaluator.run(predictions=predictions)
+
+        assert len(result) == 1
+        assert result["result"] == 1.0
+
+    def test_run_with_single_prediction(self):
+        labels = ["OpenSource"]
+        evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.EM)
+
+        result = evaluator.run(predictions=["OpenSource"])
+        assert len(result) == 1
+        assert result["result"] == 1.0
+
+    def test_run_with_mismatched_predictions(self):
+        labels = ["Source", "HaystackAI", "LLMs"]
+        evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.EM)
+        predictions = ["OpenSource", "HaystackAI", "LLMs"]
+        result = evaluator.run(predictions=predictions)
+        assert len(result) == 1
+        assert result["result"] == 2 / 3
+
+    def test_run_with_ignore_case(self):
+        labels = ["opensource", "HAYSTACKAI", "llMs"]
+        evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.EM, ignore_case=True)
+        predictions = ["OpenSource", "HaystackAI", "LLMs"]
+        result = evaluator.run(predictions=predictions)
+        assert len(result) == 1
+        assert result["result"] == 1.0
+
+    def test_run_with_ignore_punctuation(self):
+        labels = ["OpenSource", "HaystackAI", "LLMs"]
+        evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.EM, ignore_punctuation=True)
+        predictions = ["OpenSource!", "Haystack.AI", "LLMs,"]
+        result = evaluator.run(predictions=predictions)
+        assert result["result"] == 1.0
+
+    def test_run_with_ignore_numbers(self):
+        labels = ["OpenSource", "HaystackAI", "LLMs"]
+        evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.EM, ignore_numbers=True)
+        predictions = ["OpenSource123", "HaystackAI", "LLMs456"]
+        result = evaluator.run(predictions=predictions)
+        assert result["result"] == 1.0
+
+    def test_run_with_regex_to_ignore(self):
+        labels = ["OpenSource", "HaystackAI", "LLMs"]
+        evaluator = StatisticalEvaluator(
+            labels=labels, metric=StatisticalEvaluator.Metric.EM, regexes_to_ignore=[r"\d+"]
+        )
+        predictions = ["Open123Source", "HaystackAI", "LLMs456"]
+        result = evaluator.run(predictions=predictions)
+        assert result["result"] == 1.0
+
+    def test_run_with_multiple_regex_to_ignore(self):
+        labels = ["OpenSource", "HaystackAI", "LLMs"]
+        evaluator = StatisticalEvaluator(
+            labels=labels, metric=StatisticalEvaluator.Metric.EM, regexes_to_ignore=[r"\d+", r"\W+"]
+        )
+        predictions = ["Open123!Source", "Haystack.AI", "LLMs456,"]
+        result = evaluator.run(predictions=predictions)
+        assert result["result"] == 1.0
+
+    def test_run_with_multiple_ignore_parameters(self):
+        labels = ["OpenSource", "HaystackAI", "LLMs"]
+        evaluator = StatisticalEvaluator(
+            labels=labels,
+            metric=StatisticalEvaluator.Metric.EM,
+            ignore_numbers=True,
+            ignore_punctuation=True,
+            ignore_case=True,
+            regexes_to_ignore=[r"[^\w\s\d]+"],
+        )
+        predictions = ["Open%123!$Source", "Haystack.AI##", "^^LLMs456,"]
+        result = evaluator.run(predictions=predictions)
+        assert result["result"] == 1.0
diff --git a/test/evaluation/test_eval.py b/test/evaluation/test_eval.py
index 095c51d4f5..f46395db40 100644
--- a/test/evaluation/test_eval.py
+++ b/test/evaluation/test_eval.py
@@ -15,14 +15,6 @@ def test_init(self):
         assert result.outputs == []
         assert result.expected_outputs == []
 
-    def test_supported_metrics_contains_all_metrics(self):
-        runnable = Pipeline()
-        result = EvaluationResult(runnable=runnable, inputs=[], outputs=[], expected_outputs=[])
-
-        supported_metrics = [m.name for m in result._supported_metrics.keys()]
-        all_metric_names = [m.name for m in Metric]
-        assert supported_metrics == all_metric_names
-
     def test_calculate_metrics_with_supported_metric(self):
         runnable = Pipeline()
         result = EvaluationResult(runnable=runnable, inputs=[], outputs=[], expected_outputs=[])
diff --git a/test/evaluation/test_eval_exact_match.py b/test/evaluation/test_eval_exact_match.py
deleted file mode 100644
index ad0b8930d1..0000000000
--- a/test/evaluation/test_eval_exact_match.py
+++ /dev/null
@@ -1,167 +0,0 @@
-from haystack import Pipeline
-from haystack.dataclasses import GeneratedAnswer
-from haystack.evaluation.eval import EvaluationResult
-
-
-class TestExactMatch:
-    def create_evaluation_result(self, predictions, labels):
-        """
-        Creates an evaluation result of a RAG pipeline using the list of predictions and labels for testing the exact match.
-        """
-        runnable = Pipeline()
-        inputs = []
-        outputs = [
-            {"answer_builder": {"answers": [GeneratedAnswer(data=pred, query="", documents=[], meta={})]}}
-            for pred in predictions
-        ]
-        expected_outputs = [
-            {"answer_builder": {"answers": [GeneratedAnswer(data=label, query="", documents=[], meta={})]}}
-            for label in labels
-        ]
-        evaluation_result = EvaluationResult(runnable, inputs, outputs, expected_outputs)
-        return evaluation_result
-
-    def test_exact_match_empty_inputs(self):
-        """
-        Test exact match with empty inputs
-        """
-        runnable = Pipeline()
-        inputs = []
-        outputs = [
-            {"answer_builder": {"answers": []}},
-            {"answer_builder": {"answers": []}},
-            {"answer_builder": {"answers": []}},
-        ]
-        expected_outputs = [
-            {"answer_builder": {"answers": []}},
-            {"answer_builder": {"answers": []}},
-            {"answer_builder": {"answers": []}},
-        ]
-        evaluation_result = EvaluationResult(runnable, inputs, outputs, expected_outputs)
-        # Expecting 0% exact match for empty inputs
-        em_result = evaluation_result._calculate_em(output_key="answers")
-
-        assert em_result["exact_match"] == 0.0
-
-    def test_exact_match_same_inputs(self):
-        """
-        Test exact match with default parameters
-        """
-        predictions = ["OpenSource", "HaystackAI", "LLMs"]
-        labels = ["OpenSource", "HaystackAI", "LLMs"]
-        evaluation_result = self.create_evaluation_result(predictions, labels)
-        em_result = evaluation_result._calculate_em(output_key="answers")
-
-        assert em_result["exact_match"] == 1.0
-
-    def test_exact_match_single_word(self):
-        """
-        Test exact match with single-word inputs
-        """
-        predictions = ["OpenSource"]
-        labels = ["OpenSource"]
-
-        evaluation_result = self.create_evaluation_result(predictions, labels)
-        em_result = evaluation_result._calculate_em(output_key="answers")
-
-        assert em_result["exact_match"] == 1.0
-
-    def test_exact_match_negative_case(self):
-        """
-        Test exact match with deliberately mismatched predictions and labels
-        """
-        predictions = ["OpenSource", "HaystackAI", "LLMs"]
-        labels = ["Source", "HaystackAI", "LLMs"]
-
-        evaluation_result = self.create_evaluation_result(predictions, labels)
-        # Expecting EM to be 2/3 as 2 out of 3 items match
-        expected_em = 2 / 3
-        em_result = evaluation_result._calculate_em(output_key="answers")
-
-        assert em_result["exact_match"] == expected_em
-
-    def test_exact_match_ignore_case(self):
-        """
-        Test exact match with ignoring case sensitivity
-        """
-        predictions = ["OpenSource", "HaystackAI", "LLMs"]
-        labels = ["opensource", "HAYSTACKAI", "llMs"]
-
-        evaluation_result = self.create_evaluation_result(predictions, labels)
-        # Exact match after case ignoring
-        em_result = evaluation_result._calculate_em(output_key="answers", ignore_case=True)
-
-        assert em_result["exact_match"] == 1.0
-
-    def test_exact_match_ignore_punctuation(self):
-        """
-        Test exact match with ignoring punctuation
-        """
-        predictions = ["OpenSource!", "Haystack.AI", "LLMs,"]
-        labels = ["OpenSource", "HaystackAI", "LLMs"]
-
-        evaluation_result = self.create_evaluation_result(predictions, labels)
-        # Exact match after ignoring punctuation
-        em_result = evaluation_result._calculate_em(output_key="answers", ignore_punctuation=True)
-
-        assert em_result["exact_match"] == 1.0
-
-    def test_exact_match_ignore_numbers(self):
-        """
-        Test exact match with ignoring numbers
-        """
-        predictions = ["OpenSource123", "HaystackAI", "LLMs456"]
-        labels = ["OpenSource", "HaystackAI", "LLMs"]
-
-        evaluation_result = self.create_evaluation_result(predictions, labels)
-        # Exact match after ignoring numbers
-        em_result = evaluation_result._calculate_em(output_key="answers", ignore_numbers=True)
-        assert em_result["exact_match"] == 1.0
-
-    def test_exact_match_regex_ignore(self):
-        """
-        Test exact match with ignoring specific regex patterns
-        """
-        predictions = ["Open123Source", "HaystackAI", "LLMs456"]
-        labels = ["OpenSource", "HaystackAI", "LLMs"]
-
-        evaluation_result = self.create_evaluation_result(predictions, labels)
-        # Ignore numeric patterns
-        regex_to_ignore = [r"\d+"]
-        em_result = evaluation_result._calculate_em(output_key="answers", regexes_to_ignore=regex_to_ignore)
-
-        assert em_result["exact_match"] == 1.0
-
-    def test_exact_match_multiple_ignore_regex(self):
-        """
-        Test exact match with multiple ignoring parameters
-        """
-        predictions = ["Open123!Source", "Haystack.AI", "LLMs456,"]
-        labels = ["OpenSource", "HaystackAI", "LLMs"]
-
-        evaluation_result = self.create_evaluation_result(predictions, labels)
-        # Ignore numeric patterns and punctuation using regex
-        regex_to_ignore = [r"\d+", r"\W+"]
-        em_result = evaluation_result._calculate_em(output_key="answers", regexes_to_ignore=regex_to_ignore)
-
-        assert em_result["exact_match"] == 1.0
-
-    def test_exact_match_multiple_ignore_combination(self):
-        """
-        Test exact match with multiple ignoring parameters combined
-        """
-        predictions = ["Open%123!$Source", "Haystack.AI##", "^^LLMs456,"]
-        labels = ["OpenSource", "HaystackAI", "LLMs"]
-
-        evaluation_result = self.create_evaluation_result(predictions, labels)
-        # Ignore only special characters using regex
-        regex_to_ignore = [r"[^\w\s\d]+"]
-        em_result = evaluation_result._calculate_em(
-            output_key="answers",
-            ignore_numbers=True,
-            ignore_punctuation=True,
-            ignore_case=True,
-            regexes_to_ignore=regex_to_ignore,
-        )
-
-        assert em_result["exact_match"] == 1.0
diff --git a/test/evaluation/test_eval_f1.py b/test/evaluation/test_eval_f1.py
deleted file mode 100644
index cedcda466a..0000000000
--- a/test/evaluation/test_eval_f1.py
+++ /dev/null
@@ -1,178 +0,0 @@
-import pytest
-
-from haystack import Pipeline
-from haystack.dataclasses import GeneratedAnswer
-from haystack.evaluation.eval import EvaluationResult
-
-
-class TestF1:
-    def create_evaluation_result(self, predictions, labels):
-        """
-        Creates an evaluation result of a RAG pipeline using the list of predictions and labels for testing the f1.
-        """
-        runnable = Pipeline()
-        inputs = []
-        outputs = [
-            {"answer_builder": {"answers": [GeneratedAnswer(data=pred, query="", documents=[], meta={})]}}
-            for pred in predictions
-        ]
-        expected_outputs = [
-            {"answer_builder": {"answers": [GeneratedAnswer(data=label, query="", documents=[], meta={})]}}
-            for label in labels
-        ]
-        evaluation_result = EvaluationResult(runnable, inputs, outputs, expected_outputs)
-        return evaluation_result
-
-    def test_f1_empty_inputs(self):
-        """
-        Test f1 with empty inputs
-        """
-        runnable = Pipeline()
-        inputs = []
-        outputs = [
-            {"answer_builder": {"answers": []}},
-            {"answer_builder": {"answers": []}},
-            {"answer_builder": {"answers": []}},
-        ]
-        expected_outputs = [
-            {"answer_builder": {"answers": []}},
-            {"answer_builder": {"answers": []}},
-            {"answer_builder": {"answers": []}},
-        ]
-        evaluation_result = EvaluationResult(runnable, inputs, outputs, expected_outputs)
-        # Expecting 0% f1 for empty inputs
-        f1_result = evaluation_result._calculate_f1(output_key="answers")
-
-        assert f1_result["f1"] == 0.0
-
-    def test_calculate_f1_with_different_lengths(self):
-        """
-        Test f1 with default parameters
-        """
-        predictions = ["OpenSource", "HaystackAI", "LLMs"]
-        labels = ["OpenSource", "HaystackAI"]
-        evaluation_result = self.create_evaluation_result(predictions, labels)
-
-        with pytest.raises(ValueError, match="The number of predictions and labels must be the same."):
-            evaluation_result._calculate_f1(output_key="answers")
-
-    def test_f1_same_inputs(self):
-        """
-        Test f1 with default parameters
-        """
-        predictions = ["OpenSource", "HaystackAI", "LLMs"]
-        labels = ["OpenSource", "HaystackAI", "LLMs"]
-        evaluation_result = self.create_evaluation_result(predictions, labels)
-        f1_result = evaluation_result._calculate_f1(output_key="answers")
-
-        assert f1_result["f1"] == 1.0
-
-    def test_f1_single_word(self):
-        """
-        Test f1 with single-word inputs
-        """
-        predictions = ["Open Source"]
-        labels = ["Source"]
-
-        evaluation_result = self.create_evaluation_result(predictions, labels)
-        f1_result = evaluation_result._calculate_f1(output_key="answers")
-
-        assert f1_result["f1"] == pytest.approx(2 / 3)
-
-    def test_f1_negative_case(self):
-        """
-        Test f1 with deliberately mismatched predictions and labels
-        """
-        predictions = ["Open Source", "HaystackAI"]
-        labels = ["Source", "HaystackAI"]
-
-        evaluation_result = self.create_evaluation_result(predictions, labels)
-        f1_result = evaluation_result._calculate_f1(output_key="answers")
-
-        assert f1_result["f1"] == pytest.approx(5 / 6)
-
-    def test_f1_ignore_case(self):
-        """
-        Test f1 with ignoring case sensitivity
-        """
-        predictions = ["Open Source", "HaystackAI"]
-        labels = ["source", "HAYSTACKAI"]
-
-        evaluation_result = self.create_evaluation_result(predictions, labels)
-        # F1 after case ignoring
-        f1_result = evaluation_result._calculate_f1(output_key="answers", ignore_case=True)
-
-        assert f1_result["f1"] == pytest.approx(5 / 6)
-
-    def test_f1_ignore_punctuation(self):
-        """
-        Test f1 with ignoring punctuation
-        """
-        predictions = ["Open Source!", "Haystack.AI"]
-        labels = ["Source", "HaystackAI"]
-
-        evaluation_result = self.create_evaluation_result(predictions, labels)
-        # F1 after ignoring punctuation
-        f1_result = evaluation_result._calculate_f1(output_key="answers", ignore_punctuation=True)
-
-        assert f1_result["f1"] == pytest.approx(5 / 6)
-
-    def test_f1_ignore_numbers(self):
-        """
-        Test f1 with ignoring numbers
-        """
-        predictions = ["Open Source123", "HaystackAI"]
-        labels = ["Source", "HaystackAI"]
-
-        evaluation_result = self.create_evaluation_result(predictions, labels)
-        # F1 after ignoring numbers
-        f1_result = evaluation_result._calculate_f1(output_key="answers", ignore_numbers=True)
-        assert f1_result["f1"] == pytest.approx(5 / 6)
-
-    def test_f1_regex_ignore(self):
-        """
-        Test f1 with ignoring specific regex patterns
-        """
-        predictions = ["Open123 Source", "HaystackAI"]
-        labels = ["Source", "HaystackAI"]
-
-        evaluation_result = self.create_evaluation_result(predictions, labels)
-        # Ignore numeric patterns
-        regex_to_ignore = [r"\d+"]
-        f1_result = evaluation_result._calculate_f1(output_key="answers", regexes_to_ignore=regex_to_ignore)
-
-        assert f1_result["f1"] == pytest.approx(5 / 6)
-
-    def test_f1_multiple_ignore_regex(self):
-        """
-        Test f1 with multiple ignoring parameters
-        """
-        predictions = ["Open123! Source", "Haystack.AI"]
-        labels = ["Source", "HaystackAI"]
-
-        evaluation_result = self.create_evaluation_result(predictions, labels)
-        # Ignore numeric patterns and punctuation excluding whitespaces
-        regex_to_ignore = [r"\d+", r"[^\w\s]"]
-        f1_result = evaluation_result._calculate_f1(output_key="answers", regexes_to_ignore=regex_to_ignore)
-
-        assert f1_result["f1"] == pytest.approx(5 / 6)
-
-    def test_f1_multiple_ignore_combination(self):
-        """
-        Test f1 with multiple ignoring parameters combined
-        """
-        predictions = ["Open%123. !$Source", "Haystack.AI##"]
-        labels = ["Source", "HaystackAI"]
-
-        evaluation_result = self.create_evaluation_result(predictions, labels)
-        # Ignore only special characters using regex
-        regex_to_ignore = [r"[^\w\s\d]+"]
-        f1_result = evaluation_result._calculate_f1(
-            output_key="answers",
-            ignore_numbers=True,
-            ignore_punctuation=True,
-            ignore_case=True,
-            regexes_to_ignore=regex_to_ignore,
-        )
-
-        assert f1_result["f1"] == pytest.approx(5 / 6)