diff --git a/haystack/components/eval/__init__.py b/haystack/components/eval/__init__.py index 9477cb1242..547eb580b0 100644 --- a/haystack/components/eval/__init__.py +++ b/haystack/components/eval/__init__.py @@ -1,3 +1,4 @@ from .sas_evaluator import SASEvaluator +from .statistical_evaluator import StatisticalEvaluator -__all__ = ["SASEvaluator"] +__all__ = ["SASEvaluator", "StatisticalEvaluator"] diff --git a/haystack/components/eval/statistical_evaluator.py b/haystack/components/eval/statistical_evaluator.py new file mode 100644 index 0000000000..37bafe5e1e --- /dev/null +++ b/haystack/components/eval/statistical_evaluator.py @@ -0,0 +1,131 @@ +import collections +from enum import Enum +from typing import Any, Dict, List, Optional + +from numpy import array as np_array +from numpy import mean as np_mean + +from haystack import default_from_dict, default_to_dict +from haystack.core.component import component + +from .preprocess import _preprocess_text + + +@component +class StatisticalEvaluator: + """ + StatisticalEvaluator is a component that evaluates the performance of a model based on statistical metrics. + It's usually used in QA and Retrieval Augmented Generation (RAG) pipelines to evaluate the quality of the generated answers. + + The supported metrics are: + - F1: Measures word overlap between predictions and labels. + - Exact Match: Measures the proportion of cases where prediction is identical to the expected label. + """ + + class Metric(Enum): + """ + Supported metrics + """ + + F1 = "F1" + EM = "Exact Match" + + def __init__( + self, + labels: List[str], + metric: Metric, + regexes_to_ignore: Optional[List[str]] = None, + ignore_case: bool = False, + ignore_punctuation: bool = False, + ignore_numbers: bool = False, + ): + """ + Creates a new instance of StatisticalEvaluator. + + :param labels: The list of expected answers. + :param metric: Metric to use for evaluation in this component. Supported metrics are F1 and Exact Match. + :param regexes_to_ignore: A list of regular expressions. If provided, it removes substrings + matching these regular expressions from both predictions and labels before comparison. Defaults to None. + :param ignore_case: If True, performs case-insensitive comparison. Defaults to False. + :param ignore_punctuation: If True, removes punctuation from both predictions and labels before + comparison. Defaults to False. + :param ignore_numbers: If True, removes numerical digits from both predictions and labels + before comparison. Defaults to False. + """ + self._labels = labels + self._metric = metric + self._regexes_to_ignore = regexes_to_ignore + self._ignore_case = ignore_case + self._ignore_punctuation = ignore_punctuation + self._ignore_numbers = ignore_numbers + + self._metric_function = { + StatisticalEvaluator.Metric.F1: self._f1, + StatisticalEvaluator.Metric.EM: self._exact_match, + }[self._metric] + + def to_dict(self) -> Dict[str, Any]: + return default_to_dict( + self, + labels=self._labels, + metric=self._metric.value, + regexes_to_ignore=self._regexes_to_ignore, + ignore_case=self._ignore_case, + ignore_punctuation=self._ignore_punctuation, + ignore_numbers=self._ignore_numbers, + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "StatisticalEvaluator": + data["init_parameters"]["metric"] = StatisticalEvaluator.Metric(data["init_parameters"]["metric"]) + return default_from_dict(cls, data) + + @component.output_types(result=float) + def run(self, predictions: List[str]) -> Dict[str, Any]: + if len(predictions) != len(self._labels): + raise ValueError("The number of predictions and labels must be the same.") + + predictions = _preprocess_text( + predictions, self._regexes_to_ignore, self._ignore_case, self._ignore_punctuation, self._ignore_numbers + ) + labels = _preprocess_text( + self._labels, self._regexes_to_ignore, self._ignore_case, self._ignore_punctuation, self._ignore_numbers + ) + + return {"result": self._metric_function(labels, predictions)} + + def _f1(self, labels: List[str], predictions: List[str]): + """ + Measure word overlap between predictions and labels. + """ + if len(predictions) == 0: + # We expect callers of this function already checked if predictions and labels are equal length + return 0.0 + + scores: List[float] = [] + tokenized_predictions = [pred.split() for pred in predictions] + tokenized_labels = [label.split() for label in labels] + for label_tokens, prediction_tokens in zip(tokenized_labels, tokenized_predictions): + common = collections.Counter(label_tokens) & collections.Counter(prediction_tokens) + num_same = sum(common.values()) + if len(label_tokens) == 0 or len(prediction_tokens) == 0: + # If either is no-answer, then F1 is 1 if they agree, 0 otherwise + return int(label_tokens == prediction_tokens) + if num_same == 0: + return 0 + precision = 1.0 * num_same / len(prediction_tokens) + recall = 1.0 * num_same / len(label_tokens) + f1 = (2 * precision * recall) / (precision + recall) + scores.append(f1) + + return np_mean(scores) + + def _exact_match(self, labels: List[str], predictions: List[str]) -> float: + """ + Measure the proportion of cases where predictiond is identical to the the expected label. + """ + if len(predictions) == 0: + # We expect callers of this function already checked if predictions and labels are equal length + return 0.0 + score_list = np_array(predictions) == np_array(labels) + return np_mean(score_list) diff --git a/haystack/evaluation/eval.py b/haystack/evaluation/eval.py index 500b423e86..2c5c8be657 100644 --- a/haystack/evaluation/eval.py +++ b/haystack/evaluation/eval.py @@ -1,11 +1,7 @@ -import collections -from typing import Any, Callable, Dict, List, Optional, Union - -import numpy as np +from typing import Any, Callable, Dict, List, Union from haystack import Pipeline from haystack.core.component import Component -from haystack.evaluation.eval_utils import get_answers_from_output, preprocess_text from haystack.evaluation.metrics import Metric, MetricsResult @@ -45,8 +41,6 @@ def __init__( Metric.RECALL: self._calculate_recall, Metric.MRR: self._calculate_mrr, Metric.MAP: self._calculate_map, - Metric.F1: self._calculate_f1, - Metric.EM: self._calculate_em, } def calculate_metrics(self, metric: Union[Metric, Callable[..., MetricsResult]], **kwargs) -> MetricsResult: @@ -71,119 +65,6 @@ def _calculate_map(self): def _calculate_mrr(self): return MetricsResult({"mean_reciprocal_rank": None}) - def _compute_f1_single(self, label_toks: List[str], pred_toks: List[str]) -> float: - """ - Compute F1 score for a single sample. - """ - common: collections.Counter = collections.Counter(label_toks) & collections.Counter(pred_toks) - num_same = sum(common.values()) - if len(label_toks) == 0 or len(pred_toks) == 0: - # If either is no-answer, then F1 is 1 if they agree, 0 otherwise - return int(label_toks == pred_toks) - if num_same == 0: - return 0 - precision = 1.0 * num_same / len(pred_toks) - recall = 1.0 * num_same / len(label_toks) - f1 = (2 * precision * recall) / (precision + recall) - return f1 - - def _calculate_f1( - self, - output_key: str, - regexes_to_ignore: Optional[List[str]] = None, - ignore_case: bool = False, - ignore_punctuation: bool = False, - ignore_numbers: bool = False, - ) -> MetricsResult: - """ - Calculates the F1 score between two lists of predictions and labels. - F1 score measures the word overlap between the predicted text and the corresponding ground truth label. - - :param output_key: The key of the output to use for comparison. - :param regexes_to_ignore (list, optional): A list of regular expressions. If provided, it removes substrings - matching these regular expressions from both predictions and labels before comparison. Defaults to None. - :param ignore_case (bool, optional): If True, performs case-insensitive comparison. Defaults to False. - :param ignore_punctuation (bool, optional): If True, removes punctuation from both predictions and labels before - comparison. Defaults to False. - :param ignore_numbers (bool, optional): If True, removes numerical digits from both predictions and labels - before comparison. Defaults to False. - :return: A MetricsResult object containing the calculated F1 score. - """ - - predictions = get_answers_from_output( - outputs=self.outputs, output_key=output_key, runnable_type=self.runnable_type - ) - labels = get_answers_from_output( - outputs=self.expected_outputs, output_key=output_key, runnable_type=self.runnable_type - ) - - if len(predictions) != len(labels): - raise ValueError("The number of predictions and labels must be the same.") - if len(predictions) == len(labels) == 0: - # Return F1 as 0 for no inputs - return MetricsResult({"f1": 0.0}) - - predictions = preprocess_text(predictions, regexes_to_ignore, ignore_case, ignore_punctuation, ignore_numbers) - labels = preprocess_text(labels, regexes_to_ignore, ignore_case, ignore_punctuation, ignore_numbers) - - # Tokenize by splitting on spaces - tokenized_predictions = [pred.split() for pred in predictions] - tokenized_labels = [label.split() for label in labels] - - f1_scores = [ - self._compute_f1_single(label_toks, pred_toks) - for label_toks, pred_toks in zip(tokenized_labels, tokenized_predictions) - ] - - f1 = np.mean(f1_scores) - - return MetricsResult({"f1": f1}) - - def _calculate_em( - self, - output_key: str, - regexes_to_ignore: Optional[List[str]] = None, - ignore_case: bool = False, - ignore_punctuation: bool = False, - ignore_numbers: bool = False, - ) -> MetricsResult: - """ - Calculates the Exact Match (EM) score between two lists of predictions and labels. - Exact Match (EM) score measures the percentage of samples where the predicted text exactly matches the - corresponding ground truth label. - - :param output_key: The key of the output to use for comparison. - :param regexes_to_ignore (list, optional): A list of regular expressions. If provided, it removes substrings - matching these regular expressions from both predictions and labels before comparison. Defaults to None. - :param ignore_case (bool, optional): If True, performs case-insensitive comparison. Defaults to False. - :param ignore_punctuation (bool, optional): If True, removes punctuation from both predictions and labels before - comparison. Defaults to False. - :param ignore_numbers (bool, optional): If True, removes numerical digits from both predictions and labels - before comparison. Defaults to False. - :return: A MetricsResult object containing the calculated Exact Match (EM) score. - """ - - predictions = get_answers_from_output( - outputs=self.outputs, output_key=output_key, runnable_type=self.runnable_type - ) - labels = get_answers_from_output( - outputs=self.expected_outputs, output_key=output_key, runnable_type=self.runnable_type - ) - - if len(predictions) != len(labels): - raise ValueError("The number of predictions and labels must be the same.") - if len(predictions) == len(labels) == 0: - # Return Exact Match as 0 for no inputs - return MetricsResult({"exact_match": 0.0}) - - predictions = preprocess_text(predictions, regexes_to_ignore, ignore_case, ignore_punctuation, ignore_numbers) - labels = preprocess_text(labels, regexes_to_ignore, ignore_case, ignore_punctuation, ignore_numbers) - - score_list = np.array(predictions) == np.array(labels) - exact_match_score = np.mean(score_list) - - return MetricsResult({"exact_match": exact_match_score}) - def eval( runnable: Union[Pipeline, Component], inputs: List[Dict[str, Any]], expected_outputs: List[Dict[str, Any]] diff --git a/releasenotes/notes/statistical-evaluator-d65b80e3ac24778a.yaml b/releasenotes/notes/statistical-evaluator-d65b80e3ac24778a.yaml new file mode 100644 index 0000000000..7da31e250e --- /dev/null +++ b/releasenotes/notes/statistical-evaluator-d65b80e3ac24778a.yaml @@ -0,0 +1,4 @@ +--- +features: + - | + Add `StatisticalEvaluator`, this Component can be used to calculate the different statistic metrics from answers returned by LLMs. diff --git a/test/components/eval/test_statistical_evaluator.py b/test/components/eval/test_statistical_evaluator.py new file mode 100644 index 0000000000..3abbeb7220 --- /dev/null +++ b/test/components/eval/test_statistical_evaluator.py @@ -0,0 +1,259 @@ +import pytest + +from haystack.components.eval import StatisticalEvaluator + + +class TestStatisticalEvaluator: + def test_init_default(self): + labels = ["label1", "label2", "label3"] + evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.F1) + assert evaluator._labels == labels + assert evaluator._metric == StatisticalEvaluator.Metric.F1 + assert evaluator._regexes_to_ignore is None + assert evaluator._ignore_case is False + assert evaluator._ignore_punctuation is False + assert evaluator._ignore_numbers is False + + def test_to_dict(self): + labels = ["label1", "label2", "label3"] + evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.F1) + + expected_dict = { + "type": "haystack.components.eval.statistical_evaluator.StatisticalEvaluator", + "init_parameters": { + "labels": labels, + "metric": "F1", + "regexes_to_ignore": None, + "ignore_case": False, + "ignore_punctuation": False, + "ignore_numbers": False, + }, + } + assert evaluator.to_dict() == expected_dict + + def test_from_dict(self): + evaluator = StatisticalEvaluator.from_dict( + { + "type": "haystack.components.eval.statistical_evaluator.StatisticalEvaluator", + "init_parameters": { + "labels": ["label1", "label2", "label3"], + "metric": "F1", + "regexes_to_ignore": None, + "ignore_case": False, + "ignore_punctuation": False, + "ignore_numbers": False, + }, + } + ) + + assert evaluator._labels == ["label1", "label2", "label3"] + assert evaluator._metric == StatisticalEvaluator.Metric.F1 + assert evaluator._regexes_to_ignore is None + assert evaluator._ignore_case is False + assert evaluator._ignore_punctuation is False + assert evaluator._ignore_numbers is False + + +class TestStatisticalEvaluatorF1: + def test_run_with_empty_inputs(self): + evaluator = StatisticalEvaluator(labels=[], metric=StatisticalEvaluator.Metric.F1) + result = evaluator.run(predictions=[]) + assert len(result) == 1 + assert result["result"] == 0.0 + + def test_run_with_different_lengths(self): + labels = [ + "A construction budget of US $2.3 billion", + "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", + ] + evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.F1) + + predictions = [ + "A construction budget of US $2.3 billion", + "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", + "The Meiji Restoration in 1868 transformed Japan into a modernized world power.", + ] + with pytest.raises(ValueError): + evaluator.run(predictions) + + def test_run_with_matching_predictions(self): + labels = ["OpenSource", "HaystackAI", "LLMs"] + evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.F1) + predictions = ["OpenSource", "HaystackAI", "LLMs"] + result = evaluator.run(predictions=predictions) + + assert len(result) == 1 + assert result["result"] == 1.0 + + def test_run_with_single_prediction(self): + labels = ["Source"] + evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.F1) + + result = evaluator.run(predictions=["Open Source"]) + assert len(result) == 1 + assert result["result"] == pytest.approx(2 / 3) + + def test_run_with_mismatched_predictions(self): + labels = ["Source", "HaystackAI"] + evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.F1) + predictions = ["Open Source", "HaystackAI"] + result = evaluator.run(predictions=predictions) + assert len(result) == 1 + assert result["result"] == pytest.approx(5 / 6) + + def test_run_with_ignore_case(self): + labels = ["source", "HAYSTACKAI"] + evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.F1, ignore_case=True) + predictions = ["Open Source", "HaystackAI"] + result = evaluator.run(predictions=predictions) + assert len(result) == 1 + assert result["result"] == pytest.approx(5 / 6) + + def test_run_with_ignore_punctuation(self): + labels = ["Source", "HaystackAI"] + evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.F1, ignore_punctuation=True) + predictions = ["Open Source!", "Haystack.AI"] + result = evaluator.run(predictions=predictions) + + assert result["result"] == pytest.approx(5 / 6) + + def test_run_with_ignore_numbers(self): + labels = ["Source", "HaystackAI"] + evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.F1, ignore_numbers=True) + predictions = ["Open Source123", "HaystackAI"] + result = evaluator.run(predictions=predictions) + assert result["result"] == pytest.approx(5 / 6) + + def test_run_with_regex_to_ignore(self): + labels = ["Source", "HaystackAI"] + evaluator = StatisticalEvaluator( + labels=labels, metric=StatisticalEvaluator.Metric.F1, regexes_to_ignore=[r"\d+"] + ) + predictions = ["Open123 Source", "HaystackAI"] + result = evaluator.run(predictions=predictions) + assert result["result"] == pytest.approx(5 / 6) + + def test_run_with_multiple_regex_to_ignore(self): + labels = ["Source", "HaystackAI"] + evaluator = StatisticalEvaluator( + labels=labels, metric=StatisticalEvaluator.Metric.F1, regexes_to_ignore=[r"\d+", r"[^\w\s]"] + ) + predictions = ["Open123! Source", "Haystack.AI"] + result = evaluator.run(predictions=predictions) + assert result["result"] == pytest.approx(5 / 6) + + def test_run_with_multiple_ignore_parameters(self): + labels = ["Source", "HaystackAI"] + evaluator = StatisticalEvaluator( + labels=labels, + metric=StatisticalEvaluator.Metric.F1, + ignore_numbers=True, + ignore_punctuation=True, + ignore_case=True, + regexes_to_ignore=[r"[^\w\s\d]+"], + ) + predictions = ["Open%123. !$Source", "Haystack.AI##"] + result = evaluator.run(predictions=predictions) + assert result["result"] == pytest.approx(5 / 6) + + +class TestStatisticalEvaluatorExactMatch: + def test_run_with_empty_inputs(self): + evaluator = StatisticalEvaluator(labels=[], metric=StatisticalEvaluator.Metric.EM) + result = evaluator.run(predictions=[]) + assert len(result) == 1 + assert result["result"] == 0.0 + + def test_run_with_different_lengths(self): + labels = [ + "A construction budget of US $2.3 billion", + "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", + ] + evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.EM) + + predictions = [ + "A construction budget of US $2.3 billion", + "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", + "The Meiji Restoration in 1868 transformed Japan into a modernized world power.", + ] + with pytest.raises(ValueError): + evaluator.run(predictions) + + def test_run_with_matching_predictions(self): + labels = ["OpenSource", "HaystackAI", "LLMs"] + evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.EM) + predictions = ["OpenSource", "HaystackAI", "LLMs"] + result = evaluator.run(predictions=predictions) + + assert len(result) == 1 + assert result["result"] == 1.0 + + def test_run_with_single_prediction(self): + labels = ["OpenSource"] + evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.EM) + + result = evaluator.run(predictions=["OpenSource"]) + assert len(result) == 1 + assert result["result"] == 1.0 + + def test_run_with_mismatched_predictions(self): + labels = ["Source", "HaystackAI", "LLMs"] + evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.EM) + predictions = ["OpenSource", "HaystackAI", "LLMs"] + result = evaluator.run(predictions=predictions) + assert len(result) == 1 + assert result["result"] == 2 / 3 + + def test_run_with_ignore_case(self): + labels = ["opensource", "HAYSTACKAI", "llMs"] + evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.EM, ignore_case=True) + predictions = ["OpenSource", "HaystackAI", "LLMs"] + result = evaluator.run(predictions=predictions) + assert len(result) == 1 + assert result["result"] == 1.0 + + def test_run_with_ignore_punctuation(self): + labels = ["OpenSource", "HaystackAI", "LLMs"] + evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.EM, ignore_punctuation=True) + predictions = ["OpenSource!", "Haystack.AI", "LLMs,"] + result = evaluator.run(predictions=predictions) + assert result["result"] == 1.0 + + def test_run_with_ignore_numbers(self): + labels = ["OpenSource", "HaystackAI", "LLMs"] + evaluator = StatisticalEvaluator(labels=labels, metric=StatisticalEvaluator.Metric.EM, ignore_numbers=True) + predictions = ["OpenSource123", "HaystackAI", "LLMs456"] + result = evaluator.run(predictions=predictions) + assert result["result"] == 1.0 + + def test_run_with_regex_to_ignore(self): + labels = ["OpenSource", "HaystackAI", "LLMs"] + evaluator = StatisticalEvaluator( + labels=labels, metric=StatisticalEvaluator.Metric.EM, regexes_to_ignore=[r"\d+"] + ) + predictions = ["Open123Source", "HaystackAI", "LLMs456"] + result = evaluator.run(predictions=predictions) + assert result["result"] == 1.0 + + def test_run_with_multiple_regex_to_ignore(self): + labels = ["OpenSource", "HaystackAI", "LLMs"] + evaluator = StatisticalEvaluator( + labels=labels, metric=StatisticalEvaluator.Metric.EM, regexes_to_ignore=[r"\d+", r"\W+"] + ) + predictions = ["Open123!Source", "Haystack.AI", "LLMs456,"] + result = evaluator.run(predictions=predictions) + assert result["result"] == 1.0 + + def test_run_with_multiple_ignore_parameters(self): + labels = ["OpenSource", "HaystackAI", "LLMs"] + evaluator = StatisticalEvaluator( + labels=labels, + metric=StatisticalEvaluator.Metric.EM, + ignore_numbers=True, + ignore_punctuation=True, + ignore_case=True, + regexes_to_ignore=[r"[^\w\s\d]+"], + ) + predictions = ["Open%123!$Source", "Haystack.AI##", "^^LLMs456,"] + result = evaluator.run(predictions=predictions) + assert result["result"] == 1.0 diff --git a/test/evaluation/test_eval.py b/test/evaluation/test_eval.py index 095c51d4f5..f46395db40 100644 --- a/test/evaluation/test_eval.py +++ b/test/evaluation/test_eval.py @@ -15,14 +15,6 @@ def test_init(self): assert result.outputs == [] assert result.expected_outputs == [] - def test_supported_metrics_contains_all_metrics(self): - runnable = Pipeline() - result = EvaluationResult(runnable=runnable, inputs=[], outputs=[], expected_outputs=[]) - - supported_metrics = [m.name for m in result._supported_metrics.keys()] - all_metric_names = [m.name for m in Metric] - assert supported_metrics == all_metric_names - def test_calculate_metrics_with_supported_metric(self): runnable = Pipeline() result = EvaluationResult(runnable=runnable, inputs=[], outputs=[], expected_outputs=[]) diff --git a/test/evaluation/test_eval_exact_match.py b/test/evaluation/test_eval_exact_match.py deleted file mode 100644 index ad0b8930d1..0000000000 --- a/test/evaluation/test_eval_exact_match.py +++ /dev/null @@ -1,167 +0,0 @@ -from haystack import Pipeline -from haystack.dataclasses import GeneratedAnswer -from haystack.evaluation.eval import EvaluationResult - - -class TestExactMatch: - def create_evaluation_result(self, predictions, labels): - """ - Creates an evaluation result of a RAG pipeline using the list of predictions and labels for testing the exact match. - """ - runnable = Pipeline() - inputs = [] - outputs = [ - {"answer_builder": {"answers": [GeneratedAnswer(data=pred, query="", documents=[], meta={})]}} - for pred in predictions - ] - expected_outputs = [ - {"answer_builder": {"answers": [GeneratedAnswer(data=label, query="", documents=[], meta={})]}} - for label in labels - ] - evaluation_result = EvaluationResult(runnable, inputs, outputs, expected_outputs) - return evaluation_result - - def test_exact_match_empty_inputs(self): - """ - Test exact match with empty inputs - """ - runnable = Pipeline() - inputs = [] - outputs = [ - {"answer_builder": {"answers": []}}, - {"answer_builder": {"answers": []}}, - {"answer_builder": {"answers": []}}, - ] - expected_outputs = [ - {"answer_builder": {"answers": []}}, - {"answer_builder": {"answers": []}}, - {"answer_builder": {"answers": []}}, - ] - evaluation_result = EvaluationResult(runnable, inputs, outputs, expected_outputs) - # Expecting 0% exact match for empty inputs - em_result = evaluation_result._calculate_em(output_key="answers") - - assert em_result["exact_match"] == 0.0 - - def test_exact_match_same_inputs(self): - """ - Test exact match with default parameters - """ - predictions = ["OpenSource", "HaystackAI", "LLMs"] - labels = ["OpenSource", "HaystackAI", "LLMs"] - evaluation_result = self.create_evaluation_result(predictions, labels) - em_result = evaluation_result._calculate_em(output_key="answers") - - assert em_result["exact_match"] == 1.0 - - def test_exact_match_single_word(self): - """ - Test exact match with single-word inputs - """ - predictions = ["OpenSource"] - labels = ["OpenSource"] - - evaluation_result = self.create_evaluation_result(predictions, labels) - em_result = evaluation_result._calculate_em(output_key="answers") - - assert em_result["exact_match"] == 1.0 - - def test_exact_match_negative_case(self): - """ - Test exact match with deliberately mismatched predictions and labels - """ - predictions = ["OpenSource", "HaystackAI", "LLMs"] - labels = ["Source", "HaystackAI", "LLMs"] - - evaluation_result = self.create_evaluation_result(predictions, labels) - # Expecting EM to be 2/3 as 2 out of 3 items match - expected_em = 2 / 3 - em_result = evaluation_result._calculate_em(output_key="answers") - - assert em_result["exact_match"] == expected_em - - def test_exact_match_ignore_case(self): - """ - Test exact match with ignoring case sensitivity - """ - predictions = ["OpenSource", "HaystackAI", "LLMs"] - labels = ["opensource", "HAYSTACKAI", "llMs"] - - evaluation_result = self.create_evaluation_result(predictions, labels) - # Exact match after case ignoring - em_result = evaluation_result._calculate_em(output_key="answers", ignore_case=True) - - assert em_result["exact_match"] == 1.0 - - def test_exact_match_ignore_punctuation(self): - """ - Test exact match with ignoring punctuation - """ - predictions = ["OpenSource!", "Haystack.AI", "LLMs,"] - labels = ["OpenSource", "HaystackAI", "LLMs"] - - evaluation_result = self.create_evaluation_result(predictions, labels) - # Exact match after ignoring punctuation - em_result = evaluation_result._calculate_em(output_key="answers", ignore_punctuation=True) - - assert em_result["exact_match"] == 1.0 - - def test_exact_match_ignore_numbers(self): - """ - Test exact match with ignoring numbers - """ - predictions = ["OpenSource123", "HaystackAI", "LLMs456"] - labels = ["OpenSource", "HaystackAI", "LLMs"] - - evaluation_result = self.create_evaluation_result(predictions, labels) - # Exact match after ignoring numbers - em_result = evaluation_result._calculate_em(output_key="answers", ignore_numbers=True) - assert em_result["exact_match"] == 1.0 - - def test_exact_match_regex_ignore(self): - """ - Test exact match with ignoring specific regex patterns - """ - predictions = ["Open123Source", "HaystackAI", "LLMs456"] - labels = ["OpenSource", "HaystackAI", "LLMs"] - - evaluation_result = self.create_evaluation_result(predictions, labels) - # Ignore numeric patterns - regex_to_ignore = [r"\d+"] - em_result = evaluation_result._calculate_em(output_key="answers", regexes_to_ignore=regex_to_ignore) - - assert em_result["exact_match"] == 1.0 - - def test_exact_match_multiple_ignore_regex(self): - """ - Test exact match with multiple ignoring parameters - """ - predictions = ["Open123!Source", "Haystack.AI", "LLMs456,"] - labels = ["OpenSource", "HaystackAI", "LLMs"] - - evaluation_result = self.create_evaluation_result(predictions, labels) - # Ignore numeric patterns and punctuation using regex - regex_to_ignore = [r"\d+", r"\W+"] - em_result = evaluation_result._calculate_em(output_key="answers", regexes_to_ignore=regex_to_ignore) - - assert em_result["exact_match"] == 1.0 - - def test_exact_match_multiple_ignore_combination(self): - """ - Test exact match with multiple ignoring parameters combined - """ - predictions = ["Open%123!$Source", "Haystack.AI##", "^^LLMs456,"] - labels = ["OpenSource", "HaystackAI", "LLMs"] - - evaluation_result = self.create_evaluation_result(predictions, labels) - # Ignore only special characters using regex - regex_to_ignore = [r"[^\w\s\d]+"] - em_result = evaluation_result._calculate_em( - output_key="answers", - ignore_numbers=True, - ignore_punctuation=True, - ignore_case=True, - regexes_to_ignore=regex_to_ignore, - ) - - assert em_result["exact_match"] == 1.0 diff --git a/test/evaluation/test_eval_f1.py b/test/evaluation/test_eval_f1.py deleted file mode 100644 index cedcda466a..0000000000 --- a/test/evaluation/test_eval_f1.py +++ /dev/null @@ -1,178 +0,0 @@ -import pytest - -from haystack import Pipeline -from haystack.dataclasses import GeneratedAnswer -from haystack.evaluation.eval import EvaluationResult - - -class TestF1: - def create_evaluation_result(self, predictions, labels): - """ - Creates an evaluation result of a RAG pipeline using the list of predictions and labels for testing the f1. - """ - runnable = Pipeline() - inputs = [] - outputs = [ - {"answer_builder": {"answers": [GeneratedAnswer(data=pred, query="", documents=[], meta={})]}} - for pred in predictions - ] - expected_outputs = [ - {"answer_builder": {"answers": [GeneratedAnswer(data=label, query="", documents=[], meta={})]}} - for label in labels - ] - evaluation_result = EvaluationResult(runnable, inputs, outputs, expected_outputs) - return evaluation_result - - def test_f1_empty_inputs(self): - """ - Test f1 with empty inputs - """ - runnable = Pipeline() - inputs = [] - outputs = [ - {"answer_builder": {"answers": []}}, - {"answer_builder": {"answers": []}}, - {"answer_builder": {"answers": []}}, - ] - expected_outputs = [ - {"answer_builder": {"answers": []}}, - {"answer_builder": {"answers": []}}, - {"answer_builder": {"answers": []}}, - ] - evaluation_result = EvaluationResult(runnable, inputs, outputs, expected_outputs) - # Expecting 0% f1 for empty inputs - f1_result = evaluation_result._calculate_f1(output_key="answers") - - assert f1_result["f1"] == 0.0 - - def test_calculate_f1_with_different_lengths(self): - """ - Test f1 with default parameters - """ - predictions = ["OpenSource", "HaystackAI", "LLMs"] - labels = ["OpenSource", "HaystackAI"] - evaluation_result = self.create_evaluation_result(predictions, labels) - - with pytest.raises(ValueError, match="The number of predictions and labels must be the same."): - evaluation_result._calculate_f1(output_key="answers") - - def test_f1_same_inputs(self): - """ - Test f1 with default parameters - """ - predictions = ["OpenSource", "HaystackAI", "LLMs"] - labels = ["OpenSource", "HaystackAI", "LLMs"] - evaluation_result = self.create_evaluation_result(predictions, labels) - f1_result = evaluation_result._calculate_f1(output_key="answers") - - assert f1_result["f1"] == 1.0 - - def test_f1_single_word(self): - """ - Test f1 with single-word inputs - """ - predictions = ["Open Source"] - labels = ["Source"] - - evaluation_result = self.create_evaluation_result(predictions, labels) - f1_result = evaluation_result._calculate_f1(output_key="answers") - - assert f1_result["f1"] == pytest.approx(2 / 3) - - def test_f1_negative_case(self): - """ - Test f1 with deliberately mismatched predictions and labels - """ - predictions = ["Open Source", "HaystackAI"] - labels = ["Source", "HaystackAI"] - - evaluation_result = self.create_evaluation_result(predictions, labels) - f1_result = evaluation_result._calculate_f1(output_key="answers") - - assert f1_result["f1"] == pytest.approx(5 / 6) - - def test_f1_ignore_case(self): - """ - Test f1 with ignoring case sensitivity - """ - predictions = ["Open Source", "HaystackAI"] - labels = ["source", "HAYSTACKAI"] - - evaluation_result = self.create_evaluation_result(predictions, labels) - # F1 after case ignoring - f1_result = evaluation_result._calculate_f1(output_key="answers", ignore_case=True) - - assert f1_result["f1"] == pytest.approx(5 / 6) - - def test_f1_ignore_punctuation(self): - """ - Test f1 with ignoring punctuation - """ - predictions = ["Open Source!", "Haystack.AI"] - labels = ["Source", "HaystackAI"] - - evaluation_result = self.create_evaluation_result(predictions, labels) - # F1 after ignoring punctuation - f1_result = evaluation_result._calculate_f1(output_key="answers", ignore_punctuation=True) - - assert f1_result["f1"] == pytest.approx(5 / 6) - - def test_f1_ignore_numbers(self): - """ - Test f1 with ignoring numbers - """ - predictions = ["Open Source123", "HaystackAI"] - labels = ["Source", "HaystackAI"] - - evaluation_result = self.create_evaluation_result(predictions, labels) - # F1 after ignoring numbers - f1_result = evaluation_result._calculate_f1(output_key="answers", ignore_numbers=True) - assert f1_result["f1"] == pytest.approx(5 / 6) - - def test_f1_regex_ignore(self): - """ - Test f1 with ignoring specific regex patterns - """ - predictions = ["Open123 Source", "HaystackAI"] - labels = ["Source", "HaystackAI"] - - evaluation_result = self.create_evaluation_result(predictions, labels) - # Ignore numeric patterns - regex_to_ignore = [r"\d+"] - f1_result = evaluation_result._calculate_f1(output_key="answers", regexes_to_ignore=regex_to_ignore) - - assert f1_result["f1"] == pytest.approx(5 / 6) - - def test_f1_multiple_ignore_regex(self): - """ - Test f1 with multiple ignoring parameters - """ - predictions = ["Open123! Source", "Haystack.AI"] - labels = ["Source", "HaystackAI"] - - evaluation_result = self.create_evaluation_result(predictions, labels) - # Ignore numeric patterns and punctuation excluding whitespaces - regex_to_ignore = [r"\d+", r"[^\w\s]"] - f1_result = evaluation_result._calculate_f1(output_key="answers", regexes_to_ignore=regex_to_ignore) - - assert f1_result["f1"] == pytest.approx(5 / 6) - - def test_f1_multiple_ignore_combination(self): - """ - Test f1 with multiple ignoring parameters combined - """ - predictions = ["Open%123. !$Source", "Haystack.AI##"] - labels = ["Source", "HaystackAI"] - - evaluation_result = self.create_evaluation_result(predictions, labels) - # Ignore only special characters using regex - regex_to_ignore = [r"[^\w\s\d]+"] - f1_result = evaluation_result._calculate_f1( - output_key="answers", - ignore_numbers=True, - ignore_punctuation=True, - ignore_case=True, - regexes_to_ignore=regex_to_ignore, - ) - - assert f1_result["f1"] == pytest.approx(5 / 6)