From a6c327e25f03aaf75c62b9846f56e83ba22b094f Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Thu, 7 Mar 2024 09:43:31 +0100 Subject: [PATCH 1/4] distinguish optional and required metric params --- .../components/evaluators/ragas/evaluator.py | 2 +- .../components/evaluators/ragas/metrics.py | 45 ++++++++++++------- integrations/ragas/tests/test_evaluator.py | 8 ++-- 3 files changed, 35 insertions(+), 20 deletions(-) diff --git a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py index 71dacd6c7..b908b7e25 100644 --- a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py +++ b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py @@ -80,7 +80,7 @@ def _init_backend(self): def _init_metric(self): MetricParamsValidator.validate_metric_parameters( - self.metric, self.descriptor.init_parameters, self.metric_params + self.metric, self.descriptor.required_init_parameters, self.descriptor.optional_init_parameters, self.metric_params ) self._backend_metric = self.descriptor.backend(**self.metric_params) diff --git a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py index 72f3e8a3b..492469efa 100644 --- a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py +++ b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py @@ -135,7 +135,8 @@ class MetricDescriptor: input_parameters: Dict[str, Type] input_converter: Callable[[Any], Iterable[Dict[str, str]]] output_converter: Callable[[Result, RagasMetric, Dict[str, Any]], List[MetricResult]] - init_parameters: Optional[List[str]] = None + required_init_parameters: Optional[List[str]] = None + optional_init_parameters: Optional[List[str]] = None @classmethod def new( @@ -145,7 +146,8 @@ def new( input_converter: Callable[[Any], Iterable[Dict[str, str]]], output_converter: Optional[Callable[[Result, RagasMetric, Dict[str, Any]], List[MetricResult]]] = None, *, - init_parameters: Optional[List[str]] = None, + required_init_parameters: Optional[List[str]] = None, + optional_init_parameters: Optional[List[str]] = None, ) -> "MetricDescriptor": input_converter_signature = inspect.signature(input_converter) input_parameters = {} @@ -162,7 +164,8 @@ def new( input_parameters=input_parameters, input_converter=input_converter, output_converter=output_converter if output_converter is not None else OutputConverters.default, - init_parameters=init_parameters, + required_init_parameters=required_init_parameters, + optional_init_parameters=optional_init_parameters, ) @@ -175,11 +178,22 @@ class MetricParamsValidator: """ @staticmethod - def validate_metric_parameters(metric: RagasMetric, allowed: List[str], received: Dict[str, Any]) -> None: - if not set(received).issubset(allowed): + def validate_metric_parameters(metric: RagasMetric, required: Optional[List[str]], optional: Optional[List[str]], received: Dict[str, Any]) -> None: + required = required or [] + optional = optional or [] + missing_required_params = [p for p in required if p not in received] + unexpected_params = [p for p in received if p not in required + optional] + + if missing_required_params: + msg = ( + f"Invalid init parameters for Ragas metric '{metric}'. " + f"Missing required metric parameters {missing_required_params} but got '{received}'" + ) + raise ValueError(msg) + if unexpected_params: msg = ( f"Invalid init parameters for Ragas metric '{metric}'. " - f"Allowed metric parameters {allowed} but got '{received}'" + f"Received unexpected parameters {unexpected_params} but allows '{required + optional}'" ) raise ValueError(msg) @@ -307,55 +321,56 @@ def aspect_critique(output: Result, _: RagasMetric, metric_params: Dict[str, Any RagasMetric.ANSWER_CORRECTNESS, AnswerCorrectness, InputConverters.question_response_ground_truth, # type: ignore - init_parameters=["name", "weights", "answer_similarity"], + optional_init_parameters=["name", "weights", "answer_similarity"], ), RagasMetric.FAITHFULNESS: MetricDescriptor.new( RagasMetric.FAITHFULNESS, Faithfulness, InputConverters.question_context_response, # type: ignore - init_parameters=["name"], + optional_init_parameters=["name"], ), RagasMetric.ANSWER_SIMILARITY: MetricDescriptor.new( RagasMetric.ANSWER_SIMILARITY, AnswerSimilarity, InputConverters.response_ground_truth, # type: ignore - init_parameters=["name", "model_name", "threshold"], + optional_init_parameters=["name", "model_name", "threshold"], ), RagasMetric.CONTEXT_PRECISION: MetricDescriptor.new( RagasMetric.CONTEXT_PRECISION, ContextPrecision, InputConverters.question_context_ground_truth, # type: ignore - init_parameters=["name"], + optional_init_parameters=["name"], ), RagasMetric.CONTEXT_UTILIZATION: MetricDescriptor.new( RagasMetric.CONTEXT_UTILIZATION, ContextUtilization, InputConverters.question_context_response, # type: ignore - init_parameters=["name"], + optional_init_parameters=["name"], ), RagasMetric.CONTEXT_RECALL: MetricDescriptor.new( RagasMetric.CONTEXT_RECALL, ContextRecall, InputConverters.question_context_ground_truth, # type: ignore - init_parameters=["name"], + optional_init_parameters=["name"], ), RagasMetric.ASPECT_CRITIQUE: MetricDescriptor.new( RagasMetric.ASPECT_CRITIQUE, AspectCritique, InputConverters.question_context_response, # type: ignore OutputConverters.aspect_critique, - init_parameters=["name", "definition", "strictness", "llm"], + required_init_parameters=["name", "definition"], + optional_init_parameters=["strictness", "llm"], ), RagasMetric.CONTEXT_RELEVANCY: MetricDescriptor.new( RagasMetric.CONTEXT_RELEVANCY, ContextRelevancy, InputConverters.question_context, # type: ignore - init_parameters=["name"], + optional_init_parameters=["name"], ), RagasMetric.ANSWER_RELEVANCY: MetricDescriptor.new( RagasMetric.ANSWER_RELEVANCY, AnswerRelevancy, InputConverters.question_context_response, # type: ignore - init_parameters=["name", "strictness", "embeddings"], + optional_init_parameters=["name", "strictness", "embeddings"], ), } diff --git a/integrations/ragas/tests/test_evaluator.py b/integrations/ragas/tests/test_evaluator.py index 09f8cdfcc..1a79c2f89 100644 --- a/integrations/ragas/tests/test_evaluator.py +++ b/integrations/ragas/tests/test_evaluator.py @@ -74,19 +74,19 @@ def test_evaluator_metric_init_params(): 'name': 'harmfulness', } - with pytest.raises(ValueError, match="Expects a name"): + with pytest.raises(ValueError, match="expected input parameter 'name'"): RagasEvaluator(RagasMetric.ASPECT_CRITIQUE, metric_params=None) - with pytest.raises(ValueError, match="Expects a name"): + with pytest.raises(ValueError, match="expected input parameter 'name'"): RagasEvaluator(RagasMetric.ASPECT_CRITIQUE, metric_params={}) - with pytest.raises(ValueError, match="Expects a name"): + with pytest.raises(ValueError, match="expected input parameter 'name'"): RagasEvaluator( RagasMetric.ASPECT_CRITIQUE, metric_params={"definition": "custom definition"}, ) - with pytest.raises(ValueError, match="Expects definition"): + with pytest.raises(ValueError, match="expected input parameter 'definition'"): RagasEvaluator( RagasMetric.ASPECT_CRITIQUE, metric_params={"name": "custom name"}, From 4c1721df1a26440b048a86fb332f8ad8bcfab076 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Thu, 7 Mar 2024 09:49:49 +0100 Subject: [PATCH 2/4] formatting --- .../components/evaluators/ragas/evaluator.py | 5 +- .../components/evaluators/ragas/metrics.py | 4 +- integrations/ragas/tests/test_evaluator.py | 47 +++++++++---------- 3 files changed, 30 insertions(+), 26 deletions(-) diff --git a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py index b908b7e25..2fe43a215 100644 --- a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py +++ b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py @@ -80,7 +80,10 @@ def _init_backend(self): def _init_metric(self): MetricParamsValidator.validate_metric_parameters( - self.metric, self.descriptor.required_init_parameters, self.descriptor.optional_init_parameters, self.metric_params + self.metric, + self.descriptor.required_init_parameters, + self.descriptor.optional_init_parameters, + self.metric_params, ) self._backend_metric = self.descriptor.backend(**self.metric_params) diff --git a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py index 492469efa..668f13007 100644 --- a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py +++ b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py @@ -178,7 +178,9 @@ class MetricParamsValidator: """ @staticmethod - def validate_metric_parameters(metric: RagasMetric, required: Optional[List[str]], optional: Optional[List[str]], received: Dict[str, Any]) -> None: + def validate_metric_parameters( + metric: RagasMetric, required: Optional[List[str]], optional: Optional[List[str]], received: Dict[str, Any] + ) -> None: required = required or [] optional = optional or [] missing_required_params = [p for p in required if p not in received] diff --git a/integrations/ragas/tests/test_evaluator.py b/integrations/ragas/tests/test_evaluator.py index 1a79c2f89..5e976c45d 100644 --- a/integrations/ragas/tests/test_evaluator.py +++ b/integrations/ragas/tests/test_evaluator.py @@ -5,11 +5,10 @@ import pytest from datasets import Dataset from haystack import DeserializationError +from haystack_integrations.components.evaluators.ragas import RagasEvaluator, RagasMetric from ragas.evaluation import Result from ragas.metrics.base import Metric -from haystack_integrations.components.evaluators.ragas import RagasEvaluator, RagasMetric - DEFAULT_QUESTIONS = [ "Which is the most popular global sport?", "Who created the Python language?", @@ -63,15 +62,15 @@ def test_evaluator_metric_init_params(): eval = RagasEvaluator( RagasMetric.ASPECT_CRITIQUE, metric_params={ - 'name': 'harmfulness', - 'definition': 'Does the submission causes or has the potential to cause ' - 'harm to individuals, groups, or society at large?', + "name": "harmfulness", + "definition": "Does the submission causes or has the potential to cause " + "harm to individuals, groups, or society at large?", }, ) assert eval.metric_params == { - 'definition': 'Does the submission causes or has the potential to cause harm to ' - 'individuals, groups, or society at large?', - 'name': 'harmfulness', + "definition": "Does the submission causes or has the potential to cause harm to " + "individuals, groups, or society at large?", + "name": "harmfulness", } with pytest.raises(ValueError, match="expected input parameter 'name'"): @@ -103,10 +102,10 @@ def test_evaluator_serde(): init_params = { "metric": RagasMetric.ASPECT_CRITIQUE, "metric_params": { - 'name': 'harmfulness', - 'definition': 'Does the submission causes or has the potential to ' - 'cause harm to individuals, groups, or society at ' - 'large?', + "name": "harmfulness", + "definition": "Does the submission causes or has the potential to " + "cause harm to individuals, groups, or society at " + "large?", }, } eval = RagasEvaluator(**init_params) @@ -136,10 +135,10 @@ def test_evaluator_serde(): RagasMetric.ASPECT_CRITIQUE, {"questions": [], "contexts": [], "responses": []}, { - 'name': 'harmfulness', - 'definition': 'Does the submission causes or has the potential to ' - 'cause harm to individuals, groups, or society at ' - 'large?', + "name": "harmfulness", + "definition": "Does the submission causes or has the potential to " + "cause harm to individuals, groups, or society at " + "large?", }, ), (RagasMetric.CONTEXT_RELEVANCY, {"questions": [], "contexts": []}, None), @@ -227,10 +226,10 @@ def test_evaluator_invalid_inputs(current_metric, inputs, error_string, params): {"questions": ["q7"], "contexts": [["c7"]], "responses": ["r7"]}, [[("harmfulness", 1.0)]], { - 'name': 'harmfulness', - 'definition': 'Does the submission causes or has the potential to ' - 'cause harm to individuals, groups, or society at ' - 'large?', + "name": "harmfulness", + "definition": "Does the submission causes or has the potential to " + "cause harm to individuals, groups, or society at " + "large?", }, ), ( @@ -304,10 +303,10 @@ def test_evaluator_outputs(current_metric, inputs, expected_outputs, metric_para RagasMetric.ASPECT_CRITIQUE, {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES}, { - 'name': 'harmfulness', - 'definition': 'Does the submission causes or has the potential to ' - 'cause harm to individuals, groups, or society at ' - 'large?', + "name": "harmfulness", + "definition": "Does the submission causes or has the potential to " + "cause harm to individuals, groups, or society at " + "large?", }, ), (RagasMetric.CONTEXT_RELEVANCY, {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS}, None), From 71d6463884a18555dafe23716083ad7bdbf5a245 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Thu, 7 Mar 2024 10:37:21 +0100 Subject: [PATCH 3/4] fix metric init params tests --- integrations/ragas/tests/test_evaluator.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/integrations/ragas/tests/test_evaluator.py b/integrations/ragas/tests/test_evaluator.py index 5e976c45d..441ec7753 100644 --- a/integrations/ragas/tests/test_evaluator.py +++ b/integrations/ragas/tests/test_evaluator.py @@ -73,25 +73,25 @@ def test_evaluator_metric_init_params(): "name": "harmfulness", } - with pytest.raises(ValueError, match="expected input parameter 'name'"): + with pytest.raises(ValueError, match="Missing required metric parameters \['name', 'definition'\] but got '{}'"): RagasEvaluator(RagasMetric.ASPECT_CRITIQUE, metric_params=None) - with pytest.raises(ValueError, match="expected input parameter 'name'"): + with pytest.raises(ValueError, match="Missing required metric parameters \['name', 'definition'\]"): RagasEvaluator(RagasMetric.ASPECT_CRITIQUE, metric_params={}) - with pytest.raises(ValueError, match="expected input parameter 'name'"): + with pytest.raises(ValueError, match="Missing required metric parameters \['name'\]"): RagasEvaluator( RagasMetric.ASPECT_CRITIQUE, metric_params={"definition": "custom definition"}, ) - with pytest.raises(ValueError, match="expected input parameter 'definition'"): + with pytest.raises(ValueError, match="Missing required metric parameters \['definition'\]"): RagasEvaluator( RagasMetric.ASPECT_CRITIQUE, metric_params={"name": "custom name"}, ) - with pytest.raises(ValueError, match="Invalid init parameters"): + with pytest.raises(ValueError, match="Received unexpected parameters \['check_numbers'\]"): RagasEvaluator( RagasMetric.FAITHFULNESS, metric_params={"check_numbers": True}, @@ -325,7 +325,7 @@ def test_integration_run(metric, inputs, metric_params): eval = RagasEvaluator(**init_params) output = eval.run(**inputs) - assert type(output) == dict + assert isinstance(output, dict) assert len(output) == 1 assert "results" in output assert len(output["results"]) == len(next(iter(inputs.values()))) From 20739ddda61b2a88b78c9297b6aff5c354ef8197 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Fri, 8 Mar 2024 00:40:43 +0100 Subject: [PATCH 4/4] do not distinguish between required and optional params --- .../components/evaluators/ragas/evaluator.py | 28 +++-- .../components/evaluators/ragas/metrics.py | 67 +++-------- integrations/ragas/tests/test_evaluator.py | 108 +++++++++++------- 3 files changed, 103 insertions(+), 100 deletions(-) diff --git a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py index 2fe43a215..5c8613553 100644 --- a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py +++ b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py @@ -11,7 +11,6 @@ from .metrics import ( METRIC_DESCRIPTORS, InputConverters, - MetricParamsValidator, OutputConverters, RagasMetric, ) @@ -66,7 +65,7 @@ def __init__( on required parameters. """ self.metric = metric if isinstance(metric, RagasMetric) else RagasMetric.from_str(metric) - self.metric_params = metric_params or {} + self.metric_params = metric_params self.descriptor = METRIC_DESCRIPTORS[self.metric] self._init_backend() @@ -79,13 +78,24 @@ def _init_backend(self): self._backend_callable = RagasEvaluator._invoke_evaluate def _init_metric(self): - MetricParamsValidator.validate_metric_parameters( - self.metric, - self.descriptor.required_init_parameters, - self.descriptor.optional_init_parameters, - self.metric_params, - ) - self._backend_metric = self.descriptor.backend(**self.metric_params) + if self.descriptor.init_parameters is not None: + if self.metric_params is None: + msg = f"Ragas metric '{self.metric}' expected init parameters but got none" + raise ValueError(msg) + elif not all(k in self.descriptor.init_parameters for k in self.metric_params.keys()): + msg = ( + f"Invalid init parameters for Ragas metric '{self.metric}'. " + f"Expected: {self.descriptor.init_parameters}" + ) + raise ValueError(msg) + elif self.metric_params is not None: + msg = ( + f"Invalid init parameters for Ragas metric '{self.metric}'. " + f"None expected but {self.metric_params} given" + ) + raise ValueError(msg) + metric_params = self.metric_params or {} + self._backend_metric = self.descriptor.backend(**metric_params) @staticmethod def _invoke_evaluate(dataset: Dataset, metric: Metric) -> Result: diff --git a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py index 668f13007..ed807aa81 100644 --- a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py +++ b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py @@ -134,9 +134,8 @@ class MetricDescriptor: backend: Type[Metric] input_parameters: Dict[str, Type] input_converter: Callable[[Any], Iterable[Dict[str, str]]] - output_converter: Callable[[Result, RagasMetric, Dict[str, Any]], List[MetricResult]] - required_init_parameters: Optional[List[str]] = None - optional_init_parameters: Optional[List[str]] = None + output_converter: Callable[[Result, RagasMetric, Optional[Dict[str, Any]]], List[MetricResult]] + init_parameters: Optional[List[str]] = None @classmethod def new( @@ -144,10 +143,11 @@ def new( metric: RagasMetric, backend: Type[Metric], input_converter: Callable[[Any], Iterable[Dict[str, str]]], - output_converter: Optional[Callable[[Result, RagasMetric, Dict[str, Any]], List[MetricResult]]] = None, + output_converter: Optional[ + Callable[[Result, RagasMetric, Optional[Dict[str, Any]]], List[MetricResult]] + ] = None, *, - required_init_parameters: Optional[List[str]] = None, - optional_init_parameters: Optional[List[str]] = None, + init_parameters: Optional[List[str]] = None, ) -> "MetricDescriptor": input_converter_signature = inspect.signature(input_converter) input_parameters = {} @@ -164,42 +164,10 @@ def new( input_parameters=input_parameters, input_converter=input_converter, output_converter=output_converter if output_converter is not None else OutputConverters.default, - required_init_parameters=required_init_parameters, - optional_init_parameters=optional_init_parameters, + init_parameters=init_parameters, ) -class MetricParamsValidator: - """ - Validates metric parameters. - - Depending on the metric type, different metric parameters are allowed. - The validator functions are responsible for validating the parameters and raising an error if they are invalid. - """ - - @staticmethod - def validate_metric_parameters( - metric: RagasMetric, required: Optional[List[str]], optional: Optional[List[str]], received: Dict[str, Any] - ) -> None: - required = required or [] - optional = optional or [] - missing_required_params = [p for p in required if p not in received] - unexpected_params = [p for p in received if p not in required + optional] - - if missing_required_params: - msg = ( - f"Invalid init parameters for Ragas metric '{metric}'. " - f"Missing required metric parameters {missing_required_params} but got '{received}'" - ) - raise ValueError(msg) - if unexpected_params: - msg = ( - f"Invalid init parameters for Ragas metric '{metric}'. " - f"Received unexpected parameters {unexpected_params} but allows '{required + optional}'" - ) - raise ValueError(msg) - - class InputConverters: """ Converters for input parameters. @@ -308,12 +276,15 @@ def _extract_default_results(output: Result, metric_name: str) -> List[MetricRes raise ValueError(msg) from e @staticmethod - def default(output: Result, metric: RagasMetric, _: Dict) -> List[MetricResult]: + def default(output: Result, metric: RagasMetric, _: Optional[Dict]) -> List[MetricResult]: metric_name = metric.value return OutputConverters._extract_default_results(output, metric_name) @staticmethod - def aspect_critique(output: Result, _: RagasMetric, metric_params: Dict[str, Any]) -> List[MetricResult]: + def aspect_critique(output: Result, _: RagasMetric, metric_params: Optional[Dict[str, Any]]) -> List[MetricResult]: + if metric_params is None: + msg = "Aspect critique metric requires metric parameters" + raise ValueError(msg) metric_name = metric_params["name"] return OutputConverters._extract_default_results(output, metric_name) @@ -323,56 +294,50 @@ def aspect_critique(output: Result, _: RagasMetric, metric_params: Dict[str, Any RagasMetric.ANSWER_CORRECTNESS, AnswerCorrectness, InputConverters.question_response_ground_truth, # type: ignore - optional_init_parameters=["name", "weights", "answer_similarity"], + init_parameters=["weights"], ), RagasMetric.FAITHFULNESS: MetricDescriptor.new( RagasMetric.FAITHFULNESS, Faithfulness, InputConverters.question_context_response, # type: ignore - optional_init_parameters=["name"], ), RagasMetric.ANSWER_SIMILARITY: MetricDescriptor.new( RagasMetric.ANSWER_SIMILARITY, AnswerSimilarity, InputConverters.response_ground_truth, # type: ignore - optional_init_parameters=["name", "model_name", "threshold"], + init_parameters=["threshold"], ), RagasMetric.CONTEXT_PRECISION: MetricDescriptor.new( RagasMetric.CONTEXT_PRECISION, ContextPrecision, InputConverters.question_context_ground_truth, # type: ignore - optional_init_parameters=["name"], ), RagasMetric.CONTEXT_UTILIZATION: MetricDescriptor.new( RagasMetric.CONTEXT_UTILIZATION, ContextUtilization, InputConverters.question_context_response, # type: ignore - optional_init_parameters=["name"], ), RagasMetric.CONTEXT_RECALL: MetricDescriptor.new( RagasMetric.CONTEXT_RECALL, ContextRecall, InputConverters.question_context_ground_truth, # type: ignore - optional_init_parameters=["name"], ), RagasMetric.ASPECT_CRITIQUE: MetricDescriptor.new( RagasMetric.ASPECT_CRITIQUE, AspectCritique, InputConverters.question_context_response, # type: ignore OutputConverters.aspect_critique, - required_init_parameters=["name", "definition"], - optional_init_parameters=["strictness", "llm"], + init_parameters=["name", "definition", "strictness"], ), RagasMetric.CONTEXT_RELEVANCY: MetricDescriptor.new( RagasMetric.CONTEXT_RELEVANCY, ContextRelevancy, InputConverters.question_context, # type: ignore - optional_init_parameters=["name"], ), RagasMetric.ANSWER_RELEVANCY: MetricDescriptor.new( RagasMetric.ANSWER_RELEVANCY, AnswerRelevancy, InputConverters.question_context_response, # type: ignore - optional_init_parameters=["name", "strictness", "embeddings"], + init_parameters=["strictness"], ), } diff --git a/integrations/ragas/tests/test_evaluator.py b/integrations/ragas/tests/test_evaluator.py index 441ec7753..0decc96cd 100644 --- a/integrations/ragas/tests/test_evaluator.py +++ b/integrations/ragas/tests/test_evaluator.py @@ -58,43 +58,58 @@ def evaluate(self, _, metric: Metric, **kwargs): return output_map[self.metric] -def test_evaluator_metric_init_params(): +@pytest.mark.parametrize( + "metric, init_params", + [ + (RagasMetric.ANSWER_CORRECTNESS, {"weights": [0.5, 0.5]}), + (RagasMetric.FAITHFULNESS, None), + (RagasMetric.ANSWER_SIMILARITY, {"threshold": 0.5}), + (RagasMetric.CONTEXT_PRECISION, None), + (RagasMetric.CONTEXT_UTILIZATION, None), + (RagasMetric.CONTEXT_RECALL, None), + ( + RagasMetric.ASPECT_CRITIQUE, + { + "name": "harmfulness", + "definition": "Does the submission causes or has the potential to " + "cause harm to individuals, groups, or society at " + "large?", + }, + ), + (RagasMetric.CONTEXT_RELEVANCY, None), + (RagasMetric.ANSWER_RELEVANCY, {"strictness": 2}), + ], +) +def test_evaluator_valid_metric_init_params(metric, init_params): eval = RagasEvaluator( - RagasMetric.ASPECT_CRITIQUE, - metric_params={ - "name": "harmfulness", - "definition": "Does the submission causes or has the potential to cause " - "harm to individuals, groups, or society at large?", - }, + metric, + metric_params=init_params, ) - assert eval.metric_params == { - "definition": "Does the submission causes or has the potential to cause harm to " - "individuals, groups, or society at large?", - "name": "harmfulness", - } - - with pytest.raises(ValueError, match="Missing required metric parameters \['name', 'definition'\] but got '{}'"): - RagasEvaluator(RagasMetric.ASPECT_CRITIQUE, metric_params=None) - - with pytest.raises(ValueError, match="Missing required metric parameters \['name', 'definition'\]"): - RagasEvaluator(RagasMetric.ASPECT_CRITIQUE, metric_params={}) + assert eval.metric_params == init_params - with pytest.raises(ValueError, match="Missing required metric parameters \['name'\]"): + msg = f"Invalid init parameters for Ragas metric '{metric}'. " + with pytest.raises(ValueError, match=msg): RagasEvaluator( - RagasMetric.ASPECT_CRITIQUE, - metric_params={"definition": "custom definition"}, + metric, + metric_params={"invalid_param": "invalid_value"}, ) - with pytest.raises(ValueError, match="Missing required metric parameters \['definition'\]"): - RagasEvaluator( - RagasMetric.ASPECT_CRITIQUE, - metric_params={"name": "custom name"}, - ) - with pytest.raises(ValueError, match="Received unexpected parameters \['check_numbers'\]"): +@pytest.mark.parametrize( + "metric", + [ + RagasMetric.ANSWER_CORRECTNESS, + RagasMetric.ANSWER_SIMILARITY, + RagasMetric.ASPECT_CRITIQUE, + RagasMetric.ANSWER_RELEVANCY, + ], +) +def test_evaluator_fails_with_no_metric_init_params(metric): + msg = f"Ragas metric '{metric}' expected init parameters but got none" + with pytest.raises(ValueError, match=msg): RagasEvaluator( - RagasMetric.FAITHFULNESS, - metric_params={"check_numbers": True}, + metric, + metric_params=None, ) @@ -125,9 +140,13 @@ def test_evaluator_serde(): @pytest.mark.parametrize( "current_metric, inputs, params", [ - (RagasMetric.ANSWER_CORRECTNESS, {"questions": [], "responses": [], "ground_truths": []}, None), + ( + RagasMetric.ANSWER_CORRECTNESS, + {"questions": [], "responses": [], "ground_truths": []}, + {"weights": [0.5, 0.5]}, + ), (RagasMetric.FAITHFULNESS, {"questions": [], "contexts": [], "responses": []}, None), - (RagasMetric.ANSWER_SIMILARITY, {"responses": [], "ground_truths": []}, None), + (RagasMetric.ANSWER_SIMILARITY, {"responses": [], "ground_truths": []}, {"threshold": 0.5}), (RagasMetric.CONTEXT_PRECISION, {"questions": [], "contexts": [], "ground_truths": []}, None), (RagasMetric.CONTEXT_UTILIZATION, {"questions": [], "contexts": [], "responses": []}, None), (RagasMetric.CONTEXT_RECALL, {"questions": [], "contexts": [], "ground_truths": []}, None), @@ -142,7 +161,7 @@ def test_evaluator_serde(): }, ), (RagasMetric.CONTEXT_RELEVANCY, {"questions": [], "contexts": []}, None), - (RagasMetric.ANSWER_RELEVANCY, {"questions": [], "contexts": [], "responses": []}, None), + (RagasMetric.ANSWER_RELEVANCY, {"questions": [], "contexts": [], "responses": []}, {"strictness": 2}), ], ) def test_evaluator_valid_inputs(current_metric, inputs, params): @@ -169,9 +188,9 @@ def test_evaluator_valid_inputs(current_metric, inputs, params): RagasMetric.ANSWER_RELEVANCY, {"questions": [""], "responses": [], "contexts": []}, "Mismatching counts ", - None, + {"strictness": 2}, ), - (RagasMetric.ANSWER_RELEVANCY, {"responses": []}, "expected input parameter ", None), + (RagasMetric.ANSWER_RELEVANCY, {"responses": []}, "expected input parameter ", {"strictness": 2}), ], ) def test_evaluator_invalid_inputs(current_metric, inputs, error_string, params): @@ -194,7 +213,7 @@ def test_evaluator_invalid_inputs(current_metric, inputs, error_string, params): RagasMetric.ANSWER_CORRECTNESS, {"questions": ["q1"], "responses": ["r1"], "ground_truths": ["gt1"]}, [[(None, 0.5)]], - None, + {"weights": [0.5, 0.5]}, ), ( RagasMetric.FAITHFULNESS, @@ -202,7 +221,12 @@ def test_evaluator_invalid_inputs(current_metric, inputs, error_string, params): [[(None, 1.0)]], None, ), - (RagasMetric.ANSWER_SIMILARITY, {"responses": ["r3"], "ground_truths": ["gt3"]}, [[(None, 1.0)]], None), + ( + RagasMetric.ANSWER_SIMILARITY, + {"responses": ["r3"], "ground_truths": ["gt3"]}, + [[(None, 1.0)]], + {"threshold": 0.5}, + ), ( RagasMetric.CONTEXT_PRECISION, {"questions": ["q4"], "contexts": [["c4"]], "ground_truths": ["gt44"]}, @@ -242,7 +266,7 @@ def test_evaluator_invalid_inputs(current_metric, inputs, error_string, params): RagasMetric.ANSWER_RELEVANCY, {"questions": ["q9"], "contexts": [["c9"]], "responses": ["r9"]}, [[(None, 0.4)]], - None, + {"strictness": 2}, ), ], ) @@ -276,14 +300,18 @@ def test_evaluator_outputs(current_metric, inputs, expected_outputs, metric_para ( RagasMetric.ANSWER_CORRECTNESS, {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES, "ground_truths": DEFAULT_GROUND_TRUTHS}, - None, + {"weights": [0.5, 0.5]}, ), ( RagasMetric.FAITHFULNESS, {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES}, None, ), - (RagasMetric.ANSWER_SIMILARITY, {"responses": DEFAULT_QUESTIONS, "ground_truths": DEFAULT_GROUND_TRUTHS}, None), + ( + RagasMetric.ANSWER_SIMILARITY, + {"responses": DEFAULT_QUESTIONS, "ground_truths": DEFAULT_GROUND_TRUTHS}, + {"threshold": 0.5}, + ), ( RagasMetric.CONTEXT_PRECISION, {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "ground_truths": DEFAULT_GROUND_TRUTHS}, @@ -313,7 +341,7 @@ def test_evaluator_outputs(current_metric, inputs, expected_outputs, metric_para ( RagasMetric.ANSWER_RELEVANCY, {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES}, - None, + {"strictness": 2}, ), ], )