diff --git a/integrations/ragas/pyproject.toml b/integrations/ragas/pyproject.toml index 8a2348587..3a32d27a7 100644 --- a/integrations/ragas/pyproject.toml +++ b/integrations/ragas/pyproject.toml @@ -22,7 +22,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] -dependencies = ["haystack-ai", "ragas"] +dependencies = ["haystack-ai", "ragas>=0.1.11"] [project.urls] Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/ragas" @@ -41,7 +41,7 @@ root = "../.." git_describe_command = 'git describe --tags --match="integrations/ragas-v[0-9]*"' [tool.hatch.envs.default] -dependencies = ["coverage[toml]>=6.5", "pytest", "pytest-rerunfailures", "haystack-pydoc-tools"] +dependencies = ["coverage[toml]>=6.5", "pytest", "pytest-rerunfailures", "haystack-pydoc-tools", "pytest-asyncio"] [tool.hatch.envs.default.scripts] test = "pytest --reruns 3 --reruns-delay 30 -x {args:tests}" test-cov = "coverage run -m pytest --reruns 3 --reruns-delay 30 -x {args:tests}" diff --git a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py index 06b29bedf..5d6ed16bc 100644 --- a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py +++ b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py @@ -12,7 +12,6 @@ AspectCritique, # type: ignore ContextPrecision, # type: ignore ContextRecall, # type: ignore - ContextRelevancy, # type: ignore ContextUtilization, # type: ignore Faithfulness, # type: ignore ) @@ -81,10 +80,6 @@ class RagasMetric(RagasBaseEnum): #: Parameters - `name: str, definition: str, strictness: int` ASPECT_CRITIQUE = "aspect_critique" - #: Context relevancy.\ - #: Inputs - `questions: List[str], contexts: List[List[str]]` - CONTEXT_RELEVANCY = "context_relevancy" - #: Answer relevancy.\ #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]`\ #: Parameters - `strictness: int` @@ -329,11 +324,6 @@ def aspect_critique(output: Result, _: RagasMetric, metric_params: Optional[Dict OutputConverters.aspect_critique, init_parameters=["name", "definition", "strictness"], ), - RagasMetric.CONTEXT_RELEVANCY: MetricDescriptor.new( - RagasMetric.CONTEXT_RELEVANCY, - ContextRelevancy, - InputConverters.question_context, # type: ignore - ), RagasMetric.ANSWER_RELEVANCY: MetricDescriptor.new( RagasMetric.ANSWER_RELEVANCY, AnswerRelevancy, diff --git a/integrations/ragas/tests/test_evaluator.py b/integrations/ragas/tests/test_evaluator.py index 0decc96cd..fc8901c32 100644 --- a/integrations/ragas/tests/test_evaluator.py +++ b/integrations/ragas/tests/test_evaluator.py @@ -51,7 +51,6 @@ def evaluate(self, _, metric: Metric, **kwargs): RagasMetric.CONTEXT_UTILIZATION: Result(scores=Dataset.from_list([{"context_utilization": 1.0}])), RagasMetric.CONTEXT_RECALL: Result(scores=Dataset.from_list([{"context_recall": 0.9}])), RagasMetric.ASPECT_CRITIQUE: Result(scores=Dataset.from_list([{"harmfulness": 1.0}])), - RagasMetric.CONTEXT_RELEVANCY: Result(scores=Dataset.from_list([{"context_relevancy": 1.0}])), RagasMetric.ANSWER_RELEVANCY: Result(scores=Dataset.from_list([{"answer_relevancy": 0.4}])), } assert isinstance(metric, Metric) @@ -76,7 +75,6 @@ def evaluate(self, _, metric: Metric, **kwargs): "large?", }, ), - (RagasMetric.CONTEXT_RELEVANCY, None), (RagasMetric.ANSWER_RELEVANCY, {"strictness": 2}), ], ) @@ -160,7 +158,6 @@ def test_evaluator_serde(): "large?", }, ), - (RagasMetric.CONTEXT_RELEVANCY, {"questions": [], "contexts": []}, None), (RagasMetric.ANSWER_RELEVANCY, {"questions": [], "contexts": [], "responses": []}, {"strictness": 2}), ], ) @@ -177,7 +174,6 @@ def test_evaluator_valid_inputs(current_metric, inputs, params): @pytest.mark.parametrize( "current_metric, inputs, error_string, params", [ - (RagasMetric.CONTEXT_RELEVANCY, {"questions": {}, "contexts": []}, "to be a collection of type 'list'", None), ( RagasMetric.FAITHFULNESS, {"questions": [1], "contexts": [2], "responses": [3]}, @@ -256,12 +252,6 @@ def test_evaluator_invalid_inputs(current_metric, inputs, error_string, params): "large?", }, ), - ( - RagasMetric.CONTEXT_RELEVANCY, - {"questions": ["q8"], "contexts": [["c8"]]}, - [[(None, 1.0)]], - None, - ), ( RagasMetric.ANSWER_RELEVANCY, {"questions": ["q9"], "contexts": [["c9"]], "responses": ["r9"]}, @@ -293,6 +283,7 @@ def test_evaluator_outputs(current_metric, inputs, expected_outputs, metric_para # This integration test validates the evaluator by running it against the # OpenAI API. It is parameterized by the metric, the inputs to the evaluator # and the metric parameters. +@pytest.mark.asyncio @pytest.mark.skipif("OPENAI_API_KEY" not in os.environ, reason="OPENAI_API_KEY not set") @pytest.mark.parametrize( "metric, inputs, metric_params", @@ -337,7 +328,6 @@ def test_evaluator_outputs(current_metric, inputs, expected_outputs, metric_para "large?", }, ), - (RagasMetric.CONTEXT_RELEVANCY, {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS}, None), ( RagasMetric.ANSWER_RELEVANCY, {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES},