chore: Ragas - remove context relevancy metric (#917)

* ragas: remove context relevancy * try removing rerun-failures * add rerun-failures back, introduce pytest-asycio * add asyncio marker * lower-bound pin
deepset-ai · Jul 24, 2024 · cd521cf · cd521cf
1 parent 282ccc4
commit cd521cf
Show file tree

Hide file tree

Showing 3 changed files with 3 additions and 23 deletions.
diff --git a/integrations/ragas/pyproject.toml b/integrations/ragas/pyproject.toml
@@ -22,7 +22,7 @@ classifiers = [
   "Programming Language :: Python :: Implementation :: CPython",
   "Programming Language :: Python :: Implementation :: PyPy",
 ]
-dependencies = ["haystack-ai", "ragas"]
+dependencies = ["haystack-ai", "ragas>=0.1.11"]
 
 [project.urls]
 Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/ragas"
@@ -41,7 +41,7 @@ root = "../.."
 git_describe_command = 'git describe --tags --match="integrations/ragas-v[0-9]*"'
 
 [tool.hatch.envs.default]
-dependencies = ["coverage[toml]>=6.5", "pytest", "pytest-rerunfailures", "haystack-pydoc-tools"]
+dependencies = ["coverage[toml]>=6.5", "pytest", "pytest-rerunfailures", "haystack-pydoc-tools", "pytest-asyncio"]
 [tool.hatch.envs.default.scripts]
 test = "pytest --reruns 3 --reruns-delay 30 -x {args:tests}"
 test-cov = "coverage run -m pytest --reruns 3 --reruns-delay 30 -x {args:tests}"

diff --git a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py
@@ -12,7 +12,6 @@
     AspectCritique,  # type: ignore
     ContextPrecision,  # type: ignore
     ContextRecall,  # type: ignore
-    ContextRelevancy,  # type: ignore
     ContextUtilization,  # type: ignore
     Faithfulness,  # type: ignore
 )
@@ -81,10 +80,6 @@ class RagasMetric(RagasBaseEnum):
     #: Parameters - `name: str, definition: str, strictness: int`
     ASPECT_CRITIQUE = "aspect_critique"
 
-    #: Context relevancy.\
-    #: Inputs - `questions: List[str], contexts: List[List[str]]`
-    CONTEXT_RELEVANCY = "context_relevancy"
-
     #: Answer relevancy.\
     #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]`\
     #: Parameters - `strictness: int`
@@ -329,11 +324,6 @@ def aspect_critique(output: Result, _: RagasMetric, metric_params: Optional[Dict
         OutputConverters.aspect_critique,
         init_parameters=["name", "definition", "strictness"],
     ),
-    RagasMetric.CONTEXT_RELEVANCY: MetricDescriptor.new(
-        RagasMetric.CONTEXT_RELEVANCY,
-        ContextRelevancy,
-        InputConverters.question_context,  # type: ignore
-    ),
     RagasMetric.ANSWER_RELEVANCY: MetricDescriptor.new(
         RagasMetric.ANSWER_RELEVANCY,
         AnswerRelevancy,

diff --git a/integrations/ragas/tests/test_evaluator.py b/integrations/ragas/tests/test_evaluator.py
@@ -51,7 +51,6 @@ def evaluate(self, _, metric: Metric, **kwargs):
             RagasMetric.CONTEXT_UTILIZATION: Result(scores=Dataset.from_list([{"context_utilization": 1.0}])),
             RagasMetric.CONTEXT_RECALL: Result(scores=Dataset.from_list([{"context_recall": 0.9}])),
             RagasMetric.ASPECT_CRITIQUE: Result(scores=Dataset.from_list([{"harmfulness": 1.0}])),
-            RagasMetric.CONTEXT_RELEVANCY: Result(scores=Dataset.from_list([{"context_relevancy": 1.0}])),
             RagasMetric.ANSWER_RELEVANCY: Result(scores=Dataset.from_list([{"answer_relevancy": 0.4}])),
         }
         assert isinstance(metric, Metric)
@@ -76,7 +75,6 @@ def evaluate(self, _, metric: Metric, **kwargs):
                 "large?",
             },
         ),
-        (RagasMetric.CONTEXT_RELEVANCY, None),
         (RagasMetric.ANSWER_RELEVANCY, {"strictness": 2}),
     ],
 )
@@ -160,7 +158,6 @@ def test_evaluator_serde():
                 "large?",
             },
         ),
-        (RagasMetric.CONTEXT_RELEVANCY, {"questions": [], "contexts": []}, None),
         (RagasMetric.ANSWER_RELEVANCY, {"questions": [], "contexts": [], "responses": []}, {"strictness": 2}),
     ],
 )
@@ -177,7 +174,6 @@ def test_evaluator_valid_inputs(current_metric, inputs, params):
 @pytest.mark.parametrize(
     "current_metric, inputs, error_string, params",
     [
-        (RagasMetric.CONTEXT_RELEVANCY, {"questions": {}, "contexts": []}, "to be a collection of type 'list'", None),
         (
             RagasMetric.FAITHFULNESS,
             {"questions": [1], "contexts": [2], "responses": [3]},
@@ -256,12 +252,6 @@ def test_evaluator_invalid_inputs(current_metric, inputs, error_string, params):
                 "large?",
             },
         ),
-        (
-            RagasMetric.CONTEXT_RELEVANCY,
-            {"questions": ["q8"], "contexts": [["c8"]]},
-            [[(None, 1.0)]],
-            None,
-        ),
         (
             RagasMetric.ANSWER_RELEVANCY,
             {"questions": ["q9"], "contexts": [["c9"]], "responses": ["r9"]},
@@ -293,6 +283,7 @@ def test_evaluator_outputs(current_metric, inputs, expected_outputs, metric_para
 # This integration test validates the evaluator by running it against the
 # OpenAI API. It is parameterized by the metric, the inputs to the evaluator
 # and the metric parameters.
+@pytest.mark.asyncio
 @pytest.mark.skipif("OPENAI_API_KEY" not in os.environ, reason="OPENAI_API_KEY not set")
 @pytest.mark.parametrize(
     "metric, inputs, metric_params",
@@ -337,7 +328,6 @@ def test_evaluator_outputs(current_metric, inputs, expected_outputs, metric_para
                 "large?",
             },
         ),
-        (RagasMetric.CONTEXT_RELEVANCY, {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS}, None),
         (
             RagasMetric.ANSWER_RELEVANCY,
             {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES},