test: update test

dynamiq-ai · Feb 3, 2025 · 9cb2fc4 · 9cb2fc4
1 parent 40805bc
commit 9cb2fc4
Showing 1 changed file with 63 additions and 65 deletions.
diff --git a/tests/integration/evaluations/metrics/test_answer_correctness.py b/tests/integration/evaluations/metrics/test_answer_correctness.py
@@ -1,5 +1,4 @@
 from unittest.mock import MagicMock
-
 from dynamiq.evaluations.metrics import AnswerCorrectnessEvaluator
 
 
@@ -20,19 +19,20 @@ def test_answer_correctness_evaluator(openai_node):
             " heat and light, which are essential for life on Earth."
         ),
         (
-            "The boiling point of water is 100 degrees Celsius (212 degrees Fahrenheit) at"
-            " sea level. The boiling point can change with altitude."
+            "The boiling point of water is 100 degrees Celsius (212°F) at sea level. "
+            "The boiling point can change with altitude."
         ),
     ]
 
     # Initialize evaluator with the provided openai_node
     evaluator = AnswerCorrectnessEvaluator(llm=openai_node)
 
-    # Mock the LLMEvaluator's run methods
-    # The first two calls to _statement_extractor.run are for extracting from answers and ground truths
+    # Mock the LLMEvaluator's run method for statement extraction.
+    # The first call returns extracted statements from the answers,
+    # The second call returns extracted statements from the ground truth answers.
     evaluator._statement_extractor.run = MagicMock(
         side_effect=[
-            # First call - extract statements from answers
+            # Extraction for answers (for two questions)
             {
                 "results": [
                     {
@@ -44,89 +44,87 @@ def test_answer_correctness_evaluator(openai_node):
                     {"statements": ["The boiling point of water is 100 degrees Celsius at sea level."]},
                 ]
             },
-            # Second call - extract statements from ground truths
+            # Extraction for ground truth answers (for two questions)
             {
                 "results": [
                     {
                         "statements": [
                             "The sun is powered by nuclear fusion.",
-                            "This fusion process releases a tremendous amount of energy.",
-                            "The sun provides heat and light, which are essential for life on Earth.",
+                            "Fusion releases energy.",
+                            "The sun provides heat and light.",
                         ]
                     },
                     {
                         "statements": [
                             "The boiling point of water is 100 degrees Celsius at sea level.",
-                            "The boiling point can change with altitude.",
+                            "Boiling point can change with altitude.",
                         ]
                     },
                 ]
             },
         ]
     )
 
-    # Mock the _statement_classifier.run method to return predefined classifications
+    # For each candidate statement, the classification evaluator is called.
+    # In our new implementation each call returns a dict with "results" list containing 'match'
+    # and 'reasoning' keys.
+    # For question 1, there are 2 answer statements and 3 ground truth statements.
+    # For question 2, there is 1 answer statement and 2 ground truth statements.
     evaluator._statement_classifier.run = MagicMock(
-        return_value={
-            "results": [
-                {
-                    "classifications": {
-                        "TP": ["Its primary function is to provide light to the solar system."],
-                        "FP": ["The sun is powered by nuclear fission."],
-                        "FN": [
-                            "The sun is powered by nuclear fusion.",
-                            "This fusion process releases a tremendous amount of energy.",
-                            "The sun provides heat and light, which are essential for life on Earth.",
-                        ],
-                    }
-                },
-                {
-                    "classifications": {
-                        "TP": ["The boiling point of water is 100 degrees Celsius at sea level."],
-                        "FP": [],
-                        "FN": ["The boiling point can change with altitude."],
-                    }
-                },
-            ]
-        }
-    )
-
-    # Mock the _similarity_evaluator.run method to return predefined similarity scores
-    evaluator._similarity_evaluator.run = MagicMock(
-        return_value={"results": [{"similarity_score": "0.7"}, {"similarity_score": "1.0"}]}
+        side_effect=[
+            # Q1, answer statement call 1:
+            {"results": [{"match": False, "reasoning": "Mismatch: expected fusion but found fission."}]},
+            # Q1, answer statement call 2:
+            {"results": [{"match": True, "reasoning": "The statement supports the core fact of providing light."}]},
+            # Q1, ground truth statement call 1:
+            {"results": [{"match": False, "reasoning": "Mismatch: the answer does not mention nuclear fusion."}]},
+            # Q1, ground truth statement call 2:
+            {"results": [{"match": False, "reasoning": "The answer does not mention energy release."}]},
+            # Q1, ground truth statement call 3:
+            {"results": [{"match": True, "reasoning": "Matches: the answer supports the provision of light."}]},
+            # Q2, answer statement call:
+            {"results": [{"match": True, "reasoning": "The statement matches the core fact."}]},
+            # Q2, ground truth statement call 1:
+            {"results": [{"match": True, "reasoning": "Matches: the statement is exactly present in the answer."}]},
+            # Q2, ground truth statement call 2:
+            {"results": [{"match": False, "reasoning": "This detail is not mentioned in the answer."}]},
+        ]
     )
 
     # Run the evaluator
     correctness_scores = evaluator.run(
         questions=questions, answers=answers, ground_truth_answers=ground_truth_answers, verbose=False
     )
 
-    # Expected scores based on the mocked data and computations
-    # Calculation:
-    # For the first item:
-    #   F1 Score:
-    #     TP = 1
-    #     FP = 1
-    #     FN = 3
-    #     Precision = 1 / (1 + 1) = 0.5
-    #     Recall = 1 / (1 + 3) = 0.25
-    #     F1 = 2 * (0.5 * 0.25) / (0.5 + 0.25) = 0.333...
-    #   Similarity Score = 0.7
-    #   Final Score = 0.75 * 0.333... + 0.25 * 0.7 ≈ 0.25 + 0.175 = 0.425
-
-    # For the second item:
-    #   F1 Score:
-    #     TP = 1
-    #     FP = 0
-    #     FN = 1
-    #     Precision = 1 / (1 + 0) = 1.0
-    #     Recall = 1 / (1 + 1) = 0.5
-    #     F1 = 2 * (1.0 * 0.5) / (1.0 + 0.5) = 0.666...
-    #   Similarity Score = 1.0
-    #   Final Score = 0.75 * 0.666... + 0.25 * 1.0 ≈ 0.5 + 0.25 = 0.75
-
-    expected_scores = [0.425, 0.75]  # Pre-calculated expected scores
+    # Explanation of expected metrics:
+    #
+    # For Question 1:
+    #   Answer candidate classifications:
+    #      - "The sun is powered by nuclear fission." => match: False
+    #      - "Its primary function is to provide light to the solar system." => match: True
+    #     => TP (supported) = 1, FP = 1, Precision = 1/(1+1) = 0.5.
+    #
+    #   Ground truth candidate classifications (compared against answer text):
+    #      - "The sun is powered by nuclear fusion." => match: False
+    #      - "Fusion releases energy." => match: False
+    #      - "The sun provides heat and light." => match: True
+    #     => TP (present) = 1, FN = 2, Recall = 1/(1+2) ≈ 0.33.
+    #
+    #   F1 Score = 2*(0.5*0.33)/(0.5+0.33) ≈ 0.4.
+    #
+    # For Question 2:
+    #   Answer candidate: "The boiling point of water is 100 degrees Celsius at sea level."
+    #      => match: True, so TP = 1, FP = 0, Precision = 1.0.
+    #
+    #   Ground truth candidates:
+    #      - "The boiling point of water is 100 degrees Celsius at sea level." => match: True
+    #      - "Boiling point can change with altitude." => match: False
+    #     => TP (present) = 1, FN = 1, Recall = 1/(1+1) = 0.5.
+    #
+    #   F1 Score = 2*(1*0.5)/(1+0.5) = 0.67 (approximately).
+    #
+    # Therefore, expected final scores are approximately [0.4, 0.67].
+    expected_scores = [0.4, 0.67]
 
-    # Assert that the correctness scores are as expected
-    for computed, expected in zip(correctness_scores, expected_scores):
+    for computed, expected in zip([result.score for result in correctness_scores.results], expected_scores):
         assert abs(computed - expected) < 0.01, f"Expected {expected}, got {computed}"