Skip to content

Commit

Permalink
test: update test
Browse files Browse the repository at this point in the history
  • Loading branch information
MaksymAI committed Feb 3, 2025
1 parent 40805bc commit 9cb2fc4
Showing 1 changed file with 63 additions and 65 deletions.
128 changes: 63 additions & 65 deletions tests/integration/evaluations/metrics/test_answer_correctness.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from unittest.mock import MagicMock

from dynamiq.evaluations.metrics import AnswerCorrectnessEvaluator


Expand All @@ -20,19 +19,20 @@ def test_answer_correctness_evaluator(openai_node):
" heat and light, which are essential for life on Earth."
),
(
"The boiling point of water is 100 degrees Celsius (212 degrees Fahrenheit) at"
" sea level. The boiling point can change with altitude."
"The boiling point of water is 100 degrees Celsius (212°F) at sea level. "
"The boiling point can change with altitude."
),
]

# Initialize evaluator with the provided openai_node
evaluator = AnswerCorrectnessEvaluator(llm=openai_node)

# Mock the LLMEvaluator's run methods
# The first two calls to _statement_extractor.run are for extracting from answers and ground truths
# Mock the LLMEvaluator's run method for statement extraction.
# The first call returns extracted statements from the answers,
# The second call returns extracted statements from the ground truth answers.
evaluator._statement_extractor.run = MagicMock(
side_effect=[
# First call - extract statements from answers
# Extraction for answers (for two questions)
{
"results": [
{
Expand All @@ -44,89 +44,87 @@ def test_answer_correctness_evaluator(openai_node):
{"statements": ["The boiling point of water is 100 degrees Celsius at sea level."]},
]
},
# Second call - extract statements from ground truths
# Extraction for ground truth answers (for two questions)
{
"results": [
{
"statements": [
"The sun is powered by nuclear fusion.",
"This fusion process releases a tremendous amount of energy.",
"The sun provides heat and light, which are essential for life on Earth.",
"Fusion releases energy.",
"The sun provides heat and light.",
]
},
{
"statements": [
"The boiling point of water is 100 degrees Celsius at sea level.",
"The boiling point can change with altitude.",
"Boiling point can change with altitude.",
]
},
]
},
]
)

# Mock the _statement_classifier.run method to return predefined classifications
# For each candidate statement, the classification evaluator is called.
# In our new implementation each call returns a dict with "results" list containing 'match'
# and 'reasoning' keys.
# For question 1, there are 2 answer statements and 3 ground truth statements.
# For question 2, there is 1 answer statement and 2 ground truth statements.
evaluator._statement_classifier.run = MagicMock(
return_value={
"results": [
{
"classifications": {
"TP": ["Its primary function is to provide light to the solar system."],
"FP": ["The sun is powered by nuclear fission."],
"FN": [
"The sun is powered by nuclear fusion.",
"This fusion process releases a tremendous amount of energy.",
"The sun provides heat and light, which are essential for life on Earth.",
],
}
},
{
"classifications": {
"TP": ["The boiling point of water is 100 degrees Celsius at sea level."],
"FP": [],
"FN": ["The boiling point can change with altitude."],
}
},
]
}
)

# Mock the _similarity_evaluator.run method to return predefined similarity scores
evaluator._similarity_evaluator.run = MagicMock(
return_value={"results": [{"similarity_score": "0.7"}, {"similarity_score": "1.0"}]}
side_effect=[
# Q1, answer statement call 1:
{"results": [{"match": False, "reasoning": "Mismatch: expected fusion but found fission."}]},
# Q1, answer statement call 2:
{"results": [{"match": True, "reasoning": "The statement supports the core fact of providing light."}]},
# Q1, ground truth statement call 1:
{"results": [{"match": False, "reasoning": "Mismatch: the answer does not mention nuclear fusion."}]},
# Q1, ground truth statement call 2:
{"results": [{"match": False, "reasoning": "The answer does not mention energy release."}]},
# Q1, ground truth statement call 3:
{"results": [{"match": True, "reasoning": "Matches: the answer supports the provision of light."}]},
# Q2, answer statement call:
{"results": [{"match": True, "reasoning": "The statement matches the core fact."}]},
# Q2, ground truth statement call 1:
{"results": [{"match": True, "reasoning": "Matches: the statement is exactly present in the answer."}]},
# Q2, ground truth statement call 2:
{"results": [{"match": False, "reasoning": "This detail is not mentioned in the answer."}]},
]
)

# Run the evaluator
correctness_scores = evaluator.run(
questions=questions, answers=answers, ground_truth_answers=ground_truth_answers, verbose=False
)

# Expected scores based on the mocked data and computations
# Calculation:
# For the first item:
# F1 Score:
# TP = 1
# FP = 1
# FN = 3
# Precision = 1 / (1 + 1) = 0.5
# Recall = 1 / (1 + 3) = 0.25
# F1 = 2 * (0.5 * 0.25) / (0.5 + 0.25) = 0.333...
# Similarity Score = 0.7
# Final Score = 0.75 * 0.333... + 0.25 * 0.7 ≈ 0.25 + 0.175 = 0.425

# For the second item:
# F1 Score:
# TP = 1
# FP = 0
# FN = 1
# Precision = 1 / (1 + 0) = 1.0
# Recall = 1 / (1 + 1) = 0.5
# F1 = 2 * (1.0 * 0.5) / (1.0 + 0.5) = 0.666...
# Similarity Score = 1.0
# Final Score = 0.75 * 0.666... + 0.25 * 1.0 ≈ 0.5 + 0.25 = 0.75

expected_scores = [0.425, 0.75] # Pre-calculated expected scores
# Explanation of expected metrics:
#
# For Question 1:
# Answer candidate classifications:
# - "The sun is powered by nuclear fission." => match: False
# - "Its primary function is to provide light to the solar system." => match: True
# => TP (supported) = 1, FP = 1, Precision = 1/(1+1) = 0.5.
#
# Ground truth candidate classifications (compared against answer text):
# - "The sun is powered by nuclear fusion." => match: False
# - "Fusion releases energy." => match: False
# - "The sun provides heat and light." => match: True
# => TP (present) = 1, FN = 2, Recall = 1/(1+2) ≈ 0.33.
#
# F1 Score = 2*(0.5*0.33)/(0.5+0.33) ≈ 0.4.
#
# For Question 2:
# Answer candidate: "The boiling point of water is 100 degrees Celsius at sea level."
# => match: True, so TP = 1, FP = 0, Precision = 1.0.
#
# Ground truth candidates:
# - "The boiling point of water is 100 degrees Celsius at sea level." => match: True
# - "Boiling point can change with altitude." => match: False
# => TP (present) = 1, FN = 1, Recall = 1/(1+1) = 0.5.
#
# F1 Score = 2*(1*0.5)/(1+0.5) = 0.67 (approximately).
#
# Therefore, expected final scores are approximately [0.4, 0.67].
expected_scores = [0.4, 0.67]

# Assert that the correctness scores are as expected
for computed, expected in zip(correctness_scores, expected_scores):
for computed, expected in zip([result.score for result in correctness_scores.results], expected_scores):
assert abs(computed - expected) < 0.01, f"Expected {expected}, got {computed}"

0 comments on commit 9cb2fc4

Please sign in to comment.