Skip to content

Commit

Permalink
cleanup evaluate metric aggregation
Browse files Browse the repository at this point in the history
  • Loading branch information
MilesHolland committed Sep 5, 2024
1 parent d5cc19c commit 8421e05
Show file tree
Hide file tree
Showing 5 changed files with 187 additions and 9 deletions.
10 changes: 10 additions & 0 deletions src/promptflow-evals/promptflow/evals/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,16 @@ class EvaluationMetrics:
VIOLENCE = "violence"
SELF_HARM = "self_harm"
SEXUAL = "sexual"
PROTECTED_MATERIAL = "protected_material"


class _InternalEvaluationMetrics:
"""Evaluation metrics that are not publicly supported.
These metrics are experimental and subject to potential change or migration to the main
enum over time.
"""

ECI = "eci"


class Prefixes:
Expand Down
88 changes: 82 additions & 6 deletions src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,12 @@
from promptflow._sdk._constants import LINE_NUMBER
from promptflow.client import PFClient

from .._constants import CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT, EvaluationMetrics, Prefixes
from .._constants import (
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
EvaluationMetrics,
Prefixes,
_InternalEvaluationMetrics,
)
from .._user_agent import USER_AGENT
from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient
from ._telemetry import log_evaluate_activity
Expand All @@ -24,10 +29,19 @@
)


def _aggregate_metrics(df, evaluators) -> Dict[str, float]:
df.rename(columns={col: col.replace("outputs.", "") for col in df.columns}, inplace=True)
def _aggregate_content_safety_metrics(df, evaluators) -> Dict[str, float]:
"""Find and aggregate defect rates for content safety metrics. Returns both a list
of columns that were used to calculate defect rates and the defect rates themselves.
# Calculate defect rates for content safety metrics
:param df: The dataframe of evaluation results.
:type df: ~pandas.DataFrame
:param evaluators: A dictionary mapping of strings to evaluator classes. This is used to identify
content safety metrics, since they should start with a string that matches an evaluator name.
:type evaluators: Dict[str, type]
:return: A tuple; the first element is a list of dataframe columns that were used to calculate defect rates,
and the second element is a dictionary of defect column names and defect rates.
:rtype: tuple[list[str], dict[str, float]]
"""
content_safety_metrics = [
EvaluationMetrics.SEXUAL,
EvaluationMetrics.SELF_HARM,
Expand Down Expand Up @@ -59,12 +73,74 @@ def _aggregate_metrics(df, evaluators) -> Dict[str, float]:
/ col_with_numeric_values.count(),
2,
)
return content_safety_cols, defect_rates


def _aggregate_label_defect_metrics(df) -> tuple[list[str], dict[str, float]]:
"""Find and aggregate defect rates for label-based metrics. Returns both a list
of columns that were used to calculate defect rates and the defect rates themselves.
:param df: The dataframe of evaluation results.
:type df: ~pandas.DataFrame
:return: A tuple; the first element is a list of dataframe columns that were used to calculate defect rates,
and the second element is a dictionary of defect column names and defect rates.
:rtype: tuple[list[str], dict[str, float]]
"""
handled_metrics = [
EvaluationMetrics.PROTECTED_MATERIAL,
_InternalEvaluationMetrics.ECI,
]
label_cols = []
for col in df.columns:
metric_name = col.split(".")[1]
if metric_name.endswith("_label") and metric_name.replace("_label", "").lower() in handled_metrics:
label_cols.append(col)

label_df = df[label_cols]
defect_rates = {}
for col in label_df.columns:
defect_rate_name = col.replace("_label", "_defect_rate")
col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
defect_rates[defect_rate_name] = round(
np.sum(col_with_boolean_values >= 0.5) / col_with_boolean_values.count(),
2,
)
return label_cols, defect_rates


def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, type]) -> Dict[str, float]:
"""Aggregate metrics from the evaluation results.
On top of naively calculating the mean of most metrics, this function also identifies certain columns
that represent defect rates and renames them accordingly. Other columns in the dataframe are dropped.
EX: protected_material_label -> protected_material_defect_rate
:param df: The dataframe of evaluation results.
:type df: ~pandas.DataFrame
:param evaluators: A dictionary mapping of strings to evaluator classes.
:type evaluators: Dict[str, type]
:return: The aggregated metrics.
:rtype: List[str, float]
"""
df.rename(columns={col: col.replace("outputs.", "") for col in df.columns}, inplace=True)

handled_columns = []
defect_rates = {}
# Rename certain columns as defect rates if we know that's what their aggregates represent
# Content safety metrics
content_safety_cols, cs_defect_rates = _aggregate_content_safety_metrics(df, evaluators)
handled_columns.extend(content_safety_cols)
defect_rates.update(cs_defect_rates)
# Label-based (true/false) metrics where 'true' means 'something is wrong'
label_cols, label_defect_rates = _aggregate_label_defect_metrics(df)
handled_columns.extend(label_cols)
defect_rates.update(label_defect_rates)

# For rest of metrics, we will calculate mean
df.drop(columns=content_safety_cols, inplace=True)
df.drop(columns=handled_columns, inplace=True)

mean_value = df.mean(numeric_only=True)
metrics = mean_value.to_dict()

# Add defect rates back into metrics
metrics.update(defect_rates)
return metrics

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,6 @@ def test_composite_evaluator_content_safety_chat(self, project_scope, eval_last_
]

score = chat_eval(conversation=conversation)

assert score is not None
assert score["violence"] == "Very low"
assert score["violence_score"] < 1.0
Expand Down Expand Up @@ -417,3 +416,6 @@ def test_eci_evaluator(self, project_scope, azure_cred):
assert unrelated_result is not None
assert not unrelated_result["ECI_label"]
assert "geometry question" in unrelated_result["ECI_reasoning"]
import pdb

pdb.set_trace()
2 changes: 2 additions & 0 deletions src/promptflow-evals/tests/evals/e2etests/test_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,9 @@ def test_evaluate_with_content_safety_evaluator(self, project_scope, data_file):

row_result_df = pd.DataFrame(result["rows"])
metrics = result["metrics"]
import pdb

pdb.set_trace()
# validate the results
assert result is not None
assert result["rows"] is not None
Expand Down
92 changes: 90 additions & 2 deletions src/promptflow-evals/tests/evals/unittests/test_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,15 @@
from promptflow.client import PFClient
from promptflow.evals._constants import DEFAULT_EVALUATION_RESULTS_FILE_NAME
from promptflow.evals.evaluate import evaluate
from promptflow.evals.evaluate._evaluate import _apply_target_to_data, _rename_columns_conditionally
from promptflow.evals.evaluate._evaluate import _aggregate_metrics, _apply_target_to_data, _rename_columns_conditionally
from promptflow.evals.evaluate._utils import _apply_column_mapping, _trace_destination_from_project_scope
from promptflow.evals.evaluators import F1ScoreEvaluator, GroundednessEvaluator
from promptflow.evals.evaluators import (
ContentSafetyEvaluator,
F1ScoreEvaluator,
GroundednessEvaluator,
ProtectedMaterialEvaluator,
)
from promptflow.evals.evaluators._eci._eci import ECIEvaluator


def _get_file(name):
Expand Down Expand Up @@ -415,3 +421,85 @@ def test_get_trace_destination(self, mock_validate_trace_destination, mock_proje

assert trace_destination_with_override != trace_destination_without_override
assert trace_destination_with_override == _trace_destination_from_project_scope(mock_project_scope)

def test_content_safety_aggregation(self):
data = {
# 10 zeroes in a list fully written out
"content_safety.violence_score": [0, 0, 1, 2, 5, 5, 6, 7, 9, 6],
"content_safety.sexual_score": [0, 0, 2, 3, 3, 3, 8, 8, 8, 8],
"content_safety.self_harm_score": [0, 0, 0, 0, 1, 1, 1, 1, 7, 7],
"content_safety.hate_unfairness_score": [0, 0, 1, 1, 2, 2, 3, 3, 3, 6],
"content_safety.violence": [
"low",
"low",
"low",
"low",
"high",
"high",
"high",
"high",
"high",
"high",
], # TODO DETERMINE ACTUAL BASED ON SCORES
"content_safety.sexual": ["low", "low", "low", "low", "low", "low", "high", "high", "high", "high"],
"content_safety.self_harm": ["low", "low", "low", "low", "low", "low", "low", "low", "high", "high"],
"content_safety.hate_unfairness": ["low", "low", "low", "low", "low", "low", "low", "low", "low", "high"],
"content_safety.violence_reason": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"],
"content_safety.sexual_reason": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"],
"content_safety.self_harm_reason": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"],
"content_safety.hate_unfairness_reason": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"],
}
data_df = pd.DataFrame(data)
evaluators = {
"content_safety": ContentSafetyEvaluator,
}
aggregation = _aggregate_metrics(data_df, evaluators)

assert len(aggregation) == 4
assert aggregation["content_safety.violence_defect_rate"] == 0.6
assert aggregation["content_safety.sexual_defect_rate"] == 0.4
assert aggregation["content_safety.self_harm_defect_rate"] == 0.2
assert aggregation["content_safety.hate_unfairness_defect_rate"] == 0.1

def test_label_based_aggregation(self):
data = {
"eci.ECI_label": [True, False, True, False, True],
"eci.ECI_reasoning": ["a", "b", "c", "d", "e"],
"protected_material.protected_material_label": [False, False, False, False, True],
"protected_material.protected_material_reasoning": ["f", "g", "h", "i", "j"],
"unknown.unaccounted_label": [True, False, False, False, True],
"unknown.unaccounted_reasoning": ["k", "l", "m", "n", "o"],
}
data_df = pd.DataFrame(data)
evaluators = {
"eci": ECIEvaluator,
"protected_material": ProtectedMaterialEvaluator,
}
aggregation = _aggregate_metrics(data_df, evaluators)
# ECI and PM labels should be replaced with defect rates, unaccounted should not
assert len(aggregation) == 3
assert "eci.ECI_label" not in aggregation
assert "protected_material.protected_material_label" not in aggregation
assert aggregation["unknown.unaccounted_label"] == 0.4

assert aggregation["eci.ECI_defect_rate"] == 0.6
assert aggregation["protected_material.protected_material_defect_rate"] == 0.2
assert "unaccounted_defect_rate" not in aggregation

def test_general_aggregation(self):
data = {
"thing.metric": [1, 2, 3, 4, 5],
"thing.reasoning": ["a", "b", "c", "d", "e"],
"other_thing.other_meteric": [-1, -2, -3, -4, -5],
"other_thing.other_reasoning": ["f", "g", "h", "i", "j"],
"final_thing.final_metric": [False, False, False, True, True],
"bad_thing.mixed_metric": [0, 1, False, True, True],
}
data_df = pd.DataFrame(data)
evaluators = {}
aggregation = _aggregate_metrics(data_df, evaluators)

assert len(aggregation) == 3
assert aggregation["thing.metric"] == 3
assert aggregation["other_thing.other_meteric"] == -3
assert aggregation["final_thing.final_metric"] == 0.4

0 comments on commit 8421e05

Please sign in to comment.