From 8421e050ef0e61b7da9e415055ddecfb0cadd02d Mon Sep 17 00:00:00 2001 From: Miles Holland Date: Thu, 5 Sep 2024 12:59:01 -0400 Subject: [PATCH] cleanup evaluate metric aggregation --- .../promptflow/evals/_constants.py | 10 ++ .../promptflow/evals/evaluate/_evaluate.py | 88 ++++++++++++++++-- .../evals/e2etests/test_builtin_evaluators.py | 4 +- .../tests/evals/e2etests/test_evaluate.py | 2 + .../tests/evals/unittests/test_evaluate.py | 92 ++++++++++++++++++- 5 files changed, 187 insertions(+), 9 deletions(-) diff --git a/src/promptflow-evals/promptflow/evals/_constants.py b/src/promptflow-evals/promptflow/evals/_constants.py index 5572f8d65ff..0c7bb8ad5e4 100644 --- a/src/promptflow-evals/promptflow/evals/_constants.py +++ b/src/promptflow-evals/promptflow/evals/_constants.py @@ -18,6 +18,16 @@ class EvaluationMetrics: VIOLENCE = "violence" SELF_HARM = "self_harm" SEXUAL = "sexual" + PROTECTED_MATERIAL = "protected_material" + + +class _InternalEvaluationMetrics: + """Evaluation metrics that are not publicly supported. + These metrics are experimental and subject to potential change or migration to the main + enum over time. + """ + + ECI = "eci" class Prefixes: diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py b/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py index f6f40c73376..d4125c11892 100644 --- a/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py +++ b/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py @@ -12,7 +12,12 @@ from promptflow._sdk._constants import LINE_NUMBER from promptflow.client import PFClient -from .._constants import CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT, EvaluationMetrics, Prefixes +from .._constants import ( + CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT, + EvaluationMetrics, + Prefixes, + _InternalEvaluationMetrics, +) from .._user_agent import USER_AGENT from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient from ._telemetry import log_evaluate_activity @@ -24,10 +29,19 @@ ) -def _aggregate_metrics(df, evaluators) -> Dict[str, float]: - df.rename(columns={col: col.replace("outputs.", "") for col in df.columns}, inplace=True) +def _aggregate_content_safety_metrics(df, evaluators) -> Dict[str, float]: + """Find and aggregate defect rates for content safety metrics. Returns both a list + of columns that were used to calculate defect rates and the defect rates themselves. - # Calculate defect rates for content safety metrics + :param df: The dataframe of evaluation results. + :type df: ~pandas.DataFrame + :param evaluators: A dictionary mapping of strings to evaluator classes. This is used to identify + content safety metrics, since they should start with a string that matches an evaluator name. + :type evaluators: Dict[str, type] + :return: A tuple; the first element is a list of dataframe columns that were used to calculate defect rates, + and the second element is a dictionary of defect column names and defect rates. + :rtype: tuple[list[str], dict[str, float]] + """ content_safety_metrics = [ EvaluationMetrics.SEXUAL, EvaluationMetrics.SELF_HARM, @@ -59,12 +73,74 @@ def _aggregate_metrics(df, evaluators) -> Dict[str, float]: / col_with_numeric_values.count(), 2, ) + return content_safety_cols, defect_rates + + +def _aggregate_label_defect_metrics(df) -> tuple[list[str], dict[str, float]]: + """Find and aggregate defect rates for label-based metrics. Returns both a list + of columns that were used to calculate defect rates and the defect rates themselves. + + :param df: The dataframe of evaluation results. + :type df: ~pandas.DataFrame + :return: A tuple; the first element is a list of dataframe columns that were used to calculate defect rates, + and the second element is a dictionary of defect column names and defect rates. + :rtype: tuple[list[str], dict[str, float]] + """ + handled_metrics = [ + EvaluationMetrics.PROTECTED_MATERIAL, + _InternalEvaluationMetrics.ECI, + ] + label_cols = [] + for col in df.columns: + metric_name = col.split(".")[1] + if metric_name.endswith("_label") and metric_name.replace("_label", "").lower() in handled_metrics: + label_cols.append(col) + + label_df = df[label_cols] + defect_rates = {} + for col in label_df.columns: + defect_rate_name = col.replace("_label", "_defect_rate") + col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce") + defect_rates[defect_rate_name] = round( + np.sum(col_with_boolean_values >= 0.5) / col_with_boolean_values.count(), + 2, + ) + return label_cols, defect_rates + + +def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, type]) -> Dict[str, float]: + """Aggregate metrics from the evaluation results. + On top of naively calculating the mean of most metrics, this function also identifies certain columns + that represent defect rates and renames them accordingly. Other columns in the dataframe are dropped. + EX: protected_material_label -> protected_material_defect_rate + + :param df: The dataframe of evaluation results. + :type df: ~pandas.DataFrame + :param evaluators: A dictionary mapping of strings to evaluator classes. + :type evaluators: Dict[str, type] + :return: The aggregated metrics. + :rtype: List[str, float] + """ + df.rename(columns={col: col.replace("outputs.", "") for col in df.columns}, inplace=True) + + handled_columns = [] + defect_rates = {} + # Rename certain columns as defect rates if we know that's what their aggregates represent + # Content safety metrics + content_safety_cols, cs_defect_rates = _aggregate_content_safety_metrics(df, evaluators) + handled_columns.extend(content_safety_cols) + defect_rates.update(cs_defect_rates) + # Label-based (true/false) metrics where 'true' means 'something is wrong' + label_cols, label_defect_rates = _aggregate_label_defect_metrics(df) + handled_columns.extend(label_cols) + defect_rates.update(label_defect_rates) # For rest of metrics, we will calculate mean - df.drop(columns=content_safety_cols, inplace=True) + df.drop(columns=handled_columns, inplace=True) + mean_value = df.mean(numeric_only=True) metrics = mean_value.to_dict() - + # Add defect rates back into metrics metrics.update(defect_rates) return metrics diff --git a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py index 0e005e5f851..12838ca1a64 100644 --- a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py +++ b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py @@ -324,7 +324,6 @@ def test_composite_evaluator_content_safety_chat(self, project_scope, eval_last_ ] score = chat_eval(conversation=conversation) - assert score is not None assert score["violence"] == "Very low" assert score["violence_score"] < 1.0 @@ -417,3 +416,6 @@ def test_eci_evaluator(self, project_scope, azure_cred): assert unrelated_result is not None assert not unrelated_result["ECI_label"] assert "geometry question" in unrelated_result["ECI_reasoning"] + import pdb + + pdb.set_trace() diff --git a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py index 8e9b6f1fddb..cbfcd82cec5 100644 --- a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py +++ b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py @@ -178,7 +178,9 @@ def test_evaluate_with_content_safety_evaluator(self, project_scope, data_file): row_result_df = pd.DataFrame(result["rows"]) metrics = result["metrics"] + import pdb + pdb.set_trace() # validate the results assert result is not None assert result["rows"] is not None diff --git a/src/promptflow-evals/tests/evals/unittests/test_evaluate.py b/src/promptflow-evals/tests/evals/unittests/test_evaluate.py index d7f02431581..6627e4db156 100644 --- a/src/promptflow-evals/tests/evals/unittests/test_evaluate.py +++ b/src/promptflow-evals/tests/evals/unittests/test_evaluate.py @@ -11,9 +11,15 @@ from promptflow.client import PFClient from promptflow.evals._constants import DEFAULT_EVALUATION_RESULTS_FILE_NAME from promptflow.evals.evaluate import evaluate -from promptflow.evals.evaluate._evaluate import _apply_target_to_data, _rename_columns_conditionally +from promptflow.evals.evaluate._evaluate import _aggregate_metrics, _apply_target_to_data, _rename_columns_conditionally from promptflow.evals.evaluate._utils import _apply_column_mapping, _trace_destination_from_project_scope -from promptflow.evals.evaluators import F1ScoreEvaluator, GroundednessEvaluator +from promptflow.evals.evaluators import ( + ContentSafetyEvaluator, + F1ScoreEvaluator, + GroundednessEvaluator, + ProtectedMaterialEvaluator, +) +from promptflow.evals.evaluators._eci._eci import ECIEvaluator def _get_file(name): @@ -415,3 +421,85 @@ def test_get_trace_destination(self, mock_validate_trace_destination, mock_proje assert trace_destination_with_override != trace_destination_without_override assert trace_destination_with_override == _trace_destination_from_project_scope(mock_project_scope) + + def test_content_safety_aggregation(self): + data = { + # 10 zeroes in a list fully written out + "content_safety.violence_score": [0, 0, 1, 2, 5, 5, 6, 7, 9, 6], + "content_safety.sexual_score": [0, 0, 2, 3, 3, 3, 8, 8, 8, 8], + "content_safety.self_harm_score": [0, 0, 0, 0, 1, 1, 1, 1, 7, 7], + "content_safety.hate_unfairness_score": [0, 0, 1, 1, 2, 2, 3, 3, 3, 6], + "content_safety.violence": [ + "low", + "low", + "low", + "low", + "high", + "high", + "high", + "high", + "high", + "high", + ], # TODO DETERMINE ACTUAL BASED ON SCORES + "content_safety.sexual": ["low", "low", "low", "low", "low", "low", "high", "high", "high", "high"], + "content_safety.self_harm": ["low", "low", "low", "low", "low", "low", "low", "low", "high", "high"], + "content_safety.hate_unfairness": ["low", "low", "low", "low", "low", "low", "low", "low", "low", "high"], + "content_safety.violence_reason": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"], + "content_safety.sexual_reason": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"], + "content_safety.self_harm_reason": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"], + "content_safety.hate_unfairness_reason": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"], + } + data_df = pd.DataFrame(data) + evaluators = { + "content_safety": ContentSafetyEvaluator, + } + aggregation = _aggregate_metrics(data_df, evaluators) + + assert len(aggregation) == 4 + assert aggregation["content_safety.violence_defect_rate"] == 0.6 + assert aggregation["content_safety.sexual_defect_rate"] == 0.4 + assert aggregation["content_safety.self_harm_defect_rate"] == 0.2 + assert aggregation["content_safety.hate_unfairness_defect_rate"] == 0.1 + + def test_label_based_aggregation(self): + data = { + "eci.ECI_label": [True, False, True, False, True], + "eci.ECI_reasoning": ["a", "b", "c", "d", "e"], + "protected_material.protected_material_label": [False, False, False, False, True], + "protected_material.protected_material_reasoning": ["f", "g", "h", "i", "j"], + "unknown.unaccounted_label": [True, False, False, False, True], + "unknown.unaccounted_reasoning": ["k", "l", "m", "n", "o"], + } + data_df = pd.DataFrame(data) + evaluators = { + "eci": ECIEvaluator, + "protected_material": ProtectedMaterialEvaluator, + } + aggregation = _aggregate_metrics(data_df, evaluators) + # ECI and PM labels should be replaced with defect rates, unaccounted should not + assert len(aggregation) == 3 + assert "eci.ECI_label" not in aggregation + assert "protected_material.protected_material_label" not in aggregation + assert aggregation["unknown.unaccounted_label"] == 0.4 + + assert aggregation["eci.ECI_defect_rate"] == 0.6 + assert aggregation["protected_material.protected_material_defect_rate"] == 0.2 + assert "unaccounted_defect_rate" not in aggregation + + def test_general_aggregation(self): + data = { + "thing.metric": [1, 2, 3, 4, 5], + "thing.reasoning": ["a", "b", "c", "d", "e"], + "other_thing.other_meteric": [-1, -2, -3, -4, -5], + "other_thing.other_reasoning": ["f", "g", "h", "i", "j"], + "final_thing.final_metric": [False, False, False, True, True], + "bad_thing.mixed_metric": [0, 1, False, True, True], + } + data_df = pd.DataFrame(data) + evaluators = {} + aggregation = _aggregate_metrics(data_df, evaluators) + + assert len(aggregation) == 3 + assert aggregation["thing.metric"] == 3 + assert aggregation["other_thing.other_meteric"] == -3 + assert aggregation["final_thing.final_metric"] == 0.4