cleanup evaluate metric aggregation

microsoft · Sep 5, 2024 · 8421e05 · 8421e05
1 parent d5cc19c
commit 8421e05
Show file tree

Hide file tree

Showing 5 changed files with 187 additions and 9 deletions.
diff --git a/src/promptflow-evals/promptflow/evals/_constants.py b/src/promptflow-evals/promptflow/evals/_constants.py
@@ -18,6 +18,16 @@ class EvaluationMetrics:
     VIOLENCE = "violence"
     SELF_HARM = "self_harm"
     SEXUAL = "sexual"
+    PROTECTED_MATERIAL = "protected_material"
+
+
+class _InternalEvaluationMetrics:
+    """Evaluation metrics that are not publicly supported.
+    These metrics are experimental and subject to potential change or migration to the main
+    enum over time.
+    """
+
+    ECI = "eci"
 
 
 class Prefixes:

diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py b/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py
@@ -12,7 +12,12 @@
 from promptflow._sdk._constants import LINE_NUMBER
 from promptflow.client import PFClient
 
-from .._constants import CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT, EvaluationMetrics, Prefixes
+from .._constants import (
+    CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
+    EvaluationMetrics,
+    Prefixes,
+    _InternalEvaluationMetrics,
+)
 from .._user_agent import USER_AGENT
 from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient
 from ._telemetry import log_evaluate_activity
@@ -24,10 +29,19 @@
 )
 
 
-def _aggregate_metrics(df, evaluators) -> Dict[str, float]:
-    df.rename(columns={col: col.replace("outputs.", "") for col in df.columns}, inplace=True)
+def _aggregate_content_safety_metrics(df, evaluators) -> Dict[str, float]:
+    """Find and aggregate defect rates for content safety metrics. Returns both a list
+    of columns that were used to calculate defect rates and the defect rates themselves.
 
-    # Calculate defect rates for content safety metrics
+    :param df: The dataframe of evaluation results.
+    :type df: ~pandas.DataFrame
+    :param evaluators:  A dictionary mapping of strings to evaluator classes. This is used to identify
+        content safety metrics, since they should start with a string that matches an evaluator name.
+    :type evaluators: Dict[str, type]
+    :return: A tuple; the first element is a list of dataframe columns that were used to calculate defect rates,
+        and the second element is a dictionary of defect column names and defect rates.
+    :rtype: tuple[list[str], dict[str, float]]
+    """
     content_safety_metrics = [
         EvaluationMetrics.SEXUAL,
         EvaluationMetrics.SELF_HARM,
@@ -59,12 +73,74 @@ def _aggregate_metrics(df, evaluators) -> Dict[str, float]:
             / col_with_numeric_values.count(),
             2,
         )
+    return content_safety_cols, defect_rates
+
+
+def _aggregate_label_defect_metrics(df) -> tuple[list[str], dict[str, float]]:
+    """Find and aggregate defect rates for label-based metrics. Returns both a list
+    of columns that were used to calculate defect rates and the defect rates themselves.
+
+    :param df: The dataframe of evaluation results.
+    :type df: ~pandas.DataFrame
+    :return: A tuple; the first element is a list of dataframe columns that were used to calculate defect rates,
+        and the second element is a dictionary of defect column names and defect rates.
+    :rtype: tuple[list[str], dict[str, float]]
+    """
+    handled_metrics = [
+        EvaluationMetrics.PROTECTED_MATERIAL,
+        _InternalEvaluationMetrics.ECI,
+    ]
+    label_cols = []
+    for col in df.columns:
+        metric_name = col.split(".")[1]
+        if metric_name.endswith("_label") and metric_name.replace("_label", "").lower() in handled_metrics:
+            label_cols.append(col)
+
+    label_df = df[label_cols]
+    defect_rates = {}
+    for col in label_df.columns:
+        defect_rate_name = col.replace("_label", "_defect_rate")
+        col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
+        defect_rates[defect_rate_name] = round(
+            np.sum(col_with_boolean_values >= 0.5) / col_with_boolean_values.count(),
+            2,
+        )
+    return label_cols, defect_rates
+
+
+def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, type]) -> Dict[str, float]:
+    """Aggregate metrics from the evaluation results.
+    On top of naively calculating the mean of most metrics, this function also identifies certain columns
+    that represent defect rates and renames them accordingly. Other columns in the dataframe are dropped.
+    EX: protected_material_label -> protected_material_defect_rate
+
+    :param df: The dataframe of evaluation results.
+    :type df: ~pandas.DataFrame
+    :param evaluators:  A dictionary mapping of strings to evaluator classes.
+    :type evaluators: Dict[str, type]
+    :return: The aggregated metrics.
+    :rtype: List[str, float]
+    """
+    df.rename(columns={col: col.replace("outputs.", "") for col in df.columns}, inplace=True)
+
+    handled_columns = []
+    defect_rates = {}
+    # Rename certain columns as defect rates if we know that's what their aggregates represent
+    # Content safety metrics
+    content_safety_cols, cs_defect_rates = _aggregate_content_safety_metrics(df, evaluators)
+    handled_columns.extend(content_safety_cols)
+    defect_rates.update(cs_defect_rates)
+    # Label-based (true/false) metrics where 'true' means 'something is wrong'
+    label_cols, label_defect_rates = _aggregate_label_defect_metrics(df)
+    handled_columns.extend(label_cols)
+    defect_rates.update(label_defect_rates)
 
     # For rest of metrics, we will calculate mean
-    df.drop(columns=content_safety_cols, inplace=True)
+    df.drop(columns=handled_columns, inplace=True)
+
     mean_value = df.mean(numeric_only=True)
     metrics = mean_value.to_dict()
-
+    # Add defect rates back into metrics
     metrics.update(defect_rates)
     return metrics
 

diff --git a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py
@@ -324,7 +324,6 @@ def test_composite_evaluator_content_safety_chat(self, project_scope, eval_last_
         ]
 
         score = chat_eval(conversation=conversation)
-
         assert score is not None
         assert score["violence"] == "Very low"
         assert score["violence_score"] < 1.0
@@ -417,3 +416,6 @@ def test_eci_evaluator(self, project_scope, azure_cred):
         assert unrelated_result is not None
         assert not unrelated_result["ECI_label"]
         assert "geometry question" in unrelated_result["ECI_reasoning"]
+        import pdb
+
+        pdb.set_trace()
diff --git a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py
@@ -178,7 +178,9 @@ def test_evaluate_with_content_safety_evaluator(self, project_scope, data_file):
 
         row_result_df = pd.DataFrame(result["rows"])
         metrics = result["metrics"]
+        import pdb
 
+        pdb.set_trace()
         # validate the results
         assert result is not None
         assert result["rows"] is not None

diff --git a/src/promptflow-evals/tests/evals/unittests/test_evaluate.py b/src/promptflow-evals/tests/evals/unittests/test_evaluate.py
@@ -11,9 +11,15 @@
 from promptflow.client import PFClient
 from promptflow.evals._constants import DEFAULT_EVALUATION_RESULTS_FILE_NAME
 from promptflow.evals.evaluate import evaluate
-from promptflow.evals.evaluate._evaluate import _apply_target_to_data, _rename_columns_conditionally
+from promptflow.evals.evaluate._evaluate import _aggregate_metrics, _apply_target_to_data, _rename_columns_conditionally
 from promptflow.evals.evaluate._utils import _apply_column_mapping, _trace_destination_from_project_scope
-from promptflow.evals.evaluators import F1ScoreEvaluator, GroundednessEvaluator
+from promptflow.evals.evaluators import (
+    ContentSafetyEvaluator,
+    F1ScoreEvaluator,
+    GroundednessEvaluator,
+    ProtectedMaterialEvaluator,
+)
+from promptflow.evals.evaluators._eci._eci import ECIEvaluator
 
 
 def _get_file(name):
@@ -415,3 +421,85 @@ def test_get_trace_destination(self, mock_validate_trace_destination, mock_proje
 
         assert trace_destination_with_override != trace_destination_without_override
         assert trace_destination_with_override == _trace_destination_from_project_scope(mock_project_scope)
+
+    def test_content_safety_aggregation(self):
+        data = {
+            # 10 zeroes in a list fully written out
+            "content_safety.violence_score": [0, 0, 1, 2, 5, 5, 6, 7, 9, 6],
+            "content_safety.sexual_score": [0, 0, 2, 3, 3, 3, 8, 8, 8, 8],
+            "content_safety.self_harm_score": [0, 0, 0, 0, 1, 1, 1, 1, 7, 7],
+            "content_safety.hate_unfairness_score": [0, 0, 1, 1, 2, 2, 3, 3, 3, 6],
+            "content_safety.violence": [
+                "low",
+                "low",
+                "low",
+                "low",
+                "high",
+                "high",
+                "high",
+                "high",
+                "high",
+                "high",
+            ],  # TODO DETERMINE ACTUAL BASED ON SCORES
+            "content_safety.sexual": ["low", "low", "low", "low", "low", "low", "high", "high", "high", "high"],
+            "content_safety.self_harm": ["low", "low", "low", "low", "low", "low", "low", "low", "high", "high"],
+            "content_safety.hate_unfairness": ["low", "low", "low", "low", "low", "low", "low", "low", "low", "high"],
+            "content_safety.violence_reason": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"],
+            "content_safety.sexual_reason": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"],
+            "content_safety.self_harm_reason": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"],
+            "content_safety.hate_unfairness_reason": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"],
+        }
+        data_df = pd.DataFrame(data)
+        evaluators = {
+            "content_safety": ContentSafetyEvaluator,
+        }
+        aggregation = _aggregate_metrics(data_df, evaluators)
+
+        assert len(aggregation) == 4
+        assert aggregation["content_safety.violence_defect_rate"] == 0.6
+        assert aggregation["content_safety.sexual_defect_rate"] == 0.4
+        assert aggregation["content_safety.self_harm_defect_rate"] == 0.2
+        assert aggregation["content_safety.hate_unfairness_defect_rate"] == 0.1
+
+    def test_label_based_aggregation(self):
+        data = {
+            "eci.ECI_label": [True, False, True, False, True],
+            "eci.ECI_reasoning": ["a", "b", "c", "d", "e"],
+            "protected_material.protected_material_label": [False, False, False, False, True],
+            "protected_material.protected_material_reasoning": ["f", "g", "h", "i", "j"],
+            "unknown.unaccounted_label": [True, False, False, False, True],
+            "unknown.unaccounted_reasoning": ["k", "l", "m", "n", "o"],
+        }
+        data_df = pd.DataFrame(data)
+        evaluators = {
+            "eci": ECIEvaluator,
+            "protected_material": ProtectedMaterialEvaluator,
+        }
+        aggregation = _aggregate_metrics(data_df, evaluators)
+        # ECI and PM labels should be replaced with defect rates, unaccounted should not
+        assert len(aggregation) == 3
+        assert "eci.ECI_label" not in aggregation
+        assert "protected_material.protected_material_label" not in aggregation
+        assert aggregation["unknown.unaccounted_label"] == 0.4
+
+        assert aggregation["eci.ECI_defect_rate"] == 0.6
+        assert aggregation["protected_material.protected_material_defect_rate"] == 0.2
+        assert "unaccounted_defect_rate" not in aggregation
+
+    def test_general_aggregation(self):
+        data = {
+            "thing.metric": [1, 2, 3, 4, 5],
+            "thing.reasoning": ["a", "b", "c", "d", "e"],
+            "other_thing.other_meteric": [-1, -2, -3, -4, -5],
+            "other_thing.other_reasoning": ["f", "g", "h", "i", "j"],
+            "final_thing.final_metric": [False, False, False, True, True],
+            "bad_thing.mixed_metric": [0, 1, False, True, True],
+        }
+        data_df = pd.DataFrame(data)
+        evaluators = {}
+        aggregation = _aggregate_metrics(data_df, evaluators)
+
+        assert len(aggregation) == 3
+        assert aggregation["thing.metric"] == 3
+        assert aggregation["other_thing.other_meteric"] == -3
+        assert aggregation["final_thing.final_metric"] == 0.4