-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathutils.py
142 lines (112 loc) · 4.05 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""
Utility functions to keep the example notebooks uncluttered with boilerplate.
"""
import re
from collections import OrderedDict
from pathlib import Path
from typing import Tuple
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
DATA_DIR = Path(__file__).parent / "data"
UCI_ADULT_TARGET_COL = "target"
def load_uci_adult() -> Tuple[pd.DataFrame, pd.DataFrame]:
"""Downloads and pre-processes the UCI Adult dataset.
Returns
-------
train_set, test_set : tuple[pd.DataFrame, pd.DataFrame]
The pre-processed train and test datasets.
"""
try:
import wget
except ModuleNotFoundError as err:
print(f"Downloading this dataset requires the `wget` python package; got \"{err}\"")
# URLs for downloading dataset
base_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/"
train_url = base_url + "adult.data"
test_url = base_url + "adult.test"
names_url = base_url + "adult.names"
# Make local data directory
DATA_DIR.mkdir(exist_ok=True)
# Download data
train_path = wget.download(train_url, str(DATA_DIR))
test_path = wget.download(test_url, str(DATA_DIR))
names_path = wget.download(names_url, str(DATA_DIR))
return (
_preprocess_uci_adult(train_path, names_path),
_preprocess_uci_adult(test_path, names_path, skiprows=1),
)
def _preprocess_uci_adult(data_path, names_path, **read_kwargs) -> pd.DataFrame:
# Load column names
column_map = OrderedDict()
line_regexp = re.compile(r"^([-\w]+): (.*)[.]$")
with open(names_path, "r") as f_in:
lines = f_in.readlines()
for l in lines:
match = line_regexp.match(l)
if not match: continue
col_name = match.group(1)
col_values = match.group(2).split(", ")
if len(col_values) == 1:
col_values = col_values[0]
column_map[col_name] = col_values
# Last column is the target
column_map[UCI_ADULT_TARGET_COL] = ["<=50K", ">50K"]
# Load data
data = pd.read_csv(
data_path,
header=None,
names=list(column_map.keys()),
index_col=None,
**read_kwargs)
# Set correct dtypes
data = data.astype({
col_name: (
float if col_value == "continuous" else "category"
) for col_name, col_value in column_map.items()
})
# Strip whitespace from categorical values
for col in data.columns:
if pd.api.types.is_categorical_dtype(data[col]):
data[col] = data[col].map(lambda val: val.strip())
return data
def compute_fairness_ratio(y_true: np.ndarray, y_pred: np.ndarray, s_true, metric: str) -> float:
"""Compute fairness metric as the disparity (group-wise ratio)
of a given performance metric.
Parameters
----------
y_true : np.ndarray
The true labels.
y_pred : np.ndarray
The binarized predictions.
s_true : np.ndarray
The sensitive attribute column.
metric : str
The performance metric used to compute disparity.
Returns
-------
value : float
The fairness metric value (between 0 and 1).
"""
metric = metric.lower()
valid_perf_metrics = ("fpr", "fnr", "tpr", "tnr")
def compute_metric(y_true, y_pred):
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
if metric == "fpr":
return fp / (fp + tn)
elif metric == "tnr":
return tn / (fp + tn)
elif metric == "fnr":
return fn / (fn + tp)
elif metric == "tpr":
return tp / (fn + tp)
else:
raise ValueError(f"Invalid metric chosen; must be one of {valid_perf_metrics}; got '{metric}'")
groupwise_metrics = []
for group in pd.Series(s_true).unique():
group_filter = (s_true == group)
groupwise_metrics.append(compute_metric(
y_true[group_filter],
y_pred[group_filter],
))
return min(groupwise_metrics) / max(groupwise_metrics)