-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
153 lines (116 loc) · 4.28 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import random
import itertools
import datetime
import dateutil.tz
import numpy as np
def split_data(x, y, split_frac=0.2, seed=None):
""" Split data into train and test sets
Parameters:
x : np.ndarray(N, D) : data matrix
y : np.ndarray(N) : labels
split_frac : float : fraction of data to use for testing
seed : int : random seed
Returns:
x_train : np.ndarray : training data
x_test : np.ndarray : testing data
y_train : np.ndarray : training labels
y_test : np.ndarray : testing labels
"""
### set seed for reproducibility
if seed is not None:
np.random.seed(seed)
### shuffle data and split
idxs = np.random.permutation(x.shape[0])
n_test = int(x.shape[0] * split_frac)
test_idxs = idxs[:n_test]
train_idxs = idxs[n_test:]
return x[train_idxs], x[test_idxs], y[train_idxs], y[test_idxs]
def accuracy(y_true, y_pred):
""" Calculate the accuracy of the predictions
Parameters:
y_true : np.ndarray(N) : true labels
y_pred : np.ndarray(N) : predicted labels
Returns:
float : accuracy
"""
return np.mean(y_true == y_pred)
def f1(y_true, y_pred):
""" Calculate the F1 score of the predictions
Parameters:
y_true : np.ndarray(N) : true labels
y_pred : np.ndarray(N) : predicted labels
Returns:
float : F1 score
"""
uniq_vals = np.unique(y_true) # sorted
assert len(uniq_vals) == 2, "F1 score is only implemented for binary classification"
assert set(y_pred).issubset(uniq_vals), "Predictions contain values not present in true labels"
### calculate true positives, false positives, and false negatives
tp = ((y_true == uniq_vals[1]) & (y_pred == uniq_vals[1])).sum()
fp = ((y_true == uniq_vals[0]) & (y_pred == uniq_vals[1])).sum()
fn = ((y_true == uniq_vals[1]) & (y_pred == uniq_vals[0])).sum()
### calculate precision, recall, and F1
recall = tp / (tp + fn) if tp + fn > 0 else 0
precision = tp / (tp + fp) if tp + fp > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
return f1
def get_cross_val_scores(model, X, y, k_folds=5, scoring_fn=accuracy, shuffle=True):
""" Perform k-fold cross-validation and return the scores
Parameters:
model : object : model to evaluate
X : np.ndarray(N, D) : features
y : np.ndarray(N) : labels
k_folds : int : number of folds
scoring_fn : function : scoring function
shuffle : bool : shuffle data before splitting
Returns:
scores : np.ndarray(k_folds) : scores for each fold
"""
scores = []
for fold_idx in range(k_folds):
### create current fold mask and split data
if shuffle:
mask = np.random.permutation(X.shape[0]) % k_folds == fold_idx
else:
mask = np.arange(X.shape[0]) % k_folds == fold_idx
X_train, y_train = X[~mask], y[~mask]
X_test, y_test = X[mask], y[mask]
### fit and predict
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score = scoring_fn(y_test, y_pred)
scores.append(score)
return np.array(scores)
def prep_hyperparam_search(hyperparam_search):
""" Prepare hyperparameter search space
Parameters:
hyperparam_search : dict : hyperparameter search space (each key is a hyperparameter name and value is a list of values)
Returns:
list : list of hyperparameter combinations
"""
hp_names, hp_vals = zip(*hyperparam_search.items())
return [dict(zip(hp_names, v)) for v in itertools.product(*hp_vals)]
def now_str():
""" Get the current date and time as a string
Returns:
str : current date and time as a string
"""
now = datetime.datetime.now(dateutil.tz.tzlocal())
return now.strftime("%Y-%m-%d_%H-%M-%S")
def seed_all(seed):
""" Seed all random number generators
Parameters:
seed : int : seed
"""
random.seed(seed)
np.random.seed(seed)
class NegFn:
"""
Wrapper to negate the output of a function
Parameters:
fn : function : function to negate
"""
def __init__(self, fn):
self.fn = fn
def __call__(self, *args, **kwargs):
return -self.fn(*args, **kwargs)