-
Notifications
You must be signed in to change notification settings - Fork 0
/
score.py
196 lines (171 loc) · 8.95 KB
/
score.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# The AUC[R_NX] and AUC[G_NN] implementation is used from cat-SNE (not ours).
# See the license in https://github.com/cdebodt/cat-SNE.
import numpy as np
import numba
from scipy.spatial.distance import pdist, squareform
from sklearn.neighbors import KNeighborsClassifier
def evaluate_scores(X_train, y_train, X_test, y_test, Z_train, Z_test, method, logger):
print("[DEBUG] Calculate KNN score")
if Z_train is not None:
KNN_score(Z_train, y_train, f"{method}_train", logger)
if Z_test is not None:
KNN_score(Z_test, y_test, f"{method}_test", logger)
print("[DEBUG] Calculate AUC[RNX] and AUC[KNN] scores")
if all([X_train is not None, y_train is not None, Z_train is not None]):
knngain_and_rnx(X_train, y_train, Z_train, f"{method}_train", logger)
if all([X_test is not None, y_test is not None, Z_test is not None]):
knngain_and_rnx(X_test, y_test, Z_test, f"{method}_test", logger)
def KNN_score(Z, labels, method, logger, K=10):
knn = KNeighborsClassifier(n_neighbors=K, n_jobs=-1, algorithm="auto")
score = knn.fit(X=Z, y=labels).score(X=Z, y=labels)
logger.log(f"knn{K}", score, method)
del knn
def knngain_and_rnx(X, labels, Z, method, logger):
# pairwise distance in HD and LD
d_hd = squareform(pdist(X, metric="euclidean"), force="tomatrix")
d_ld = squareform(pdist(Z, metric="euclidean"), force="tomatrix")
# calculate KNN gain
gain, auc_knn = knngain(d_hd, d_ld, labels)
logger.log("auc_knn", auc_knn, method)
# logger.log("knn_gain", gain.astype(np.float32).tolist(), method)
print(method, f"AUC[KNN]: {auc_knn:.3f}")
# calculate RNX
rnx, auc_rnx = eval_dr_quality(d_hd, d_ld)
logger.log("auc_rnx", auc_rnx, method)
# logger.log("rnx", rnx.astype(np.float32).tolist(), method)
print(method, f"AUC[RNX]: {auc_rnx:.3f}")
del d_hd, d_ld
@numba.jit(nopython=True)
def eval_auc(arr):
"""
Evaluates the AUC, as defined in [5].
[5] Lee, J. A., Peluffo-Ordóñez, D. H., & Verleysen, M. (2015). Multi-scale similarities in stochastic neighbour embedding: Reducing dimensionality while preserving both local and global structure. Neurocomputing, 169, 246-261.
In:
- arr: 1-D numpy array storing the values of a curve from K=1 to arr.size.
Out:
The AUC under arr, as defined in [5], with a log scale for K=1 to arr.size.
"""
i_all_k = 1.0 / (np.arange(arr.size) + 1.0)
return np.float64(arr.dot(i_all_k)) / (i_all_k.sum())
@numba.jit(nopython=True)
def knngain(d_hd, d_ld, labels):
"""
Compute the KNN gain curve and its AUC, as defined in [1].
If c_i refers to the class label of data point i, v_i^K (resp. n_i^K) to the set of the K nearest neighbors of data point i in the HDS (resp. LDS), and N to the number of data points, the KNN gain develops as G_{NN}(K) = (1/N) * \sum_{i=1}^{N} (|{j \in n_i^K such that c_i=c_j}|-|{j \in v_i^K such that c_i=c_j}|)/K.
It averages the gain (or loss, if negative) of neighbors of the same class around each point, after DR.
Hence, a positive value correlates with likely improved KNN classification performances.
As the R_{NX}(K) curve from the unsupervised DR quality assessment, the KNN gain G_{NN}(K) can be displayed with respect to K, with a log scale for K.
A global score summarizing the resulting curve is provided by its area (AUC).
In:
- d_hd: 2-D numpy array of floats with shape (N, N), representing the redundant matrix of pairwise distances in the HDS.
- d_ld: 2-D numpy array of floats with shape (N, N), representing the redundant matrix of pairwise distances in the LDS.
- labels: 1-D numpy array with N elements, containing integers indicating the class labels of the data points.
Out:
A tuple with:
- a 1-D numpy array of floats with N-1 elements, storing the KNN gain for K=1 to N-1.
- the AUC of the KNN gain curve, with a log scale for K.
"""
# Number of data points
N = d_hd.shape[0]
N_1 = N - 1
k_hd = np.zeros(shape=N_1, dtype=np.int64)
k_ld = np.zeros(shape=N_1, dtype=np.int64)
# For each data point
for i in range(N):
c_i = labels[i]
di_hd = d_hd[i, :].argsort(kind="mergesort")
di_ld = d_ld[i, :].argsort(kind="mergesort")
# Making sure that i is first in di_hd and di_ld
for arr in [di_hd, di_ld]:
for idj, j in enumerate(arr):
if j == i:
idi = idj
break
if idi != 0:
arr[idi] = arr[0]
arr = arr[1:]
for k in range(N_1):
if c_i == labels[di_hd[k]]:
k_hd[k] += 1
if c_i == labels[di_ld[k]]:
k_ld[k] += 1
# Computing the KNN gain
gn = (k_ld.cumsum() - k_hd.cumsum()).astype(np.float64) / (
(1.0 + np.arange(N_1)) * N
)
# Returning the KNN gain and its AUC
return gn, eval_auc(gn)
# @numba.jit(nopython=True)
def coranking(d_hd, d_ld):
"""
Computation of the co-ranking matrix, as described in [3].
The time complexity of this function is O(N**2 log(N)), where N is the number of data points.
In:
- d_hd: 2-D numpy array representing the redundant matrix of pairwise distances in the HDS.
- d_ld: 2-D numpy array representing the redundant matrix of pairwise distances in the LDS.
Out:
The (N-1)x(N-1) co-ranking matrix, where N = d_hd.shape[0].
"""
# Computing the permutations to sort the rows of the distance matrices in HDS and LDS.
perm_hd = d_hd.argsort(axis=-1, kind="mergesort")
perm_ld = d_ld.argsort(axis=-1, kind="mergesort")
N = d_hd.shape[0]
i = np.arange(N, dtype=np.int64)
# Computing the ranks in the LDS
R = np.empty(shape=(N, N), dtype=np.int64)
for j in range(N):
R[perm_ld[j, i], j] = i
# Computing the co-ranking matrix
Q = np.zeros(shape=(N, N), dtype=np.int64)
for j in range(N):
Q[i, R[perm_hd[j, i], j]] += 1
# Returning
return Q[1:, 1:]
@numba.jit(nopython=True)
def eval_rnx(Q):
"""
Evaluate R_NX(K) for K = 1 to N-2, as defined in [4]. N is the number of data points in the data set.
The time complexity of this function is O(N^2).
In:
- Q: a 2-D numpy array representing the (N-1)x(N-1) co-ranking matrix of the embedding.
Out:
A 1-D numpy array with N-2 elements. Element i contains R_NX(i+1).
"""
N_1 = Q.shape[0]
N = N_1 + 1
# Computing Q_NX
qnxk = np.empty(shape=N_1, dtype=np.float64)
acc_q = 0.0
for K in range(N_1):
acc_q += Q[K, K] + np.sum(Q[K, :K]) + np.sum(Q[:K, K])
qnxk[K] = acc_q / ((K + 1) * N)
# Computing R_NX
arr_K = np.arange(N_1)[1:].astype(np.float64)
rnxk = (N_1 * qnxk[: N_1 - 1] - arr_K) / (N_1 - arr_K)
# Returning
return rnxk
def eval_dr_quality(d_hd, d_ld):
"""
Compute the DR quality assessment criteria R_{NX}(K) and AUC, as defined in [2, 3, 4, 5].
These criteria measure the neighborhood preservation around the data points from the HDS to the LDS.
Based on the HD and LD distances, the sets v_i^K (resp. n_i^K) of the K nearest neighbors of data point i in the HDS (resp. LDS) can first be computed.
Their average normalized agreement develops as Q_{NX}(K) = (1/N) * \sum_{i=1}^{N} |v_i^K \cap n_i^K|/K, where N refers to the number of data points and \cap to the set intersection operator.
Q_{NX}(K) ranges between 0 and 1; the closer to 1, the better.
As the expectation of Q_{NX}(K) with random LD coordinates is equal to K/(N-1), which is increasing with K, R_{NX}(K) = ((N-1)*Q_{NX}(K)-K)/(N-1-K) enables more easily comparing different neighborhood sizes K.
R_{NX}(K) ranges between -1 and 1, but a negative value indicates that the embedding performs worse than random. Therefore, R_{NX}(K) typically lies between 0 and 1.
The R_{NX}(K) values for K=1 to N-2 can be displayed as a curve with a log scale for K, as closer neighbors typically prevail.
The area under the resulting curve (AUC) is a scalar score which grows with DR quality, quantified at all scales with an emphasis on small ones.
The AUC lies between -1 and 1, but a negative value implies performances which are worse than random.
In:
- d_hd: 2-D numpy array of floats with shape (N, N), representing the redundant matrix of pairwise distances in the HDS.
- d_ld: 2-D numpy array of floats with shape (N, N), representing the redundant matrix of pairwise distances in the LDS.
Out: a tuple with
- a 1-D numpy array with N-2 elements. Element i contains R_{NX}(i+1).
- the AUC of the R_{NX}(K) curve with a log scale for K, as defined in [5].
Remark:
- The time complexity to evaluate the quality criteria is O(N**2 log(N)). It is the time complexity to compute the co-ranking matrix. R_{NX}(K) can then be evaluated for all K=1, ..., N-2 in O(N**2).
"""
# Computing the co-ranking matrix of the embedding, and the R_{NX}(K) curve.
rnxk = eval_rnx(Q=coranking(d_hd=d_hd, d_ld=d_ld))
# Computing the AUC, and returning.
return rnxk, eval_auc(rnxk)