-
Notifications
You must be signed in to change notification settings - Fork 0
/
kmeans.c
283 lines (254 loc) · 8.94 KB
/
kmeans.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
/* ----------------------------------------------------------------------
* Project: Autonomous Edge Pipeline
* Title: kmeans.c
* Description: kmeans clustering algorithm
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <stdbool.h>
#include <string.h>
#include <stdlib.h>
#include "main.h"
#include "kmeans.h"
#include "dataset.h"
#include "pipeline.h"
/**
* @brief kmeans
* @param[in] max_samples pointer to data samples
* @param[in] centroids pointer to centroids
* @param[in, out] y_train pointer to clustering labels
* @param[in] weights pointer to the confidence weights
* @param[in] n_samples number of samples
* The function fills y_train with the labels resulting from clustering
* and return the new number of samples (changed due to confidence that removes unwanted samples)
*/
/**
* @brief initial_centroids
* @param[in] max_samples pointer to data samples
* @param[in] centroids pointer to centroids
* @param[in] n_samples number of samples
* The function chooses one random sample as initial centroids
*/
/**
* @brief kmeans_plus_plus
* @param[in] max_samples pointer to data samples
* @param[in] centroids pointer to centroids
* @param[in] n_samples number of samples
* @param[in] next_centoird the second random centoid using kmeans++ approach
* The function chooses the other initial centroids using kmeans++ approach
*/
/**
* @brief clustering
* @param[in] X pointer to input samples
* @param[in] centroids pointer to centroids
* @param[in] weights pointer to the confidence weights
* @param[in, out] samples_per_cluster array that hold the number of samples belonging to each cluster
* @param[in] index index of input sample: used to write its corresponding weight
* @return The function returns the clustering label
*/
/**
* @brief update_cluster_assignment
* @param[in] max_samples pointer to data samples
* @param[in] index cluster_assignment index
* The function updates cluster_assignment with the sum of samples in each cluster
*/
/**
* @brief update_centroids
* @param[in] centroids pointer to centroids
* @param[in] samples_per_cluster array that holds the number of samples belonging to each cluster
* The function updates centroids by calculating the mean of the samples inside the same cluster
*/
float cluster_assignment[K][N_FEATURE];
float prev_centroids[K][N_FEATURE];
int is_equal;
int stop = 0;
int kmeans(float max_samples[MEMORY_SIZE+UPDATE_THR][N_FEATURE], float centroids[K][N_FEATURE], float weights[MEMORY_SIZE][K], int y_train[MEMORY_SIZE+UPDATE_THR], int n_samples)
{
int cluster;
float weight;
FILE *fptr;
int counter = 0;
initial_centroids(max_samples, centroids, n_samples);
/* RUN KMEANS FOR ITERATION TIMES UNTIL NO FURTHER CHANGE IN RESULTS*/
for (int iteration = 0; iteration < ITERATION; iteration++)
{
int samples_per_cluster[K] = {0};
for (int j = 0; j < n_samples; j++)
{
cluster = clustering(max_samples[j], centroids, weights, samples_per_cluster, j);
y_train[j] = cluster;
update_cluster_assignment(max_samples[j], cluster);
}
/* Copy centroids elements to centroids to compare */
for(int i = 0; i < K; i++)
{
for(int k = 0; k < N_FEATURE; k++)
{
prev_centroids[i][k] = centroids[i][k];
}
}
update_centroids(centroids, samples_per_cluster);
/* Stop the algorithm when the centroids of the newly formed clusters stop changing (centroids = prev_centroids).
We made an update to the stoping criterion by checking twice that centroids are equal to prev_centroids. */
is_equal = memcmp(centroids, prev_centroids, sizeof(prev_centroids));
if(is_equal == 0)
{
stop++;
if(stop == 2)
{
fptr = fopen("log.txt", "a");
fprintf(fptr, "^ k-means: \n\n");
fprintf(fptr, "\t- Centroids stopped changing at iteration: %d\n", iteration);
fclose(fptr);
break;
}
}
/* AFTER EACH ITERATION RESET cluster_assignment TO ZERO */
memset(cluster_assignment, 0, sizeof(cluster_assignment));
}
#ifdef CONFIDENCE
/* weight calculation */
for(int i = 0; i < n_samples; i++)
{
weight = 0;
for(int j = 0; j < K; j++)
{
weight += pow((1/weights[i][j]), 2);
}
for(int k = 0; k < K; k++)
{
weights[i][k] = 1 / (weight * pow(weights[i][k], 2));
}
}
for(int n = 0; n < n_samples; n++)
{
if(weights[n][y_train[n]] > CONFIDENCE_THR)
{
for(int l = 0; l < N_FEATURE; l++)
{
max_samples[n-counter][l] = max_samples[n][l];
}
y_train[n-counter] = y_train[n];
}
else
{
counter++;
}
}
#endif
n_samples = n_samples - counter;
// for(int n = 0; n < n_samples; n++)
// {
// printf("samples[%d][0]: %f\n", n, samples[n][0]);
// }
fptr = fopen("log.txt", "a");
fprintf(fptr, "^ k-means: \n\n");
fprintf(fptr, "\t- Final Centroids are: \n");
for(int i = 0; i < K; i++)
{
fprintf(fptr, "\t[\t");
for(int j = 0; j < N_FEATURE; j++)
{
fprintf(fptr, "%f\t", centroids[i][j]);
}
fprintf(fptr, "]\n");
}
fprintf(fptr, "\n");
fclose(fptr);
return n_samples;
}
/* RANDOMLY CHOOSE ONE SAMPLE AS INITIAL CENTROIDS */
void initial_centroids(float max_samples[MEMORY_SIZE+UPDATE_THR][N_FEATURE], float centroids[K][N_FEATURE], int n_samples)
{
time_t t;
srand((unsigned) time(&t));
int random = rand() % n_samples;
for(int i = 0; i < K; i++)
{
for(int j = 0; j < N_FEATURE; j++)
{
centroids[i][j] = max_samples[random][j];
}
if(i+1 == K)
{
break;
}
/* TO CHOOSE THE OTHER CENTROIDS USE KMEANS++*/
random = kmeans_plus_plus(max_samples, centroids, n_samples, i+1);
}
}
int kmeans_plus_plus(float max_samples[MEMORY_SIZE+UPDATE_THR][N_FEATURE], float centroids[K][N_FEATURE], int n_samples, int next_centroid)
{
float max = -1000;
int random;
float dist;
for (int i = 0; i < n_samples; i++)
{
for (int k = 0; k < next_centroid; k++)
{
for (int j = 0; j < N_FEATURE; j++)
{
dist += pow((max_samples[i][j] - centroids[k][j]), 2.0);
}
if (dist > max)
{
max = dist;
random = i;
}
dist = 0;
}
}
return random;
}
/* EACH DATA POINT IS ALLOCATED TO THE NEAREST CENTROID */
int clustering(float X[], float centroids[K][N_FEATURE], float weights[MEMORY_SIZE][K], int samples_per_cluster[], int index)
{
float y = 0;
float min_distance = 1000000;
int cluster = 0;
for (int k = 0; k < K; k++)
{
for (int j = 0; j < N_FEATURE; j++)
{
y += pow((X[j] - centroids[k][j]), 2.0);
}
/* FUZZY CLUSTERING */
weights[index][k] = y;
y = sqrt(y);
if (y < min_distance)
{
min_distance = y;
y = 0;
cluster = k;
continue;
}
y = 0;
}
samples_per_cluster[cluster] += 1;
return cluster;
}
/* UPDATE cluster_assignment --> SUM OF THE SAMPLES ASSIGNED TO THE SAME CLUSTER */
void update_cluster_assignment(float max_samples[MEMORY_SIZE+UPDATE_THR], int index)
{
for (int i = 0; i < N_FEATURE; i++)
{
cluster_assignment[index][i] = cluster_assignment[index][i] + max_samples[i];
}
}
/* UPDATE centroids --> MEAN OF THE SAMPLES INSIDE THE SAME CLUSTER */
void update_centroids(float centroids[K][N_FEATURE], int samples_per_cluster[])
{
for (int j = 0; j < N_FEATURE; j++)
{
if(samples_per_cluster[0] != 0)
{
centroids[0][j] = (cluster_assignment[0][j])/(samples_per_cluster[0]);
}
if(samples_per_cluster[1] != 0)
{
centroids[1][j] = (cluster_assignment[1][j])/(samples_per_cluster[1]);
}
}
}