-
Notifications
You must be signed in to change notification settings - Fork 14
/
main_mmul.cpp
executable file
·361 lines (333 loc) · 10.3 KB
/
main_mmul.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
#undef DT32
//#define DT32 //<- This should be the ONLY difference between core32 and core64!
#ifdef DT32
#define flt float
#else
#define flt double
#endif
/*
This program tests matrix multiplication for a naive C implementation versus a blas optimized for the architecture.
The test simulates the inner loop for General Linear Model t-tests, which are computed thousands of times for permutation threshod testing
m = 485776; //<- voxels
n = 16; //statistical contrast, e.g "1 0 0"
p = 120; //<- shared: participants
A[m][p] A[(m*P+p)*IA]
B[p][n] B[(p*N+n)*IB]
C[m][n] C[(m*N+n)*IC]
c = a * b
To compile on MacOS: M1 MacBook Air
g++ -O3 -o tst main_mmul.cpp -DDSP -framework Accelerate -target arm64-apple-macos11 -mmacosx-version-min=11.0 -I/Library/Developer/CommandLineTools/SDKs/MacOSX11.1.sdk/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/Headers
export VECLIB_MAXIMUM_THREADS=1
./tst
matrix multiplication 10 repetitions 64-bit
mmul: min/mean 530 531 ms
mmulBLAS: min/mean 16 18 ms
mmulDSP: min/mean 16 20 ms
differences 36.8881%, max difference 2.13163e-14
matrix multiplication 10 repetitions 32-bit
mmul: min/mean 530 530 ms
mmulBLAS: min/mean 11 11 ms
mmulDSP: min/mean 11 11 ms
differences 36.8648%, max difference 1.14441e-05
One may consider to use openblas
Be aware that NumPy notes "Accelerate is buggy, disallow it"
https://github.com/numpy/numpy/blob/3dbf5ce5fdbc996e8abb6929a219689cf8c18e69/numpy/core/setup.py#L408
export ARCHFLAGS='-arch arm64'
brew install -s openblas
c++ -O3 main_mmul.cpp -lblas -o tst -L/opt/homebrew/opt/openblas/lib -I/opt/homebrew/opt/openblas/include
./tst
matrix multiplication 10 repetitions 64-bit
mmul: min/mean 529 531 ms
mmulBLAS: min/mean 83 90 ms
differences 36.8881%, max difference 2.13163e-14
matrix multiplication 10 repetitions 32-bit
mmul: min/mean 530 530 ms
mmulBLAS: min/mean 50 57 ms
differences 36.8648%, max difference 1.14441e-05
To compile on Ubuntu: Ryzen 3900X
sudo apt-get install libblas-dev
matrix multiplication 10 repetitions 64-bit
c++ -O3 main_mmul.cpp -lblas -o tst; ./tst
mmul: min/mean 279 281 ms
mmulBLAS: min/mean 86 86 ms
differences 36.9339%, max difference 2.13163e-14
matrix multiplication 10 repetitions 32-bit
mmul: min/mean 178 179 ms
mmulBLAS: min/mean 61 63 ms
differences 36.9856%, max difference 1.14441e-05
This test can be modified to evaluate gsl_blas_dgemm by setting
c++ -O3 main_mmul.cpp -lgslcblas -DGSL -o tst; ./tst
matrix multiplication 10 repetitions 64-bit
mmul: min/mean 282 284 ms
mmulBLAS: min/mean 820 821 ms
differences 0%, max difference 0
matrix multiplication 10 repetitions 32-bit
mmul: min/mean 143 144 ms
mmulBLAS: min/mean 921 923 ms
differences 0%, max difference 0
This test can be modified to evaluate Intel MKL by setting
https://software.intel.com/content/www/us/en/develop/documentation/mkl-tutorial-c/top/multiplying-matrices-using-dgemm.html
https://danieldk.eu/Posts/2020-08-31-MKL-Zen.html
sudo apt install libmkl-full-dev
g++ -O3 main_mmul.cpp -o tstMKL -DMKL -I/usr/include/mkl -L/usr/lib/x86_64-linux-gnu/ -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lm -ldl
./tstMKL
matrix multiplication 10 repetitions 64-bit
mmul: min/mean 306 308 ms
mmulBLAS: min/mean 14 18 ms
differences 48.683%, max difference 6.39488e-14
* matrix multiplication 10 repetitions 32-bit
mmul: min/mean 157 159 ms
mmulBLAS: min/mean 7 9 ms
differences 36.9856%, max difference 1.14441e-05
export MKL_NUM_THREADS=1
./tstMKL
matrix multiplication 10 repetitions 64-bit
mmul: min/mean 293 297 ms
mmulBLAS: min/mean 55 58 ms
differences 36.9339%, max difference 2.13163e-14
matrix multiplication 10 repetitions 32-bit
mmul: min/mean 152 153 ms
mmulBLAS: min/mean 28 31 ms
differences 36.9856%, max difference 1.14441e-05
*/
#include <cmath>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#ifdef DSP //Apple's accelerate
#include <vDSP.h>
#endif
#ifdef GSL
#include <gsl/gsl_blas.h>
#else
#ifdef MKL
#include <mkl.h>
#else
#include <cblas.h>
#endif
#endif
#include <time.h>
#include <sys/time.h>
#include <climits>
#include <cstring>
#if !defined (HAVE_POSIX_MEMALIGN) && !defined (_WIN32) && !defined (__APPLE__)
#include <malloc.h>
#endif
#if defined(_OPENMP)
#include <omp.h>
#endif
#ifdef __x86_64__
#include <immintrin.h>
#else
void *_mm_malloc(size_t size, size_t align) {
void *ptr;
if (align == 1)
return malloc(size);
if (align == 2 || (sizeof(void *) == 8 && align == 4))
align = sizeof(void *);
if (!posix_memalign(&ptr, align, size))
return ptr;
return NULL;
}
void _mm_free(void *addr) {
free(addr);
}
#endif
#ifndef MAX
#define MAX(A,B) ((A) > (B) ? (A) : (B))
#endif
#ifndef MIN
#define MIN(A,B) ((A) > (B) ? (B) : (A))
#endif
double clockMsec() { //return milliseconds since midnight
struct timespec _t;
clock_gettime(CLOCK_MONOTONIC, &_t);
return _t.tv_sec*1000.0 + (_t.tv_nsec/1.0e6);
}
long timediff(double startTimeMsec, double endTimeMsec) {
return round(endTimeMsec - startTimeMsec);
}
//naive matrix multiplication, for optimization see http://apfel.mathematik.uni-ulm.de/~lehn/sghpc/gemm/
//void mmul(const flt * __restrict__ A, size_t IA, const flt * __restrict__ B, size_t IB, flt * __restrict__ C, size_t IC, size_t M, size_t N, size_t P) {
void mmul(const flt * A, size_t IA, const flt * B, size_t IB, flt * C, size_t IC, size_t M, size_t N, size_t P) {
/* A is regarded as a two-dimensional matrix with dimemnsions [M][P]
and stride IA. B is regarded as a two-dimensional matrix with
dimemnsions [P][N] and stride IB. C is regarded as a
two-dimensional matrix with dimemnsions [M][N] and stride IC.
Pseudocode: Memory:
A[m][p] A[(m*P+p)*IA]
B[p][n] B[(p*N+n)*IB]
C[m][n] C[(m*N+n)*IC]
These compute:
for (m = 0; m < M; ++m)
for (n = 0; n < N; ++n)
C[m][n] = sum(A[m][p] * B[p][n], 0 <= p < P);
*/
if ((IA != 1) || (IB != 1) || (IC != 1)) { //deal with stride
for (int m = 0; m < M; ++m) {
size_t mP = m * P;
for (size_t n = 0; n < N; ++n) {
flt ret = 0.0;
for (size_t p = 0; p < P; ++p)
ret += A[(mP + p) * IA] * B[(p*N + n)*IB];
C[(m*N + n) * IC] = ret;
} //for n
} //for m
return;
}
//#define cptr //optional C pointer trick does not seem to help with modern compilers at high optimization levels
#ifdef cptr
flt * Cp = C;
for (size_t m = 0; m < M; ++m) {
size_t mP = m * P;
for (size_t n = 0; n < N; ++n) {
flt * Ap = (flt *) A;
Ap += mP;
flt * Bp = (flt *) B;
Bp += n;
flt ret = 0.0;
for (size_t p = 0; p < P; ++p) {
ret += Ap[0] * Bp[0];
Ap ++;
Bp += N;
}
//C[m*N + n] = ret;
Cp[0] = ret;
Cp ++;
} //for n
} //for m
#else
for (size_t m = 0; m < M; ++m) {
size_t mP = m * P;
for (size_t n = 0; n < N; ++n) {
flt ret = 0.0;
for (size_t p = 0; p < P; ++p)
ret += A[mP + p] * B[p*N + n];
C[m*N + n] = ret;
} //for n
} //for m
#endif
}
void svesq(const flt *A, size_t IA, flt *C, size_t N) {
// Sum of vector elements' squares.
flt sumSq = 0.0;
for (size_t n = 0; n < N; ++n)
sumSq += A[n] * A[n];
C[0] = sumSq;
}
/*
//Matlab equivalence
maxNumCompThreads(1)
m = 485776; % voxels
n = 16; %statistical contrast, e.g "1 0 0"
p = 120; % shared: participants
reps = 10;
a = rand(m, p);
b = rand(p, n);
r = zeros(m,n);
sm = 0.0; %sum time for all iterations
mn = inf; %minimum time for fastest iteration
for i = 1:reps
startTime = tic;
r = a * b;
s = sum(r(:) .* r(:)); % sum squared error
tm = toc(startTime) * 1000.0; %elapsed in in ms
sm = sm + tm;
mn = min(tm, mn);
end
fprintf('mmul: min/mean\t%g\t%g\tms\n', mn, sm/reps);
*/
void tst_mmul(int reps) {
printf("matrix multiplication %d repetitions %llu-bit\n", reps, (unsigned long long) sizeof(flt)*8);
size_t m = 485776; //<- voxels
size_t n = 16; //statistical contrast, e.g "1 0 0"
size_t p = 120; //<- shared: participants
//#define dbug
#ifdef dbug
m = 3;
n = 3;
p = 3;
#endif
flt *a = (flt *) _mm_malloc(m * p * sizeof(flt),64);
flt *b = (flt *) _mm_malloc(p * n * sizeof(flt),64);
flt *c = (flt *) _mm_malloc(m * n * sizeof(flt),64);
flt *cBLAS = (flt *) _mm_malloc(m * n * sizeof(flt),64);
flt *cDSP = (flt *) _mm_malloc(m * n * sizeof(flt),64);
for (size_t i = 0; i < (m * p); i++)
a[i] = (flt) rand()/RAND_MAX;
for (size_t i = 0; i < (p * n); i++)
b[i] = (flt) rand()/RAND_MAX;
#ifdef dbug
for (size_t i = 0; i < (m * p); i++)
a[i] = i;
for (size_t i = 0; i < (p * n); i++)
b[i] = i;
#endif
long mn = INT_MAX;
long sum = 0.0;
long mnDSP = INT_MAX;
long sumDSP = 0.0;
long mnBLAS = INT_MAX;
long sumBLAS = 0.0;
for (int64_t i = 0; i < reps; i++) {
double startTime = clockMsec();
#ifdef DSP
#ifdef DT32
vDSP_mmul(a, 1, b, 1, cDSP, 1, m, n, p);
//vDSP_svesq(a, 1, &ssDSP, m * n);
#else
vDSP_mmulD(a, 1, b, 1, cDSP, 1, m, n, p);
//vDSP_svesqD(a, 1, &ssDSP, m * n);
#endif
mnDSP = MIN(mnDSP, timediff(startTime, clockMsec()));
sumDSP += timediff(startTime, clockMsec());
startTime = clockMsec();
#endif
//blas
flt alpha = 1.0;
flt beta = 0.0;
#ifdef DT32
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, p, alpha, a, p, b, n, beta, cBLAS, n);
#else
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, p, alpha, a, p, b, n, beta, cBLAS, n);
#endif
mnBLAS = MIN(mnBLAS, timediff(startTime, clockMsec()));
sumBLAS += timediff(startTime, clockMsec());
startTime = clockMsec();
mmul(a, 1, b, 1, c, 1, m, n, p);
//svesq(a, 1, &ss, m * n);
mn = MIN(mn, timediff(startTime, clockMsec()));
sum += timediff(startTime, clockMsec());
}
printf("mmul: min/mean\t%ld\t%ld\tms\n", mn, sum/reps);
printf("mmulBLAS: min/mean\t%ld\t%ld\tms\n", mnBLAS, sumBLAS/reps);
#ifdef DSP
printf("mmulDSP: min/mean\t%ld\t%ld\tms\n", mnDSP, sumDSP/reps);
#endif
//check results
size_t nDiff = 0;
flt mxDiff = 0.0;
for (size_t i = 0; i < (m * n); i++) {
#ifdef dbug
#ifdef DSP
printf("%lu %g %g %g\n", i,c[i], cBLAS[i], cDSP[i]);
#else
printf("%lu %g %g\n", i,c[i], cBLAS[i]);
#endif
#endif
if (c[i] != cBLAS[i]) {
nDiff ++;
mxDiff = MAX(mxDiff, fabs(c[i] - cBLAS[i]) );
}
}
printf("differences %g%%, max difference %g\n", ((double) nDiff) / ((double) (m*n)) * 100.0, mxDiff);
_mm_free (a);
_mm_free (b);
_mm_free (c);
_mm_free (cBLAS);
_mm_free (cDSP);
} //tst_mmul()
int main(int argc, char * argv[]) {
tst_mmul(10);
return 0;
}