forked from utcs378-pfcp/cs378pfcp-loopordering
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdriver.c
186 lines (144 loc) · 5.61 KB
/
driver.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <time.h>
#define dabs( x ) ( (x) < 0 ? -(x) : x )
double FLA_Clock(); // This is a routine for extracting elapsed
// time borrowed from the libflame library
/* MaxAbsDiff computes the maximum absolute difference over all
corresponding elements of two matrices */
double MaxAbsDiff( int, int, double *, int, double *, int );
/* RandomMatrix overwrites a matrix with random values */
void RandomMatrix( int, int, double *, int );
void dgemm_( char *, char *, // transA, transB
int *, int *, int *, // m, n, k
double *, double *, int *, // alpha, A, ldA
double *, int *, // B, ldB
double *, double *, int * ); // beta, C, ldC
/* Various constants that control what gets timed */
/* My_Gemm is a common interface to all the implementations we will
develop so we don't have to keep rewriting this driver routine. */
void MyGemm( int, int, int, double *, int, double *, int, double *, int );
int main(int argc, char *argv[])
{
int
m, n, k,
ldA, ldB, ldC,
size, first, last, inc,
i, irep,
nrepeats,
err;
double
d_one = 1.0,
dtime, dtime_best,
diff, maxdiff = 0.0, gflops;
double
*A, *B, *C, *Cold, *Cref;
/* Every time trial is repeated "repeat" times and the fastest run is recorded */
printf( "%% number of repeats:" );
err = scanf( "%d", &nrepeats );
printf( "%% %d\n", nrepeats );
/* Timing trials for matrix sizes m=n=k=first to last in increments
of inc will be performed. (Actually, we are going to go from
largest to smallest since this seems to give more reliable
timings. */
printf( "%% enter first, last, inc:" );
err = scanf( "%d%d%d", &first, &last, &inc );
/* Adjust first and last so that they are multiples of inc */
last = ( last / inc ) * inc;
first = ( first / inc ) * inc;
first = ( first == 0 ? inc : first );
printf( "%% %d %d %d \n", first, last, inc );
printf( "data = [\n" );
printf( "%% n reference | current implementation \n" );
printf( "%% time GFLOPS | time GFLOPS diff \n" );
i = 1;
for ( size=last; size>= first; size-=inc ){
/* we will only time cases where all three matrices are square */
m = n = k = size;
ldA = ldB = ldC = size;
/* Gflops performed */
gflops = 2.0 * m * n * k * 1e-09;
/* Allocate space for the matrices. We will use five arrays:
A will be the address where A is stored. Addressed with alpha(i,j).
B will be the address where B is stored. Addressed with beta(i,j).
C will be the address where C is stored. Addressed with gamma(i,j).
Now, we will compute C = A B + C with via routine MyGemm
and also with a reference implementation. Therefore, we will
utilize two more arrays:
Cold will be the address where the original matrix C is
stored.
Cref will be the address where the result of computing C = A B
+ C computed with the reference implementation will be stored.
*/
A = ( double * ) malloc( ldA * k * sizeof( double ) );
B = ( double * ) malloc( ldB * n * sizeof( double ) );
C = ( double * ) malloc( ldC * n * sizeof( double ) );
Cold = ( double * ) malloc( ldC * n * sizeof( double ) );
Cref = ( double * ) malloc( ldC * n * sizeof( double ) );
/* Generate random matrix A */
RandomMatrix( m, k, A, ldA );
/* Generate random matrix B */
RandomMatrix( k, n, B, ldB );
/* Generate random matrix Cold */
RandomMatrix( m, n, Cold, ldC );
/* Time reference implementation provided by the BLAS library
routine dgemm (double precision general matrix-matrix
multiplicationn */
for ( irep=0; irep<nrepeats; irep++ ){
/* Copy matrix Cold to Cref */
memcpy( Cref, Cold, ldC * n * sizeof( double ) );
/* start clock */
dtime = FLA_Clock();
/* Compute Cref = A B + Cref */
dgemm_( "No transpose", "No transpose",
&m, &n, &k,
&d_one, A, &ldA,
B, &ldB,
&d_one, Cref, &ldC );
/* stop clock */
dtime = FLA_Clock() - dtime;
/* record the best time so far */
if ( irep == 0 )
dtime_best = dtime;
else
dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
}
printf( " %5d %8.4le %8.4le ", n, dtime_best, gflops/dtime_best );
fflush( stdout ); // We flush the output buffer because otherwise
// it may throw the timings of a next
// experiment.
/* Time MyGemm */
for ( irep=0; irep<nrepeats; irep++ ){
/* Copy vector Cold to C */
memcpy( C, Cold, ldC * n * sizeof( double ) );
/* start clock */
dtime = FLA_Clock();
/* Compute C = A B + C */
MyGemm( m, n, k, A, ldA, B, ldB, C, ldC );
/* stop clock */
dtime = FLA_Clock() - dtime;
if ( irep == 0 )
dtime_best = dtime;
else
dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
}
diff = MaxAbsDiff( m, n, C, ldC, Cref, ldC );
maxdiff = ( diff > maxdiff ? diff : maxdiff );
printf( " %8.4le %8.4le %8.4le\n", dtime_best, gflops/dtime_best, diff );
fflush( stdout ); // We flush the output buffer because otherwise
// it may throw the timings of a next
// experiment.
/* Free the buffers */
free( A );
free( B );
free( C );
free( Cold );
free( Cref );
i++;
}
printf( "];\n\n" );
printf( "%% Maximum difference between reference and your implementation: %le.\n", maxdiff );
exit( 0 );
}