-
Notifications
You must be signed in to change notification settings - Fork 31
/
Copy path01-matrix-multiply-2d.cu
84 lines (68 loc) · 1.97 KB
/
01-matrix-multiply-2d.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#include <stdio.h>
#define N 64
__global__ void matrixMulGPU( int * a, int * b, int * c )
{
/*
* Build out this kernel.
*/
}
/*
* This CPU function already works, and will run to create a solution matrix
* against which to verify your work building out the matrixMulGPU kernel.
*/
void matrixMulCPU( int * a, int * b, int * c )
{
int val = 0;
for( int row = 0; row < N; ++row )
for( int col = 0; col < N; ++col )
{
val = 0;
for ( int k = 0; k < N; ++k )
val += a[row * N + k] * b[k * N + col];
c[row * N + col] = val;
}
}
int main()
{
int *a, *b, *c_cpu, *c_gpu; // Allocate a solution matrix for both the CPU and the GPU operations
int size = N * N * sizeof (int); // Number of bytes of an N x N matrix
// Allocate memory
cudaMallocManaged (&a, size);
cudaMallocManaged (&b, size);
cudaMallocManaged (&c_cpu, size);
cudaMallocManaged (&c_gpu, size);
// Initialize memory; create 2D matrices
for( int row = 0; row < N; ++row )
for( int col = 0; col < N; ++col )
{
a[row*N + col] = row;
b[row*N + col] = col+2;
c_cpu[row*N + col] = 0;
c_gpu[row*N + col] = 0;
}
/*
* Assign `threads_per_block` and `number_of_blocks` 2D values
* that can be used in matrixMulGPU above.
*/
dim3 threads_per_block;
dim3 number_of_blocks;
matrixMulGPU <<< number_of_blocks, threads_per_block >>> ( a, b, c_gpu );
cudaDeviceSynchronize();
// Call the CPU version to check our work
matrixMulCPU( a, b, c_cpu );
// Compare the two answers to make sure they are equal
bool error = false;
for( int row = 0; row < N && !error; ++row )
for( int col = 0; col < N && !error; ++col )
if (c_cpu[row * N + col] != c_gpu[row * N + col])
{
printf("FOUND ERROR at c[%d][%d]\n", row, col);
error = true;
break;
}
if (!error)
printf("Success!\n");
// Free all our allocated memory
cudaFree(a); cudaFree(b);
cudaFree( c_cpu ); cudaFree( c_gpu );
}