forked from amd/HPCTrainingExamples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdaxpy_1.hip
127 lines (106 loc) · 3.57 KB
/
daxpy_1.hip
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
This training example is released under the MIT license as listed
in the top-level directory. If this example is separated from the
main directory, include the LICENSE file with it.
Author: Gina Sitaraman
*/
#include <stdio.h>
#include <hip/hip_runtime.h>
#include <sys/time.h>
#define ELAPSED(t1,t2) (t2.tv_sec-t1.tv_sec + (t2.tv_usec-t1.tv_usec)*1E-6)
#define NITER 25
#define HIP_CHECK(cmd) \
{\
hipError_t error = cmd;\
if (error != hipSuccess) { \
fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,__FILE__, __LINE__); \
exit(EXIT_FAILURE);\
}\
}
void print_arr (double *mat, int nelem)
{
int i;
for (i=0; i<nelem; i++) {
printf ("%g ", mat[i]);
}
printf ("\n");
}
// Test results with host reference
int check_results (double *h_ref, double *h_z, size_t nelem)
{
size_t i;
for (i=0; i<nelem; i++) {
if (h_ref[i] != h_z[i]) {
printf ("ERROR at i=%lu, h_ref[i]=%f, h_z[i]=%f\n", i, h_ref[i], h_z[i]);
return -1;
}
}
printf ("PASSED\n");
return 0;
}
__global__ void kernel_1 (double *d_x, double *d_y, double *d_z, double a, size_t N)
{
size_t idx = threadIdx.x;
while (idx < N) {
d_z[idx] = d_x[idx] * a + d_y[idx];
idx += 64;
}
}
int main (int argc, char *argv[])
{
hipDeviceProp_t props;
HIP_CHECK(hipGetDeviceProperties(&props, 0/*deviceID*/));
printf ("info: running on device %s\n", props.name);
struct timeval t1, t2;
if (argc < 2) {
printf ("Usage: %s N where N is the length of arrays X, Y and Z\n", argv[0]);
return -1;
}
size_t N = atol(argv[1]);
size_t i, j, szbytes = N*sizeof(double);
double *h_x = (double *) malloc (szbytes);
double *h_y = (double *) malloc (szbytes);
double *h_z = (double *) malloc (szbytes);
double *h_ref = (double *) malloc (szbytes);
double *d_x, *d_y, *d_z;
double a = 1.0;
// amt of data moved in GBytes
float gbs = (float)(3*szbytes*NITER)/(1024.f*1024.f*1024.f);
// populate input arrays
for (i=0; i<N; i++) {
h_x[i] = (double)i;
h_y[i] = (double)(i+1);
}
// populate h_ref - perform daxpy on CPU
for (i=0; i<N; i++) {
h_ref[i] = h_x[i] * a + h_y[i];
}
// allocate GPU buffers
HIP_CHECK(hipMalloc (&d_x, szbytes));
HIP_CHECK(hipMalloc (&d_y, szbytes));
HIP_CHECK(hipMalloc (&d_z, szbytes));
// copy input arrays to GPU memory
HIP_CHECK(hipMemcpy (d_x, h_x, szbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipMemcpy (d_y, h_y, szbytes, hipMemcpyHostToDevice));
//-----------------------------------------------------------------------------
// Launch naive kernel to do the work using a single wavefront
//-----------------------------------------------------------------------------
// Set up grid and block dimensions
dim3 grid_1(1, 1, 1);
dim3 block_1(64, 1, 1);
// Launch kernel once first to warm up GPU and check correctness
kernel_1 <<< grid_1, block_1 >>> (d_x, d_y, d_z, a, N);
// copy result back to host buffer
HIP_CHECK(hipMemcpy (h_z, d_z, szbytes, hipMemcpyDeviceToHost));
check_results (h_ref, h_z, N);
// Now measure bandwidth achieved
gettimeofday (&t1, NULL);
for (int iter=0; iter<NITER; iter++) {
kernel_1 <<< grid_1, block_1 >>> (d_x, d_y, d_z, a, N);
}
hipDeviceSynchronize ();
gettimeofday (&t2, NULL);
printf ("daxpy_1: Bandwidth achieved = %.2f GB/s\n", gbs/ELAPSED(t1,t2));
return 0;
}