forked from amd/HPCTrainingExamples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdaxpy_4.hip
130 lines (110 loc) · 3.85 KB
/
daxpy_4.hip
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
This training example is released under the MIT license as listed
in the top-level directory. If this example is separated from the
main directory, include the LICENSE file with it.
Author: Gina Sitaraman
*/
#include <stdio.h>
#include <hip/hip_runtime.h>
#include <sys/time.h>
#define ELAPSED(t1,t2) (t2.tv_sec-t1.tv_sec + (t2.tv_usec-t1.tv_usec)*1E-6)
#define NITER 25
#define BLOCK_SIZE 256
#define HIP_CHECK(cmd) \
{\
hipError_t error = cmd;\
if (error != hipSuccess) { \
fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,__FILE__, __LINE__); \
exit(EXIT_FAILURE);\
}\
}
void print_arr (double *mat, int nelem)
{
int i;
for (i=0; i<nelem; i++) {
printf ("%g ", mat[i]);
}
printf ("\n");
}
// Test results with host reference
int check_results (double *h_ref, double *h_z, size_t nelem)
{
size_t i;
for (i=0; i<nelem; i++) {
if (h_ref[i] != h_z[i]) {
printf ("ERROR at i=%lu, h_ref[i]=%f, h_z[i]=%f\n", i, h_ref[i], h_z[i]);
return -1;
}
}
printf ("PASSED\n");
return 0;
}
__global__ void kernel_4 (double *d_x, double *d_y, double *d_z, double a, size_t N)
{
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
d_z[idx] = d_x[idx] * a + d_y[idx];
}
int main (int argc, char *argv[])
{
hipDeviceProp_t props;
HIP_CHECK(hipGetDeviceProperties(&props, 0/*deviceID*/));
printf ("info: running on device %s\n", props.name);
struct timeval t1, t2;
if (argc < 2) {
printf ("Usage: %s N where N is the length of arrays X, Y and Z\n", argv[0]);
return -1;
}
size_t N = atol(argv[1]);
// Reset N if not a multiple of BLOCK_SIZE
if (N%BLOCK_SIZE) {
N = ((N+(BLOCK_SIZE-1))/BLOCK_SIZE)*BLOCK_SIZE;
printf ("Resetting N to be a multiple of %d, new N = %lld\n", BLOCK_SIZE, (long long)N);
}
size_t i, j, szbytes = N*sizeof(double);
double *h_x = (double *) malloc (szbytes);
double *h_y = (double *) malloc (szbytes);
double *h_z = (double *) malloc (szbytes);
double *h_ref = (double *) malloc (szbytes);
double *d_x, *d_y, *d_z;
double a = 1.0;
// amt of data moved in GBytes
float gbs = (float)(3*szbytes*NITER)/(1024.f*1024.f*1024.f);
// populate input arrays
for (i=0; i<N; i++) {
h_x[i] = (double)i;
h_y[i] = (double)(i+1);
}
// populate h_ref
for (i=0; i<N; i++) {
h_ref[i] = h_x[i] * a + h_y[i];
}
//print_arr (h_ref, 10);
// allocate GPU buffers
HIP_CHECK(hipMalloc (&d_x, szbytes));
HIP_CHECK(hipMalloc (&d_y, szbytes));
HIP_CHECK(hipMalloc (&d_z, szbytes));
// copy input arrays to GPU memory
HIP_CHECK(hipMemcpy (d_x, h_x, szbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipMemcpy (d_y, h_y, szbytes, hipMemcpyHostToDevice));
//-----------------------------------------------------------------------------
// Launch kernel that takes multiple blocks, mapping each thread to 1 array element
//-----------------------------------------------------------------------------
// Set up grid and block dimensions
dim3 grid_4((N+BLOCK_SIZE-1)/BLOCK_SIZE, 1, 1);
dim3 block_4(BLOCK_SIZE, 1, 1);
// Launch kernel once first to warm up GPU and check correctness
kernel_4 <<< grid_4, block_4 >>> (d_x, d_y, d_z, a, N);
// copy result back to host buffer
HIP_CHECK(hipMemcpy (h_z, d_z, szbytes, hipMemcpyDeviceToHost));
check_results (h_ref, h_z, N);
// Now measure bandwidth achieved
gettimeofday (&t1, NULL);
for (int iter=0; iter<NITER; iter++) {
kernel_4 <<< grid_4, block_4 >>> (d_x, d_y, d_z, a, N);
}
hipDeviceSynchronize ();
gettimeofday (&t2, NULL);
printf ("daxpy_4: Bandwidth achieved = %.2f GB/s\n", gbs/ELAPSED(t1,t2));
return 0;
}