-
Notifications
You must be signed in to change notification settings - Fork 0
/
test02_global_and_local_ids.py
231 lines (187 loc) · 8.77 KB
/
test02_global_and_local_ids.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
import pyopencl as pycl
import pyopencl as cl
import numpy as np
import unittest
import time
import pyopencl.cltypes
from pyopencl.array import Array as clArray
import pyopencl.clmath
import matplotlib.pyplot as plt
class TestOpenCLIds(unittest.TestCase):
"""
I am now starting to write my own kernels. I found it complicated to understand
global_id, local_id, etc.... So this series of tests is to understand these workunit functions
(that's what they are called). It is necessary to access the proper data in the computation.
"""
kernel_source_file = """
// ----------------- TEST KERNELS -----------------
__kernel void test_global_id(__global int *buffer){
int id = get_global_id(0);
buffer[id] = id;
}
__kernel void test_local_id(__global int *buffer){
int gid = get_global_id(0);
int lid = get_local_id(0);
buffer[gid] = lid;
}
__kernel void test_extract_many_ids(__global int *buffer){
int gid = get_global_id(0);
int lid = get_local_id(0);
int lgid = get_group_id(0);
int ls = get_local_size(0);
int offset = get_global_offset(0);
buffer[gid] = gid+lid*100+ls*10000+lgid*1000000+offset*100000000;
}
__kernel void test_compute_global_id_from_local_id(__global int *buffer){
int gid = get_global_id(0);
int lid = get_local_id(0);
int lgid = get_group_id(0);
int ls = get_local_size(0);
int offset = get_global_offset(0);
buffer[gid] = ls * lgid + lid ;
}
__kernel void test_compute_global_id_from_local_id_nonuniform(__global int *buffer, __global int *local_sizes){
int gid = get_global_id(0);
int lid = get_local_id(0);
int lgid = get_group_id(0);
int ls = get_local_size(0);
local_sizes[lgid] = ls;
int i = 0;
int total_previous = 0;
for (i = 0; i < lgid; i++) {
total_previous += local_sizes[i];
}
int offset = get_global_offset(0);
buffer[gid] = total_previous + lid + offset;
}
"""
def setUp(self):
self.gpuDevices = pycl.get_platforms()[0].get_devices(pycl.device_type.GPU)
self.context = pycl.Context(devices=self.gpuDevices)
self.queue = pycl.CommandQueue(self.context)
self.program_source = self.kernel_source_file
self.program = pycl.Program(self.context, self.program_source).build()
def test_01_get_kernel_info(self):
"""
Can I get the kernel from my source file above? See setUp()
"""
kernel = self.program.test_global_id
self.assertIsNotNone(kernel)
def test_02_trivial_kernel_global_id(self):
"""
global_id is a number from 0 to nWorkUnits-1 that is associated with each "kernel run"
A kernel run is applied on a "work unit".
My first kernel called 'test_global_id' is the variable kernel_source_file above
will simply copy the global_id into the array that I am passing to it.
I will overwrite the default kernel file by overwriting it.
"""
queue = pycl.CommandQueue(self.context)
nWorkUnits = 128
valueBuffer = clArray(cq=queue, shape=(nWorkUnits,), dtype=cl.cltypes.uint)
self.program_source = """
__kernel void test_global_id(__global uint *buffer){
int id = get_global_id(0);
buffer[id] = id;
}
"""
self.program = pycl.Program(self.context, self.program_source).build()
knl = self.program.test_global_id # copy global_id into array
"""
Looking at knl below and the kernel source file first line:
__kernel void test_global_id(__global uint *buffer)
queue: is the opencl queue
nWorkUnits: is the total number of computations. We have kernel_source_file elements.
The second parameter (None) is the group size because the whole computation is
split into group_size "local" computations. We let OpenCL (i.e. it should be 32)
valueBuffer.data : is the array passed to the kernel.
"""
knl(queue, (nWorkUnits,), None, valueBuffer.data)
for i, value in enumerate(valueBuffer):
self.assertEqual(value, i)
def test_03_trivial_kernel_local_id_default_workgroup_size(self):
"""
local_id is a number that I don't really know what it does.
Not sure when to use it, but all workunits are split into groups, of get_group_size()
I will force the group_size to see if this works (knl third parameter)
"""
queue = pycl.CommandQueue(self.context)
nWorkUnits = 128
valueBuffer = clArray(cq=queue, shape=(nWorkUnits,), dtype=cl.cltypes.uint)
self.program_source = """
__kernel void test_local_id(__global int *buffer){
int gid = get_global_id(0);
int lid = get_local_id(0);
buffer[gid] = lid;
}
"""
self.program = pycl.Program(self.context, self.program_source).build()
knl = self.program.test_local_id # copy local_id into array
local_group_size = 32
knl(queue, (nWorkUnits,), (local_group_size,), valueBuffer.data)
for i, value in enumerate(valueBuffer):
self.assertEqual(i%local_group_size, value)
def test_04_trivial_kernel_local_id_set_workgroup_size(self):
"""
Having attempted to set the workgroup size, I noticed not all values are possible.
The number of groups must divide nWorkUnits evenly
"""
queue = pycl.CommandQueue(self.context)
nWorkUnits = 100
valueBuffer = clArray(cq=queue, shape=(nWorkUnits,), dtype=cl.cltypes.uint)
knl = self.program.test_local_id # copy local_id into array
accepted_values = set()
for i in range(100):
try:
knl(queue, (nWorkUnits,), (i,), valueBuffer.data)
self.assertEqual(nWorkUnits%i, 0)
accepted_values.add(i)
for j, value in enumerate(valueBuffer):
self.assertEqual(j%nWorkUnits, value)
except:
pass
self.assertTrue(accepted_values == set([1,2,4,5,10,20,25,50]))
# @unittest.skip('For information only')
# def test_05_extract_many_ids(self):
# nWorkUnits = 100
# valueBuffer = Buffer(nWorkUnits, value=0, dtype=cl.cltypes.uint)
# source_path = os.path.join(OPENCL_SOURCE_DIR, "test_opencl.c")
# program = CLProgram(source_path)
# program.launchKernel(kernelName='test_extract_many_ids', N=nWorkUnits, arguments = [valueBuffer])
# for i, value in enumerate(valueBuffer.hostBuffer):
# print(i,value)
def test_06_compute_global_id_from_local_id_uniform_sizes(self):
"""
When we force a uniform group size, it is possible to compute the global_id from the local_id.
"""
queue = pycl.CommandQueue(self.context)
nWorkUnits = 128 # Multiplie of 32
valueBuffer = clArray(cq=queue, shape=(nWorkUnits,), dtype=cl.cltypes.uint)
knl = self.program.test_compute_global_id_from_local_id # copy local_id into array
knl(queue, (nWorkUnits,), None, valueBuffer.data)
for i, value in enumerate(valueBuffer):
self.assertEqual(value, i)
def test_07_compute_global_id_from_local_id_NON_uniform_sizes(self):
"""
Computing the global_id from the local_id in a non-uniform computation
is problematic, because we need to know the size of all groups before the present
group. This is not guaranteed (as this tests demonstrate).
My assumption was that workunits would be fed in order but that is not true.
I don't know how to compute the global_id from the local parameters then
It does not matter: use get_global_id() but this is a basic issue in
understanding OpenCL for me.
"""
queue = pycl.CommandQueue(self.context)
nWorkUnits = 100 # NOT A MULTIPLE of 32
valueBuffer = clArray(cq=queue, shape=(nWorkUnits,), dtype=cl.cltypes.uint)
local_sizes = clArray(cq=queue, shape=(nWorkUnits,), dtype=cl.cltypes.uint)
knl = self.program.test_compute_global_id_from_local_id_nonuniform # copy local_id into array
knl(queue, (nWorkUnits,), None, valueBuffer.data, local_sizes.data)
# This is fine: IN THE END, local_sizes is correct
self.assertEqual(np.sum(local_sizes), nWorkUnits)
# But this is not: the calculated global_id is incorrect
# because during the computation, local_sizes is not completed in order.
with self.assertRaises(Exception):
for i, value in enumerate(valueBuffer):
self.assertEqual(value, i)
if __name__ == "__main__":
unittest.main()