@@ -55,12 +55,12 @@ static void printResult(cl_ulong4 seed, cl_ulong round, result r, cl_uchar score
55
55
std::cout << " : 0x" << strPublic << std::endl;
56
56
}
57
57
58
- unsigned int getKernelExecutionTimeMillis (cl_event & e) {
58
+ unsigned int getKernelExecutionTimeMicros (cl_event & e) {
59
59
cl_ulong timeStart = 0 , timeEnd = 0 ;
60
60
clWaitForEvents (1 , &e);
61
61
clGetEventProfilingInfo (e, CL_PROFILING_COMMAND_START, sizeof (timeStart), &timeStart, NULL );
62
62
clGetEventProfilingInfo (e, CL_PROFILING_COMMAND_END, sizeof (timeEnd), &timeEnd, NULL );
63
- return (timeEnd - timeStart) / 1000000 ;
63
+ return (timeEnd - timeStart) / 1000 ;
64
64
}
65
65
66
66
Dispatcher::OpenCLException::OpenCLException (const std::string s, const cl_int res) :
@@ -127,16 +127,15 @@ Dispatcher::Device::Device(Dispatcher & parent, cl_context & clContext, cl_progr
127
127
m_worksizeLocal(worksizeLocal),
128
128
m_clScoreMax(0 ),
129
129
m_clQueue(createQueue(clContext, clDeviceId) ),
130
- m_kernelBegin( createKernel(clProgram, " profanity_begin" ) ),
131
- m_kernelInverse(createKernel(clProgram, " profanity_inverse_multiple" )),
132
- m_kernelInversePost(createKernel(clProgram, " profanity_inverse_post" )),
133
- m_kernelEnd(createKernel(clProgram, " profanity_end" )),
130
+ m_kernelInit( createKernel(clProgram, " profanity_init" ) ),
131
+ m_kernelInverse(createKernel(clProgram, " profanity_inverse" )),
132
+ m_kernelIterate(createKernel(clProgram, " profanity_iterate" )),
134
133
m_kernelTransform( mode.transformKernel() == "" ? NULL : createKernel(clProgram, mode.transformKernel())),
135
134
m_kernelScore(createKernel(clProgram, mode.kernel)),
136
135
m_memPrecomp(clContext, m_clQueue, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, sizeof (g_precomp), g_precomp),
137
- m_memPointsX (clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, size, true ),
138
- m_memPointsY (clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, size, true ),
139
- m_memInverse (clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, size, true ),
136
+ m_memPointsDeltaX (clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, size, true ),
137
+ m_memInversedNegativeDoubleGy (clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, size, true ),
138
+ m_memPrevLambda (clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, size, true ),
140
139
m_memResult(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, PROFANITY_MAX_SCORE + 1 ),
141
140
m_memData1(clContext, m_clQueue, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, 20 ),
142
141
m_memData2(clContext, m_clQueue, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, 20 ),
@@ -205,6 +204,8 @@ void Dispatcher::init() {
205
204
std::cout << std::endl;
206
205
207
206
const auto deviceCount = m_vDevices.size ();
207
+ m_sizeInitTotal = m_size * deviceCount;
208
+ m_sizeInitDone = 0 ;
208
209
209
210
cl_event * const pInitEvents = new cl_event[deviceCount];
210
211
@@ -238,33 +239,28 @@ void Dispatcher::initBegin(Device & d) {
238
239
d.m_memData2 .write (true );
239
240
240
241
// Kernel arguments - profanity_begin
241
- d.m_memPrecomp .setKernelArg (d.m_kernelBegin , 0 );
242
- d.m_memPointsX .setKernelArg (d.m_kernelBegin , 1 );
243
- d.m_memPointsY .setKernelArg (d.m_kernelBegin , 2 );
244
- d.m_memResult .setKernelArg (d.m_kernelBegin , 3 );
245
- CLMemory<cl_ulong4>::setKernelArg (d.m_kernelBegin , 4 , d.m_clSeed );
242
+ d.m_memPrecomp .setKernelArg (d.m_kernelInit , 0 );
243
+ d.m_memPointsDeltaX .setKernelArg (d.m_kernelInit , 1 );
244
+ d.m_memPrevLambda .setKernelArg (d.m_kernelInit , 2 );
245
+ d.m_memResult .setKernelArg (d.m_kernelInit , 3 );
246
+ CLMemory<cl_ulong4>::setKernelArg (d.m_kernelInit , 4 , d.m_clSeed );
246
247
247
248
// Kernel arguments - profanity_inverse
248
- d.m_memPointsX .setKernelArg (d.m_kernelInverse , 0 );
249
- d.m_memInverse .setKernelArg (d.m_kernelInverse , 1 );
249
+ d.m_memPointsDeltaX .setKernelArg (d.m_kernelInverse , 0 );
250
+ d.m_memInversedNegativeDoubleGy .setKernelArg (d.m_kernelInverse , 1 );
250
251
251
- // Kernel arguments - profanity_inverse_post
252
- d.m_memPointsX .setKernelArg (d.m_kernelInversePost , 0 );
253
- d.m_memPointsY .setKernelArg (d.m_kernelInversePost , 1 );
254
- d.m_memInverse .setKernelArg (d.m_kernelInversePost , 2 );
255
-
256
- // Kernel arguments - profanity_end
257
- d.m_memPointsX .setKernelArg (d.m_kernelEnd , 0 );
258
- d.m_memPointsY .setKernelArg (d.m_kernelEnd , 1 );
259
- d.m_memInverse .setKernelArg (d.m_kernelEnd , 2 );
252
+ // Kernel arguments - profanity_iterate
253
+ d.m_memPointsDeltaX .setKernelArg (d.m_kernelIterate , 0 );
254
+ d.m_memInversedNegativeDoubleGy .setKernelArg (d.m_kernelIterate , 1 );
255
+ d.m_memPrevLambda .setKernelArg (d.m_kernelIterate , 2 );
260
256
261
257
// Kernel arguments - profanity_transform_*
262
258
if (d.m_kernelTransform ) {
263
- d.m_memInverse .setKernelArg (d.m_kernelTransform , 0 );
259
+ d.m_memInversedNegativeDoubleGy .setKernelArg (d.m_kernelTransform , 0 );
264
260
}
265
261
266
262
// Kernel arguments - profanity_score_*
267
- d.m_memInverse .setKernelArg (d.m_kernelScore , 0 );
263
+ d.m_memInversedNegativeDoubleGy .setKernelArg (d.m_kernelScore , 0 );
268
264
d.m_memResult .setKernelArg (d.m_kernelScore , 1 );
269
265
d.m_memData1 .setKernelArg (d.m_kernelScore , 2 );
270
266
d.m_memData2 .setKernelArg (d.m_kernelScore , 3 );
@@ -277,11 +273,16 @@ void Dispatcher::initBegin(Device & d) {
277
273
278
274
void Dispatcher::initContinue (Device & d) {
279
275
size_t sizeLeft = m_size - d.m_sizeInitialized ;
276
+ const size_t sizeInitLimit = m_size / 20 ;
277
+
278
+ // Print progress
279
+ const size_t percentDone = m_sizeInitDone * 100 / m_sizeInitTotal;
280
+ std::cout << " " << percentDone << " %\r " << std::flush;
280
281
281
282
if (sizeLeft) {
282
283
cl_event event;
283
- const size_t sizeRun = std::min (sizeLeft, m_worksizeMax);
284
- const auto resEnqueue = clEnqueueNDRangeKernel (d.m_clQueue , d.m_kernelBegin , 1 , &d.m_sizeInitialized , &sizeRun, NULL , 0 , NULL , &event);
284
+ const size_t sizeRun = std::min (sizeInitLimit, std::min ( sizeLeft, m_worksizeMax) );
285
+ const auto resEnqueue = clEnqueueNDRangeKernel (d.m_clQueue , d.m_kernelInit , 1 , &d.m_sizeInitialized , &sizeRun, NULL , 0 , NULL , &event);
285
286
OpenCLException::throwIfError (" kernel queueing failed during initilization" , resEnqueue);
286
287
287
288
// See: https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/clSetEventCallback.html
@@ -292,7 +293,9 @@ void Dispatcher::initContinue(Device & d) {
292
293
// clFlush on the queue before returning or arrange for clFlush to be called later on another thread.
293
294
clFlush (d.m_clQueue );
294
295
296
+ std::lock_guard<std::mutex> lock (m_mutex);
295
297
d.m_sizeInitialized += sizeRun;
298
+ m_sizeInitDone += sizeRun;
296
299
297
300
const auto resCallback = clSetEventCallback (event, CL_COMPLETE, staticCallback, &d);
298
301
OpenCLException::throwIfError (" failed to set custom callback during initialization" , resCallback);
@@ -340,16 +343,13 @@ void Dispatcher::dispatch(Device & d) {
340
343
341
344
#ifdef PROFANITY_DEBUG
342
345
cl_event eventInverse;
343
- cl_event eventInversePost;
344
- cl_event eventEnd;
346
+ cl_event eventIterate;
345
347
346
348
enqueueKernelDevice (d, d.m_kernelInverse , m_size / m_inverseSize, &eventInverse);
347
- enqueueKernelDevice (d, d.m_kernelInversePost , m_size, &eventInversePost);
348
- enqueueKernelDevice (d, d.m_kernelEnd , m_size, &eventEnd);
349
+ enqueueKernelDevice (d, d.m_kernelIterate , m_size, &eventIterate);
349
350
#else
350
351
enqueueKernelDevice (d, d.m_kernelInverse , m_size / m_inverseSize);
351
- enqueueKernelDevice (d, d.m_kernelInversePost , m_size);
352
- enqueueKernelDevice (d, d.m_kernelEnd , m_size);
352
+ enqueueKernelDevice (d, d.m_kernelIterate , m_size);
353
353
#endif
354
354
355
355
if (d.m_kernelTransform ) {
@@ -360,8 +360,11 @@ void Dispatcher::dispatch(Device & d) {
360
360
clFlush (d.m_clQueue );
361
361
362
362
#ifdef PROFANITY_DEBUG
363
- clFinish (d.m_clQueue );
364
- std::cout << getKernelExecutionTimeMillis (eventInverse) << " , " << getKernelExecutionTimeMillis (eventInversePost) << " , " << getKernelExecutionTimeMillis (eventEnd) << std::endl;
363
+ // We're actually not allowed to call clFinish here because this function is ultimately asynchronously called by OpenCL.
364
+ // However, this happens to work on my computer and it's not really intended for release, just something to aid me in
365
+ // optimizations.
366
+ clFinish (d.m_clQueue );
367
+ std::cout << " Timing: profanity_inverse = " << getKernelExecutionTimeMicros (eventInverse) << " us, profanity_iterate = " << getKernelExecutionTimeMicros (eventIterate) << " us" << std::endl;
365
368
#endif
366
369
367
370
const auto res = clSetEventCallback (event, CL_COMPLETE, staticCallback, &d);
@@ -399,8 +402,8 @@ void Dispatcher::onEvent(cl_event event, cl_int status, Device & d) {
399
402
else if (d.m_eventFinished != NULL ) {
400
403
initContinue (d);
401
404
} else {
402
- handleResult (d);
403
405
++d.m_round ;
406
+ handleResult (d);
404
407
405
408
bool bDispatch = true ;
406
409
{
0 commit comments