Skip to content
This repository was archived by the owner on Sep 15, 2022. It is now read-only.

Commit 90f2c21

Browse files
committed
New major mathematical optimization giving boost of up to 14% on my nVidia GTX 1070. Number of multiplications during point addition phase has been reduced from 3 to 2. In addition to this a visible progress counter has been added during initialization.
1 parent 7730859 commit 90f2c21

File tree

3 files changed

+172
-171
lines changed

3 files changed

+172
-171
lines changed

Dispatcher.cpp

+41-38
Original file line numberDiff line numberDiff line change
@@ -55,12 +55,12 @@ static void printResult(cl_ulong4 seed, cl_ulong round, result r, cl_uchar score
5555
std::cout << ": 0x" << strPublic << std::endl;
5656
}
5757

58-
unsigned int getKernelExecutionTimeMillis(cl_event & e) {
58+
unsigned int getKernelExecutionTimeMicros(cl_event & e) {
5959
cl_ulong timeStart = 0, timeEnd = 0;
6060
clWaitForEvents(1, &e);
6161
clGetEventProfilingInfo(e, CL_PROFILING_COMMAND_START, sizeof(timeStart), &timeStart, NULL);
6262
clGetEventProfilingInfo(e, CL_PROFILING_COMMAND_END, sizeof(timeEnd), &timeEnd, NULL);
63-
return (timeEnd - timeStart) / 1000000;
63+
return (timeEnd - timeStart) / 1000;
6464
}
6565

6666
Dispatcher::OpenCLException::OpenCLException(const std::string s, const cl_int res) :
@@ -127,16 +127,15 @@ Dispatcher::Device::Device(Dispatcher & parent, cl_context & clContext, cl_progr
127127
m_worksizeLocal(worksizeLocal),
128128
m_clScoreMax(0),
129129
m_clQueue(createQueue(clContext, clDeviceId) ),
130-
m_kernelBegin( createKernel(clProgram, "profanity_begin") ),
131-
m_kernelInverse(createKernel(clProgram, "profanity_inverse_multiple")),
132-
m_kernelInversePost(createKernel(clProgram, "profanity_inverse_post")),
133-
m_kernelEnd(createKernel(clProgram, "profanity_end")),
130+
m_kernelInit( createKernel(clProgram, "profanity_init") ),
131+
m_kernelInverse(createKernel(clProgram, "profanity_inverse")),
132+
m_kernelIterate(createKernel(clProgram, "profanity_iterate")),
134133
m_kernelTransform( mode.transformKernel() == "" ? NULL : createKernel(clProgram, mode.transformKernel())),
135134
m_kernelScore(createKernel(clProgram, mode.kernel)),
136135
m_memPrecomp(clContext, m_clQueue, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, sizeof(g_precomp), g_precomp),
137-
m_memPointsX(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, size, true),
138-
m_memPointsY(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, size, true),
139-
m_memInverse(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, size, true),
136+
m_memPointsDeltaX(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, size, true),
137+
m_memInversedNegativeDoubleGy(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, size, true),
138+
m_memPrevLambda(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, size, true),
140139
m_memResult(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, PROFANITY_MAX_SCORE + 1),
141140
m_memData1(clContext, m_clQueue, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, 20),
142141
m_memData2(clContext, m_clQueue, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, 20),
@@ -205,6 +204,8 @@ void Dispatcher::init() {
205204
std::cout << std::endl;
206205

207206
const auto deviceCount = m_vDevices.size();
207+
m_sizeInitTotal = m_size * deviceCount;
208+
m_sizeInitDone = 0;
208209

209210
cl_event * const pInitEvents = new cl_event[deviceCount];
210211

@@ -238,33 +239,28 @@ void Dispatcher::initBegin(Device & d) {
238239
d.m_memData2.write(true);
239240

240241
// Kernel arguments - profanity_begin
241-
d.m_memPrecomp.setKernelArg(d.m_kernelBegin, 0);
242-
d.m_memPointsX.setKernelArg(d.m_kernelBegin, 1);
243-
d.m_memPointsY.setKernelArg(d.m_kernelBegin, 2);
244-
d.m_memResult.setKernelArg(d.m_kernelBegin, 3);
245-
CLMemory<cl_ulong4>::setKernelArg(d.m_kernelBegin, 4, d.m_clSeed);
242+
d.m_memPrecomp.setKernelArg(d.m_kernelInit, 0);
243+
d.m_memPointsDeltaX.setKernelArg(d.m_kernelInit, 1);
244+
d.m_memPrevLambda.setKernelArg(d.m_kernelInit, 2);
245+
d.m_memResult.setKernelArg(d.m_kernelInit, 3);
246+
CLMemory<cl_ulong4>::setKernelArg(d.m_kernelInit, 4, d.m_clSeed);
246247

247248
// Kernel arguments - profanity_inverse
248-
d.m_memPointsX.setKernelArg(d.m_kernelInverse, 0);
249-
d.m_memInverse.setKernelArg(d.m_kernelInverse, 1);
249+
d.m_memPointsDeltaX.setKernelArg(d.m_kernelInverse, 0);
250+
d.m_memInversedNegativeDoubleGy.setKernelArg(d.m_kernelInverse, 1);
250251

251-
// Kernel arguments - profanity_inverse_post
252-
d.m_memPointsX.setKernelArg(d.m_kernelInversePost, 0);
253-
d.m_memPointsY.setKernelArg(d.m_kernelInversePost, 1);
254-
d.m_memInverse.setKernelArg(d.m_kernelInversePost, 2);
255-
256-
// Kernel arguments - profanity_end
257-
d.m_memPointsX.setKernelArg(d.m_kernelEnd, 0);
258-
d.m_memPointsY.setKernelArg(d.m_kernelEnd, 1);
259-
d.m_memInverse.setKernelArg(d.m_kernelEnd, 2);
252+
// Kernel arguments - profanity_iterate
253+
d.m_memPointsDeltaX.setKernelArg(d.m_kernelIterate, 0);
254+
d.m_memInversedNegativeDoubleGy.setKernelArg(d.m_kernelIterate, 1);
255+
d.m_memPrevLambda.setKernelArg(d.m_kernelIterate, 2);
260256

261257
// Kernel arguments - profanity_transform_*
262258
if(d.m_kernelTransform) {
263-
d.m_memInverse.setKernelArg(d.m_kernelTransform, 0);
259+
d.m_memInversedNegativeDoubleGy.setKernelArg(d.m_kernelTransform, 0);
264260
}
265261

266262
// Kernel arguments - profanity_score_*
267-
d.m_memInverse.setKernelArg(d.m_kernelScore, 0);
263+
d.m_memInversedNegativeDoubleGy.setKernelArg(d.m_kernelScore, 0);
268264
d.m_memResult.setKernelArg(d.m_kernelScore, 1);
269265
d.m_memData1.setKernelArg(d.m_kernelScore, 2);
270266
d.m_memData2.setKernelArg(d.m_kernelScore, 3);
@@ -277,11 +273,16 @@ void Dispatcher::initBegin(Device & d) {
277273

278274
void Dispatcher::initContinue(Device & d) {
279275
size_t sizeLeft = m_size - d.m_sizeInitialized;
276+
const size_t sizeInitLimit = m_size / 20;
277+
278+
// Print progress
279+
const size_t percentDone = m_sizeInitDone * 100 / m_sizeInitTotal;
280+
std::cout << " " << percentDone << "%\r" << std::flush;
280281

281282
if (sizeLeft) {
282283
cl_event event;
283-
const size_t sizeRun = std::min(sizeLeft, m_worksizeMax);
284-
const auto resEnqueue = clEnqueueNDRangeKernel(d.m_clQueue, d.m_kernelBegin, 1, &d.m_sizeInitialized, &sizeRun, NULL, 0, NULL, &event);
284+
const size_t sizeRun = std::min(sizeInitLimit, std::min(sizeLeft, m_worksizeMax));
285+
const auto resEnqueue = clEnqueueNDRangeKernel(d.m_clQueue, d.m_kernelInit, 1, &d.m_sizeInitialized, &sizeRun, NULL, 0, NULL, &event);
285286
OpenCLException::throwIfError("kernel queueing failed during initilization", resEnqueue);
286287

287288
// See: https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/clSetEventCallback.html
@@ -292,7 +293,9 @@ void Dispatcher::initContinue(Device & d) {
292293
// clFlush on the queue before returning or arrange for clFlush to be called later on another thread.
293294
clFlush(d.m_clQueue);
294295

296+
std::lock_guard<std::mutex> lock(m_mutex);
295297
d.m_sizeInitialized += sizeRun;
298+
m_sizeInitDone += sizeRun;
296299

297300
const auto resCallback = clSetEventCallback(event, CL_COMPLETE, staticCallback, &d);
298301
OpenCLException::throwIfError("failed to set custom callback during initialization", resCallback);
@@ -340,16 +343,13 @@ void Dispatcher::dispatch(Device & d) {
340343

341344
#ifdef PROFANITY_DEBUG
342345
cl_event eventInverse;
343-
cl_event eventInversePost;
344-
cl_event eventEnd;
346+
cl_event eventIterate;
345347

346348
enqueueKernelDevice(d, d.m_kernelInverse, m_size / m_inverseSize, &eventInverse);
347-
enqueueKernelDevice(d, d.m_kernelInversePost, m_size, &eventInversePost);
348-
enqueueKernelDevice(d, d.m_kernelEnd, m_size, &eventEnd);
349+
enqueueKernelDevice(d, d.m_kernelIterate, m_size, &eventIterate);
349350
#else
350351
enqueueKernelDevice(d, d.m_kernelInverse, m_size / m_inverseSize);
351-
enqueueKernelDevice(d, d.m_kernelInversePost, m_size);
352-
enqueueKernelDevice(d, d.m_kernelEnd, m_size);
352+
enqueueKernelDevice(d, d.m_kernelIterate, m_size);
353353
#endif
354354

355355
if (d.m_kernelTransform) {
@@ -360,8 +360,11 @@ void Dispatcher::dispatch(Device & d) {
360360
clFlush(d.m_clQueue);
361361

362362
#ifdef PROFANITY_DEBUG
363-
clFinish(d.m_clQueue);
364-
std::cout << getKernelExecutionTimeMillis(eventInverse) << ", " << getKernelExecutionTimeMillis(eventInversePost) << ", " << getKernelExecutionTimeMillis(eventEnd) << std::endl;
363+
// We're actually not allowed to call clFinish here because this function is ultimately asynchronously called by OpenCL.
364+
// However, this happens to work on my computer and it's not really intended for release, just something to aid me in
365+
// optimizations.
366+
clFinish(d.m_clQueue);
367+
std::cout << "Timing: profanity_inverse = " << getKernelExecutionTimeMicros(eventInverse) << "us, profanity_iterate = " << getKernelExecutionTimeMicros(eventIterate) << "us" << std::endl;
365368
#endif
366369

367370
const auto res = clSetEventCallback(event, CL_COMPLETE, staticCallback, &d);
@@ -399,8 +402,8 @@ void Dispatcher::onEvent(cl_event event, cl_int status, Device & d) {
399402
else if (d.m_eventFinished != NULL) {
400403
initContinue(d);
401404
} else {
402-
handleResult(d);
403405
++d.m_round;
406+
handleResult(d);
404407

405408
bool bDispatch = true;
406409
{

Dispatcher.hpp

+7-6
Original file line numberDiff line numberDiff line change
@@ -49,17 +49,16 @@ class Dispatcher {
4949
cl_uchar m_clScoreMax;
5050
cl_command_queue m_clQueue;
5151

52-
cl_kernel m_kernelBegin;
52+
cl_kernel m_kernelInit;
5353
cl_kernel m_kernelInverse;
54-
cl_kernel m_kernelInversePost;
55-
cl_kernel m_kernelEnd;
54+
cl_kernel m_kernelIterate;
5655
cl_kernel m_kernelTransform;
5756
cl_kernel m_kernelScore;
5857

5958
CLMemory<point> m_memPrecomp;
60-
CLMemory<mp_number> m_memPointsX;
61-
CLMemory<mp_number> m_memPointsY;
62-
CLMemory<mp_number> m_memInverse;
59+
CLMemory<mp_number> m_memPointsDeltaX;
60+
CLMemory<mp_number> m_memInversedNegativeDoubleGy;
61+
CLMemory<mp_number> m_memPrevLambda;
6362
CLMemory<result> m_memResult;
6463

6564
// Data parameters used in some modes
@@ -125,6 +124,8 @@ class Dispatcher {
125124
std::chrono::time_point<std::chrono::steady_clock> timeStart;
126125
unsigned int m_countPrint;
127126
unsigned int m_countRunning;
127+
size_t m_sizeInitTotal;
128+
size_t m_sizeInitDone;
128129
bool m_quit;
129130
};
130131

0 commit comments

Comments
 (0)