Skip to content

Commit 8535786

Browse files
fancyIXfancyIX
fancyIX
authored and
fancyIX
committed
Reduce instruction length to reduce cache miss
1 parent e9ac263 commit 8535786

File tree

2 files changed

+57
-67
lines changed

2 files changed

+57
-67
lines changed

Makefile.am

-1
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,6 @@ nvcc_ARCH :=
116116
nvcc_ARCH += -gencode=arch=compute_86,code=\"sm_86,compute_86\"
117117
nvcc_ARCH += -gencode=arch=compute_75,code=\"sm_75,compute_75\"
118118
nvcc_ARCH += -gencode=arch=compute_61,code=\"sm_61,compute_61\"
119-
nvcc_ARCH += -gencode=arch=compute_52,code=\"sm_52,compute_52\"
120119
#nvcc_ARCH += -gencode=arch=compute_50,code=\"sm_50,compute_50\"
121120
#nvcc_ARCH += -gencode=arch=compute_35,code=\"sm_35,compute_35\"
122121
#nvcc_ARCH += -gencode=arch=compute_30,code=\"sm_30,compute_30\"

lyra2/cuda_lyra2.cu

+57-66
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
/**
2+
* fancyIX
23
* Lyra2 (v1) cuda implementation based on djm34 work
34
* tpruvot@github 2015, Nanashi 08/2016 (from 1.8-r2)
45
*/
@@ -39,41 +40,33 @@
3940
extern __shared__ uint2 shared_mem[];
4041
const int s0 = (Ncol / 2 * (row - BUF_COUNT) + col / 2) * memshift;
4142

42-
res[0] = shared_mem[((s0 + 0) * 8 + threadIdx.y) * 4 + threadIdx.x];
43-
res[1] = shared_mem[((s0 + 1) * 8 + threadIdx.y) * 4 + threadIdx.x];
44-
res[2] = shared_mem[((s0 + 2) * 8 + threadIdx.y) * 4 + threadIdx.x];
43+
res[0] = shared_mem[((s0 + 0) * 16 + threadIdx.y) * 4 + threadIdx.x];
44+
res[1] = shared_mem[((s0 + 1) * 16 + threadIdx.y) * 4 + threadIdx.x];
45+
res[2] = shared_mem[((s0 + 2) * 16 + threadIdx.y) * 4 + threadIdx.x];
4546
}
4647

4748
__device__ __forceinline__ void ST4SS(const int row, const int col, const uint2 data[3], const int thread, const int threads)
4849
{
4950
extern __shared__ uint2 shared_mem[];
5051
const int s0 = (Ncol / 2 * (row - BUF_COUNT) + col / 2) * memshift;
5152

52-
shared_mem[((s0 + 0) *8 + threadIdx.y) * 4 + threadIdx.x] = data[0];
53-
shared_mem[((s0 + 1) *8 + threadIdx.y) * 4 + threadIdx.x] = data[1];
54-
shared_mem[((s0 + 2) *8 + threadIdx.y) * 4 + threadIdx.x] = data[2];
53+
shared_mem[((s0 + 0) * 16 + threadIdx.y) * 4 + threadIdx.x] = data[0];
54+
shared_mem[((s0 + 1) * 16 + threadIdx.y) * 4 + threadIdx.x] = data[1];
55+
shared_mem[((s0 + 2) * 16 + threadIdx.y) * 4 + threadIdx.x] = data[2];
5556
}
5657

57-
__device__ __forceinline__ void LD4S(uint2 res[3], const int row, const int col, const int thread, const int threads, uint2 pad[Ncol / 2][Nrow][3])
58+
__device__ __forceinline__ void LD4SL(uint2 res[3], const int row, const int col, uint2 pad[Ncol / 2][Nrow][3])
5859
{
59-
if ((col & 1) == 0) {
60-
LD4SS(res, row, col, thread, threads);
61-
} else {
6260
res[0] = pad[col / 2][row][0];
6361
res[1] = pad[col / 2][row][1];
6462
res[2] = pad[col / 2][row][2];
65-
}
6663
}
6764

68-
__device__ __forceinline__ void ST4S(const int row, const int col, const uint2 data[3], const int thread, const int threads, uint2 pad[Ncol / 2][Nrow][3])
65+
__device__ __forceinline__ void ST4SL(const int row, const int col, const uint2 data[3], uint2 pad[Ncol / 2][Nrow][3])
6966
{
70-
if ((col & 1) == 0) {
71-
ST4SS(row, col, data, thread, threads);
72-
} else {
7367
pad[col / 2][row][0] = data[0];
7468
pad[col / 2][row][1] = data[1];
7569
pad[col / 2][row][2] = data[2];
76-
}
7770
}
7871

7972

@@ -206,15 +199,18 @@
206199

207200
for (int i = 0; i < Nrow; i++)
208201
{
209-
ST4S(0, Ncol - i - 1, state, thread, threads, pad);
202+
if ((i & 1) == 1)
203+
ST4SS(0, Ncol - i - 1, state, thread, threads);
204+
else
205+
ST4SL(0, Ncol - i - 1, state, pad);
210206

211207
round_lyra(state);
212208
}
213209

214210
for (int i = 0; i < Nrow; i+=2)
215211
{
216-
LD4S(state1, 0, i, thread, threads, pad);
217-
LD4S(state2, 0, i + 1, thread, threads, pad);
212+
LD4SS(state1, 0, i, thread, threads);
213+
LD4SL(state2, 0, i + 1, pad);
218214
#pragma unroll
219215
for (int j = 0; j < 3; j++)
220216
state[j] ^= state1[j];
@@ -234,8 +230,8 @@
234230
#pragma unroll
235231
for (int j = 0; j < 3; j++)
236232
state2[j] ^= state[j];
237-
ST4S(1, Ncol - i - 1, state1, thread, threads, pad);
238-
ST4S(1, Ncol - (i + 1) - 1, state2, thread, threads, pad);
233+
ST4SL(1, Ncol - i - 1, state1, pad);
234+
ST4SS(1, Ncol - (i + 1) - 1, state2, thread, threads);
239235
}
240236
}
241237

@@ -246,10 +242,10 @@
246242

247243
for (int i = 0; i < Nrow; i+=2)
248244
{
249-
LD4S(state1, rowIn, i, thread, threads, pad);
250-
LD4S(state2, rowInOut, i, thread, threads, pad);
251-
LD4S(state3, rowIn, i + 1, thread, threads, pad);
252-
LD4S(state4, rowInOut, i + 1, thread, threads, pad);
245+
LD4SS(state1, rowIn, i, thread, threads);
246+
LD4SS(state2, rowInOut, i, thread, threads);
247+
LD4SL(state3, rowIn, i + 1, pad);
248+
LD4SL(state4, rowInOut, i + 1, pad);
253249
#pragma unroll
254250
for (int j = 0; j < 3; j++)
255251
state[j] ^= state1[j] + state2[j];
@@ -260,7 +256,7 @@
260256
for (int j = 0; j < 3; j++)
261257
state1[j] ^= state[j];
262258

263-
ST4S(rowOut, Ncol - i - 1, state1, thread, threads, pad);
259+
ST4SL(rowOut, Ncol - i - 1, state1, pad);
264260

265261
// simultaneously receive data from preceding thread and send data to following thread
266262
uint2 Data0 = state[0];
@@ -279,7 +275,7 @@
279275
state2[2] ^= Data2;
280276
}
281277

282-
ST4S(rowInOut, i, state2, thread, threads, pad);
278+
ST4SS(rowInOut, i, state2, thread, threads);
283279

284280
//=====================================
285281
#pragma unroll
@@ -292,7 +288,7 @@
292288
for (int j = 0; j < 3; j++)
293289
state3[j] ^= state[j];
294290

295-
ST4S(rowOut, Ncol - (i + 1) - 1, state3, thread, threads, pad);
291+
ST4SS(rowOut, Ncol - (i + 1) - 1, state3, thread, threads);
296292

297293
// simultaneously receive data from preceding thread and send data to following thread
298294
uint2 Data01 = state[0];
@@ -311,7 +307,7 @@
311307
state4[2] ^= Data21;
312308
}
313309

314-
ST4S(rowInOut, (i + 1), state4, thread, threads, pad);
310+
ST4SL(rowInOut, (i + 1), state4, pad);
315311
}
316312
}
317313

@@ -322,16 +318,16 @@
322318
{
323319
uint2 state1[3], state2[3], state3[3], state4[3];
324320

325-
LD4S(state1, rowIn, i, thread, threads, pad);
326-
LD4S(state2, rowInOut, i, thread, threads, pad);
327-
LD4S(state3, rowIn, i + 1, thread, threads, pad);
328-
LD4S(state4, rowInOut, i + 1, thread, threads, pad);
321+
LD4SS(state1, rowIn, i, thread, threads);
322+
LD4SS(state2, rowInOut, i, thread, threads);
323+
LD4SL(state3, rowIn, i + 1, pad);
324+
LD4SL(state4, rowInOut, i + 1, pad);
329325

330326
#pragma unroll
331327
for (int j = 0; j < 3; j++)
332328
state[j] ^= state1[j] + state2[j];
333329

334-
LD4S(state1, rowOut, i, thread, threads, pad);
330+
LD4SS(state1, rowOut, i, thread, threads);
335331

336332
round_lyra(state);
337333

@@ -355,7 +351,7 @@
355351
}
356352

357353
if (rowInOut != rowOut) {
358-
ST4S(rowInOut, i, state2, thread, threads, pad);
354+
ST4SS(rowInOut, i, state2, thread, threads);
359355
#pragma unroll
360356
for (int j = 0; j < 3; j++)
361357
state2[j] = state1[j];
@@ -365,7 +361,7 @@
365361
for (int j = 0; j < 3; j++)
366362
state2[j] ^= state[j];
367363

368-
ST4S(rowOut, i, state2, thread, threads, pad);
364+
ST4SS(rowOut, i, state2, thread, threads);
369365

370366
//======================================
371367

@@ -374,7 +370,7 @@
374370
for (int j = 0; j < 3; j++)
375371
state[j] ^= state3[j] + state4[j];
376372

377-
LD4S(state3, rowOut, i + 1, thread, threads, pad);
373+
LD4SL(state3, rowOut, i + 1, pad);
378374

379375
round_lyra(state);
380376

@@ -398,7 +394,7 @@
398394
}
399395

400396
if (rowInOut != rowOut) {
401-
ST4S(rowInOut, i + 1, state4, thread, threads, pad);
397+
ST4SL(rowInOut, i + 1, state4, pad);
402398
#pragma unroll
403399
for (int j = 0; j < 3; j++)
404400
state4[j] = state3[j];
@@ -408,7 +404,7 @@
408404
for (int j = 0; j < 3; j++)
409405
state4[j] ^= state[j];
410406

411-
ST4S(rowOut, i + 1, state4, thread, threads, pad);
407+
ST4SL(rowOut, i + 1, state4, pad);
412408
}
413409
}
414410

@@ -417,8 +413,8 @@
417413
{
418414
uint2 state1[3], state2[3], state3[3], state4[3], last[3];
419415

420-
LD4S(state1, 2, 0, thread, threads, pad);
421-
LD4S(last, rowInOut, 0, thread, threads, pad);
416+
LD4SS(state1, 2, 0, thread, threads);
417+
LD4SS(last, rowInOut, 0, thread, threads);
422418

423419
#pragma unroll
424420
for (int j = 0; j < 3; j++)
@@ -450,8 +446,8 @@
450446
last[j] ^= state[j];
451447
}
452448

453-
LD4S(state1, 2, 1, thread, threads, pad);
454-
LD4S(state2, rowInOut, 1, thread, threads, pad);
449+
LD4SL(state1, 2, 1, pad);
450+
LD4SL(state2, rowInOut, 1, pad);
455451

456452
#pragma unroll
457453
for (int j = 0; j < 3; j++)
@@ -461,10 +457,10 @@
461457

462458
for (int i = 2; i < Nrow; i+=2)
463459
{
464-
LD4S(state1, 2, i, thread, threads, pad);
465-
LD4S(state2, rowInOut, i, thread, threads, pad);
466-
LD4S(state3, 2, i + 1, thread, threads, pad);
467-
LD4S(state4, rowInOut, i + 1, thread, threads, pad);
460+
LD4SS(state1, 2, i, thread, threads);
461+
LD4SS(state2, rowInOut, i, thread, threads);
462+
LD4SL(state3, 2, i + 1, pad);
463+
LD4SL(state4, rowInOut, i + 1, pad);
468464

469465
#pragma unroll
470466
for (int j = 0; j < 3; j++)
@@ -524,7 +520,7 @@
524520
}
525521

526522
__global__
527-
__launch_bounds__(TPB52, 1)
523+
__launch_bounds__(64, 1)
528524
void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
529525
{
530526
const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y;
@@ -546,21 +542,16 @@
546542
reduceDuplexRowSetup(4, 3, 5, state, thread, threads, pad);
547543
reduceDuplexRowSetup(5, 2, 6, state, thread, threads, pad);
548544
reduceDuplexRowSetup(6, 1, 7, state, thread, threads, pad);
549-
550-
uint32_t rowa = WarpShuffle(state[0].x, 0, 4) & 7;
551-
reduceDuplexRowt(7, rowa, 0, state, thread, threads, pad);
552-
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
553-
reduceDuplexRowt(0, rowa, 3, state, thread, threads, pad);
554-
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
555-
reduceDuplexRowt(3, rowa, 6, state, thread, threads, pad);
556-
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
557-
reduceDuplexRowt(6, rowa, 1, state, thread, threads, pad);
558-
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
559-
reduceDuplexRowt(1, rowa, 4, state, thread, threads, pad);
560-
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
561-
reduceDuplexRowt(4, rowa, 7, state, thread, threads, pad);
562-
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
563-
reduceDuplexRowt(7, rowa, 2, state, thread, threads, pad);
545+
546+
uint32_t rowa;
547+
uint32_t row = 0;
548+
uint32_t pre = 7;
549+
for (int i = 0; i < 7; i++) {
550+
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
551+
reduceDuplexRowt(pre, rowa, row, state, thread, threads, pad);
552+
pre = row;
553+
row = (row + 3) % 8;
554+
}
564555
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
565556
reduceDuplexRowt_8(rowa, state, thread, threads, pad);
566557

@@ -623,8 +614,8 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint6
623614
else if (cuda_arch[dev_id] >= 500) tpb = TPB50;
624615
else if (cuda_arch[dev_id] >= 200) tpb = TPB20;
625616

626-
dim3 grid1((threads * 4 + tpb - 1) / tpb);
627-
dim3 block1(4, tpb >> 2);
617+
dim3 grid1((threads * 4 + 64 - 1) / 64);
618+
dim3 block1(4, 64 >> 2);
628619

629620
dim3 grid2((threads + 64 - 1) / 64);
630621
dim3 block2(64);
@@ -636,7 +627,7 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint6
636627
{
637628
lyra2_gpu_hash_32_1 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash);
638629

639-
lyra2_gpu_hash_32_2 <<< grid1, block1, 12 * (8 - 0) * sizeof(uint2) * tpb >>> (threads, startNounce, d_hash);
630+
lyra2_gpu_hash_32_2 <<< grid1, block1, 12 * (8 - 0) * sizeof(uint2) * 64 >>> (threads, startNounce, d_hash);
640631

641632
lyra2_gpu_hash_32_3 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash);
642633
}

0 commit comments

Comments
 (0)