1
1
/* *
2
+ * fancyIX
2
3
* Lyra2 (v1) cuda implementation based on djm34 work
3
4
* tpruvot@github 2015, Nanashi 08/2016 (from 1.8-r2)
4
5
*/
39
40
extern __shared__ uint2 shared_mem[];
40
41
const int s0 = (Ncol / 2 * (row - BUF_COUNT) + col / 2 ) * memshift;
41
42
42
- res[0 ] = shared_mem[((s0 + 0 ) * 8 + threadIdx .y ) * 4 + threadIdx .x ];
43
- res[1 ] = shared_mem[((s0 + 1 ) * 8 + threadIdx .y ) * 4 + threadIdx .x ];
44
- res[2 ] = shared_mem[((s0 + 2 ) * 8 + threadIdx .y ) * 4 + threadIdx .x ];
43
+ res[0 ] = shared_mem[((s0 + 0 ) * 16 + threadIdx .y ) * 4 + threadIdx .x ];
44
+ res[1 ] = shared_mem[((s0 + 1 ) * 16 + threadIdx .y ) * 4 + threadIdx .x ];
45
+ res[2 ] = shared_mem[((s0 + 2 ) * 16 + threadIdx .y ) * 4 + threadIdx .x ];
45
46
}
46
47
47
48
__device__ __forceinline__ void ST4SS (const int row, const int col, const uint2 data[3 ], const int thread, const int threads)
48
49
{
49
50
extern __shared__ uint2 shared_mem[];
50
51
const int s0 = (Ncol / 2 * (row - BUF_COUNT) + col / 2 ) * memshift;
51
52
52
- shared_mem[((s0 + 0 ) *8 + threadIdx .y ) * 4 + threadIdx .x ] = data[0 ];
53
- shared_mem[((s0 + 1 ) *8 + threadIdx .y ) * 4 + threadIdx .x ] = data[1 ];
54
- shared_mem[((s0 + 2 ) *8 + threadIdx .y ) * 4 + threadIdx .x ] = data[2 ];
53
+ shared_mem[((s0 + 0 ) * 16 + threadIdx .y ) * 4 + threadIdx .x ] = data[0 ];
54
+ shared_mem[((s0 + 1 ) * 16 + threadIdx .y ) * 4 + threadIdx .x ] = data[1 ];
55
+ shared_mem[((s0 + 2 ) * 16 + threadIdx .y ) * 4 + threadIdx .x ] = data[2 ];
55
56
}
56
57
57
- __device__ __forceinline__ void LD4S (uint2 res[3 ], const int row, const int col, const int thread, const int threads , uint2 pad[Ncol / 2 ][Nrow][3 ])
58
+ __device__ __forceinline__ void LD4SL (uint2 res[3 ], const int row, const int col, uint2 pad[Ncol / 2 ][Nrow][3 ])
58
59
{
59
- if ((col & 1 ) == 0 ) {
60
- LD4SS (res, row, col, thread, threads);
61
- } else {
62
60
res[0 ] = pad[col / 2 ][row][0 ];
63
61
res[1 ] = pad[col / 2 ][row][1 ];
64
62
res[2 ] = pad[col / 2 ][row][2 ];
65
- }
66
63
}
67
64
68
- __device__ __forceinline__ void ST4S (const int row, const int col, const uint2 data[3 ], const int thread, const int threads , uint2 pad[Ncol / 2 ][Nrow][3 ])
65
+ __device__ __forceinline__ void ST4SL (const int row, const int col, const uint2 data[3 ], uint2 pad[Ncol / 2 ][Nrow][3 ])
69
66
{
70
- if ((col & 1 ) == 0 ) {
71
- ST4SS (row, col, data, thread, threads);
72
- } else {
73
67
pad[col / 2 ][row][0 ] = data[0 ];
74
68
pad[col / 2 ][row][1 ] = data[1 ];
75
69
pad[col / 2 ][row][2 ] = data[2 ];
76
- }
77
70
}
78
71
79
72
206
199
207
200
for (int i = 0 ; i < Nrow; i++)
208
201
{
209
- ST4S (0 , Ncol - i - 1 , state, thread, threads, pad);
202
+ if ((i & 1 ) == 1 )
203
+ ST4SS (0 , Ncol - i - 1 , state, thread, threads);
204
+ else
205
+ ST4SL (0 , Ncol - i - 1 , state, pad);
210
206
211
207
round_lyra (state);
212
208
}
213
209
214
210
for (int i = 0 ; i < Nrow; i+=2 )
215
211
{
216
- LD4S (state1, 0 , i, thread, threads, pad );
217
- LD4S (state2, 0 , i + 1 , thread, threads , pad);
212
+ LD4SS (state1, 0 , i, thread, threads);
213
+ LD4SL (state2, 0 , i + 1 , pad);
218
214
#pragma unroll
219
215
for (int j = 0 ; j < 3 ; j++)
220
216
state[j] ^= state1[j];
234
230
#pragma unroll
235
231
for (int j = 0 ; j < 3 ; j++)
236
232
state2[j] ^= state[j];
237
- ST4S (1 , Ncol - i - 1 , state1, thread, threads , pad);
238
- ST4S (1 , Ncol - (i + 1 ) - 1 , state2, thread, threads, pad );
233
+ ST4SL (1 , Ncol - i - 1 , state1, pad);
234
+ ST4SS (1 , Ncol - (i + 1 ) - 1 , state2, thread, threads);
239
235
}
240
236
}
241
237
246
242
247
243
for (int i = 0 ; i < Nrow; i+=2 )
248
244
{
249
- LD4S (state1, rowIn, i, thread, threads, pad );
250
- LD4S (state2, rowInOut, i, thread, threads, pad );
251
- LD4S (state3, rowIn, i + 1 , thread, threads , pad);
252
- LD4S (state4, rowInOut, i + 1 , thread, threads , pad);
245
+ LD4SS (state1, rowIn, i, thread, threads);
246
+ LD4SS (state2, rowInOut, i, thread, threads);
247
+ LD4SL (state3, rowIn, i + 1 , pad);
248
+ LD4SL (state4, rowInOut, i + 1 , pad);
253
249
#pragma unroll
254
250
for (int j = 0 ; j < 3 ; j++)
255
251
state[j] ^= state1[j] + state2[j];
260
256
for (int j = 0 ; j < 3 ; j++)
261
257
state1[j] ^= state[j];
262
258
263
- ST4S (rowOut, Ncol - i - 1 , state1, thread, threads , pad);
259
+ ST4SL (rowOut, Ncol - i - 1 , state1, pad);
264
260
265
261
// simultaneously receive data from preceding thread and send data to following thread
266
262
uint2 Data0 = state[0 ];
279
275
state2[2 ] ^= Data2;
280
276
}
281
277
282
- ST4S (rowInOut, i, state2, thread, threads, pad );
278
+ ST4SS (rowInOut, i, state2, thread, threads);
283
279
284
280
// =====================================
285
281
#pragma unroll
292
288
for (int j = 0 ; j < 3 ; j++)
293
289
state3[j] ^= state[j];
294
290
295
- ST4S (rowOut, Ncol - (i + 1 ) - 1 , state3, thread, threads, pad );
291
+ ST4SS (rowOut, Ncol - (i + 1 ) - 1 , state3, thread, threads);
296
292
297
293
// simultaneously receive data from preceding thread and send data to following thread
298
294
uint2 Data01 = state[0 ];
311
307
state4[2 ] ^= Data21;
312
308
}
313
309
314
- ST4S (rowInOut, (i + 1 ), state4, thread, threads , pad);
310
+ ST4SL (rowInOut, (i + 1 ), state4, pad);
315
311
}
316
312
}
317
313
322
318
{
323
319
uint2 state1[3 ], state2[3 ], state3[3 ], state4[3 ];
324
320
325
- LD4S (state1, rowIn, i, thread, threads, pad );
326
- LD4S (state2, rowInOut, i, thread, threads, pad );
327
- LD4S (state3, rowIn, i + 1 , thread, threads , pad);
328
- LD4S (state4, rowInOut, i + 1 , thread, threads , pad);
321
+ LD4SS (state1, rowIn, i, thread, threads);
322
+ LD4SS (state2, rowInOut, i, thread, threads);
323
+ LD4SL (state3, rowIn, i + 1 , pad);
324
+ LD4SL (state4, rowInOut, i + 1 , pad);
329
325
330
326
#pragma unroll
331
327
for (int j = 0 ; j < 3 ; j++)
332
328
state[j] ^= state1[j] + state2[j];
333
329
334
- LD4S (state1, rowOut, i, thread, threads, pad );
330
+ LD4SS (state1, rowOut, i, thread, threads);
335
331
336
332
round_lyra (state);
337
333
355
351
}
356
352
357
353
if (rowInOut != rowOut) {
358
- ST4S (rowInOut, i, state2, thread, threads, pad );
354
+ ST4SS (rowInOut, i, state2, thread, threads);
359
355
#pragma unroll
360
356
for (int j = 0 ; j < 3 ; j++)
361
357
state2[j] = state1[j];
365
361
for (int j = 0 ; j < 3 ; j++)
366
362
state2[j] ^= state[j];
367
363
368
- ST4S (rowOut, i, state2, thread, threads, pad );
364
+ ST4SS (rowOut, i, state2, thread, threads);
369
365
370
366
// ======================================
371
367
374
370
for (int j = 0 ; j < 3 ; j++)
375
371
state[j] ^= state3[j] + state4[j];
376
372
377
- LD4S (state3, rowOut, i + 1 , thread, threads , pad);
373
+ LD4SL (state3, rowOut, i + 1 , pad);
378
374
379
375
round_lyra (state);
380
376
398
394
}
399
395
400
396
if (rowInOut != rowOut) {
401
- ST4S (rowInOut, i + 1 , state4, thread, threads , pad);
397
+ ST4SL (rowInOut, i + 1 , state4, pad);
402
398
#pragma unroll
403
399
for (int j = 0 ; j < 3 ; j++)
404
400
state4[j] = state3[j];
408
404
for (int j = 0 ; j < 3 ; j++)
409
405
state4[j] ^= state[j];
410
406
411
- ST4S (rowOut, i + 1 , state4, thread, threads , pad);
407
+ ST4SL (rowOut, i + 1 , state4, pad);
412
408
}
413
409
}
414
410
417
413
{
418
414
uint2 state1[3 ], state2[3 ], state3[3 ], state4[3 ], last[3 ];
419
415
420
- LD4S (state1, 2 , 0 , thread, threads, pad );
421
- LD4S (last, rowInOut, 0 , thread, threads, pad );
416
+ LD4SS (state1, 2 , 0 , thread, threads);
417
+ LD4SS (last, rowInOut, 0 , thread, threads);
422
418
423
419
#pragma unroll
424
420
for (int j = 0 ; j < 3 ; j++)
450
446
last[j] ^= state[j];
451
447
}
452
448
453
- LD4S (state1, 2 , 1 , thread, threads , pad);
454
- LD4S (state2, rowInOut, 1 , thread, threads , pad);
449
+ LD4SL (state1, 2 , 1 , pad);
450
+ LD4SL (state2, rowInOut, 1 , pad);
455
451
456
452
#pragma unroll
457
453
for (int j = 0 ; j < 3 ; j++)
461
457
462
458
for (int i = 2 ; i < Nrow; i+=2 )
463
459
{
464
- LD4S (state1, 2 , i, thread, threads, pad );
465
- LD4S (state2, rowInOut, i, thread, threads, pad );
466
- LD4S (state3, 2 , i + 1 , thread, threads , pad);
467
- LD4S (state4, rowInOut, i + 1 , thread, threads , pad);
460
+ LD4SS (state1, 2 , i, thread, threads);
461
+ LD4SS (state2, rowInOut, i, thread, threads);
462
+ LD4SL (state3, 2 , i + 1 , pad);
463
+ LD4SL (state4, rowInOut, i + 1 , pad);
468
464
469
465
#pragma unroll
470
466
for (int j = 0 ; j < 3 ; j++)
524
520
}
525
521
526
522
__global__
527
- __launch_bounds__ (TPB52 , 1 )
523
+ __launch_bounds__ (64 , 1 )
528
524
void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
529
525
{
530
526
const uint32_t thread = blockDim .y * blockIdx .x + threadIdx .y ;
546
542
reduceDuplexRowSetup (4 , 3 , 5 , state, thread, threads, pad);
547
543
reduceDuplexRowSetup (5 , 2 , 6 , state, thread, threads, pad);
548
544
reduceDuplexRowSetup (6 , 1 , 7 , state, thread, threads, pad);
549
-
550
- uint32_t rowa = WarpShuffle (state[0 ].x , 0 , 4 ) & 7 ;
551
- reduceDuplexRowt (7 , rowa, 0 , state, thread, threads, pad);
552
- rowa = WarpShuffle (state[0 ].x , 0 , 4 ) & 7 ;
553
- reduceDuplexRowt (0 , rowa, 3 , state, thread, threads, pad);
554
- rowa = WarpShuffle (state[0 ].x , 0 , 4 ) & 7 ;
555
- reduceDuplexRowt (3 , rowa, 6 , state, thread, threads, pad);
556
- rowa = WarpShuffle (state[0 ].x , 0 , 4 ) & 7 ;
557
- reduceDuplexRowt (6 , rowa, 1 , state, thread, threads, pad);
558
- rowa = WarpShuffle (state[0 ].x , 0 , 4 ) & 7 ;
559
- reduceDuplexRowt (1 , rowa, 4 , state, thread, threads, pad);
560
- rowa = WarpShuffle (state[0 ].x , 0 , 4 ) & 7 ;
561
- reduceDuplexRowt (4 , rowa, 7 , state, thread, threads, pad);
562
- rowa = WarpShuffle (state[0 ].x , 0 , 4 ) & 7 ;
563
- reduceDuplexRowt (7 , rowa, 2 , state, thread, threads, pad);
545
+
546
+ uint32_t rowa;
547
+ uint32_t row = 0 ;
548
+ uint32_t pre = 7 ;
549
+ for (int i = 0 ; i < 7 ; i++) {
550
+ rowa = WarpShuffle (state[0 ].x , 0 , 4 ) & 7 ;
551
+ reduceDuplexRowt (pre, rowa, row, state, thread, threads, pad);
552
+ pre = row;
553
+ row = (row + 3 ) % 8 ;
554
+ }
564
555
rowa = WarpShuffle (state[0 ].x , 0 , 4 ) & 7 ;
565
556
reduceDuplexRowt_8 (rowa, state, thread, threads, pad);
566
557
@@ -623,8 +614,8 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint6
623
614
else if (cuda_arch[dev_id] >= 500 ) tpb = TPB50;
624
615
else if (cuda_arch[dev_id] >= 200 ) tpb = TPB20;
625
616
626
- dim3 grid1 ((threads * 4 + tpb - 1 ) / tpb );
627
- dim3 block1 (4 , tpb >> 2 );
617
+ dim3 grid1 ((threads * 4 + 64 - 1 ) / 64 );
618
+ dim3 block1 (4 , 64 >> 2 );
628
619
629
620
dim3 grid2 ((threads + 64 - 1 ) / 64 );
630
621
dim3 block2 (64 );
@@ -636,7 +627,7 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint6
636
627
{
637
628
lyra2_gpu_hash_32_1 <<< grid2, block2 >>> (threads, startNounce, (uint2 *)d_hash);
638
629
639
- lyra2_gpu_hash_32_2 <<< grid1, block1, 12 * (8 - 0 ) * sizeof (uint2 ) * tpb >>> (threads, startNounce, d_hash);
630
+ lyra2_gpu_hash_32_2 <<< grid1, block1, 12 * (8 - 0 ) * sizeof (uint2 ) * 64 >>> (threads, startNounce, d_hash);
640
631
641
632
lyra2_gpu_hash_32_3 <<< grid2, block2 >>> (threads, startNounce, (uint2 *)d_hash);
642
633
}
0 commit comments