Skip to content
This repository was archived by the owner on Sep 15, 2022. It is now read-only.

Commit a268e49

Browse files
committed
Major rework on Keccak implementation which together with some other minor changes gives up to 4x performance boost on nVidia cards
1 parent ef33b01 commit a268e49

10 files changed

+877
-770
lines changed

Dispatcher.cpp

+5-5
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,8 @@ static void printResult(cl_ulong4 seed, cl_ulong round, result r, cl_uchar score
4848
const std::string strPublic = toHex(r.foundHash, 20);
4949

5050
// Print
51-
std::cout << " Time: " << std::setw(5) << seconds << "s Score: " << std::setw(2) << (int) score << " Private: 0x" << strPrivate << ' ';
51+
const std::string strVT100ClearLine = "\33[2K\r";
52+
std::cout << strVT100ClearLine << " Time: " << std::setw(5) << seconds << "s Score: " << std::setw(2) << (int) score << " Private: 0x" << strPrivate << ' ';
5253

5354
std::cout << mode.transformName();
5455
std::cout << ": 0x" << strPublic << std::endl;
@@ -116,8 +117,8 @@ Dispatcher::Device::Device(Dispatcher & parent, cl_context & clContext, cl_progr
116117
m_kernelInverse(createKernel(clProgram, "profanity_inverse_multiple")),
117118
m_kernelInversePost(createKernel(clProgram, "profanity_inverse_post")),
118119
m_kernelEnd(createKernel(clProgram, "profanity_end")),
119-
m_kernelScore(createKernel(clProgram, mode.kernel)),
120120
m_kernelTransform(createKernel(clProgram, mode.transformKernel())),
121+
m_kernelScore(createKernel(clProgram, mode.kernel)),
121122
m_memPrecomp(clContext, m_clQueue, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, sizeof(g_precomp), g_precomp),
122123
m_memPoints(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, size, true),
123124
m_memInverse(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, size, true),
@@ -181,7 +182,7 @@ void Dispatcher::run() {
181182

182183
void Dispatcher::init() {
183184
std::cout << "Initializing devices..." << std::endl;
184-
std::cout << " This can take a minute or two. The number of objects initialized on each" << std::endl;
185+
std::cout << " This should take less than a minute. The number of objects initialized on each" << std::endl;
185186
std::cout << " device is equal to inverse-size * inverse-multiple. To lower" << std::endl;
186187
std::cout << " initialization time (and memory footprint) I suggest lowering the" << std::endl;
187188
std::cout << " inverse-multiple first. You can do this via the -I switch. Do note that" << std::endl;
@@ -308,8 +309,7 @@ void Dispatcher::enqueueKernel(cl_command_queue & clQueue, cl_kernel & clKernel,
308309
void Dispatcher::enqueueKernelDevice(Device & d, cl_kernel & clKernel, size_t worksizeGlobal, const bool bOneAtATime = false) {
309310
try {
310311
enqueueKernel(d.m_clQueue, clKernel, worksizeGlobal, d.m_worksizeLocal, bOneAtATime);
311-
}
312-
catch ( OpenCLException & e ) {
312+
} catch ( OpenCLException & e ) {
313313
// If local work size is invalid, abandon it and let implementation decide
314314
if ((e.m_res == CL_INVALID_WORK_GROUP_SIZE || e.m_res == CL_INVALID_WORK_ITEM_SIZE) && d.m_worksizeLocal != 0) {
315315
std::cout << std::endl << "warning: local work size abandoned on GPU" << d.m_index << std::endl;

Mode.cpp

+7
Original file line numberDiff line numberDiff line change
@@ -133,3 +133,10 @@ Mode Mode::mirror() {
133133
r.kernel = "profanity_score_mirror";
134134
return r;
135135
}
136+
137+
Mode Mode::doubles() {
138+
Mode r;
139+
r.name = "doubles";
140+
r.kernel = "profanity_score_doubles";
141+
return r;
142+
}

Mode.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ class Mode {
3030
static Mode zeros();
3131
static Mode letters();
3232
static Mode numbers();
33+
static Mode doubles();
3334

3435
std::string name;
3536

README.md

+8-5
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ usage: ./profanity [OPTIONS]
1919
--letters Score on letters anywhere in hash.
2020
--numbers Score on numbers anywhere in hash.
2121
--mirror Score on mirroring from center.
22+
--leading-doubles Score on hashes leading with hexadecimal pairs
2223
2324
Modes with arguments:
2425
--leading <single hex> Score on hashes leading with given hex character.
@@ -38,15 +39,15 @@ usage: ./profanity [OPTIONS]
3839
3940
Device control:
4041
-s, --skip <index> Skip device given by index.
41-
-n, --no-cache Don't load cached pre-compiled version of kernel.
42+
-n, --no-cache Don't load cached pre-compiled version of kernel.
4243
4344
Tweaking:
4445
-w, --work <size> Set OpenCL local work size. [default = 64]
45-
-W, --work-max <size> Set OpenCL maximum work size. [default = 1048576]
46+
-W, --work-max <size> Set OpenCL maximum work size. [default = -i * -I]
4647
-i, --inverse-size Set size of modular inverses to calculate in one
47-
work item. [default = 256]
48+
work item. [default = 255]
4849
-I, --inverse-multiple Set how many above work items will run in
49-
parallell. [default = 65536]
50+
parallell. [default = 16384]
5051
5152
Examples:
5253
./profanity --leading f
@@ -69,10 +70,12 @@ usage: ./profanity [OPTIONS]
6970
|Model|Clock Speed|Memory Speed|Modified straps|Speed|Time to match eight characters|Version
7071
|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
7172
|RX VEGA56|1408|1100|YES|146 MH/s| ~29 s | 1.1x
73+
|GTX 1070 OC|1950|4450|NO|120.0 MH/s| ~36s | 1.3x
74+
|GTX 1070|1750|4000|NO|106.0 MH/s| ~41s | 1.3x
7275
|R9 290|1150|1400|NO|100 MH/s| ~43 s | 1.1x
7376
|RX 480|1328|2000|YES|97 MH/s| ~45 s| 1.2x
7477
|RX 480|1266|2000|YES|92 MH/s| ~47 s| 1.2x
7578
|RX 580|1366|1750|YES|92 MH/s| ~47 s| 1.2x
7679
|R9 290|1040|1300|NO|91 MH/s| ~47 s | 1.1x
7780
|RX 470|1216|1750|YES|73 MH/s| ~59s | 1.2x
78-
|GTX 1070| - | - | NO | 26.0 MH/s | ~166s | 1.2x
81+

SpeedSample.cpp

+10-5
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,17 @@ SpeedSample::~SpeedSample() {
1212
}
1313

1414
double SpeedSample::getSpeed() const {
15-
double speed = 0;
16-
for( auto & v : m_lSpeeds) {
17-
speed += v / m_lSpeeds.size();
15+
auto delta = std::chrono::duration_cast<std::chrono::milliseconds>(now() - m_lastTime).count();
16+
if (delta > 5000) {
17+
return 0;
18+
} else {
19+
double speed = 0;
20+
for (auto & v : m_lSpeeds) {
21+
speed += v / m_lSpeeds.size();
22+
}
23+
24+
return speed;
1825
}
19-
20-
return speed;
2126
}
2227

2328
void SpeedSample::sample(const double V) {

constants.hpp

-16
This file was deleted.

help.hpp

+4-3
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ usage: ./profanity [OPTIONS]
1212
--letters Score on letters anywhere in hash.
1313
--numbers Score on numbers anywhere in hash.
1414
--mirror Score on mirroring from center.
15+
--leading-doubles Score on hashes leading with hexadecimal pairs
1516
1617
Modes with arguments:
1718
--leading <single hex> Score on hashes leading with given hex character.
@@ -35,11 +36,11 @@ usage: ./profanity [OPTIONS]
3536
3637
Tweaking:
3738
-w, --work <size> Set OpenCL local work size. [default = 64]
38-
-W, --work-max <size> Set OpenCL maximum work size. [default = 1048576]
39+
-W, --work-max <size> Set OpenCL maximum work size. [default = -i * -I]
3940
-i, --inverse-size Set size of modular inverses to calculate in one
40-
work item. [default = 256]
41+
work item. [default = 255]
4142
-I, --inverse-multiple Set how many above work items will run in
42-
parallell. [default = 65536]
43+
parallell. [default = 16384]
4344
4445
Examples:
4546
./profanity --leading f

keccak.cl

+139-94
Original file line numberDiff line numberDiff line change
@@ -1,94 +1,139 @@
1-
/* Original: https://github.com/mjosaarinen/tiny_sha3
2-
* Below is a very slimmed down version of the already tiny SHA3
3-
* implementation by Markku-Juhani O. Saarinen.
4-
*
5-
* The implementation below is for inputs of exactly 64 bytes
6-
* that's pre-filled in ethhash.b[0] - ethhash.b[63].
7-
*/
8-
9-
typedef union {
10-
uchar b[200];
11-
ulong q[25];
12-
uint d[50];
13-
} ethhash;
14-
15-
// constants
16-
__constant ulong keccakf_rndc[24] = {
17-
0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
18-
0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
19-
0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
20-
0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
21-
0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
22-
0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
23-
0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
24-
0x8000000000008080, 0x0000000080000001, 0x8000000080008008
25-
};
26-
27-
__constant ulong keccakf_rotc[24] = {
28-
1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
29-
27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44
30-
};
31-
32-
__constant int keccakf_piln[24] = {
33-
10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
34-
15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1
35-
};
36-
37-
/* Barely a bottleneck. No need to tinker more. */
38-
void sha3_keccakf(ethhash * const pHash)
39-
{
40-
ulong * const st = &pHash->q;
41-
pHash->d[33] ^= 0x80000000;
42-
43-
// variables
44-
int i, j, r;
45-
ulong t, bc[5];
46-
47-
// actual iteration
48-
for (r = 0; r < 24; r++) {
49-
// Theta - unrolled
50-
bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
51-
bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
52-
bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
53-
bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
54-
bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
55-
56-
for (i = 0; i < 5; i++) {
57-
t = bc[(i + 4) % 5] ^ rotate(bc[(i + 1) % 5], (ulong) 1);
58-
59-
st[i] ^= t;
60-
st[i + 5] ^= t;
61-
st[i + 10] ^= t;
62-
st[i + 15] ^= t;
63-
st[i + 20] ^= t;
64-
}
65-
66-
// Rho Pi
67-
t = st[1];
68-
for (i = 0; i < 24; i++) {
69-
j = keccakf_piln[i];
70-
bc[0] = st[j];
71-
72-
st[j] = rotate(t, keccakf_rotc[i]);
73-
t = bc[0];
74-
}
75-
76-
// Chi
77-
for (j = 0; j < 25; j += 5) {
78-
bc[0] = st[j + 0];
79-
bc[1] = st[j + 1];
80-
bc[2] = st[j + 2];
81-
bc[3] = st[j + 3];
82-
bc[4] = st[j + 4];
83-
84-
st[j + 0] ^= (~bc[1]) & bc[2];
85-
st[j + 1] ^= (~bc[2]) & bc[3];
86-
st[j + 2] ^= (~bc[3]) & bc[4];
87-
st[j + 3] ^= (~bc[4]) & bc[0];
88-
st[j + 4] ^= (~bc[0]) & bc[1];
89-
}
90-
91-
// Iota
92-
st[0] ^= keccakf_rndc[r];
93-
}
94-
}
1+
/* This Keccak implementation is an amalgamation of:
2+
* Tiny SHA3 implementation by Markku-Juhani O. Saarinen:
3+
* https://github.com/mjosaarinen/tiny_sha3
4+
* Keccak implementation found in xptMiner-gpu @ Github:
5+
* https://github.com/llamasoft/xptMiner-gpu/blob/master/opencl/keccak.cl
6+
*/
7+
8+
typedef union {
9+
uchar b[200];
10+
ulong q[25];
11+
uint d[50];
12+
} ethhash;
13+
14+
#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) \
15+
{ \
16+
t = rotate((ulong)(d0 ^ d1 ^ d2 ^ d3 ^ d4), (ulong)1) ^ (c0 ^ c1 ^ c2 ^ c3 ^ c4); \
17+
}
18+
19+
#define THETA(s00, s01, s02, s03, s04, \
20+
s10, s11, s12, s13, s14, \
21+
s20, s21, s22, s23, s24, \
22+
s30, s31, s32, s33, s34, \
23+
s40, s41, s42, s43, s44) \
24+
{ \
25+
TH_ELT(t0, s40, s41, s42, s43, s44, s10, s11, s12, s13, s14); \
26+
TH_ELT(t1, s00, s01, s02, s03, s04, s20, s21, s22, s23, s24); \
27+
TH_ELT(t2, s10, s11, s12, s13, s14, s30, s31, s32, s33, s34); \
28+
TH_ELT(t3, s20, s21, s22, s23, s24, s40, s41, s42, s43, s44); \
29+
TH_ELT(t4, s30, s31, s32, s33, s34, s00, s01, s02, s03, s04); \
30+
s00 ^= t0; s01 ^= t0; s02 ^= t0; s03 ^= t0; s04 ^= t0; \
31+
s10 ^= t1; s11 ^= t1; s12 ^= t1; s13 ^= t1; s14 ^= t1; \
32+
s20 ^= t2; s21 ^= t2; s22 ^= t2; s23 ^= t2; s24 ^= t2; \
33+
s30 ^= t3; s31 ^= t3; s32 ^= t3; s33 ^= t3; s34 ^= t3; \
34+
s40 ^= t4; s41 ^= t4; s42 ^= t4; s43 ^= t4; s44 ^= t4; \
35+
}
36+
37+
#define RHOPI(s00, s01, s02, s03, s04, \
38+
s10, s11, s12, s13, s14, \
39+
s20, s21, s22, s23, s24, \
40+
s30, s31, s32, s33, s34, \
41+
s40, s41, s42, s43, s44) \
42+
{ \
43+
t0 = rotate(s10, (ulong) 1); \
44+
s10 = rotate(s11, (ulong)44); \
45+
s11 = rotate(s41, (ulong)20); \
46+
s41 = rotate(s24, (ulong)61); \
47+
s24 = rotate(s42, (ulong)39); \
48+
s42 = rotate(s04, (ulong)18); \
49+
s04 = rotate(s20, (ulong)62); \
50+
s20 = rotate(s22, (ulong)43); \
51+
s22 = rotate(s32, (ulong)25); \
52+
s32 = rotate(s43, (ulong) 8); \
53+
s43 = rotate(s34, (ulong)56); \
54+
s34 = rotate(s03, (ulong)41); \
55+
s03 = rotate(s40, (ulong)27); \
56+
s40 = rotate(s44, (ulong)14); \
57+
s44 = rotate(s14, (ulong) 2); \
58+
s14 = rotate(s31, (ulong)55); \
59+
s31 = rotate(s13, (ulong)45); \
60+
s13 = rotate(s01, (ulong)36); \
61+
s01 = rotate(s30, (ulong)28); \
62+
s30 = rotate(s33, (ulong)21); \
63+
s33 = rotate(s23, (ulong)15); \
64+
s23 = rotate(s12, (ulong)10); \
65+
s12 = rotate(s21, (ulong) 6); \
66+
s21 = rotate(s02, (ulong) 3); \
67+
s02 = t0; \
68+
}
69+
70+
#define KHI(s00, s01, s02, s03, s04, \
71+
s10, s11, s12, s13, s14, \
72+
s20, s21, s22, s23, s24, \
73+
s30, s31, s32, s33, s34, \
74+
s40, s41, s42, s43, s44) \
75+
{ \
76+
t0 = s00 ^ (~s10 & s20); \
77+
t1 = s10 ^ (~s20 & s30); \
78+
t2 = s20 ^ (~s30 & s40); \
79+
t3 = s30 ^ (~s40 & s00); \
80+
t4 = s40 ^ (~s00 & s10); \
81+
s00 = t0; s10 = t1; s20 = t2; s30 = t3; s40 = t4; \
82+
\
83+
t0 = s01 ^ (~s11 & s21); \
84+
t1 = s11 ^ (~s21 & s31); \
85+
t2 = s21 ^ (~s31 & s41); \
86+
t3 = s31 ^ (~s41 & s01); \
87+
t4 = s41 ^ (~s01 & s11); \
88+
s01 = t0; s11 = t1; s21 = t2; s31 = t3; s41 = t4; \
89+
\
90+
t0 = s02 ^ (~s12 & s22); \
91+
t1 = s12 ^ (~s22 & s32); \
92+
t2 = s22 ^ (~s32 & s42); \
93+
t3 = s32 ^ (~s42 & s02); \
94+
t4 = s42 ^ (~s02 & s12); \
95+
s02 = t0; s12 = t1; s22 = t2; s32 = t3; s42 = t4; \
96+
\
97+
t0 = s03 ^ (~s13 & s23); \
98+
t1 = s13 ^ (~s23 & s33); \
99+
t2 = s23 ^ (~s33 & s43); \
100+
t3 = s33 ^ (~s43 & s03); \
101+
t4 = s43 ^ (~s03 & s13); \
102+
s03 = t0; s13 = t1; s23 = t2; s33 = t3; s43 = t4; \
103+
\
104+
t0 = s04 ^ (~s14 & s24); \
105+
t1 = s14 ^ (~s24 & s34); \
106+
t2 = s24 ^ (~s34 & s44); \
107+
t3 = s34 ^ (~s44 & s04); \
108+
t4 = s44 ^ (~s04 & s14); \
109+
s04 = t0; s14 = t1; s24 = t2; s34 = t3; s44 = t4; \
110+
}
111+
112+
#define IOTA(s00, r) { s00 ^= r; }
113+
114+
__constant ulong keccakf_rndc[24] = {
115+
0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
116+
0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
117+
0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
118+
0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
119+
0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
120+
0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
121+
0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
122+
0x8000000000008080, 0x0000000080000001, 0x8000000080008008
123+
};
124+
125+
// Barely a bottleneck. No need to tinker more.
126+
void sha3_keccakf(ethhash * const h)
127+
{
128+
ulong * const st = &h->q;
129+
h->d[33] ^= 0x80000000;
130+
ulong t0, t1, t2, t3, t4;
131+
132+
// Unrolling and removing PI stage gave negligable performance on GTX 1070.
133+
for (int i = 0; i < 24; ++i) {
134+
THETA(st[0], st[5], st[10], st[15], st[20], st[1], st[6], st[11], st[16], st[21], st[2], st[7], st[12], st[17], st[22], st[3], st[8], st[13], st[18], st[23], st[4], st[9], st[14], st[19], st[24]);
135+
RHOPI(st[0], st[5], st[10], st[15], st[20], st[1], st[6], st[11], st[16], st[21], st[2], st[7], st[12], st[17], st[22], st[3], st[8], st[13], st[18], st[23], st[4], st[9], st[14], st[19], st[24]);
136+
KHI(st[0], st[5], st[10], st[15], st[20], st[1], st[6], st[11], st[16], st[21], st[2], st[7], st[12], st[17], st[22], st[3], st[8], st[13], st[18], st[23], st[4], st[9], st[14], st[19], st[24]);
137+
IOTA(st[0], keccakf_rndc[i]);
138+
}
139+
}

0 commit comments

Comments
 (0)