|
1 |
| -/* Original: https://github.com/mjosaarinen/tiny_sha3 |
2 |
| - * Below is a very slimmed down version of the already tiny SHA3 |
3 |
| - * implementation by Markku-Juhani O. Saarinen. |
4 |
| - * |
5 |
| - * The implementation below is for inputs of exactly 64 bytes |
6 |
| - * that's pre-filled in ethhash.b[0] - ethhash.b[63]. |
7 |
| - */ |
8 |
| - |
9 |
| -typedef union { |
10 |
| - uchar b[200]; |
11 |
| - ulong q[25]; |
12 |
| - uint d[50]; |
13 |
| -} ethhash; |
14 |
| - |
15 |
| -// constants |
16 |
| -__constant ulong keccakf_rndc[24] = { |
17 |
| - 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, |
18 |
| - 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, |
19 |
| - 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, |
20 |
| - 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, |
21 |
| - 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, |
22 |
| - 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, |
23 |
| - 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, |
24 |
| - 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 |
25 |
| -}; |
26 |
| - |
27 |
| -__constant ulong keccakf_rotc[24] = { |
28 |
| - 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, |
29 |
| - 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44 |
30 |
| -}; |
31 |
| - |
32 |
| -__constant int keccakf_piln[24] = { |
33 |
| - 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, |
34 |
| - 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1 |
35 |
| -}; |
36 |
| - |
37 |
| -/* Barely a bottleneck. No need to tinker more. */ |
38 |
| -void sha3_keccakf(ethhash * const pHash) |
39 |
| -{ |
40 |
| - ulong * const st = &pHash->q; |
41 |
| - pHash->d[33] ^= 0x80000000; |
42 |
| - |
43 |
| - // variables |
44 |
| - int i, j, r; |
45 |
| - ulong t, bc[5]; |
46 |
| - |
47 |
| - // actual iteration |
48 |
| - for (r = 0; r < 24; r++) { |
49 |
| - // Theta - unrolled |
50 |
| - bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20]; |
51 |
| - bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21]; |
52 |
| - bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22]; |
53 |
| - bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23]; |
54 |
| - bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24]; |
55 |
| - |
56 |
| - for (i = 0; i < 5; i++) { |
57 |
| - t = bc[(i + 4) % 5] ^ rotate(bc[(i + 1) % 5], (ulong) 1); |
58 |
| - |
59 |
| - st[i] ^= t; |
60 |
| - st[i + 5] ^= t; |
61 |
| - st[i + 10] ^= t; |
62 |
| - st[i + 15] ^= t; |
63 |
| - st[i + 20] ^= t; |
64 |
| - } |
65 |
| - |
66 |
| - // Rho Pi |
67 |
| - t = st[1]; |
68 |
| - for (i = 0; i < 24; i++) { |
69 |
| - j = keccakf_piln[i]; |
70 |
| - bc[0] = st[j]; |
71 |
| - |
72 |
| - st[j] = rotate(t, keccakf_rotc[i]); |
73 |
| - t = bc[0]; |
74 |
| - } |
75 |
| - |
76 |
| - // Chi |
77 |
| - for (j = 0; j < 25; j += 5) { |
78 |
| - bc[0] = st[j + 0]; |
79 |
| - bc[1] = st[j + 1]; |
80 |
| - bc[2] = st[j + 2]; |
81 |
| - bc[3] = st[j + 3]; |
82 |
| - bc[4] = st[j + 4]; |
83 |
| - |
84 |
| - st[j + 0] ^= (~bc[1]) & bc[2]; |
85 |
| - st[j + 1] ^= (~bc[2]) & bc[3]; |
86 |
| - st[j + 2] ^= (~bc[3]) & bc[4]; |
87 |
| - st[j + 3] ^= (~bc[4]) & bc[0]; |
88 |
| - st[j + 4] ^= (~bc[0]) & bc[1]; |
89 |
| - } |
90 |
| - |
91 |
| - // Iota |
92 |
| - st[0] ^= keccakf_rndc[r]; |
93 |
| - } |
94 |
| -} |
| 1 | +/* This Keccak implementation is an amalgamation of: |
| 2 | + * Tiny SHA3 implementation by Markku-Juhani O. Saarinen: |
| 3 | + * https://github.com/mjosaarinen/tiny_sha3 |
| 4 | + * Keccak implementation found in xptMiner-gpu @ Github: |
| 5 | + * https://github.com/llamasoft/xptMiner-gpu/blob/master/opencl/keccak.cl |
| 6 | + */ |
| 7 | + |
| 8 | +typedef union { |
| 9 | + uchar b[200]; |
| 10 | + ulong q[25]; |
| 11 | + uint d[50]; |
| 12 | +} ethhash; |
| 13 | + |
| 14 | +#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) \ |
| 15 | +{ \ |
| 16 | + t = rotate((ulong)(d0 ^ d1 ^ d2 ^ d3 ^ d4), (ulong)1) ^ (c0 ^ c1 ^ c2 ^ c3 ^ c4); \ |
| 17 | +} |
| 18 | + |
| 19 | +#define THETA(s00, s01, s02, s03, s04, \ |
| 20 | + s10, s11, s12, s13, s14, \ |
| 21 | + s20, s21, s22, s23, s24, \ |
| 22 | + s30, s31, s32, s33, s34, \ |
| 23 | + s40, s41, s42, s43, s44) \ |
| 24 | +{ \ |
| 25 | + TH_ELT(t0, s40, s41, s42, s43, s44, s10, s11, s12, s13, s14); \ |
| 26 | + TH_ELT(t1, s00, s01, s02, s03, s04, s20, s21, s22, s23, s24); \ |
| 27 | + TH_ELT(t2, s10, s11, s12, s13, s14, s30, s31, s32, s33, s34); \ |
| 28 | + TH_ELT(t3, s20, s21, s22, s23, s24, s40, s41, s42, s43, s44); \ |
| 29 | + TH_ELT(t4, s30, s31, s32, s33, s34, s00, s01, s02, s03, s04); \ |
| 30 | + s00 ^= t0; s01 ^= t0; s02 ^= t0; s03 ^= t0; s04 ^= t0; \ |
| 31 | + s10 ^= t1; s11 ^= t1; s12 ^= t1; s13 ^= t1; s14 ^= t1; \ |
| 32 | + s20 ^= t2; s21 ^= t2; s22 ^= t2; s23 ^= t2; s24 ^= t2; \ |
| 33 | + s30 ^= t3; s31 ^= t3; s32 ^= t3; s33 ^= t3; s34 ^= t3; \ |
| 34 | + s40 ^= t4; s41 ^= t4; s42 ^= t4; s43 ^= t4; s44 ^= t4; \ |
| 35 | +} |
| 36 | + |
| 37 | +#define RHOPI(s00, s01, s02, s03, s04, \ |
| 38 | + s10, s11, s12, s13, s14, \ |
| 39 | + s20, s21, s22, s23, s24, \ |
| 40 | + s30, s31, s32, s33, s34, \ |
| 41 | + s40, s41, s42, s43, s44) \ |
| 42 | +{ \ |
| 43 | + t0 = rotate(s10, (ulong) 1); \ |
| 44 | + s10 = rotate(s11, (ulong)44); \ |
| 45 | + s11 = rotate(s41, (ulong)20); \ |
| 46 | + s41 = rotate(s24, (ulong)61); \ |
| 47 | + s24 = rotate(s42, (ulong)39); \ |
| 48 | + s42 = rotate(s04, (ulong)18); \ |
| 49 | + s04 = rotate(s20, (ulong)62); \ |
| 50 | + s20 = rotate(s22, (ulong)43); \ |
| 51 | + s22 = rotate(s32, (ulong)25); \ |
| 52 | + s32 = rotate(s43, (ulong) 8); \ |
| 53 | + s43 = rotate(s34, (ulong)56); \ |
| 54 | + s34 = rotate(s03, (ulong)41); \ |
| 55 | + s03 = rotate(s40, (ulong)27); \ |
| 56 | + s40 = rotate(s44, (ulong)14); \ |
| 57 | + s44 = rotate(s14, (ulong) 2); \ |
| 58 | + s14 = rotate(s31, (ulong)55); \ |
| 59 | + s31 = rotate(s13, (ulong)45); \ |
| 60 | + s13 = rotate(s01, (ulong)36); \ |
| 61 | + s01 = rotate(s30, (ulong)28); \ |
| 62 | + s30 = rotate(s33, (ulong)21); \ |
| 63 | + s33 = rotate(s23, (ulong)15); \ |
| 64 | + s23 = rotate(s12, (ulong)10); \ |
| 65 | + s12 = rotate(s21, (ulong) 6); \ |
| 66 | + s21 = rotate(s02, (ulong) 3); \ |
| 67 | + s02 = t0; \ |
| 68 | +} |
| 69 | + |
| 70 | +#define KHI(s00, s01, s02, s03, s04, \ |
| 71 | + s10, s11, s12, s13, s14, \ |
| 72 | + s20, s21, s22, s23, s24, \ |
| 73 | + s30, s31, s32, s33, s34, \ |
| 74 | + s40, s41, s42, s43, s44) \ |
| 75 | +{ \ |
| 76 | + t0 = s00 ^ (~s10 & s20); \ |
| 77 | + t1 = s10 ^ (~s20 & s30); \ |
| 78 | + t2 = s20 ^ (~s30 & s40); \ |
| 79 | + t3 = s30 ^ (~s40 & s00); \ |
| 80 | + t4 = s40 ^ (~s00 & s10); \ |
| 81 | + s00 = t0; s10 = t1; s20 = t2; s30 = t3; s40 = t4; \ |
| 82 | + \ |
| 83 | + t0 = s01 ^ (~s11 & s21); \ |
| 84 | + t1 = s11 ^ (~s21 & s31); \ |
| 85 | + t2 = s21 ^ (~s31 & s41); \ |
| 86 | + t3 = s31 ^ (~s41 & s01); \ |
| 87 | + t4 = s41 ^ (~s01 & s11); \ |
| 88 | + s01 = t0; s11 = t1; s21 = t2; s31 = t3; s41 = t4; \ |
| 89 | + \ |
| 90 | + t0 = s02 ^ (~s12 & s22); \ |
| 91 | + t1 = s12 ^ (~s22 & s32); \ |
| 92 | + t2 = s22 ^ (~s32 & s42); \ |
| 93 | + t3 = s32 ^ (~s42 & s02); \ |
| 94 | + t4 = s42 ^ (~s02 & s12); \ |
| 95 | + s02 = t0; s12 = t1; s22 = t2; s32 = t3; s42 = t4; \ |
| 96 | + \ |
| 97 | + t0 = s03 ^ (~s13 & s23); \ |
| 98 | + t1 = s13 ^ (~s23 & s33); \ |
| 99 | + t2 = s23 ^ (~s33 & s43); \ |
| 100 | + t3 = s33 ^ (~s43 & s03); \ |
| 101 | + t4 = s43 ^ (~s03 & s13); \ |
| 102 | + s03 = t0; s13 = t1; s23 = t2; s33 = t3; s43 = t4; \ |
| 103 | + \ |
| 104 | + t0 = s04 ^ (~s14 & s24); \ |
| 105 | + t1 = s14 ^ (~s24 & s34); \ |
| 106 | + t2 = s24 ^ (~s34 & s44); \ |
| 107 | + t3 = s34 ^ (~s44 & s04); \ |
| 108 | + t4 = s44 ^ (~s04 & s14); \ |
| 109 | + s04 = t0; s14 = t1; s24 = t2; s34 = t3; s44 = t4; \ |
| 110 | +} |
| 111 | + |
| 112 | +#define IOTA(s00, r) { s00 ^= r; } |
| 113 | + |
| 114 | +__constant ulong keccakf_rndc[24] = { |
| 115 | + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, |
| 116 | + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, |
| 117 | + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, |
| 118 | + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, |
| 119 | + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, |
| 120 | + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, |
| 121 | + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, |
| 122 | + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 |
| 123 | +}; |
| 124 | + |
| 125 | +// Barely a bottleneck. No need to tinker more. |
| 126 | +void sha3_keccakf(ethhash * const h) |
| 127 | +{ |
| 128 | + ulong * const st = &h->q; |
| 129 | + h->d[33] ^= 0x80000000; |
| 130 | + ulong t0, t1, t2, t3, t4; |
| 131 | + |
| 132 | + // Unrolling and removing PI stage gave negligable performance on GTX 1070. |
| 133 | + for (int i = 0; i < 24; ++i) { |
| 134 | + THETA(st[0], st[5], st[10], st[15], st[20], st[1], st[6], st[11], st[16], st[21], st[2], st[7], st[12], st[17], st[22], st[3], st[8], st[13], st[18], st[23], st[4], st[9], st[14], st[19], st[24]); |
| 135 | + RHOPI(st[0], st[5], st[10], st[15], st[20], st[1], st[6], st[11], st[16], st[21], st[2], st[7], st[12], st[17], st[22], st[3], st[8], st[13], st[18], st[23], st[4], st[9], st[14], st[19], st[24]); |
| 136 | + KHI(st[0], st[5], st[10], st[15], st[20], st[1], st[6], st[11], st[16], st[21], st[2], st[7], st[12], st[17], st[22], st[3], st[8], st[13], st[18], st[23], st[4], st[9], st[14], st[19], st[24]); |
| 137 | + IOTA(st[0], keccakf_rndc[i]); |
| 138 | + } |
| 139 | +} |
0 commit comments