diff --git a/src/cuckaroom/Makefile b/src/cuckaroom/Makefile index 301876a..92e07aa 100644 --- a/src/cuckaroom/Makefile +++ b/src/cuckaroom/Makefile @@ -60,3 +60,6 @@ cuda19: ../crypto/siphash.cuh compress.hpp graph.hpp mean.cu Makefile cuda29: ../crypto/siphash.cuh compress.hpp graph.hpp mean.cu kernel.cuh Makefile $(NVCC) -o $@ -DEDGEBITS=29 -arch sm_35 mean.cu $(BLAKE_2B_SRC) + +cd29: ../crypto/siphash.cuh compress.hpp graph.hpp meaner.cu Makefile + $(NVCC) -o $@ -DEDGEBITS=29 -arch sm_35 meaner.cu $(BLAKE_2B_SRC) diff --git a/src/cuckatoo/mean.cu b/src/cuckatoo/mean.cu index 43deec0..150798d 100644 --- a/src/cuckatoo/mean.cu +++ b/src/cuckatoo/mean.cu @@ -14,10 +14,21 @@ typedef uint8_t u8; typedef uint16_t u16; -#ifndef NA -#define NA 4 +// number of invocations of seeding kernel +// larger values reduce pressure on TLB +// which appears to be an issue on GTX cards +#ifndef NTLB +#define NTLB 1 #endif -#define NA2 (NA * NA) +// number of slices in edge splitting kernel +// larger values reduce memory use +// which is proportional to 1+1/NMEM +#ifndef NMEM +#define NMEM 4 +#endif +// after seeding, main memory buffer is a NMEM x NTLB matrix +// of blocks of buckets +#define NMEMTLB (NMEM * NTLB) #ifndef MAXSOLS #define MAXSOLS 4 @@ -27,6 +38,7 @@ typedef uint16_t u16; // number of bits of compression of surviving edge endpoints // reduces space used in cycle finding, but too high a value // results in NODE OVERFLOW warnings and fake cycles +// tag relay achieves a 2^12 reduction in edges #define IDXSHIFT 12 #endif @@ -34,64 +46,44 @@ const u32 MAXEDGES = NEDGES >> IDXSHIFT; typedef uint64_t u64; // save some typing -#ifndef XBITS +#ifndef BUCKBITS // assumes at least 2^18 bits of shared mem (32 KB) on thread block -// #define XBITS ((EDGEBITS-18+1)/2) -// scrap that; too few buckets inhibits parallellism -#define XBITS 6 +// #define BUCKBITS (EDGEBITS-18) +#define BUCKBITS 12 #endif -const u32 NX = 1 << XBITS; -const u32 NX2 = NX * NX; -const u32 NX2_NA = NX2 / NA; -const u32 YBITS = XBITS; -const u32 NY = 1 << YBITS; -const u32 YZBITS = EDGEBITS - XBITS; -const u32 ZBITS = YZBITS - YBITS; +const u32 NB = 1 << BUCKBITS; +const u32 NB_NTLB = NB / NTLB; +const u32 NB_NMEM = NB / NMEM; +const u32 ZBITS = EDGEBITS - BUCKBITS; const u32 NZ = 1 << ZBITS; const u32 ZMASK = NZ - 1; +// buckets are a little oversized to allow for variance in distribution +#define NEPS 128 #ifndef NEPS_A #define NEPS_A 133 #endif +// max number of edges per bucket after seeding +const u32 EDGES_A = NZ * NEPS_A / NEPS; #ifndef NEPS_B #define NEPS_B 85 #endif -#define NEPS 128 - -const u32 EDGES_A = NZ * NEPS_A / NEPS; +// max number of edges per bucket after one trim const u32 EDGES_B = NZ * NEPS_B / NEPS; -const u32 ROW_EDGES_A = EDGES_A * NY; -const u32 ROW_EDGES_B = EDGES_B * NY; - -// Number of rows in bufferB not overlapping bufferA -#ifndef NRB1 -#define NRB1 (NX / 2) -#endif -#define NRB2 (NX - NRB1) -#define NB 2 - __constant__ uint2 recoveredges[PROOFSIZE]; __constant__ siphash_keys dipkeys; -#ifndef FLUSHA // should perhaps be in trimparams and passed as template parameter -#define FLUSHA 4 -#endif - -#ifndef NFLUSH -#define NFLUSH (8 / FLUSHA) -#endif - -__device__ __forceinline__ void bitmapset(u32 *ebitmap, const int bucket) { - int word = bucket >> 5; +__device__ __forceinline__ void bitmapset(u32 *ebitmap, const u32 bucket) { + u32 word = bucket >> 5; unsigned char bit = bucket & 0x1F; const u32 mask = 1 << bit; atomicOr(ebitmap + word, mask); } -__device__ __forceinline__ bool bitmaptest(u32 *ebitmap, const int bucket) { - int word = bucket >> 5; +__device__ __forceinline__ bool bitmaptest(u32 *ebitmap, const u32 bucket) { + u32 word = bucket >> 5; unsigned char bit = bucket & 0x1F; return (ebitmap[word] >> bit) & 1; } @@ -104,41 +96,65 @@ __device__ __forceinline__ u32 endpoint(uint2 nodes, bool uorv) { return uorv ? nodes.y : nodes.x; } -template -__global__ void Seed(u32 * __restrict__ buffer, u32 * __restrict__ indexes, const u64 offset) { - const int group = blockIdx.x; - const int dim = blockDim.x; - const int lid = threadIdx.x; - const int gid = group * dim + lid; - const int nthreads = gridDim.x * dim; +#ifndef SEED_TPB +#define SEED_TPB 256 +#endif +#ifndef TRIM0_TPB +#define TRIM0_TPB 1024 +#endif +#ifndef TRIM1_TPB +#define TRIM1_TPB 512 +#endif +#ifndef TRIM_TPB +#define TRIM_TPB 512 +#endif +#ifndef RELAY_TPB +#define RELAY_TPB 512 +#endif +#ifndef TAIL_TPB +#define TAIL_TPB SEED_TPB +#endif - extern __shared__ u32 tmp[][FLUSHA-1]; - int *counters = (int *)(tmp[NX2]); +#ifndef FLUSHA // should perhaps be in trimparams and passed as template parameter +#define FLUSHA 4 +#endif -#if tpb && NX2 % tpb == 0 - for (int i = 0; i < NX2/tpb; i++) -#else - for (int i = 0; i < (NX2 - lid + tpb-1) / tpb; i++) +#ifndef NFLUSH +#define NFLUSH (8 / FLUSHA) #endif - counters[lid + tpb * i] = 0; + +static_assert(NB % SEED_TPB == 0, "SEED_TPB must divide NB"); + +template +__global__ void Seed(u32 * __restrict__ buffer, u32 * __restrict__ indexes, const u32 offset) { + const u32 group = blockIdx.x; + const u32 lid = threadIdx.x; + const u32 gid = group * SEED_TPB + lid; + const u32 nthreads = gridDim.x * SEED_TPB; + + extern __shared__ u32 tmp[][FLUSHA-1]; + u32 *counters = (u32 *)(tmp[NB]); + + for (u32 i = 0; i < NB/SEED_TPB; i++) + counters[lid + SEED_TPB * i] = 0; __syncthreads(); - const int nloops = (NEDGES / NA - gid + nthreads-1) / nthreads; - for (int i = 0; i < nloops; i++) { + const u32 nloops = (NEDGES / NTLB - gid + nthreads-1) / nthreads; + for (u32 i = 0; i < nloops; i++) { const u32 nonce = offset + gid * nloops + i; const u32 node0 = endpoint(nonce, 0); - const int bucket = node0 >> ZBITS; - u32 counter = (int)atomicAdd(counters + bucket, 1); - for (int nf=0; ; nf++) { - if (counter < FLUSHA-1) + const u32 bucket = node0 >> ZBITS; + u32 counter = atomicAdd(counters + bucket, 1); + for (u32 nf=0; ; nf++) { + if (counter < FLUSHA-1) // int counter would need additional test counter >= 0 tmp[bucket][counter] = nonce; __syncthreads(); if (nf == NFLUSH) break; if (counter == FLUSHA-1) { - const u64 pos = min((int)atomicAdd(indexes + bucket, FLUSHA), (int)(maxOut - FLUSHA)); - const u64 idx = (bucket + (bucket / NX2_NA) * (NX2 - NX2_NA)) * maxOut + pos; + const u32 pos = min((u32)atomicAdd(indexes + bucket, FLUSHA), (u32)(maxOut - FLUSHA)); + const u64 idx = (u64)(bucket + (bucket / NB_NMEM) * (NTLB-1) * NB_NMEM) * maxOut + pos; #if FLUSHA==4 ((uint4 *)buffer)[idx/4] = make_uint4(tmp[bucket][0], tmp[bucket][1], tmp[bucket][2], nonce); #elif FLUSHA==2 @@ -149,14 +165,15 @@ __global__ void Seed(u32 * __restrict__ buffer, u32 * __restrict__ indexes, cons __syncthreads(); counter -= FLUSHA; } + // negative counters are fine if ((int)counter >= FLUSHA-1) printf("WHOOPS!\n"); } - for (int i = 0; i < NX2 / tpb; i++) { - const int bucket = lid + i * tpb; - const int cnt = counters[bucket]; + for (u32 i = 0; i < NB / SEED_TPB; i++) { + const u32 bucket = lid + i * SEED_TPB; + const u32 cnt = counters[bucket]; if (cnt) { - const u64 pos = min((int)atomicAdd(indexes + bucket, FLUSHA), (int)(maxOut - FLUSHA)); - const u64 idx = (bucket + (bucket / NX2_NA) * (NX2 - NX2_NA)) * maxOut + pos; + const u32 pos = min((u32)atomicAdd(indexes + bucket, FLUSHA), (u32)(maxOut - FLUSHA)); + const u64 idx = (u64)(bucket + (bucket / NB_NMEM) * (NTLB-1) * NB_NMEM) * maxOut + pos; #if FLUSHA==4 ((uint4 *)buffer)[idx/4] = make_uint4(tmp[bucket][0], cnt >= 2 ? tmp[bucket][1] : 0, cnt >= 3 ? tmp[bucket][2] : 0, 0); #elif FLUSHA==2 @@ -177,91 +194,87 @@ __global__ void Seed(u32 * __restrict__ buffer, u32 * __restrict__ indexes, cons const u32 PART_MASK = (1 << PART_BITS) - 1; const u32 NONPART_BITS = ZBITS - PART_BITS; const word_t NONPART_MASK = (1 << NONPART_BITS) - 1; -const int BITMAPBYTES = (NZ >> PART_BITS) / 8; +const u32 BITMAPBYTES = (NZ >> PART_BITS) / 8; -template -__global__ void Round0(u32 * src, u32 * dst, u32 * srcIdx, u32 * dstIdx, const int offset) { - const int part = blockIdx.x & PART_MASK; - const int group = blockIdx.x >> PART_BITS; - const int lid = threadIdx.x; - const int BITMAPWORDS = BITMAPBYTES / sizeof(u32); - int nloops[NA]; +template +__global__ void Round0(u32 * src, u32 * dst, u32 * srcIdx, u32 * dstIdx, const u32 offset) { + const u32 part = blockIdx.x & PART_MASK; + const u32 group = blockIdx.x >> PART_BITS; + const u32 lid = threadIdx.x; + const u32 BITMAPWORDS = BITMAPBYTES / sizeof(u32); + u32 nloops[NTLB]; extern __shared__ u32 ebitmap[]; -#if tpb && BITMAPWORDS% tpb == 0 - for (int i = 0; i < BITMAPWORDS/tpb; i++) -#else - for (int i = 0; i < (BITMAPWORDS- lid + tpb-1) / tpb; i++) -#endif - ebitmap[lid + tpb * i] = 0; + static_assert(BITMAPWORDS % TRIM0_TPB == 0, "TRIM0_TPB must divide BITMAPWORDS"); - for (int a = 0; a < NA; a++) - nloops[a] = (min(srcIdx[a * NX2 + offset + group], maxIn) - lid + tpb-1) / tpb; + for (u32 i = 0; i < BITMAPWORDS/TRIM0_TPB; i++) + ebitmap[lid + TRIM0_TPB * i] = 0; - const int rowOffset = offset * NA; - src += maxIn * (rowOffset + group) + lid; + for (u32 a = 0; a < NTLB; a++) + nloops[a] = (min(srcIdx[a * NB + offset + group], maxIn) - lid + TRIM0_TPB-1) / TRIM0_TPB; + + src += (u64)(offset * NTLB + group) * maxIn + lid; __syncthreads(); - for (int a = 0; a < NA; a++) { - const int delta = a * NX2_NA * maxIn; - for (int i = 0; i < nloops[a]; i++) { - u32 edge = src[delta + i * tpb]; + for (u32 a = 0; a < NTLB; a++) { + const u64 delta = a * NB_NMEM * (u64)maxIn; + for (u32 i = 0; i < nloops[a]; i++) { + u32 edge = src[delta + i * TRIM0_TPB]; if (!edge) continue; u32 z = endpoint(edge, 0) & ZMASK; - if ((z >> NONPART_BITS) == part) + if (!PART_BITS || (z >> NONPART_BITS) == part) bitmapset(ebitmap, z & NONPART_MASK); } } __syncthreads(); - for (int a = 0; a < NA; a++) { - const int delta = a * NX2_NA * maxIn; - for (int i = 0; i < nloops[a]; i++) { - const u32 edge = src[delta + i * tpb]; + const u32 rowOffset = offset * NMEM; + for (u32 a = 0; a < NTLB; a++) { + const u64 delta = a * NB_NMEM * (u64)maxIn; + for (u32 i = 0; i < nloops[a]; i++) { + const u32 edge = src[delta + i * TRIM0_TPB]; if (!edge) continue; const u32 node0 = endpoint(edge, 0); const u32 z = node0 & ZMASK; - if ((z >> NONPART_BITS) == part && bitmaptest(ebitmap, (z & NONPART_MASK) ^ 1)) { - const int bucket = endpoint(edge, 1) >> ZBITS; - const int bktIdx = min(atomicAdd(dstIdx + bucket + rowOffset, 1), maxOut - 1); + if ((!PART_BITS || (z >> NONPART_BITS) == part) && bitmaptest(ebitmap, (z & NONPART_MASK) ^ 1)) { + const u32 bucket = endpoint(edge, 1) >> ZBITS; + const u32 bktIdx = min(atomicAdd(dstIdx + rowOffset + bucket, 1), maxOut - 1); dst[bucket * maxOut + bktIdx] = edge; } } } } -template +template __global__ void Round1(u32 * src, u32 * dst, u32 * srcIdx, u32 * dstIdx) { - const int part = blockIdx.x & PART_MASK; - const int group = blockIdx.x >> PART_BITS; - const int lid = threadIdx.x; + const u32 part = blockIdx.x & PART_MASK; + const u32 group = blockIdx.x >> PART_BITS; + const u32 lid = threadIdx.x; - const int BITMAPWORDS = BITMAPBYTES / sizeof(u32); - int nloops[NA]; + const u32 BITMAPWORDS = BITMAPBYTES / sizeof(u32); + u32 nloops[NMEM]; extern __shared__ u32 ebitmap[]; -#if tpb && BITMAPWORDS% tpb == 0 - for (int i = 0; i < BITMAPWORDS/tpb; i++) -#else - for (int i = 0; i < (BITMAPWORDS- lid + tpb-1) / tpb; i++) -#endif - ebitmap[lid + tpb * i] = 0; + static_assert(BITMAPWORDS % TRIM1_TPB == 0, "TRIM1_TPB must divide BITMAPWORDS"); - for (int a = 0; a < NA; a++) - nloops[a] = (min(srcIdx[a * NX2 + group], maxIn) - lid + tpb-1) / tpb; + for (u32 i = 0; i < BITMAPWORDS/TRIM1_TPB; i++) + ebitmap[lid + TRIM1_TPB * i] = 0; - src += maxIn * group + lid; + for (u32 a = 0; a < NMEM; a++) + nloops[a] = (min(srcIdx[a * NB + group], maxIn) - lid + TRIM1_TPB-1) / TRIM1_TPB; + + src += (u64)maxIn * group + lid; __syncthreads(); - for (int a = 0; a < NA; a++) { - const int delta = a * maxIn * NX2; - for (int i = 0; i < nloops[a]; i++) { - u32 edge = src[delta + i * tpb]; + for (u32 a = 0; a < NMEM; a++) { + const u64 delta = a * (u64)maxIn * NB; + for (u32 i = 0; i < nloops[a]; i++) { + u32 edge = src[delta + i * TRIM1_TPB]; u32 z = endpoint(edge, 1) & ZMASK; if ((z >> NONPART_BITS) == part) bitmapset(ebitmap, z & NONPART_MASK); @@ -270,46 +283,44 @@ __global__ void Round1(u32 * src, u32 * dst, u32 * srcIdx, u32 * dstIdx) { __syncthreads(); - for (int a = 0; a < NA; a++) { - const int delta = a * maxIn * NX2; - for (int i = 0; i < nloops[a]; i++) { - const u32 edge = src[delta + i * tpb]; + for (u32 a = 0; a < NMEM; a++) { + const u64 delta = a * (u64)maxIn * NB; + for (u32 i = 0; i < nloops[a]; i++) { + const u32 edge = src[delta + i * TRIM1_TPB]; const u32 node1 = endpoint(edge, 1); const u32 z = node1 & ZMASK; if ((z >> NONPART_BITS) == part && bitmaptest(ebitmap, (z & NONPART_MASK) ^ 1)) { - const int bucket = endpoint(edge, 0) >> ZBITS; - const int bktIdx = min(atomicAdd(dstIdx + bucket, 1), maxOut - 1); + const u32 bucket = endpoint(edge, 0) >> ZBITS; + const u32 bktIdx = min(atomicAdd(dstIdx + bucket, 1), maxOut - 1); dst[bucket * maxOut + bktIdx] = edge; } } } } -template -__global__ void Round(const int round, EdgeIn * src, uint2 * dst, u32 * srcIdx, u32 * dstIdx) { - const int part = blockIdx.x & PART_MASK; - const int group = blockIdx.x >> PART_BITS; - const int lid = threadIdx.x; +template +__global__ void Round(const u32 round, EdgeIn * src, uint2 * dst, u32 * srcIdx, u32 * dstIdx) { + const u32 part = blockIdx.x & PART_MASK; + const u32 group = blockIdx.x >> PART_BITS; + const u32 lid = threadIdx.x; - const int BITMAPWORDS = BITMAPBYTES / sizeof(u32); + const u32 BITMAPWORDS = BITMAPBYTES / sizeof(u32); extern __shared__ u32 ebitmap[]; - const int nloops = (min(srcIdx[group], maxIn) - lid + tpb-1) / tpb; + const u32 nloops = (min(srcIdx[group], maxIn) - lid + TRIM_TPB-1) / TRIM_TPB; -#if tpb && BITMAPWORDS % tpb == 0 - for (int i = 0; i < BITMAPWORDS/tpb; i++) -#else - for (int i = 0; i < (BITMAPWORDS - lid + tpb-1) / tpb; i++) -#endif - ebitmap[lid + tpb * i] = 0; + static_assert(BITMAPWORDS % TRIM_TPB == 0, "TRIM_TPB must divide BITMAPWORDS"); + + for (u32 i = 0; i < BITMAPWORDS/TRIM_TPB; i++) + ebitmap[lid + TRIM_TPB * i] = 0; src += maxIn * group + lid; __syncthreads(); - for (int i = 0; i < nloops; i++) { - const EdgeIn edge = src[i * tpb]; // EdgeIn edge = __ldg(&src[index]); + for (u32 i = 0; i < nloops; i++) { + const EdgeIn edge = src[i * TRIM_TPB]; // EdgeIn edge = __ldg(&src[index]); const u32 z = endpoint(edge, 0) & ZMASK; if ((z >> NONPART_BITS) == part) bitmapset(ebitmap, z & NONPART_MASK); @@ -317,14 +328,14 @@ __global__ void Round(const int round, EdgeIn * src, uint2 * dst, u32 * srcIdx, __syncthreads(); - for (int i = 0; i < nloops; i++) { - const EdgeIn edge = src[i * tpb]; // EdgeIn edge = __ldg(&src[index]); + for (u32 i = 0; i < nloops; i++) { + const EdgeIn edge = src[i * TRIM_TPB]; // EdgeIn edge = __ldg(&src[index]); const u32 node = endpoint(edge, 0); const u32 z = node & ZMASK; if ((z >> NONPART_BITS) == part && bitmaptest(ebitmap, (z & NONPART_MASK) ^ 1)) { u32 node2 = endpoint(edge, 1); - const int bucket = node2 >> ZBITS; - const int bktIdx = min(atomicAdd(dstIdx + bucket, 1), maxOut - 1); + const u32 bucket = node2 >> ZBITS; + const u32 bktIdx = min(atomicAdd(dstIdx + bucket, 1), maxOut - 1); dst[bucket * maxOut + bktIdx] = make_uint2(node2, node); } } @@ -341,26 +352,28 @@ const u32 LISTMASK = NLISTS - 1; #define NNEXTS NLISTS #endif -template +const u32 COPYFLAG = NZ; + +template __global__ void Relay(const u32 round, const uint2 * source, uint2 * destination, const u32 * sourceIndexes, u32 * destinationIndexes, bool TAGGED) { - const int lid = threadIdx.x; - const int group = blockIdx.x; + const u32 lid = threadIdx.x; + const u32 group = blockIdx.x; __shared__ u32 lists[NLISTS]; __shared__ u32 nexts[NNEXTS]; - const int nloops = (min(sourceIndexes[group], NNEXTS) - lid + tpb-1) / tpb; + const u32 nloops = (min(sourceIndexes[group], NNEXTS) - lid + RELAY_TPB-1) / RELAY_TPB; source += bktInSize * group; - for (int i = 0; i < NLISTS/tpb; i++) - lists[i * tpb + lid] = ~0; + for (u32 i = 0; i < NLISTS/RELAY_TPB; i++) + lists[i * RELAY_TPB + lid] = ~0; __syncthreads(); - for (int i = 0; i < nloops; i++) { - const u32 index = i * tpb + lid; + for (u32 i = 0; i < nloops; i++) { + const u32 index = i * RELAY_TPB + lid; const u32 list = endpoint(source[index], 0) & LISTMASK; nexts[index] = atomicExch(&lists[list], index); } @@ -368,9 +381,9 @@ __global__ void Relay(const u32 round, const uint2 * source, uint2 * destinatio __syncthreads(); for (int i = nloops; --i >= 0;) { - const u32 index = i * tpb + lid; + const u32 index = i * RELAY_TPB + lid; const uint2 edge = source[index]; - if (edge.y & NEDGES) continue; // copies don't relay + if (TAGGED && (edge.x & COPYFLAG)) continue; // copies don't relay u32 bucket = edge.y >> ZBITS; u32 copybit = 0; const u32 list = (edge.x & LISTMASK) ^ 1; @@ -378,31 +391,31 @@ __global__ void Relay(const u32 round, const uint2 * source, uint2 * destinatio uint2 tagged = source[idx]; if ((tagged.x ^ edge.x ^ 1) & ZMASK) continue; u32 bktIdx = min(atomicAdd(destinationIndexes + bucket, 1), bktOutSize - 1); - u32 tag = TAGGED ? tagged.x >> ZBITS : tagged.y >> 1; - destination[(bucket * bktOutSize) + bktIdx] = make_uint2((tag << ZBITS) | (edge.y & ZMASK), copybit | (group << ZBITS) | (edge.x & ZMASK)); - copybit = NEDGES; + u32 tag = (TAGGED ? tagged.x >> ZBITS : tagged.y >> 1) & ~1 | copybit; + destination[(bucket * bktOutSize) + bktIdx] = make_uint2((tag << ZBITS) | (edge.y & ZMASK), (group << ZBITS) | (edge.x & ZMASK)); + copybit = 1; } } } -template +template __global__ void Tail(const uint2 *source, uint4 *destination, const u32 *sourceIndexes, u32 *destinationIndexes) { - const int lid = threadIdx.x; - const int group = blockIdx.x; + const u32 lid = threadIdx.x; + const u32 group = blockIdx.x; __shared__ u32 lists[NLISTS]; __shared__ u32 nexts[NNEXTS]; - const int nloops = (min(sourceIndexes[group], NNEXTS) - lid + tpb-1) / tpb; + const u32 nloops = (min(sourceIndexes[group], NNEXTS) - lid + tpb-1) / tpb; source += maxIn * group; - for (int i = 0; i < NLISTS/tpb; i++) + for (u32 i = 0; i < NLISTS/tpb; i++) lists[i * tpb + lid] = ~0; __syncthreads(); - for (int i = 0; i < nloops; i++) { + for (u32 i = 0; i < nloops; i++) { const u32 index = i * tpb + lid; const u32 list = source[index].x & LISTMASK; nexts[index] = atomicExch(&lists[list], index); @@ -413,17 +426,13 @@ __global__ void Tail(const uint2 *source, uint4 *destination, const u32 *sourceI for (int i = nloops; --i >= 0;) { const u32 index = i * tpb + lid; const uint2 edge = source[index]; -#ifdef DBG101 - if (((edge.x^0x1d3cc2ae)&ZMASK)<2) printf("Tail group %x x %x y %x tag %x\n", group, edge.x, edge.y, edge.x>>ZBITS); - if (((edge.y^0x1d3cc2ae)&ZMASK)<2) printf("Tail group %x x %x y %x tag %x\n", group, edge.x, edge.y, edge.x>>ZBITS); -#endif if (edge.x & 1) continue; const u32 list = (edge.x & LISTMASK) ^ 1; for (u32 idx = lists[list]; idx != ~0; idx = nexts[idx]) { uint2 other = source[idx]; - if ((other.x ^ edge.x) != 1) continue; + if (((other.x ^ edge.x) & ~COPYFLAG) != 1) continue; u32 bktIdx = atomicAdd(destinationIndexes, 2); - destination[bktIdx/2] = make_uint4(edge.y & (NEDGES-1), (group << ZBITS) | (edge.x & ZMASK), + destination[bktIdx/2] = make_uint4(edge.y, (group << ZBITS) | (edge.x & ZMASK), other.y & (NEDGES-1), (group << ZBITS) | (other.x & ZMASK)); } } @@ -445,19 +454,19 @@ inline int gpuAssert(cudaError_t code, const char *file, int line, bool abort=tr } __global__ void Recovery(u32 *indexes) { - const int gid = blockDim.x * blockIdx.x + threadIdx.x; - const int lid = threadIdx.x; - const int nthreads = blockDim.x * gridDim.x; + const u32 gid = blockDim.x * blockIdx.x + threadIdx.x; + const u32 lid = threadIdx.x; + const u32 nthreads = blockDim.x * gridDim.x; __shared__ u32 nonces[PROOFSIZE]; - const int nloops = (NEDGES -gid + nthreads-1) / nthreads; + const u32 nloops = (NEDGES -gid + nthreads-1) / nthreads; if (lid < PROOFSIZE) nonces[lid] = 0; __syncthreads(); - for (int i = 0; i < nloops; i++) { + for (u32 i = 0; i < nloops; i++) { u64 nonce = gid * nloops + i; u64 u = endpoint(nonce, 0); u64 v = endpoint(nonce, 1); - for (int i = 0; i < PROOFSIZE; i++) { + for (u32 i = 0; i < PROOFSIZE; i++) { if (recoveredges[i].x == v && recoveredges[i].y == u) nonces[i] = nonce; } @@ -474,25 +483,6 @@ struct blockstpb { u16 tpb; }; -#ifndef SEED_TPB -#define SEED_TPB 256 -#endif -#ifndef TRIM0_TPB -#define TRIM0_TPB 1024 -#endif -#ifndef TRIM1_TPB -#define TRIM1_TPB 512 -#endif -#ifndef TRIM_TPB -#define TRIM_TPB 512 -#endif -#ifndef RELAY_TPB -#define RELAY_TPB 512 -#endif -#ifndef TAIL_TPB -#define TAIL_TPB SEED_TPB -#endif - struct trimparams { u16 ntrims; blockstpb seed; @@ -504,16 +494,16 @@ struct trimparams { trimparams() { ntrims = 31; - seed.blocks = 1024; + seed.blocks = NB_NTLB; seed.tpb = SEED_TPB; - trim0.blocks = NX2_NA; + trim0.blocks = NB_NMEM; trim0.tpb = TRIM0_TPB; - trim1.blocks = NX2_NA; + trim1.blocks = NB_NMEM; trim1.tpb = TRIM1_TPB; - trim.blocks = NX2; + trim.blocks = NB; trim.tpb = TRIM_TPB; - tail.blocks = NX2; - tail.tpb = TAIL_TPB;; + tail.blocks = NB; + tail.tpb = TAIL_TPB; recover.blocks = 2048; recover.tpb = 256; } @@ -526,8 +516,9 @@ struct edgetrimmer { trimparams tp; edgetrimmer *dt; size_t sizeA, sizeB; - const size_t indexesSize = NX * NY * sizeof(u32); - const size_t indexesSizeNA = NA * indexesSize; + const size_t indexesSize = NB * sizeof(u32); + const size_t indexesSizeNTLB = NTLB * indexesSize; + const size_t indexesSizeNMEM = NMEM * indexesSize; u8 *bufferA; u8 *bufferB; u8 *bufferA1; @@ -539,32 +530,34 @@ struct edgetrimmer { bool abort; bool initsuccess = false; + static_assert((NTLB & (NTLB-1)) == 0, "ensure NTLB is a 2 power"); + static_assert((NMEM & (NMEM-1)) == 0, "ensure NMEM is a 2 power"); + edgetrimmer(const trimparams _tp) : tp(_tp) { checkCudaErrors_V(cudaMalloc((void**)&dt, sizeof(edgetrimmer))); checkCudaErrors_V(cudaMalloc((void**)&uvnodes, PROOFSIZE * 2 * sizeof(u32))); - checkCudaErrors_V(cudaMalloc((void**)&indexesA, indexesSizeNA)); - checkCudaErrors_V(cudaMalloc((void**)&indexesB, indexesSizeNA)); - sizeA = (u64)ROW_EDGES_A * NX * sizeof(u32); - sizeB = (u64)ROW_EDGES_B * NX * sizeof(u32); - const size_t bufferSize = sizeA + sizeB / NA; + checkCudaErrors_V(cudaMalloc((void**)&indexesA, indexesSizeNTLB)); + checkCudaErrors_V(cudaMalloc((void**)&indexesB, indexesSizeNMEM)); + sizeA = (u64)EDGES_A * NB * sizeof(u32); + sizeB = (u64)EDGES_B * NB * sizeof(u32); + const size_t bufferSize = sizeA + sizeB / NMEM; checkCudaErrors_V(cudaMalloc((void**)&bufferB, bufferSize)); - bufferA = bufferB + sizeB / NA; + bufferA = bufferB + sizeB / NMEM; bufferA1 = bufferB + sizeB; - print_log("allocated %lld bytes bufferB %llx bufferA %llx bufferA1 %llx\n", bufferSize, bufferB, bufferA, bufferA1); - print_log("endB %llx endA %llx endBuffer %llx\n", bufferB+sizeB, bufferA+sizeA, bufferB+bufferSize); - assert((NA & (NA-1)) == 0); // ensure NA is a 2 power + // print_log("allocated %lld bytes bufferB %llx bufferA %llx bufferA1 %llx\n", bufferSize, bufferB, bufferA, bufferA1); + // print_log("endB %llx endA %llx endBuffer %llx\n", bufferB+sizeB, bufferA+sizeA, bufferB+bufferSize); cudaMemcpy(dt, this, sizeof(edgetrimmer), cudaMemcpyHostToDevice); initsuccess = true; - int maxbytes = 0x10000; // 64 KB - cudaFuncSetAttribute(Seed < SEED_TPB, EDGES_A/NA >, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes); - cudaFuncSetAttribute(Round0, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes); - cudaFuncSetAttribute(Round1, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes); - cudaFuncSetAttribute(Round < TRIM_TPB, EDGES_B/2, u32, EDGES_A/4>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes); - cudaFuncSetAttribute(Round < TRIM_TPB, EDGES_A/4, uint2, EDGES_B/4>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes); - cudaFuncSetAttribute(Round < TRIM_TPB, EDGES_B/4, uint2, EDGES_B/4>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes); + u32 maxbytes = 0x10000; // 64 KB + cudaFuncSetAttribute(Seed , cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes); + cudaFuncSetAttribute(Round0, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes); + cudaFuncSetAttribute(Round1, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes); + cudaFuncSetAttribute(Round , cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes); + cudaFuncSetAttribute(Round , cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes); + cudaFuncSetAttribute(Round , cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes); } u64 globalbytes() const { - return sizeA + sizeB/NA + 2 * indexesSizeNA + sizeof(siphash_keys) + PROOFSIZE * 2*sizeof(u32) + sizeof(edgetrimmer); + return sizeA + sizeB/NMEM + indexesSizeNTLB + indexesSizeNMEM + sizeof(siphash_keys) + PROOFSIZE * 2*sizeof(u32) + sizeof(edgetrimmer); } ~edgetrimmer() { checkCudaErrors_V(cudaFree(bufferB)); @@ -581,12 +574,17 @@ struct edgetrimmer { void indexcount(u32 round, const u32 *indexes) { #ifdef VERBOSE - u32 nedges; - cudaMemcpy(&nedges, indexes+VBIDX, sizeof(u32), cudaMemcpyDeviceToHost); + u32 nedges[NB]; + cudaMemcpy(nedges, indexes, NB * sizeof(u32), cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); - print_log("round %d edges %d\n", round, nedges); + u32 sum, max; + for (u32 i = sum = max = 0; i < NB; i++) { + sum += nedges[i]; + if (nedges[i] > max) + max = nedges[i]; + } + print_log("round %d edges avg %d max %d\n", round, sum/NB, max); #endif - } u32 trim() { cudaEvent_t start, stop; @@ -597,9 +595,9 @@ struct edgetrimmer { float durationA; cudaEventRecord(start, NULL); - cudaMemset(indexesA, 0, indexesSizeNA); - for (u32 i = 0; i < NA; i++) { - Seed<<>>((u32*)(bufferA+i*(sizeA/NA2)), indexesA+i*NX2, i*(NEDGES/NA)); + cudaMemset(indexesA, 0, indexesSizeNTLB); + for (u32 i = 0; i < NTLB; i++) { + Seed<<>>((u32*)(bufferA+i*(sizeA/NMEMTLB)), indexesA+i*NB, i*(NEDGES/NTLB)); cudaDeviceSynchronize(); if (abort) return false; } @@ -610,46 +608,47 @@ struct edgetrimmer { // checkCudaErrors(cudaEventDestroy(start)); checkCudaErrors(cudaEventDestroy(stop)); print_log("Seeding completed in %.0f ms\n", durationA); + indexcount(0, indexesA); if (abort) return false; - cudaMemset(indexesB, 0, indexesSizeNA); + cudaMemset(indexesB, 0, indexesSizeNMEM); - const size_t qB = sizeB / NA; - for (u32 i = 0; i < NA; i++) { - Round0<<>>((u32*)bufferA, (u32*)(bufferB+i*qB), indexesA, indexesB, i*NX2_NA); // to .632 + const size_t qB = sizeB / NMEM; + for (u32 i = 0; i < NMEM; i++) { + Round0<<>>((u32*)bufferA, (u32*)(bufferB+i*qB), indexesA, indexesB, i*NB_NMEM); // to .632 if (abort) return false; } indexcount(1, indexesB); cudaMemset(indexesA, 0, indexesSize); - Round1<<>>((u32*)bufferB, (u32*)bufferA1, indexesB, indexesA); // to .296 + Round1<<>>((u32*)bufferB, (u32*)bufferA1, indexesB, indexesA); // to .296 if (abort) return false; indexcount(2, indexesA); cudaMemset(indexesB, 0, indexesSize); - Round<<>>(2, (u32 *)bufferA1, (uint2 *)bufferB, indexesA, indexesB); // to .176 + Round<<>>(2, (u32 *)bufferA1, (uint2 *)bufferB, indexesA, indexesB); // to .176 if (abort) return false; indexcount(3, indexesB); cudaMemset(indexesA, 0, indexesSize); - Round<<>>(3, (uint2 *)bufferB, (uint2 *)bufferA1, indexesB, indexesA); // to .116 + Round<<>>(3, (uint2 *)bufferB, (uint2 *)bufferA1, indexesB, indexesA); // to .116 if (abort) return false; indexcount(4, indexesA); - for (int round = 5; round < tp.ntrims + PROOFSIZE/2-1; round += 2) { + for (u32 round = 5; round < tp.ntrims + PROOFSIZE/2-1; round += 2) { cudaMemset(indexesB, 0, indexesSize); if (round >= tp.ntrims) - Relay<<>>(round-1, (uint2 *)bufferA1, (uint2 *)bufferB, indexesA, indexesB, round > tp.ntrims); - else Round<<>>(round-1, (uint2 *)bufferA1, (uint2 *)bufferB, indexesA, indexesB); + Relay<<>>(round-1, (uint2 *)bufferA1, (uint2 *)bufferB, indexesA, indexesB, round > tp.ntrims); + else Round<<>>(round-1, (uint2 *)bufferA1, (uint2 *)bufferB, indexesA, indexesB); indexcount(round, indexesB); if (abort) return false; cudaMemset(indexesA, 0, indexesSize); if (round+1 >= tp.ntrims) - Relay<<>>(round, (uint2 *)bufferB, (uint2 *)bufferA1, indexesB, indexesA, round+1 > tp.ntrims); - else Round<<>>(round, (uint2 *)bufferB, (uint2 *)bufferA1, indexesB, indexesA); + Relay<<>>(round, (uint2 *)bufferB, (uint2 *)bufferA1, indexesB, indexesA, round+1 > tp.ntrims); + else Round<<>>(round, (uint2 *)bufferB, (uint2 *)bufferA1, indexesB, indexesA); indexcount(round+1, indexesA); if (abort) return false; } @@ -657,7 +656,7 @@ struct edgetrimmer { cudaMemset(indexesB, 0, indexesSize); cudaDeviceSynchronize(); - Tail<<>>((const uint2 *)bufferA1, (uint4 *)bufferB, indexesA, indexesB); + Tail<<>>((const uint2 *)bufferA1, (uint4 *)bufferB, indexesA, indexesB); cudaMemcpy(&nedges, indexesB, sizeof(u32), cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); return nedges; @@ -717,7 +716,7 @@ struct solver_ctx { return ndupes; } - int solve() { + u32 solve() { u64 time0, time1; u32 timems,timems2; @@ -806,12 +805,12 @@ CALL_CONVENTION int run_solver(SolverCtx* ctx, for (u32 i = 0; i < PROOFSIZE; i++) solutions->sols[sumnsols+s].proof[i] = (u64) prf[i]; } - int pow_rc = verify(prf, &ctx->trimmer.sipkeys); + u32 pow_rc = verify(prf, &ctx->trimmer.sipkeys); if (pow_rc == POW_OK) { print_log("Verified with cyclehash "); unsigned char cyclehash[32]; blake2b((void *)cyclehash, sizeof(cyclehash), (const void *)prf, sizeof(proof), 0, 0); - for (int i=0; i<32; i++) + for (u32 i=0; i<32; i++) print_log("%02x", cyclehash[i]); print_log("\n"); } else { @@ -941,19 +940,19 @@ int main(int argc, char **argv) { cudaDeviceProp prop; checkCudaErrors(cudaGetDeviceProperties(&prop, device)); u64 dbytes = prop.totalGlobalMem; - int dunit; + u32 dunit; for (dunit=0; dbytes >= 102400; dbytes>>=10,dunit++) ; print_log("%s with %d%cB @ %d bits x %dMHz\n", prop.name, (u32)dbytes, " KMGT"[dunit], prop.memoryBusWidth, prop.memoryClockRate/1000); print_log("Looking for %d-cycle on cuckatoo%d(\"%s\",%d", PROOFSIZE, EDGEBITS, header, nonce); if (range > 1) print_log("-%d", nonce+range-1); - print_log(") with 50%% edges, %d*%d buckets, %d trims, and %d thread blocks.\n", NX, NY, params.ntrims, NX); + print_log(") with 50%% edges, %d buckets, and %d trims.\n", NB, params.ntrims); SolverCtx* ctx = create_solver_ctx(¶ms); u64 bytes = ctx->trimmer.globalbytes(); - int unit; + u32 unit; for (unit=0; bytes >= 102400; bytes>>=10,unit++) ; print_log("Using %d%cB of global memory.\n", (u32)bytes, " KMGT"[unit]);