From efbf4f865e2ed412f000fd487b324c08e420b735 Mon Sep 17 00:00:00 2001 From: John Tromp Date: Wed, 15 Jul 2020 15:54:17 +0200 Subject: [PATCH] working cuckarooz CUDA solver --- src/cuckarooz/Makefile | 4 +- src/cuckarooz/mean.cu | 761 +++++++++++++++++++++++++---------------- src/cuckarooz/mean.hpp | 330 +++++++----------- 3 files changed, 587 insertions(+), 508 deletions(-) diff --git a/src/cuckarooz/Makefile b/src/cuckarooz/Makefile index 43e8af5..7911d0e 100644 --- a/src/cuckarooz/Makefile +++ b/src/cuckarooz/Makefile @@ -55,8 +55,8 @@ mean30x1: cuckarooz.hpp bitmap.hpp graph.hpp ../threads/barrier.hpp ../crypto/s mean30x8: cuckarooz.hpp bitmap.hpp graph.hpp ../threads/barrier.hpp ../crypto/siphash.hpp mean.hpp mean.cpp Makefile $(GPP) -o $@ -mavx2 -DNSIPHASH=8 -DEXPANDROUND=10 -DCOMPRESSROUND=22 -DEDGEBITS=30 mean.cpp $(BLAKE_2B_SRC) -cuda19: ../crypto/siphash.cuh compress.hpp graph.hpp mean.cu Makefile +cuda19: ../crypto/siphash.cuh cuckarooz.hpp compress.hpp graph.hpp mean.cu Makefile $(NVCC) -o $@ -DEPS_A=4 -DEPS_B=3 -DIDXSHIFT=2 -DEDGEBITS=19 -arch sm_35 mean.cu $(BLAKE_2B_SRC) -cuda29: ../crypto/siphash.cuh compress.hpp graph.hpp mean.cu Makefile +cuda29: ../crypto/siphash.cuh cuckarooz.hpp compress.hpp graph.hpp mean.cu Makefile $(NVCC) -o $@ -DEDGEBITS=29 -arch sm_35 mean.cu $(BLAKE_2B_SRC) diff --git a/src/cuckarooz/mean.cu b/src/cuckarooz/mean.cu index 3fc3560..aa50c14 100644 --- a/src/cuckarooz/mean.cu +++ b/src/cuckarooz/mean.cu @@ -1,5 +1,5 @@ // Cuckarooz Cycle, a memory-hard proof-of-work by John Tromp -// Copyright (c) 2018-2020 Wilke Trei, Jiri Vadura (photon) and John Tromp +// Copyright (c) 2018-2020 Wilke Trei (Lolliedieb), Jiri Vadura (photon) and John Tromp // This software is covered by the FAIR MINING license //Includes for IntelliSense @@ -21,22 +21,17 @@ #include "../crypto/blake2.h" // number of invocations of seeding kernel -// values larger than 1 reduce pressure on TLB +// larger values reduce pressure on TLB // which appears to be an issue on GTX cards -#ifndef NTLB -#define NTLB 2 -#endif +#define NTLB 4 // number of slices in edge splitting kernel // larger values reduce memory use // which is proportional to 1+1/NMEM -#ifndef NMEM #define NMEM 4 -#endif // after seeding, main memory buffer is a NMEM x NTLB matrix // of blocks of buckets #define NMEMTLB (NMEM * NTLB) -#define NODE1MASK NODEMASK #include "../crypto/siphash.cuh" typedef uint8_t u8; @@ -50,12 +45,14 @@ typedef uint32_t u32; #endif const u32 NB = 1 << BUCKBITS; +const u32 NB2 = NB / 2; const u32 NB_NTLB = NB / NTLB; const u32 NB_NMEM = NB / NMEM; +const u32 NODEBITS = EDGEBITS+1; const u32 ZBITS = NODEBITS - BUCKBITS; -const u32 ZBITS1 = ZBITS + 1; const u32 NZ = 1 << ZBITS; const u32 ZMASK = NZ - 1; +const u32 ZBITS1 = ZBITS + 1; __device__ __forceinline__ uint2 ld_cs_u32_v2(const uint2 *p_src) { @@ -103,7 +100,7 @@ __device__ __forceinline__ bool testbit(const u64 edgemap, const int slot) } __constant__ siphash_keys dipkeys; -__constant__ u64 recovery[42]; +__constant__ u64 recovery[2*PROOFSIZE]; #define ROTL(x,b) ( ((x) << (b)) | ( (x) >> (64 - (b))) ) #define SIPROUND {\ @@ -130,7 +127,7 @@ __constant__ u64 recovery[42]; #define SPLIT_TPB 1024 #endif #ifndef TRIM_TPB -#define TRIM_TPB 1024 +#define TRIM_TPB 512 #endif #ifndef UNSPLIT_TPB #define UNSPLIT_TPB 256 @@ -144,9 +141,11 @@ __constant__ u64 recovery[42]; template __global__ void Seed(uint4 * __restrict__ buffer, u32 * __restrict__ indexes, const u32 part) +// Seed<<>>((uint4*)bufferA, indexesA, i); { - const int gid = blockDim.x * blockIdx.x + threadIdx.x; + const int group = blockIdx.x; const int lid = threadIdx.x; + const int gid = group * SEED_TPB + lid; const int nthreads = gridDim.x * SEED_TPB; const int nloops = (NEDGES / NTLB / EDGE_BLOCK_SIZE - gid + nthreads-1) / nthreads; @@ -161,7 +160,7 @@ __global__ void Seed(uint4 * __restrict__ buffer, u32 * __restrict__ indexes, co __syncthreads(); u64 offset = part * (NEDGES / NTLB); - buffer += part * (maxOut * NB_NMEM / 2); + buffer += part * (maxOut * NB_NMEM / 2); // buffer indexed by edgepairs rather than edges indexes += part * NB; for (int i = 0; i < nloops; i++) { @@ -187,10 +186,26 @@ __global__ void Seed(uint4 * __restrict__ buffer, u32 * __restrict__ indexes, co u64 last = 0; +#define BBITS2 (BUCKBITS/2) +#define NBB2 (1 << BBITS2) +#define BB2MASK (NBB2 - 1) +#define BBITS21 (BBITS2 - 1) +#define NBB21 (1 << BBITS21) +#define BB21MASK (NBB21 - 1) + #define DUMP(E) {\ u64 lookup = (E);\ const uint2 edge1 = make_uint2(lookup & NODEMASK, (lookup >> 32) & NODEMASK);\ - int bucket = edge1.x >> ZBITS;\ + const u32 uX = edge1.x >> (ZBITS + BBITS2);\ + assert(uX < NBB2);\ + const u32 vX = edge1.y >> (ZBITS + BBITS2);\ + assert(vX < NBB2);\ + const u32 uvXa = 2 * (uX >> BBITS21) + (vX >> BBITS21);\ + assert(uvXa < 4);\ + const u32 uvXb = (uX & BB21MASK) << BBITS21 | (vX & BB21MASK);\ + assert(uvXb < NB/4);\ + const u32 bucket = uvXa << (BUCKBITS-2) | uvXb;\ + assert(bucket < NB);\ u64 edge64 = (((u64)edge1.y) << 32) | edge1.x;\ for (u64 ret = atomicCAS(&magazine[bucket], 0, edge64); ret; ) {\ u64 ret2 = atomicCAS(&magazine[bucket], ret, 0);\ @@ -200,7 +215,7 @@ __global__ void Seed(uint4 * __restrict__ buffer, u32 * __restrict__ indexes, co printf("Seed dropping edges %llx %llx\n", ret, edge64);\ break;\ }\ - int idx = (bucket + (bucket / NB_NMEM) * (NTLB-1) * NB_NMEM) * maxOut + position;\ + const u32 idx = (uvXa * NTLB * NB_NMEM + uvXb) * maxOut + position;\ buffer[idx/2] = make_uint4(ret, ret >> 32, edge1.x, edge1.y);\ break;\ }\ @@ -228,91 +243,98 @@ __global__ void Seed(uint4 * __restrict__ buffer, u32 * __restrict__ indexes, co printf("Seed dropping edge %llx\n", edge); continue; } - int idx = (bucket + (bucket / NB_NMEM) * (NTLB-1) * NB_NMEM) * maxOut + position;\ + const u32 idx = ((bucket/NB_NMEM) * NTLB * NB_NMEM + bucket % NB_NMEM) * maxOut + position; buffer[idx/2] = make_uint4(edge, edge >> 32, 0, 0); } } } -#ifndef EXTSHARED -#define EXTSHARED 0xc000 -#endif - -#ifndef CYCLE_V0 -#define CYCLE_V0 0x59f7ada -#define CYCLE_V1 0x106a0753 -#endif -#define BUCK0 (CYCLE_V0 >> ZBITS) -#define REST0 (CYCLE_V0 & ZMASK) -#define BUCK1 (CYCLE_V1 >> ZBITS) -#define REST1 (CYCLE_V1 & ZMASK) - template -__global__ void EdgeSplit(const uint2 *src, uint2 *dst, const u32 *srcIdx, u32 *dstIdx, u64 *edgemap, const int part) +__global__ void EdgeSplit(const uint2 *src, uint2 *dst, const u32 *srcIdx, u32 *dstIdx, u32 *edgeslots, const int uvXa) +// EdgeSplit<<>>((uint2*)bufferA, (uint2*)bufferB, indexesA, indexesB, edgeSlots, i); { + const int uvYa = blockIdx.x % 4; + const int uvXb = blockIdx.x / 4; const int lid = threadIdx.x; - const int group = blockIdx.x; - extern __shared__ unsigned long long magazine[]; - u32 *emap = (u32 *)(magazine + NB); + __shared__ unsigned long long magazine[NB]; int nloops[NTLB]; for (int i = 0; i < NB/SPLIT_TPB; i++) { magazine[lid + SPLIT_TPB * i] = 0; - emap[lid + SPLIT_TPB * i] = 0; + // emap[lid + SPLIT_TPB * i] = 0; } - const int offset = part * NB_NMEM; - const int Group = offset + group; - edgemap += NB * Group; - -#if EXTSHARED >= 0xc004 - u32 *ourIdx = emap + NB; -#else - u32 *ourIdx = (u32 *)edgemap; -#endif - - if (!lid) - *ourIdx = 0; + const u32 offset = uvXa * NB_NMEM; + const u32 uvX = offset + uvXb; + const u32 uXb = uvXb >> BBITS21; // 5 bits + const u32 vXb = uvXb & BB21MASK; // 5 bits + const u32 uX = (uvXa/2) << BBITS21 | uXb; // 6 bits + const u32 vX = (uvXa%2) << BBITS21 | vXb; // 6 bits + edgeslots += (uX * NBB2 + vX) * NB; + if (!lid) assert(uXb < 32); + if (!lid) assert(vXb < 32); for (int a = 0; a < NTLB; a++) - nloops[a] = (min(srcIdx[a * NB + Group], maxIn) - lid + SPLIT_TPB-1) / SPLIT_TPB; + nloops[a] = (min(srcIdx[a * NB + uvX], maxIn) - lid + SPLIT_TPB-1) / SPLIT_TPB; - src += (offset * NTLB + group) * maxIn + lid; - dst += part * NB * maxOut; - uint2 *dstX = dst + (NB_NMEM + group) * NTLB * maxIn / 2; + src += (offset * NTLB + uvXb) * maxIn + lid; + dst += uvXa * 2 * NB * maxOut; + uint2 *ydst = dst + NB * maxOut; + u32 *xdstIdx = dstIdx + uvXa * 2 * NB; + u32 *ydstIdx = xdstIdx + NB; __syncthreads(); - const int rowOffset = offset * NMEM; for (int a = 0; a < NTLB; a++) { const u32 delta = a * NB_NMEM * maxIn; for (int i = 0; i < nloops[a]; i++) { uint2 edge = src[delta + i * SPLIT_TPB]; if (edge.x == edge.y) continue; - int bucket = edge.y >> ZBITS; + const u32 uXYa = edge.x >> (ZBITS-1); // 13 bits + const u32 vXYa = edge.y >> (ZBITS-1); // 13 bits + const u32 uXY = uXYa >> 1; + const u32 vXY = vXYa >> 1; + assert(uXY < NB); + assert(vXY < NB); + assert((uXY >> BBITS2) == ((uvXa/2) << BBITS21 | uXb)); // match 6 bits + assert((vXY >> BBITS2) == ((uvXa%2) << BBITS21 | vXb)); // match 6 bits + if (2 * (uXYa % 2) + (vXYa % 2) != uvYa) continue; + const u32 bucket = (uXY % NBB2) * NBB2 + vXY % NBB2; + assert(bucket < NB); u64 edge64 = (((u64)edge.y) << 32) | edge.x; for (u64 ret = atomicCAS(&magazine[bucket], 0, edge64); ret; ) { u64 ret2 = atomicCAS(&magazine[bucket], ret, 0); if (ret2 == ret) { - const u32 slot = atomicAdd(emap + bucket, 2); + const u32 slot = atomicAdd(edgeslots + bucket, 2); if (slot >= 64) { #ifdef VERBOSE printf("dropped edges %llx %llx\n", ret, edge64); #endif break; - } + } else if (slot >= 63) { +#ifdef VERBOSE + printf("dropped edge %llx\n", ret); +#endif + break; + } const u32 slots = (slot+1) << 6 | slot; // slot for edge, slot+1 for ret - int bktIdx = atomicAdd(ourIdx, 1); - dstX[bktIdx] = make_uint2(bucket << ZBITS | edge.x & ZMASK, slots << ZBITS | ret & ZMASK); - bktIdx = atomicAdd(dstIdx + rowOffset + bucket, 1); - if (bktIdx >= maxOut/2) { + const u32 xidx = atomicAdd(xdstIdx + uXYa % NB, 1); + if (xidx >= maxOut) { +#ifdef VERBOSE + printf("dropped u-halfedge %llx\n", edge64); +#endif + break; + } + dst[(uXYa % NB) * maxOut + xidx] = make_uint2((vXY << ZBITS) | (edge.x & ZMASK), slots << ZBITS | ret & ZMASK); + const u32 yIdx = atomicAdd(ydstIdx + vXYa % NB, 1); + if (yIdx >= maxOut) { #ifdef VERBOSE - printf("dropped halfedge %llx\n", edge); + printf("dropped v-halfedge %llx\n", edge); #endif break; } - dst[bucket * maxOut/2 + bktIdx] = make_uint2(Group << ZBITS | edge.y & ZMASK, slots << ZBITS | (ret>>32) & ZMASK); + ydst[(vXYa % NB) * maxOut + yIdx] = make_uint2((uXY << ZBITS) | (edge.y & ZMASK), slots << ZBITS | (ret>>32) & ZMASK); break; } ret = ret2 ? ret2 : atomicCAS(&magazine[bucket], 0, edge64); @@ -326,41 +348,56 @@ __global__ void EdgeSplit(const uint2 *src, uint2 *dst, const u32 *srcIdx, u32 * int bucket = lid + SPLIT_TPB * i; u64 edge = magazine[bucket]; if (edge != 0) { - const u32 slot = atomicAdd(emap + bucket, 1); + const u32 slot = atomicAdd(edgeslots + bucket, 1); if (slot >= 64) { #ifdef VERBOSE printf("dropped edge %llx\n", edge); #endif continue; } + const u32 uXY = uX * NBB2 + bucket / NBB2; + const u32 vXY = vX * NBB2 + bucket % NBB2; + const u32 uXYa = uXY << 1 | (uvYa / 2); + const u32 vXYa = vXY << 1 | (uvYa % 2); + assert(((edge & 0xffffffff) >> (ZBITS-1)) == uXYa); + assert((edge >> (32+ZBITS-1)) == vXYa); const u32 slots = slot << 6 | slot; // duplicate slot indicates singleton pair - int bktIdx = atomicAdd(ourIdx, 1); - dstX[bktIdx] = make_uint2(bucket << ZBITS | edge & ZMASK, slots << ZBITS | edge & ZMASK); - bktIdx = atomicAdd(dstIdx + rowOffset + bucket, 1); - if (bktIdx >= maxOut/2) { + int bktIdx = atomicAdd(xdstIdx + uXYa % NB, 1); + if (bktIdx >= maxOut) { #ifdef VERBOSE - printf("dropped halfedge %llx\n", edge); + printf("dropped u-halfedge %llx\n", edge); #endif - continue; + break; + } + dst[(uXYa % NB) * maxOut + bktIdx] = make_uint2((vXY << ZBITS) | (edge & ZMASK), slots << ZBITS | edge & ZMASK); + bktIdx = atomicAdd(ydstIdx + vXYa % NB, 1); + if (bktIdx >= maxOut) { +#ifdef VERBOSE + printf("dropped v-halfedge %llx\n", edge); +#endif + break; } - dst[bucket * maxOut/2 + bktIdx] = make_uint2(Group << ZBITS | (edge>>32) & ZMASK, slots << ZBITS | (edge>>32) & ZMASK); + ydst[(vXYa % NB) * maxOut + bktIdx] = make_uint2((uXY << ZBITS) | (edge>>32) & ZMASK, slots << ZBITS | (edge>>32) & ZMASK); } } +} - __syncthreads(); - - if (!lid) { - u32 *dstXIdx = dstIdx + NMEM * NB; - dstXIdx[Group] = *ourIdx; - } +__global__ void setEdgeMap(u32 *edgeslots, u64 *edgemap) +//setEdgeMap<<>>(edgeSlots, edgeMap); +{ + const int uX = blockIdx.x / NBB2; + const int vX = blockIdx.x % NBB2; + const int lid = threadIdx.x; + edgeslots += blockIdx.x * NB; for (int i = 0; i < NB/SPLIT_TPB; i++) { - int idx = lid + SPLIT_TPB * i; - edgemap[idx] = emap[idx] < 64 ? (1ULL << emap[idx]) - 1 : ~0ULL; + int bucket = lid + SPLIT_TPB * i; + edgemap[(uX*NBB2 + bucket/NBB2) * NB + vX*NBB2 + bucket%NBB2] = edgeslots[bucket] < 64 ? (1ULL << edgeslots[bucket]) - 1 : ~0ULL; } } __device__ __forceinline__ void addhalfedge(u32 *magazine, const u32 bucket, const u32 rest, const u32 slot, u32 *indices, uint2* dst) { +// addhalfedge(magazine, bucket, pair.x&ZMASK, slot0, sourceIndexes+NMEM*NB+group, source+delta); u32 halfedge = slot << ZBITS | rest; for (u32 ret = atomicCAS(&magazine[bucket], 0, halfedge); ret; ) { u64 ret2 = atomicCAS(&magazine[bucket], ret, 0); @@ -387,184 +424,306 @@ __device__ __forceinline__ void flushhalfedges(u32 *magazine, const int lid, u32 } } -template // maxIn is size of small buckets in half-edges -__global__ void Round(uint2 *source, u32 *sourceIndexes, u64 *edgemap) +static const int NODEDEGWORDS = NZ / 32; + +__device__ __forceinline__ void nodeSet(u32 *nodeDeg, const int bucket) { + int word = bucket >> 5; + unsigned char bit = bucket & 0x1F; + u32 mask = 1 << bit; + + u32 old = atomicOr(nodeDeg + word, mask) & mask; + if (old) + atomicOr(nodeDeg + NODEDEGWORDS/2 + word, mask); +} + +__device__ __forceinline__ bool nodeTest(u32 *nodeDeg, const int bucket) { + int word = bucket >> 5; + unsigned char bit = bucket & 0x1F; + + return (nodeDeg[NODEDEGWORDS/2 + word] >> bit) & 1; +} + +template // maxIn is size of buckets in half-edge apirs +__global__ void Round(uint2 *source, u32 *sourceIndexes, u64 *edgemap) +// Round<<<2*NB, TRIM_TPB>>>((uint2*)bufferB, indexesB, edgeMap); { const int lid = threadIdx.x; const int group = blockIdx.x; - int bktsize[NMEM]; - int npairs[NMEM]; + int xsize[2], ysize[2]; + int nxpairs[2], nypairs[2]; - __shared__ u32 htnode[2*NZ/32/NPARTS]; // u & v node bits + __shared__ u32 nodeDeg[NODEDEGWORDS]; // node degrees __shared__ u32 magazine[NB]; - for (int i = 0; i < 2*NZ/32/NPARTS/TRIM_TPB; i++) - htnode[lid + i * TRIM_TPB] = 0; + for (int i = 0; i < NODEDEGWORDS/TRIM_TPB; i++) + nodeDeg[lid + i * TRIM_TPB] = 0; for (int i = 0; i < NB/TRIM_TPB; i++) magazine[lid + i * TRIM_TPB] = 0; - for (int a = 0; a < NMEM; a++) { - bktsize[a] = sourceIndexes[a * NB + group]; - npairs[a] = (bktsize[a] + TRIM_TPB-1) / TRIM_TPB; + const u32 groupa = group / NB; + if (!lid) assert(groupa < 2); + const int groupb = group % NB; + for (int ai = 0; ai < 2; ai++) { + xsize[ai] = sourceIndexes[(2 * groupa + ai) * 2*NB + groupb]; + nxpairs[ai] = (xsize[ai] + TRIM_TPB-1) / TRIM_TPB; + ysize[ai] = sourceIndexes[(2 * ai + groupa) * 2*NB + NB + groupb]; + nypairs[ai] = (ysize[ai] + TRIM_TPB-1) / TRIM_TPB; } - const u32 bsize = sourceIndexes[NMEM * NB + group]; - const u32 np = (bsize + TRIM_TPB-1) / TRIM_TPB; __syncthreads(); - if (lid <= NMEM) - sourceIndexes[lid * NB + group] = 0; + if (lid < NMEM) { + if (lid / 2 == groupa) + sourceIndexes[lid * 2*NB + groupb] = 0; + if (lid % 2 == groupa) + sourceIndexes[lid * 2*NB + NB + groupb] = 0; + } - const u32 delta = (group / NB_NMEM) * NB * maxIn + (NB + (group % NB_NMEM) * NMEM) * maxIn/2; - for (int i = 0; i < np; i++) { - __syncthreads(); - u32 idx = lid + i * TRIM_TPB; - if (idx >= bsize) continue; - uint2 pair = source[delta + idx]; - u32 bucket = pair.x >> ZBITS; - u32 slots = pair.y >> ZBITS; - u32 slot0 = slots % 64, slot1 = slots / 64; - u64 es = edgemap[group * NB + bucket]; - if (testbit(es, slot0)) - setbit(htnode, pair.x & ZMASK); - if (slot1 != slot0 && testbit(es, slot1)) - setbit(htnode, pair.y & ZMASK); +#if 1 + for (int ai = 0; ai < 2; ai++) { + int a = 2 * groupa + ai; + const u32 delta = (a * 2*NB + groupb) * maxIn; + // if (!group && !lid && !a) printf("xsize[%d] = %d nxpairs[a] = %d\n", a, xsize[ai], nxpairs[ai]); + for (int i = 0; i < nxpairs[ai]; i++) { + __syncthreads(); + const u32 idx = lid + i * TRIM_TPB; + if (idx >= xsize[ai]) continue; // not to be optimized away with every iteration synced + const uint2 pair = source[delta + idx]; + const u32 bucket = pair.x >> ZBITS; + assert(bucket < NB); + const u32 slots = pair.y >> ZBITS; + const u32 slot0 = slots % 64, slot1 = slots / 64; + assert(group/2 < NB); + const u64 es = edgemap[(group/2) * NB + bucket]; // group/2 * NB + // if (!group && !lid && !a) printf("delta+idx %d bucket %x pair %x %x\n", delta+idx, bucket, pair.x, pair.y); + if (testbit(es, slot0)) + nodeSet(nodeDeg, pair.x & ZMASK/2); + if (slot1 != slot0 && testbit(es, slot1)) + nodeSet(nodeDeg, pair.y & ZMASK/2); + } + } +#endif + +#if 1 + for (int ai = 0; ai < 2; ai++) { + int a = 2 * ai + groupa; + const u32 delta = (a * 2*NB + NB + groupb) * maxIn; + for (int i = 0; i < nypairs[ai]; i++) { + __syncthreads(); + const u32 idx = lid + i * TRIM_TPB; + if (idx >= ysize[ai]) continue; // not to be optimized away with every iteration synced + const uint2 pair = source[delta + idx]; + const u32 bucket = pair.x >> ZBITS; + assert(bucket < NB); + const u32 slots = pair.y >> ZBITS; + const u32 slot0 = slots % 64, slot1 = slots / 64; + const u64 es = edgemap[bucket * NB + group/2]; + if (testbit(es, slot0)) + nodeSet(nodeDeg, pair.x & ZMASK/2); + if (slot1 != slot0 && testbit(es, slot1)) + nodeSet(nodeDeg, pair.y & ZMASK/2); + } } +#endif - for (int a = 0; a < NMEM; a++) { - const u32 delta = a * NB * maxIn + group * maxIn/2; // / 2 for indexing pairs, * 2 for alternate buckets - for (int i = 0; i < npairs[a]; i++) { +#if 1 + for (int ai = 0; ai < 2; ai++) { + int a = 2 * groupa + ai; + const u32 delta = (a * 2*NB + groupb) * maxIn; + for (int i = 0; i < nxpairs[ai]; i++) { __syncthreads(); - u32 idx = lid + i * TRIM_TPB; - if (idx >= bktsize[a]) continue; - uint2 pair = source[delta + idx]; - u32 bucket = pair.x >> ZBITS; - u32 slots = pair.y >> ZBITS; - u32 slot0 = slots % 64, slot1 = slots / 64; - u64 es = edgemap[bucket * NB + group]; + const u32 idx = lid + i * TRIM_TPB; + if (idx >= xsize[ai]) continue; // not to be optimized away with every iteration synced + const uint2 pair = source[delta + idx]; + const u32 bucket = pair.x >> ZBITS; + assert(bucket < NB); + const u32 slots = pair.y >> ZBITS; + const u32 slot0 = slots % 64, slot1 = slots / 64; + const u64 es = edgemap[(group/2) * NB + bucket]; if (testbit(es, slot0)) { - if (testbit(htnode, pair.x & ZMASK)) { - setbit(htnode, pair.x & ZMASK); + if (nodeTest(nodeDeg, pair.x & ZMASK/2)) { + addhalfedge(magazine, bucket, pair.x&ZMASK, slot0, sourceIndexes+a*2*NB+groupb, source+delta); + } else { + // if (!group && !bucket) printf("u-halfedge %08x slot %d dies\n", pair.x & ZMASK, slot0); + resetbit(edgemap + (group/2) * NB + bucket, slot0); } } if (slot1 != slot0 && testbit(es, slot1)) { - if (testbit(htnode, pair.y & ZMASK)) { - setbit(htnode, pair.y & ZMASK); + if (nodeTest(nodeDeg, pair.y & ZMASK/2)) { + addhalfedge(magazine, bucket, pair.y&ZMASK, slot1, sourceIndexes+a*2*NB+groupb, source+delta); + } else { + // if (!group && !bucket) printf("u-halfedge %08x slot %d dies\n", pair.y & ZMASK, slot1); + resetbit(edgemap + (group/2) * NB + bucket, slot1); } } } __syncthreads(); + flushhalfedges(magazine, lid, sourceIndexes+a*2*NB+groupb, source+delta); } +#endif - for (int a = 0; a < NMEM; a++) { - const u32 delta = a * NB * maxIn + group * maxIn/2; // / 2 for indexing pairs, * 2 for alternate buckets - for (int i = 0; i < npairs[a]; i++) { +#if 1 + for (int ai = 0; ai < 2; ai++) { + int a = 2 * ai + groupa; + const u32 delta = (a * 2*NB + NB + groupb) * maxIn; + for (int i = 0; i < nypairs[ai]; i++) { __syncthreads(); - u32 idx = lid + i * TRIM_TPB; - if (idx >= bktsize[a]) continue; - uint2 pair = source[delta + idx]; - u32 bucket = pair.x >> ZBITS; - u32 slots = pair.y >> ZBITS; - u32 slot0 = slots % 64, slot1 = slots / 64; - u64 es = edgemap[bucket * NB + group]; + const u32 idx = lid + i * TRIM_TPB; + if (idx >= ysize[ai]) continue; // not to be optimized away with every iteration synced + const uint2 pair = source[delta + idx]; + const u32 bucket = pair.x >> ZBITS; + assert(bucket < NB); + const u32 slots = pair.y >> ZBITS; + const u32 slot0 = slots % 64, slot1 = slots / 64; + const u64 es = edgemap[bucket * NB + group/2]; if (testbit(es, slot0)) { - if (testbit(htnode, pair.x & ZMASK)) { - addhalfedge(magazine, bucket, pair.x&ZMASK, slot0, sourceIndexes+a*NB+group, source+delta); - } else resetbit(edgemap + bucket * NB + group, slot0); + if (nodeTest(nodeDeg, pair.x & ZMASK/2)) { + addhalfedge(magazine, bucket, pair.x&ZMASK, slot0, sourceIndexes+a*2*NB+NB+groupb, source+delta); + } else { + // if (!group && !bucket) printf("v-halfedge %08x slot %d dies\n", pair.x & ZMASK, slot0); + resetbit(edgemap + bucket * NB + group/2, slot0); + } } if (slot1 != slot0 && testbit(es, slot1)) { - if (testbit(htnode, pair.y & ZMASK)) { - addhalfedge(magazine, bucket, pair.y&ZMASK, slot1, sourceIndexes+a*NB+group, source+delta); - } else resetbit(edgemap + bucket * NB + group, slot1); + if (nodeTest(nodeDeg, pair.y & ZMASK/2)) { + addhalfedge(magazine, bucket, pair.y&ZMASK, slot1, sourceIndexes+a*2*NB+NB+groupb, source+delta); + } else { + // if (!group && !bucket) printf("v-halfedge %08x slot %d dies\n", pair.y & ZMASK, slot1); + resetbit(edgemap + bucket * NB + group/2, slot1); + } } } __syncthreads(); - - flushhalfedges(magazine, lid, sourceIndexes+a*NB+group, source+delta); + flushhalfedges(magazine, lid, sourceIndexes+a*2*NB+NB+groupb, source+delta); } +#endif +} + +template +__global__ void EdgesMeet(uint2 *source, u32 *sourceIndexes, u32 *destIndexes) +// EdgesMeet<<<4*NB, TRIM_TPB>>>((uint2*)bufferB, indexesB, indexesA); +{ + const int lid = threadIdx.x; + const int groupa = blockIdx.x / NB; + assert(groupa < 4); + const int groupb = blockIdx.x % NB; + + int vsize = sourceIndexes[groupa * 2*NB + NB + groupb]; + int nvpairs = (vsize + TRIM_TPB-1) / TRIM_TPB; + + __syncthreads(); - for (int i = 0; i < np; i++) { + const u32 deltaOut = groupa * 2*NB * maxIn; + const u32 deltaIn = deltaOut + (NB + groupb) * maxIn; + const int vX = (groupa%2) * NB2 + groupb/2; + for (int i = 0; i < nvpairs; i++) { __syncthreads(); u32 idx = lid + i * TRIM_TPB; - if (idx >= bsize) continue; - uint2 pair = source[delta + idx]; - u32 bucket = pair.x >> ZBITS; - u32 slots = pair.y >> ZBITS; - u32 slot0 = slots % 64, slot1 = slots / 64; - u64 es = edgemap[group * NB + bucket]; - if (testbit(es, slot0)) { - if (testbit(htnode, pair.x & ZMASK)) { - addhalfedge(magazine, bucket, pair.x&ZMASK, slot0, sourceIndexes+NMEM*NB+group, source+delta); - } else resetbit(edgemap + group * NB + bucket, slot0); - } - if (slot1 != slot0 && testbit(es, slot1)) { - if (testbit(htnode, pair.y & ZMASK)) { - addhalfedge(magazine, bucket, pair.y&ZMASK, slot1, sourceIndexes+NMEM*NB+group, source+delta); - } else resetbit(edgemap + group * NB + bucket, slot1); - } + if (idx >= vsize) continue; + uint2 pair = source[deltaIn + idx]; + u32 uXb = (pair.x >> (ZBITS-1)) % NB; + pair.x = (vX << ZBITS) | (pair.x & ZMASK); + int i = atomicAdd(destIndexes + groupa * 2*NB + uXb, 1); + source[deltaOut + uXb * maxIn + i] = pair; } - __syncthreads(); - - flushhalfedges(magazine, lid, sourceIndexes+NMEM*NB+group, source+delta); } -template // maxIn is size of small buckets in half-edges -__global__ void EdgesMeet(uint2 *source, u32 *sourceIndexes, u32 *destIndexes) +template +__global__ void EdgesMoveU(uint2 *source, u32 *indexes, u32 *dstIndexes) +// EdgesMoveU<<>>((uint2*)bufferB, indexesB, indexesB+NMEM*2*NB); { const int lid = threadIdx.x; const int group = blockIdx.x; - int bktsize[NMEM]; - int npairs[NMEM]; - - for (int a = 0; a < NMEM; a++) { - bktsize[a] = sourceIndexes[a * NB + group]; - npairs[a] = (bktsize[a] + TRIM_TPB-1) / TRIM_TPB; + const int aOut = group / NB_NMEM; + const int uXa = aOut / 2; + assert(uXa < 2); + const int groupb = group % NB2; + const int groupbb = group % NB_NMEM; + + const u32 deltaOut0 = (aOut * 2*NB + NB) * maxIn + groupbb * maxOut; + u32 deltaOut = deltaOut0; + for (int vXa = 0; vXa < 2; vXa++) { + int aIn = 2 * uXa + vXa; + for (int uYa = 0; uYa < 2; uYa++) { + const int groupba = 2*groupb +uYa; + int usize = indexes[aIn * 2*NB + groupba]; + int nupairs = (usize + TRIM_TPB-1) / TRIM_TPB; + const u32 deltaIn = (aIn * 2*NB + groupba) * maxIn; + for (int i = 0; i < nupairs; i++) { + u32 idx = lid + i * TRIM_TPB; + if (idx >= usize) continue; + uint2 pair = source[deltaIn + idx]; + // if (group==0 && lid>=0) printf("pair %08x %08x\n", pair.x, pair.y); + source[deltaOut + idx] = pair; + } + deltaOut += usize; + } } + if (!lid) + dstIndexes[group] = deltaOut - deltaOut0; +} - __syncthreads(); - - for (int a = 0; a < NMEM; a++) { - const u32 delta = a * NB * maxIn + group * maxIn/2; // / 2 for indexing pairs, * 2 for alternate buckets - const u32 delta2 = a * NB * maxIn + NB * maxIn/2; - for (int i = 0; i < npairs[a]; i++) { - __syncthreads(); - u32 idx = lid + i * TRIM_TPB; - if (idx >= bktsize[a]) continue; - uint2 pair = source[delta + idx]; - u32 bucket = pair.x >> ZBITS; - pair.x = (group << ZBITS) | (pair.x & ZMASK); - int i = atomicAdd(destIndexes + bucket, 1); - const u32 dlt = (bucket % NB_NMEM) * NMEM * maxIn/2; - source[delta2 + dlt + i] = pair; +template +__global__ void EdgesMoveV(uint2 *source, u32 *indexes, u32 *indexes2, u32 *dstIndexes) +// EdgesMoveV<<>>((uint2*)bufferB, indexesB, indexesA, indexesB+NMEM*2*NB); +{ + const int lid = threadIdx.x; + const int group = blockIdx.x; + const int aOut = group / NB_NMEM; + const int uXa = aOut / 2; + assert(uXa < 2); + const int groupb = group % NB2; + const int groupbb = group % NB_NMEM; + + const u32 deltaOut0 = (aOut * 2*NB + NB) * maxIn + groupbb * maxOut; + const u32 uOutsize = dstIndexes[group]; + u32 deltaOut = deltaOut0 + uOutsize; + for (int vXa = 0; vXa < 2; vXa++) { + int aIn = 2 * uXa + vXa; + for (int uYa = 0; uYa < 2; uYa++) { + const int groupba = 2*groupb +uYa; + int usize = indexes[aIn * 2*NB + groupba]; + int vsize = indexes2[aIn * 2*NB + groupba] - usize; + assert(vsize >= 0); + int nvpairs = (vsize + TRIM_TPB-1) / TRIM_TPB; + const u32 deltaIn = (aIn * 2*NB + groupba) * maxIn + usize; + for (int i = 0; i < nvpairs; i++) { + u32 idx = lid + i * TRIM_TPB; + if (idx >= vsize) continue; + uint2 pair = source[deltaIn + idx]; + // if (group==0 && lid>=0) printf("pair %08x %08x\n", pair.x, pair.y); + source[deltaOut + idx] = pair; + } + deltaOut += vsize; } } + if (!lid) + dstIndexes[NB+group] = deltaOut - deltaOut0; } template -__device__ __forceinline__ void addedge(const u32 buckx, const u32 restx, const u32 bucky, const u32 resty, u32 *indices, uint2* dest, u32* dstidx) { - int idx = atomicAdd(indices + bucky, 1); - dest[ bucky *maxOut + idx] = make_uint2(restx << ZBITS1 | resty, buckx << ZBITS | restx); // extra +1 shift makes a 0 copybit - int idx2 = atomicAdd(dstidx, 1); - dest[(NB+buckx)*maxOut + idx2] = make_uint2(resty << ZBITS1 | restx, bucky << ZBITS | resty); +__device__ __forceinline__ void addedge(const u32 buckx, const u32 restx, const u32 bucky, const u32 resty, u32 *indices, uint2* dest) { + // if (restx==0x150f3 || resty==0x150f3) printf("x %x %x y %x %x\n", buckx, restx, bucky, resty); + int idx = atomicAdd(indices + bucky, 1); + dest[bucky * maxOut + idx] = make_uint2(restx << ZBITS1 | resty, buckx << ZBITS | restx); // extra +1 shift makes a 0 copybit + int idx2 = atomicAdd(indices + buckx, 1); + dest[buckx * maxOut + idx2] = make_uint2(resty << ZBITS1 | restx, bucky << ZBITS | resty); } template // maxIn is size of small buckets in half-edges __global__ void UnsplitEdges(uint2 *source, u32 *srcIndexes, u32 *srcIndexes2, uint2 *dest, u32 *dstIndexes) +// UnsplitEdges<<>>((uint2*)bufferB, indexesB+NMEM*NB, indexesA, bufferC, indexesB); { const int lid = threadIdx.x; const int group = blockIdx.x; __shared__ u32 lists[NB]; __shared__ u32 nexts[NB]; - __shared__ u32 dstidx; for (int i = 0; i < NB/UNSPLIT_TPB; i++) lists[i * UNSPLIT_TPB + lid] = ~0; - if (!lid) - dstidx = 0; - const u32 size1 = srcIndexes[group]; const u32 np1 = (size1 + UNSPLIT_TPB-1) / UNSPLIT_TPB; const u32 size2 = srcIndexes2[group]; @@ -572,12 +731,13 @@ __global__ void UnsplitEdges(uint2 *source, u32 *srcIndexes, u32 *srcIndexes2, u __syncthreads(); - const u32 delta = (group / NB_NMEM) * NB * maxIn + (NB + (group % NB_NMEM) * NMEM) * maxIn/2; + const u32 delta = (group / NB_NMEM) * NB * maxIn + (NB + (group % NB_NMEM) * NMEM) * maxIn/2; source += delta; for (int i = 0; i < np1; i++) { const u32 index = lid + i * UNSPLIT_TPB; if (index >= size1) continue; const uint2 pair = source[index]; + // if (group==0 && lid==0) printf("pair %08x %08x\n", pair.x, pair.y); const u32 bucket = pair.x >> ZBITS; nexts[index] = atomicExch(&lists[bucket], index); } @@ -595,28 +755,24 @@ __global__ void UnsplitEdges(uint2 *source, u32 *srcIndexes, u32 *srcIndexes2, u const uint2 pair2 = source[idx]; const u32 slots2 = pair2.y >> ZBITS; const u32 slot20 = slots2 % 64, slot21 = slots2 / 64; + // if (group==0 && lid==0) printf("pair %08x %08x pair2 %08x %08x\n", pair.x, pair.y, pair2.x, pair2.y); if (slot20 == slot0) - addedge(group, pair2.x & ZMASK, bucket, pair.x & ZMASK, dstIndexes, dest, &dstidx); + addedge(group, pair2.x & ZMASK, bucket, pair.x & ZMASK, dstIndexes, dest); else if (slot20 == slot1) - addedge(group, pair2.x & ZMASK, bucket, pair.y & ZMASK, dstIndexes, dest, &dstidx); + addedge(group, pair2.x & ZMASK, bucket, pair.y & ZMASK, dstIndexes, dest); if (slot21 != slot20) { if (slot21 == slot0) - addedge(group, pair2.y & ZMASK, bucket, pair.x & ZMASK, dstIndexes, dest, &dstidx); + addedge(group, pair2.y & ZMASK, bucket, pair.x & ZMASK, dstIndexes, dest); else if (slot21 == slot1) - addedge(group, pair2.y & ZMASK, bucket, pair.y & ZMASK, dstIndexes, dest, &dstidx); + addedge(group, pair2.y & ZMASK, bucket, pair.y & ZMASK, dstIndexes, dest); } // if (slots != slots2) printf("pair slots %d %d pair2 slots %d %d\n", slot0, slot1, slot20, slot21); } } - - __syncthreads(); - - if (!lid) - dstIndexes[NB + group] = dstidx; } #ifndef LISTBITS -#define LISTBITS 11 +#define LISTBITS 12 #endif const u32 NLISTS = 1 << LISTBITS; @@ -626,7 +782,9 @@ const u32 NLISTS = 1 << LISTBITS; #endif template -__global__ void Tag_Relay(const uint2 *source, uint2 *destination, const u32 *sourceIndexes, u32 *destinationIndexes) +__global__ void Tag_Relay(const uint2 *source, uint2 *destination, const u32 *sourceIndexes, u32 *destinationIndexes) +// Tag_Relay<<>>(bufferC, bufferD, indexesB, indexesA); +// Tag_Relay<<>>(bufferD, bufferC, indexesA, indexesB); { const int lid = threadIdx.x; const int group = blockIdx.x; @@ -634,115 +792,94 @@ __global__ void Tag_Relay(const uint2 *source, uint2 *destination, const u32 *so const u32 TAGMASK = (~1U) << ZBITS; const u32 COPYFLAG = NZ; - __shared__ u32 lists0[NLISTS]; - __shared__ u32 nexts0[NNEXTS]; - __shared__ u32 lists1[NLISTS]; - __shared__ u32 nexts1[NNEXTS]; + __shared__ u32 lists[NLISTS]; + __shared__ u32 nexts[NNEXTS]; - const int nloops0 = (min(sourceIndexes[ group], NNEXTS) - lid + RELAY_TPB-1) / RELAY_TPB; - const int nloops1 = (min(sourceIndexes[NB+group], NNEXTS) - lid + RELAY_TPB-1) / RELAY_TPB; + const int nloops = (min(sourceIndexes[group], NNEXTS) - lid + RELAY_TPB-1) / RELAY_TPB; source += group * maxIn; for (int i = 0; i < NLISTS/RELAY_TPB; i++) - lists0[i * RELAY_TPB + lid] = lists1[i * RELAY_TPB + lid] = ~0; + lists[i * RELAY_TPB + lid] = ~0; __syncthreads(); - for (int i = 0; i < nloops0; i++) { + for (int i = 0; i < nloops; i++) { const u32 index = i * RELAY_TPB + lid; const uint2 edge = source[index]; + // if (group==0 && lid>=0) printf("edge %08x %08x\n", edge.x, edge.y); const u32 list = edge.x & LISTMASK; - nexts0[index] = atomicExch(&lists0[list], index); + nexts[index] = atomicExch(&lists[list], index); } __syncthreads(); - for (int i = 0; i < nloops1; i++) { + for (int i = 0; i < nloops; i++) { const u32 index = i * RELAY_TPB + lid; - const uint2 edge = source[NB * maxIn + index]; - const u32 list = edge.x & LISTMASK; - nexts1[index] = atomicExch(&lists1[list], index); + const uint2 edge = source[index]; if (edge.x & COPYFLAG) continue; // copies don't relay + const u32 list = edge.x & LISTMASK; + // nexts1[index] = atomicExch(&lists1[list], index); u32 bucket = edge.y >> ZBITS; u32 copybit = 0; - for (u32 idx = lists0[list]; idx != ~0; idx = nexts0[idx]) { + for (u32 idx = lists[list]; idx != ~0; idx = nexts[idx]) { uint2 tagged = source[idx]; // printf("compare with tagged %08x %08x xor %x\n", tagged.x, tagged.y, (tagged.x ^ edge.x) & ZMASK); - if ((tagged.x ^ edge.x) & ZMASK) continue; - u32 bktIdx = min(atomicAdd(destinationIndexes + bucket, 1), maxOut - 1); + if (idx == index || (tagged.x ^ edge.x) & ZMASK || tagged.y == edge.y) continue; + if (((group << ZBITS) | (edge.x & ZMASK)) == edge.y) { printf("OOPS! SELF\n"); continue; } + u32 bktIdx = atomicAdd(destinationIndexes + bucket, 1); // , maxOut - 1); + assert(bktIdx < maxOut); // printf("relaying from tagged %08x %08x bktIdx %d\n", tagged.x, tagged.y, bktIdx); u32 tag = (tagged.x & TAGMASK) | copybit; destination[bucket * maxOut + bktIdx] = make_uint2(tag | (edge.y & ZMASK), (group << ZBITS) | (edge.x & ZMASK)); copybit = COPYFLAG; } } - - __syncthreads(); - - for (int i = 0; i < nloops0; i++) { - const u32 index = i * RELAY_TPB + lid; - const uint2 edge = source[index]; - const u32 list = edge.x & LISTMASK; - if (edge.x & COPYFLAG) continue; // copies don't relay - u32 bucket = edge.y >> ZBITS; - u32 copybit = 0; - for (u32 idx = lists1[list]; idx != ~0; idx = nexts1[idx]) { - uint2 tagged = source[NB * maxIn + idx]; - if ((tagged.x ^ edge.x) & ZMASK) continue; - u32 bktIdx = min(atomicAdd(destinationIndexes + NB + bucket, 1), maxOut - 1); - // printf("relaying from tagged %08x %08x bktIdx %d\n", tagged.x, tagged.y, bktIdx); - u32 tag = (tagged.x & TAGMASK) | copybit; - destination[(NB + bucket) * maxOut + bktIdx] = make_uint2(tag | (edge.y & ZMASK), (group << ZBITS) | (edge.x & ZMASK)); - copybit = COPYFLAG; - } - } } template -__global__ void Tail(const uint2 *source, uint2 *destination, const u32 *sourceIndexes, u32 *destinationIndexes) +__global__ void Tail(const uint2 *source, uint2 *destination, const u32 *sourceIndexes, u32 *destinationIndexes) { const int lid = threadIdx.x; const int group = blockIdx.x; const u32 LISTMASK = NLISTS - 1; const u32 COPYFLAG = NZ; - __shared__ u32 lists0[NLISTS]; - __shared__ u32 nexts0[NNEXTS]; + __shared__ u32 lists[NLISTS]; + __shared__ u32 nexts[NNEXTS]; - const int nloops0 = (min(sourceIndexes[ group], NNEXTS) - lid + TAIL_TPB-1) / TAIL_TPB; - const int nloops1 = (min(sourceIndexes[NB+group], NNEXTS) - lid + TAIL_TPB-1) / TAIL_TPB; + const int nloops = (min(sourceIndexes[group], NNEXTS) - lid + TAIL_TPB-1) / TAIL_TPB; source += group * maxIn; for (int i = 0; i < NLISTS/TAIL_TPB; i++) - lists0[i * TAIL_TPB + lid] = ~0; + lists[i * TAIL_TPB + lid] = ~0; __syncthreads(); - for (int i = 0; i < nloops0; i++) { + for (int i = 0; i < nloops; i++) { const u32 index = i * TAIL_TPB + lid; const uint2 edge = source[index]; const u32 list = edge.x & LISTMASK; - nexts0[index] = atomicExch(&lists0[list], index); + nexts[index] = atomicExch(&lists[list], index); } __syncthreads(); - for (int i = 0; i < nloops1; i++) { + for (int i = 0; i < nloops; i++) { const u32 index = i * TAIL_TPB + lid; - const uint2 edge = source[NB * maxIn + index]; + const uint2 edge = source[index]; const u32 list = edge.x & LISTMASK; - for (u32 idx = lists0[list]; idx != ~0; idx = nexts0[idx]) { + for (u32 idx = lists[list]; idx != ~0; idx = nexts[idx]) { uint2 tagged = source[idx]; - if ((tagged.x ^ edge.x) & ~COPYFLAG) continue; + if (idx == index || (tagged.x ^ edge.x) & ~COPYFLAG) continue; u32 bktIdx = atomicAdd(destinationIndexes, 1); destination[bktIdx] = make_uint2((group << ZBITS) | (edge.x & ZMASK), edge.y); } } } -__global__ void Recovery(u32 * indexes) +__global__ void Recovery(u32 * indexes) { const int gid = blockDim.x * blockIdx.x + threadIdx.x; const int lid = threadIdx.x; @@ -763,6 +900,7 @@ __global__ void Recovery(u32 * indexes) for (int i = 0; i < nloops; i++) { u64 blockNonce = gid * nloops * EDGE_BLOCK_SIZE + i * EDGE_BLOCK_SIZE; + assert(blockNonce < NEDGES); v0 = dipkeys.k0; v1 = dipkeys.k1; @@ -787,7 +925,7 @@ __global__ void Recovery(u32 * indexes) for (int s = EDGE_BLOCK_SIZE; --s >= 0; last = lookup) { lookup = sipblock[s] ^ last; for (int i = 0; i < PROOFSIZE; i++) { - if (recovery[i] == (lookup & NODEMASK2)) + if (recovery[i] == (lookup & NODEMASK2) || recovery[PROOFSIZE+i] == (lookup & NODEMASK2)) nonces[i] = blockNonce + s; } } @@ -813,18 +951,15 @@ typedef uint16_t u16; const u32 MAXEDGES = NEDGES >> IDXSHIFT; #ifndef NEPS_A -#define NEPS_A 135 +#define NEPS_A 144 #endif #ifndef NEPS_B -#define NEPS_B 88 -#endif -#ifndef NEPS_C -#define NEPS_C 55 +#define NEPS_B 92 #endif #define NEPS 128 -const u32 EDGES_A = NZ * NEPS_A / NEPS; -const u32 EDGES_B = NZ * NEPS_B / NEPS; +const u32 EDGES_A = (NEDGES / NB) * NEPS_A / NEPS; +const u32 EDGES_B = (NEDGES / NB) * NEPS_B / NEPS; const u32 ALL_EDGES_A = EDGES_A * NB; const u32 ALL_EDGES_B = EDGES_B * NB; @@ -887,12 +1022,15 @@ struct edgetrimmer { const size_t indexesSize = NB * sizeof(u32); const size_t indexesSizeNTLB = NTLB * indexesSize; const size_t indexesSizeNMEM = NMEM * indexesSize; - const size_t edgemapSize = 2 * NEDGES / 8; // 8 bits per byte; twice that for bucket (2^24 in all) size variance + const size_t edgeSlotsSize = NB * NB * sizeof(u32); // count slots in edgeMap + const size_t edgeMapSize = 2 * NEDGES / 8; // 8 bits per byte; twice that for bucket (2^24 in all) size variance u8 *bufferA; u8 *bufferB; + u8 *bufferA1; u32 *indexesA; u32 *indexesB; - u64 *edgemap; + u32 *edgeSlots; + u64 *edgeMap; u32 nedges; siphash_keys sipkeys; bool abort; @@ -900,29 +1038,33 @@ struct edgetrimmer { edgetrimmer(const trimparams _tp) : tp(_tp) { checkCudaErrors_V(cudaMalloc((void**)&dt, sizeof(edgetrimmer))); - checkCudaErrors_V(cudaMalloc((void**)&indexesA, indexesSizeNTLB)); - checkCudaErrors_V(cudaMalloc((void**)&indexesB, indexesSizeNMEM + indexesSize)); - checkCudaErrors_V(cudaMalloc((void**)&edgemap, edgemapSize)); + checkCudaErrors_V(cudaMalloc((void**)&indexesA, 2*indexesSizeNTLB)); + checkCudaErrors_V(cudaMalloc((void**)&indexesB, 3*indexesSizeNMEM)); + checkCudaErrors_V(cudaMalloc((void**)&edgeSlots, edgeSlotsSize)); checkCudaErrors_V(cudaMalloc((void**)&bufferB, bufferSize)); bufferA = bufferB + sizeA / NMEM; - // print_log("allocated %lld bytes bufferB %llx endBuffer %llx\n", bufferSize, bufferB, bufferB+bufferSize); + assert(edgeMapSize < sizeA / NMEM); + assert(edgeMapSize == NB * NB * sizeof(u64)); + edgeMap = (u64 *)(bufferB + bufferSize - edgeMapSize); + bufferA1 = bufferB + bufferSize - sizeA / NMEM; + print_log("allocated %lld bytes bufferB %llx endBuffer %llx\n", bufferSize, bufferB, bufferB+bufferSize); + print_log("bufferA %llx bufferA1 %llx\n", bufferA, bufferA1); cudaMemcpy(dt, this, sizeof(edgetrimmer), cudaMemcpyHostToDevice); initsuccess = true; - cudaFuncSetAttribute(EdgeSplit, cudaFuncAttributeMaxDynamicSharedMemorySize, EXTSHARED); } u64 globalbytes() const { - return bufferSize + indexesSizeNTLB + indexesSizeNMEM + indexesSize + edgemapSize + sizeof(siphash_keys) + sizeof(edgetrimmer); + return bufferSize + 2*indexesSizeNTLB + 3*indexesSizeNMEM + edgeSlotsSize + edgeMapSize + sizeof(siphash_keys) + sizeof(edgetrimmer); } ~edgetrimmer() { checkCudaErrors_V(cudaFree(bufferB)); checkCudaErrors_V(cudaFree(indexesA)); checkCudaErrors_V(cudaFree(indexesB)); - checkCudaErrors_V(cudaFree(edgemap)); + checkCudaErrors_V(cudaFree(edgeSlots)); checkCudaErrors_V(cudaFree(dt)); cudaDeviceReset(); } void indexcount(u32 round, const u32 *indexes) { -#ifdef VERBOSE +#if VERBOSE u32 nedges[NB]; for (int i = 0; i < NB; i++) nedges[i] = 0; @@ -982,37 +1124,41 @@ struct edgetrimmer { print_log("Seeding completed in %.0f ms\n", durationA); print_log("EdgeSplit<<<%d,%d>>>\n", NB_NMEM, SPLIT_TPB); // 1024x1024 #endif + if (abort) return false; - cudaMemset(indexesB, 0, indexesSizeNMEM + indexesSize); + assert((uint2*)bufferB + NB * EDGES_A/NMEM == (uint2 *)bufferA); + cudaMemset(edgeSlots, 0, edgeSlotsSize); + cudaMemset(indexesB, 0, 2*indexesSizeNMEM); for (u32 i=0; i < NMEM; i++) { - EdgeSplit<<>>((uint2*)bufferA, (uint2*)bufferB, indexesA, indexesB, edgemap, i); + EdgeSplit<<>>((uint2*)bufferA, (uint2*)bufferB, indexesA, indexesB, edgeSlots, i); if (abort) return false; } checkCudaErrors(cudaDeviceSynchronize()); cudaEventRecord(stop, NULL); cudaEventSynchronize(stop); cudaEventElapsedTime(&durationB, start, stop); - cudaEventRecord(start, NULL); + setEdgeMap<<>>(edgeSlots, edgeMap); + checkCudaErrors(cudaDeviceSynchronize()); + #ifdef VERBOSE - for (u32 i=0; i < NMEM; i++) + for (u32 i=0; i < 2*NMEM; i++) indexcount(1, indexesB+i*NB); - indexcount(1, indexesB+NMEM*NB); - edgemapcount(1, edgemap); // .400 + edgemapcount(1, edgeMap); // .400 print_log("EdgeSplit completed in %.0f ms\n", durationB); print_log("Round<<<%d,%d>>>\n", NB, TRIM_TPB); // 4096x1024 #endif - Round<<>>((uint2*)bufferB, indexesB, edgemap); + cudaEventRecord(start, NULL); + Round<<<2*NB, TRIM_TPB>>>((uint2*)bufferB, indexesB, edgeMap); checkCudaErrors(cudaDeviceSynchronize()); cudaEventRecord(stop, NULL); cudaEventSynchronize(stop); cudaEventElapsedTime(&durationC, start, stop); checkCudaErrors(cudaEventDestroy(start)); checkCudaErrors(cudaEventDestroy(stop)); print_log("Round completed in %.0f ms\n", durationC); - for (u32 i=0; i < NMEM; i++) + for (u32 i=0; i < 2*NMEM; i++) indexcount(2, indexesB+i*NB); - indexcount(2, indexesB+NMEM*NB); - edgemapcount(2, edgemap); + edgemapcount(2, edgeMap); if (abort) return false; #ifdef VERBOSE @@ -1020,17 +1166,32 @@ struct edgetrimmer { #endif for (int r = 3; r < tp.ntrims; r++) { - Round<<>>((uint2*)bufferB, indexesB, edgemap); + Round<<<2*NB, TRIM_TPB>>>((uint2*)bufferB, indexesB, edgeMap); checkCudaErrors(cudaDeviceSynchronize()); - edgemapcount(r, edgemap); + indexcount(r, indexesB); + edgemapcount(r, edgeMap); } - cudaMemcpy((void *)indexesA, indexesB+NMEM*NB, indexesSize, cudaMemcpyDeviceToDevice); - EdgesMeet<<>>((uint2*)bufferB, indexesB, indexesA); + cudaMemcpy((void *)indexesA, indexesB, 2*indexesSizeNMEM, cudaMemcpyDeviceToDevice); // assumes NMEM == NTLB == 4 + EdgesMeet<<<4*NB, TRIM_TPB>>>((uint2*)bufferB, indexesB, indexesA); + checkCudaErrors(cudaDeviceSynchronize()); + EdgesMoveU<<>>((uint2*)bufferB, indexesB, indexesB+NMEM*2*NB); + checkCudaErrors(cudaDeviceSynchronize()); + EdgesMoveV<<>>((uint2*)bufferB, indexesB, indexesA, indexesB+NMEM*2*NB); + checkCudaErrors(cudaDeviceSynchronize()); + cudaMemcpy((void *)indexesA, indexesB+NMEM*2*NB+NB, indexesSize, cudaMemcpyDeviceToDevice); + + indexcount(tp.ntrims, indexesB+NMEM*2*NB); + indexcount(tp.ntrims, indexesA); + +#ifdef VERBOSE + print_log("Done moving\n"); +#endif uint2 *bufferC = (uint2 *)bufferA; - cudaMemset(indexesB, 0, indexesSize); // UnsplitEdges zeroes second indexes set - UnsplitEdges<<>>((uint2*)bufferB, indexesB+NMEM*NB, indexesA, bufferC, indexesB); + uint2 *bufferD = bufferC+NB_NMEM*EDGES_A; + cudaMemset(indexesB, 0, indexesSize); + UnsplitEdges<<>>((uint2*)bufferB, indexesB+NMEM*2*NB, indexesA, bufferC, indexesB); checkCudaErrors(cudaDeviceSynchronize()); indexcount(tp.ntrims, indexesB); indexcount(tp.ntrims, indexesB+NB); @@ -1039,17 +1200,18 @@ struct edgetrimmer { print_log("UnsplitEdges<><<<%d,%d>>>\n", NB, UNSPLIT_TPB); #endif - static_assert(NTLB >= 2, "2 index sets need to fit in indexesA"); - static_assert(NMEM >= 2, "2 index sets need to fit in indexesB"); + static_assert(NTLB == 4, "4 index sets need to fit in indexesA"); + static_assert(NMEM == 4, "4 index sets need to fit in indexesB"); for (int r = 0; r < PROOFSIZE/2 - 1; r++) { - // printf("Relay round %d\n", r); if (r % 2 == 0) { - cudaMemset(indexesA, 0, 2*indexesSize); - Tag_Relay<<>>(bufferC, bufferC+NB_NMEM*EDGES_A, indexesB, indexesA); + cudaMemset(indexesA, 0, indexesSize); + Tag_Relay<<>>(bufferC, bufferD, indexesB, indexesA); + indexcount(tp.ntrims, indexesA); } else { - cudaMemset(indexesB, 0, 2*indexesSize); - Tag_Relay<<>>(bufferC+NB_NMEM*EDGES_A, bufferC, indexesA, indexesB); + cudaMemset(indexesB, 0, indexesSize); + Tag_Relay<<>>(bufferD, bufferC, indexesA, indexesB); + indexcount(tp.ntrims, indexesB); } checkCudaErrors(cudaDeviceSynchronize()); } @@ -1058,14 +1220,14 @@ struct edgetrimmer { print_log("Tail<><<<%d,%d>>>\n", NB, TAIL_TPB); #endif - cudaMemset(indexesA, 0, sizeof(u32)); + cudaMemset(indexesB+NB, 0, sizeof(u32)); if ((PROOFSIZE/2 - 1) % 2 == 0) { - Tail<<>>(bufferC, (uint2 *)bufferB, indexesB, indexesB+2*NB); + Tail<<>>(bufferC, (uint2 *)bufferB, indexesB, indexesB+NB); } else { - Tail<<>>(bufferC+NB_NMEM*EDGES_A, (uint2 *)bufferB, indexesA, indexesB+2*NB); + Tail<<>>(bufferD, (uint2 *)bufferB, indexesA, indexesB+NB); } - cudaMemcpy(&nedges, indexesB+2*NB, sizeof(u32), cudaMemcpyDeviceToHost); + cudaMemcpy(&nedges, indexesB+NB, sizeof(u32), cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); print_log("%d rounds %d edges\n", tp.ntrims, nedges); return nedges; @@ -1077,7 +1239,7 @@ struct solver_ctx { bool mutatenonce; uint2 *edges; graph cg; - uint2 soledges[PROOFSIZE]; + uint2 soledges[2*PROOFSIZE]; std::vector sols; // concatenation of all proof's indices solver_ctx(const trimparams tp, bool mutate_nonce) : trimmer(tp), cg(MAXEDGES, MAXEDGES, MAX_SOLS, IDXSHIFT) { @@ -1107,6 +1269,7 @@ struct solver_ctx { #endif for (u32 j = 0; j < PROOFSIZE; j++) { soledges[j] = edges[cg.sols[s][j]]; + soledges[PROOFSIZE+j] = make_uint2(soledges[j].y, soledges[j].x); #ifdef VERBOSE print_log(" (%x, %x)", soledges[j].x, soledges[j].y); #endif @@ -1120,8 +1283,8 @@ struct solver_ctx { #ifdef VERBOSE print_log("Recovery<><<<%d,%d>>>\n", trimmer.tp.recover.blocks, trimmer.tp.recover.tpb); #endif - Recovery<<>>((u32 *)trimmer.bufferA); - cudaMemcpy(&sols[sols.size()-PROOFSIZE], trimmer.bufferA, PROOFSIZE * sizeof(u32), cudaMemcpyDeviceToHost); + Recovery<<>>((u32 *)trimmer.bufferA1); + cudaMemcpy(&sols[sols.size()-PROOFSIZE], trimmer.bufferA1, PROOFSIZE * sizeof(u32), cudaMemcpyDeviceToHost); checkCudaErrors(cudaDeviceSynchronize()); qsort(&sols[sols.size()-PROOFSIZE], PROOFSIZE, sizeof(u32), cg.nonce_cmp); } @@ -1288,10 +1451,10 @@ CALL_CONVENTION void fill_default_params(SolverParams* params) { params->cpuload = false; } +static_assert(BUCKBITS % 2 == 0, "BUCKBITS must be even"); static_assert(NB % SEED_TPB == 0, "SEED_TPB must divide NB"); static_assert(NB % SPLIT_TPB == 0, "SPLIT_TPB must divide NB"); static_assert(NLISTS % (RELAY_TPB) == 0, "RELAY_TPB must divide NLISTS"); // for Tag_Edges lists init -static_assert(NZ % (32 * SPLIT_TPB) == 0, "SPLIT_TPB must divide NZ/32"); // for EdgeSplit htnode init static_assert(NZ % (32 * TRIM_TPB) == 0, "TRIM_TPB must divide NZ/32"); // for Round htnode init int main(int argc, char **argv) { @@ -1326,7 +1489,7 @@ int main(int argc, char **argv) { for (u32 i=0; i= 30 -typedef u64 offset_t; -#else -typedef u32 offset_t; -#endif - -#if BIGSIZE0 > 4 -typedef u64 BIGTYPE0; -#else -typedef u32 BIGTYPE0; -#endif - // node bits have two groups of bucketbits (X for big and Y for small) and a remaining group Z of degree bits const u32 NX = 1 << XBITS; const u32 XMASK = NX - 1; -const u32 NY = 1 << YBITS; -const u32 YMASK = NY - 1; -const u32 XYBITS = XBITS + YBITS; -const u32 NXY = 1 << XYBITS; -const u32 ZBITS = EDGEBITS - XYBITS; +const u32 ZBITS = NODEBITS - XBITS; const u32 NZ = 1 << ZBITS; const u32 ZMASK = NZ - 1; -const u32 YZBITS = EDGEBITS - XBITS; -const u32 NYZ = 1 << YZBITS; -const u32 YZMASK = NYZ - 1; -const u32 YZ1BITS = YZBITS < 16 ? YZBITS : 16; // compressed YZ bits -const u32 NYZ1 = 1 << YZ1BITS; -const u32 MAXNZNYZ1 = NZ < NYZ1 ? NYZ1 : NZ; -const u32 YZ1MASK = NYZ1 - 1; -const u32 Z1BITS = YZ1BITS - YBITS; -const u32 NZ1 = 1 << Z1BITS; -const u32 Z1MASK = NZ1 - 1; -const u32 YZ2BITS = YZBITS < 12 ? YZBITS : 12; // more compressed YZ bits -const u32 NYZ2 = 1 << YZ2BITS; -const u32 YZ2MASK = NYZ2 - 1; -const u32 Z2BITS = YZ2BITS - YBITS; -const u32 NZ2 = 1 << Z2BITS; -const u32 Z2MASK = NZ2 - 1; -const u32 YZZBITS = YZBITS + ZBITS; -const u32 YZZ1BITS = YZ1BITS + ZBITS; - -const u32 MAXEDGES = NX * NYZ2; - -const u32 BIGSLOTBITS = BIGSIZE * 8; -const u32 SMALLSLOTBITS = SMALLSIZE * 8; -const u64 BIGSLOTMASK = (1ULL << BIGSLOTBITS) - 1ULL; -const u64 SMALLSLOTMASK = (1ULL << SMALLSLOTBITS) - 1ULL; -const u32 BIGSLOTBITS0 = BIGSIZE0 * 8; -const u64 BIGSLOTMASK0 = (1ULL << BIGSLOTBITS0) - 1ULL; + +const u32 MAXEDGES = NEDGES >> 13; // bound on #edges remaining after trimming // for p close to 0, Pr(X>=k) < e^{-n*p*eps^2} where k=n*p*(1+eps) // see https://en.wikipedia.org/wiki/Binomial_distribution#Tail_bounds @@ -138,56 +67,50 @@ const u64 BIGSLOTMASK0 = (1ULL << BIGSLOTBITS0) - 1ULL; #define BIGEPS 3/64 #endif -// 176/256 is safely over 1-e(-1) ~ 0.63 trimming fraction -#ifndef TRIMFRAC256 -#define TRIMFRAC256 176 -#endif - const u32 NTRIMMEDZ = NZ * TRIMFRAC256 / 256; const u32 ZBUCKETSLOTS = NZ + NZ * BIGEPS; const u32 ZBUCKETSIZE = ZBUCKETSLOTS * BIGSIZE0; const u32 TBUCKETSIZE = ZBUCKETSLOTS * BIGSIZE1; +typedef struct { // half-edge pair + unsigned int otherX : XBITS; + unsigned int Z0 : ZBITS; + unsigned int Z1 : ZBITS; + unsigned int slot0 : 6; + unsigned int slot1 : 6; +} hep; + +static_assert(sizeof(hep) == 8); + template struct zbucket { u32 size; // should avoid different values of RENAMESIZE in different threads of one process - static const u32 RENAMESIZE = NZ2 + (COMPRESSROUND ? NZ1 : 0); - union alignas(16) { - u8 bytes[BUCKETSIZE]; - struct { - u32 words[BUCKETSIZE/sizeof(u32) - RENAMESIZE]; - u32 renameu1[NZ2/2]; - u32 renamev1[NZ2/2]; - u32 renameu[COMPRESSROUND ? NZ1/2 : 0]; - u32 renamev[COMPRESSROUND ? NZ1/2 : 0]; - }; - }; - u32 setsize(u8 const *end) { - size = end - bytes; + // union alignas(16) { + hep words[BUCKETSIZE]; + u32 setsize(hep const *end) { + size = end - words; assert(size <= BUCKETSIZE); return size; } }; template -using yzbucket = zbucket[NY]; -template -using matrix = yzbucket[NX]; +using buckets_t = zbucket[NX]; template struct indexer { - offset_t index[NX]; + word_t index[NX]; void matrixv(const u32 y) { const yzbucket *foo = 0; for (u32 x = 0; x < NX; x++) index[x] = foo[x][y].bytes - (u8 *)foo; } - offset_t storev(yzbucket *buckets, const u32 y) { + word_t storev(yzbucket *buckets, const u32 y) { u8 const *base = (u8 *)buckets; - offset_t sumsize = 0; + word_t sumsize = 0; for (u32 x = 0; x < NX; x++) sumsize += buckets[x][y].setsize(base+index[x]); return sumsize; @@ -197,9 +120,9 @@ struct indexer { for (u32 y = 0; y < NY; y++) index[y] = foo[x][y].bytes - (u8 *)foo; } - offset_t storeu(yzbucket *buckets, const u32 x) { + word_t storeu(yzbucket *buckets, const u32 x) { u8 const *base = (u8 *)buckets; - offset_t sumsize = 0; + word_t sumsize = 0; for (u32 y = 0; y < NY; y++) sumsize += buckets[x][y].setsize(base+index[y]); return sumsize; @@ -217,7 +140,7 @@ typedef struct { edgetrimmer *et; } thread_ctx; -typedef u8 zbucket8[MAXNZNYZ1]; +typedef u8 zbucket8[2*MAXNZNYZ1]; typedef u16 zbucket16[NTRIMMEDZ]; typedef u32 zbucket32[NTRIMMEDZ]; @@ -225,32 +148,27 @@ typedef u32 zbucket32[NTRIMMEDZ]; class edgetrimmer { public: siphash_keys sip_keys; - yzbucket *buckets; - yzbucket *tbuckets; + buckets_t *buckets; zbucket8 *tdegs; - offset_t *tcounts; + word_t *tcounts; u32 ntrims; u32 nthreads; bool showall; thread_ctx *threads; trim_barrier barry; - void touch(u8 *p, const offset_t n) { - for (offset_t i=0; i) == NX * sizeof(yzbucket)); - assert(sizeof(matrix) == NX * sizeof(yzbucket)); nthreads = n_threads; ntrims = n_trims; showall = show_all; - buckets = new yzbucket[NX]; - touch((u8 *)buckets, sizeof(matrix)); - tbuckets = new yzbucket[nthreads]; - touch((u8 *)tbuckets, sizeof(yzbucket[nthreads])); + buckets = new zbucket[NX]; + touch((u8 *)buckets, sizeof(buckets_t)); tdegs = new zbucket8[nthreads]; - tcounts = new offset_t[nthreads]; + tcounts = new word_t[nthreads]; threads = new thread_ctx[nthreads]; } ~edgetrimmer() { @@ -260,8 +178,8 @@ class edgetrimmer { delete[] tdegs; delete[] tcounts; } - offset_t count() const { - offset_t cnt = 0; + word_t count() const { + word_t cnt = 0; for (u32 t = 0; t < nthreads; t++) cnt += tcounts[t]; return cnt; @@ -278,7 +196,7 @@ class edgetrimmer { const u32 starty = NY * id / nthreads; const u32 endy = NY * (id+1) / nthreads; u32 edge0 = starty << YZBITS, endedge0 = edge0 + NYZ; - offset_t sumsize = 0; + word_t sumsize = 0; for (u32 my = starty; my < endy; my++, endedge0 += NYZ) { dst.matrixv(my); for (; edge0 < endedge0; edge0 += NEBS) { @@ -338,10 +256,9 @@ class edgetrimmer { const u64 buflast = buf[NEBS - NSIPHASH + ns]; buf[NEBS - NSIPHASH + ns] = 0; for (u32 e = 0; e < NEBS; e += NSIPHASH) { - const u32 dir = (e / NSIPHASH) & 1; const u64 nodes = buf[e + ns] ^ buflast; - const u32 node0 = (nodes & NODE1MASK) << 1 | dir; - const u32 node1 = ((nodes >> 31) & (NODE1MASK << 1)) | dir; + const u32 node0 = nodes & EDGEMASK; + const u32 node1 = (nodes >> 32) & EDGEMASK; const u32 ux = node0 >> YZBITS; // bit 50...22 21..15 14..0 // write VXXYYZZ UYYYYY UZZZZ @@ -363,7 +280,7 @@ class edgetrimmer { indexer small; rdtsc0 = __rdtsc(); - offset_t sumsize = 0; + word_t sumsize = 0; u8 const *base = (u8 *)buckets; u8 const *small0 = (u8 *)tbuckets[id]; const u32 startux = NX * id / nthreads; @@ -373,6 +290,7 @@ class edgetrimmer { for (u32 my = 0 ; my < NY; my++) { u8 *readbig = buckets[ux][my].bytes; u8 const *endreadbig = readbig + buckets[ux][my].size; +// print_log("id %d x %d y %d size %u read %d\n", id, ux, my, buckets[ux][my].size, readbig-base); for (; readbig < endreadbig; readbig += BIGSIZE0) { // bit 50...22 21..15 14..0 // write VXXYYZZ UYYYYY UZZZZ within UX partition @@ -383,6 +301,7 @@ class edgetrimmer { // bit 43...15 14..0 // write VXXYYZZ UZZZZ within UX UY partition *(u64 *)(small0+small.index[uy]) = ((u64)vxyz << ZBITS) | (e & ZMASK); +// print_log("id %d ux %d y %d e %010lx e' %010x\n", id, ux, my, e, ((u64)edge << ZBITS) | (e >> YBITS)); small.index[uy] += SMALLSIZE; } } @@ -391,10 +310,11 @@ class edgetrimmer { dst.matrixu(ux); for (u32 uy = 0 ; uy < NY; uy++) { assert(NZ <= sizeof(zbucket8)); - memset(degs, 0, NZ); + memset(degs, 0xff, NZ); u8 *readsmall = tbuckets[id][uy].bytes, *endreadsmall = readsmall + tbuckets[id][uy].size; +// if (id==1) print_log("id %d ux %d y %d size %u sumsize %u\n", id, ux, uy, tbuckets[id][uy].size/BIGSIZE1, sumsize); for (u8 *rdsmall = readsmall; rdsmall < endreadsmall; rdsmall+=SMALLSIZE) - degs[*(u32 *)rdsmall & ZMASK] = BIGSIZE; + degs[*(u32 *)rdsmall & ZMASK]++; u64 uy37 = (int64_t)uy << YZZBITS; for (u8 *rdsmall = readsmall; rdsmall < endreadsmall; rdsmall+=SMALLSIZE) { @@ -406,7 +326,8 @@ class edgetrimmer { // bit 43/39..37 36..22 21..15 14..0 // write UYYYYY UZZZZZ VYYYYY VZZZZ within UX VX partition *(u64 *)(base+dst.index[vx]) = uy37 | ((u64)uz << YZBITS) | ((e >> ZBITS) & YZMASK); - dst.index[vx] += degs[uz ^ 1]; +// print_log("id %d ux %d y %d edge %08x e' %010lx vx %d\n", id, ux, uy, *readedge, uy34 | ((u64)(node & YZMASK) << ZBITS) | *readz, vx); + dst.index[vx] += degs[uz] ? BIGSIZE : 0; } } sumsize += dst.storeu(buckets, ux); @@ -431,7 +352,7 @@ class edgetrimmer { indexer small; rdtsc0 = __rdtsc(); - offset_t sumsize = 0; + word_t sumsize = 0; u8 const *base = (u8 *)buckets; u8 const *small0 = (u8 *)tbuckets[id]; const u32 startvx = NY * id / nthreads; @@ -442,11 +363,13 @@ class edgetrimmer { u32 uxyz = ux << YZBITS; zbucket &zb = TRIMONV ? buckets[ux][vx] : buckets[vx][ux]; const u8 *readbig = zb.bytes, *endreadbig = readbig + zb.size; +// print_log("id %d vx %d ux %d size %u\n", id, vx, ux, zb.size/SRCSIZE); for (; readbig < endreadbig; readbig += SRCSIZE) { // bit 43..37 36..22 21..15 14..0 // write UYYYYY UZZZZZ VYYYYY VZZZZ within VX partition const u64 e = *(u64 *)readbig & SRCSLOTMASK; uxyz += ((u32)(e >> YZBITS) - uxyz) & SRCPREFMASK; +// if (round==6) print_log("id %d vx %d ux %d e %010lx suffUXYZ %05x suffUXY %03x UXYZ %08x UXY %04x mask %x\n", id, vx, ux, e, (u32)(e >> YZBITS), (u32)(e >> YZZBITS), uxyz, uxyz>>ZBITS, SRCPREFMASK); const u32 vy = (e >> ZBITS) & YMASK; // bit 43/39..37 36..30 29..15 14..0 // write UXXXXX UYYYYY UZZZZZ VZZZZ within VX VY partition @@ -462,20 +385,22 @@ class edgetrimmer { TRIMONV ? dst.matrixv(vx) : dst.matrixu(vx); for (u32 vy = 0 ; vy < NY; vy++) { const u64 vy34 = (u64)vy << YZZBITS; - memset(degs, 0, NZ); + memset(degs, 0xff, NZ); u8 *readsmall = tbuckets[id][vy].bytes, *endreadsmall = readsmall + tbuckets[id][vy].size; +// print_log("id %d vx %d vy %d size %u sumsize %u\n", id, vx, vy, tbuckets[id][vx].size/BIGSIZE1, sumsize); for (u8 *rdsmall = readsmall; rdsmall < endreadsmall; rdsmall += DSTSIZE) - degs[*(u32 *)rdsmall & ZMASK] = DSTSIZE; + degs[*(u32 *)rdsmall & ZMASK]++; u32 ux = 0; for (u8 *rdsmall = readsmall; rdsmall < endreadsmall; rdsmall += DSTSIZE) { // bit 39..37 36..30 29..15 14..0 with XBITS==YBITS==7 // read UXXXXX UYYYYY UZZZZZ VZZZZ within VX VY partition const u64 e = *(u64 *)rdsmall & DSTSLOTMASK; ux += ((u32)(e >> YZZBITS) - ux) & DSTPREFMASK; +// print_log("id %d vx %d vy %d e %010lx suffUX %02x UX %x mask %x\n", id, vx, vy, e, (u32)(e >> YZZBITS), ux, SRCPREFMASK); // bit 41/39..34 33..21 20..13 12..0 // write VYYYYY VZZZZZ UYYYYY UZZZZ within UX partition *(u64 *)(base+dst.index[ux]) = vy34 | ((e & ZMASK) << YZBITS) | ((e >> ZBITS) & YZMASK); - dst.index[ux] += degs[(e & ZMASK) ^ 1]; + dst.index[ux] += degs[e & ZMASK] ? DSTSIZE : 0; } if (unlikely(ux >> DSTPREFBITS != XMASK >> DSTPREFBITS)) { print_log("OOPS4: id %d vx %x ux %x vs %x\n", id, vx, ux, XMASK); exit(1); } @@ -502,7 +427,7 @@ class edgetrimmer { u32 maxnnid = 0; rdtsc0 = __rdtsc(); - offset_t sumsize = 0; + word_t sumsize = 0; u8 const *base = (u8 *)buckets; u8 const *small0 = (u8 *)tbuckets[id]; const u32 startvx = NY * id / nthreads; @@ -514,6 +439,7 @@ class edgetrimmer { u32 uyz = 0; zbucket &zb = TRIMONV ? buckets[ux][vx] : buckets[vx][ux]; const u8 *readbig = zb.bytes, *endreadbig = readbig + zb.size; +// print_log("id %d vx %d ux %d size %u\n", id, vx, ux, zb.size/SRCSIZE); for (; readbig < endreadbig; readbig += SRCSIZE) { // bit 39..37 36..22 21..15 14..0 // write UYYYYY UZZZZZ VYYYYY VZZZZ within VX partition if TRIMONV @@ -523,29 +449,32 @@ class edgetrimmer { if (TRIMONV) uyz += ((u32)(e >> YZBITS) - uyz) & SRCPREFMASK; else uyz = e >> YZBITS; +// if (round==32 && ux==25) print_log("id %d vx %d ux %d e %010lx suffUXYZ %05x suffUXY %03x UXYZ %08x UXY %04x mask %x\n", id, vx, ux, e, (u32)(e >> YZBITS), (u32)(e >> YZZBITS), uxyz, uxyz>>ZBITS, SRCPREFMASK); const u32 vy = (e >> ZBITS) & YMASK; // bit 39..37 36..30 29..15 14..0 // write UXXXXX UYYYYY UZZZZZ VZZZZ within VX VY partition if TRIMONV // bit 37...31 30...15 14..0 // write VXXXXXX VYYYZZ' UZZZZ within UX UY partition if !TRIMONV *(u64 *)(small0+small.index[vy]) = ((u64)(ux << (TRIMONV ? YZBITS : YZ1BITS) | uyz) << ZBITS) | (e & ZMASK); +// if (TRIMONV&&vx==75&&vy==83) print_log("id %d vx %d vy %d e %010lx e15 %x ux %x\n", id, vx, vy, ((u64)uxyz << ZBITS) | (e & ZMASK), uxyz, uxyz>>YZBITS); if (TRIMONV) uyz &= ~ZMASK; small.index[vy] += SRCSIZE; } } - u8 *degs = tdegs[id]; + u16 *degs = (u16 *)tdegs[id]; small.storeu(tbuckets+id, 0); TRIMONV ? dst.matrixv(vx) : dst.matrixu(vx); u32 newnodeid = 0; u32 *renames = TRIMONV ? buckets[0][vx].renamev : buckets[vx][0].renameu; - u32 *endrenames = renames + NZ1/2; + u32 *endrenames = renames + NZ1; for (u32 vy = 0 ; vy < NY; vy++) { assert(2*NZ <= sizeof(zbucket8)); - memset(degs, 0, NZ); + memset(degs, 0xff, 2 * NZ); // sets each u16 entry to 0xffff u8 *readsmall = tbuckets[id][vy].bytes, *endreadsmall = readsmall + tbuckets[id][vy].size; +// print_log("id %d vx %d vy %d size %u sumsize %u\n", id, vx, vy, tbuckets[id][vx].size/BIGSIZE1, sumsize); for (u8 *rdsmall = readsmall; rdsmall < endreadsmall; rdsmall += SRCSIZE) - degs[*(u32 *)rdsmall & ZMASK] = 1; + degs[*(u32 *)rdsmall & ZMASK]++; u32 ux = 0; u32 nrenames = 0; for (u8 *rdsmall = readsmall; rdsmall < endreadsmall; rdsmall += SRCSIZE) { @@ -558,27 +487,26 @@ class edgetrimmer { ux += ((u32)(e >> YZZBITS) - ux) & SRCPREFMASK2; else ux = e >> YZZ1BITS; const u32 vz = e & ZMASK; - u16 *pdeg = (u16 *)degs + (vz >> 1); - u16 vdeg = *pdeg; - if (vdeg >= 0x0101) { - if (vdeg == 0x0101) { - *pdeg = vdeg = 0x0102 + nrenames++; - *renames++ = vy << ZBITS | (vz & ~1); // neutralize parity, to be restored upon use + u16 vdeg = degs[vz]; + if (vdeg) { + if (vdeg < 32) { + degs[vz] = vdeg = 32 + nrenames++; + *renames++ = vy << ZBITS | vz; if (renames == endrenames) { endrenames += (TRIMONV ? sizeof(yzbucket) : sizeof(zbucket)) / sizeof(u32); - renames = endrenames - NZ1/2; + renames = endrenames - NZ1; } } - vdeg = ((vdeg-0x0102) << 1) | (vz & 1); // preserve parity // bit 37..22 21..15 14..0 // write VYYZZ' UYYYYY UZZZZ within UX partition if TRIMONV if (TRIMONV) - *(u64 *)(base+dst.index[ux]) = ((u64)(newnodeid + vdeg) << YZBITS ) | ((e >> ZBITS) & YZMASK); - else *(u32 *)(base+dst.index[ux]) = ( (newnodeid + vdeg) << YZ1BITS) | ((e >> ZBITS) & YZ1MASK); + *(u64 *)(base+dst.index[ux]) = ((u64)(newnodeid + vdeg-32) << YZBITS ) | ((e >> ZBITS) & YZMASK); + else *(u32 *)(base+dst.index[ux]) = ( (newnodeid + vdeg-32) << YZ1BITS) | ((e >> ZBITS) & YZ1MASK); +// if (vx==44&&vy==58) print_log(" id %d vx %d vy %d newe %010lx\n", id, vx, vy, vy28 | ((vdeg) << YZBITS) | ((e >> ZBITS) & YZMASK)); dst.index[ux] += DSTSIZE; } } - newnodeid += 2 * nrenames; + newnodeid += nrenames; if (TRIMONV && unlikely(ux >> SRCPREFBITS2 != XMASK >> SRCPREFBITS2)) { print_log("OOPS6: id %d vx %d vy %d ux %x vs %x\n", id, vx, vy, ux, XMASK); exit(0); } } @@ -599,7 +527,7 @@ class edgetrimmer { indexer dst; rdtsc0 = __rdtsc(); - offset_t sumsize = 0; + word_t sumsize = 0; u8 *degs = tdegs[id]; u8 const *base = (u8 *)buckets; const u32 startvx = NY * id / nthreads; @@ -607,13 +535,13 @@ class edgetrimmer { for (u32 vx = startvx; vx < endvx; vx++) { TRIMONV ? dst.matrixv(vx) : dst.matrixu(vx); assert(NYZ1 <= sizeof(zbucket8)); - memset(degs, 0, NYZ1); + memset(degs, 0xff, NYZ1); for (u32 ux = 0 ; ux < NX; ux++) { zbucket &zb = TRIMONV ? buckets[ux][vx] : buckets[vx][ux]; u32 *readbig = zb.words, *endreadbig = readbig + zb.size/sizeof(u32); // print_log("id %d vx %d ux %d size %d\n", id, vx, ux, zb.size/SRCSIZE); for (; readbig < endreadbig; readbig++) - degs[*readbig & YZ1MASK] = sizeof(u32); + degs[*readbig & YZ1MASK]++; } for (u32 ux = 0 ; ux < NX; ux++) { zbucket &zb = TRIMONV ? buckets[ux][vx] : buckets[vx][ux]; @@ -623,10 +551,11 @@ class edgetrimmer { // read UYYZZZ' VYYZZ' within VX partition const u32 e = *readbig; const u32 vyz = e & YZ1MASK; + // print_log("id %d vx %d ux %d e %08lx vyz %04x uyz %04x\n", id, vx, ux, e, vyz, e >> YZ1BITS); // bit 31...16 15...0 // write VYYZZZ' UYYZZ' within UX partition *(u32 *)(base+dst.index[ux]) = (vyz << YZ1BITS) | (e >> YZ1BITS); - dst.index[ux] += degs[vyz ^ 1]; + dst.index[ux] += degs[vyz] ? sizeof(u32) : 0; } } sumsize += TRIMONV ? dst.storev(buckets, vx) : dst.storeu(buckets, vx); @@ -644,23 +573,24 @@ class edgetrimmer { u32 maxnnid = 0; rdtsc0 = __rdtsc(); - offset_t sumsize = 0; - u8 *degs = tdegs[id]; + word_t sumsize = 0; + u16 *degs = (u16 *)tdegs[id]; u8 const *base = (u8 *)buckets; const u32 startvx = NY * id / nthreads; const u32 endvx = NY * (id+1) / nthreads; for (u32 vx = startvx; vx < endvx; vx++) { TRIMONV ? dst.matrixv(vx) : dst.matrixu(vx); - memset(degs, 0, NYZ1); + memset(degs, 0xff, 2 * NYZ1); // sets each u16 entry to 0xffff for (u32 ux = 0 ; ux < NX; ux++) { zbucket &zb = TRIMONV ? buckets[ux][vx] : buckets[vx][ux]; u32 *readbig = zb.words, *endreadbig = readbig + zb.size/sizeof(u32); + // print_log("id %d vx %d ux %d size %d\n", id, vx, ux, zb.size/SRCSIZE); for (; readbig < endreadbig; readbig++) - degs[*readbig & YZ1MASK] = 1; + degs[*readbig & YZ1MASK]++; } - u32 newnodepairid = 0; + u32 newnodeid = 0; u32 *renames = TRIMONV ? buckets[0][vx].renamev1 : buckets[vx][0].renameu1; - u32 *endrenames = renames + NZ2/2; + u32 *endrenames = renames + NZ2; for (u32 ux = 0 ; ux < NX; ux++) { zbucket &zb = TRIMONV ? buckets[ux][vx] : buckets[vx][ux]; u32 *readbig = zb.words, *endreadbig = readbig + zb.size/sizeof(u32); @@ -669,28 +599,26 @@ class edgetrimmer { // read UYYYZZ' VYYZZ' within VX partition const u32 e = *readbig; const u32 vyz = e & YZ1MASK; - u16 *pdeg = (u16 *)degs + (vyz >> 1); - u16 vdeg = *pdeg; - if (vdeg >= 0x0101) { - if (vdeg == 0x0101) { - *pdeg = vdeg = 0x0102 + newnodepairid++; - *renames++ = vyz & ~1; // neutralize parity, to be restored upon use + u16 vdeg = degs[vyz]; + if (vdeg) { + if (vdeg < 32) { + degs[vyz] = vdeg = 32 + newnodeid++; + *renames++ = vyz; if (renames == endrenames) { endrenames += (TRIMONV ? sizeof(yzbucket) : sizeof(zbucket)) / sizeof(u32); - renames = endrenames - NZ2/2; + renames = endrenames - NZ2; assert(renames < buckets[NX][0].renameu1); } } - vdeg = ((vdeg-0x0102) << 1) | (vyz & 1); // preserve parity // bit 26...16 15...0 // write VYYZZZ" UYYZZ' within UX partition - *(u32 *)(base+dst.index[ux]) = (vdeg << (TRIMONV ? YZ1BITS : YZ2BITS)) | (e >> YZ1BITS); + *(u32 *)(base+dst.index[ux]) = ((vdeg - 32) << (TRIMONV ? YZ1BITS : YZ2BITS)) | (e >> YZ1BITS); dst.index[ux] += sizeof(u32); } } } - if (2*newnodepairid > maxnnid) - maxnnid = 2*newnodepairid; + if (newnodeid > maxnnid) + maxnnid = newnodeid; sumsize += TRIMONV ? dst.storev(buckets, vx) : dst.storeu(buckets, vx); } rdtsc1 = __rdtsc(); @@ -709,6 +637,7 @@ class edgetrimmer { int err = pthread_create(&threads[t].thread, NULL, etworker, (void *)&threads[t]); assert(err == 0); } + // sleep(7); abort(); for (u32 t = 0; t < nthreads; t++) { int err = pthread_join(threads[t].thread, NULL); assert(err == 0); @@ -797,7 +726,6 @@ class solver_ctx { bool mutatenonce; proof cycleus; proof cyclevs; - word_t rowsize[NX]; std::bitset uxymap; std::vector sols; // concatanation of all proof's indices @@ -812,7 +740,7 @@ class solver_ctx { solver_ctx(const u32 nthreads, const u32 n_trims, bool allrounds, bool show_cycle, bool mutate_nonce) : trimmer(nthreads, n_trims, allrounds), - cg(MAXEDGES, MAXEDGES, MAX_SOLS, 0, (char *)trimmer.tbuckets) { + cg(MAXEDGES, MAXEDGES, MAX_SOLS, (char *)trimmer.tbuckets) { assert(cg.bytes() <= sizeof(yzbucket[nthreads])); // check that graph cg can fit in tbucket's memory showcycle = show_cycle; mutatenonce = mutate_nonce; @@ -832,37 +760,26 @@ class solver_ctx { u32 threadbytes() const { return sizeof(thread_ctx) + sizeof(yzbucket) + sizeof(zbucket8) + sizeof(zbucket16) + sizeof(zbucket32); } - void recordedge(const u32 i, const u32 u1, const u32 v1) { - assert(u1 < NX * NYZ2); + void recordedge(const u32 i, const u32 u1, const u32 v2) { const u32 ux = u1 >> YZ2BITS; - u32 uyz = trimmer.buckets[ux][(u1 >> Z2BITS) & YMASK].renameu1[(u1 & Z2MASK) >> 1] | (u1 & 1); - assert(v1 < NX * NYZ2); + u32 uyz = trimmer.buckets[ux][(u1 >> Z2BITS) & YMASK].renameu1[u1 & Z2MASK]; + const u32 v1 = v2 - MAXEDGES; const u32 vx = v1 >> YZ2BITS; - u32 vyz = trimmer.buckets[(v1 >> Z2BITS) & YMASK][vx].renamev1[(v1 & Z2MASK) >> 1] | (v1 & 1); + u32 vyz = trimmer.buckets[(v1 >> Z2BITS) & YMASK][vx].renamev1[v1 & Z2MASK]; #if COMPRESSROUND > 0 - uyz = trimmer.buckets[ux][uyz >> Z1BITS].renameu[(uyz & Z1MASK) >> 1] | (u1 & 1); - vyz = trimmer.buckets[vyz >> Z1BITS][vx].renamev[(vyz & Z1MASK) >> 1] | (v1 & 1); + uyz = trimmer.buckets[ux][uyz >> Z1BITS].renameu[uyz & Z1MASK]; + vyz = trimmer.buckets[vyz >> Z1BITS][vx].renamev[vyz & Z1MASK]; #endif const u32 u = cycleus[i] = (ux << YZBITS) | uyz; cyclevs[i] = (vx << YZBITS) | vyz; + // print_log(" (%x,%x)", u, cyclevs[i]); uxymap[u >> ZBITS] = 1; } void solution(const proof sol) { // print_log("Nodes"); - for (u32 i = 0; i < PROOFSIZE; i++) { - u32 ux, vx, idx = sol[i]; - for (vx = 0; idx >= rowsize[vx]; vx++) { - idx -= rowsize[vx]; - } - for (ux = 0; idx >= trimmer.buckets[ux][vx].size/sizeof(u32); ux++) { - idx -= trimmer.buckets[ux][vx].size/sizeof(u32); - } - const u32 e = trimmer.buckets[ux][vx].words[idx]; - const u32 u = (ux << YZ2BITS) | (e >> YZ2BITS); - const u32 v = (vx << YZ2BITS) | (e & YZ2MASK); - recordedge(i, u, v); - } + for (u32 i = 0; i < PROOFSIZE; i++) + recordedge(i, cg.links[2*sol[i]].to, cg.links[2*sol[i]+1].to); // print_log("\n"); if (showcycle) { void *matchworker(void *vp); @@ -890,18 +807,18 @@ class solver_ctx { rdtsc0 = __rdtsc(); cg.reset(); for (u32 vx = 0; vx < NX; vx++) { - rowsize[vx] = 0; for (u32 ux = 0 ; ux < NX; ux++) { zbucket &zb = trimmer.buckets[ux][vx]; u32 *readbig = zb.words, *endreadbig = readbig + zb.size/sizeof(u32); - rowsize[vx] += zb.size/sizeof(u32); +// print_log("vx %d ux %d size %u\n", vx, ux, zb.size/4); for (; readbig < endreadbig; readbig++) { // bit 21..11 10...0 // write UYYZZZ' VYYZZ' within VX partition const u32 e = *readbig; const u32 u = (ux << YZ2BITS) | (e >> YZ2BITS); const u32 v = (vx << YZ2BITS) | (e & YZ2MASK); - cg.add_edge(u>>1, v>>1, u&1); + // print_log("add_edge(%x, %x)\n", u, v); + cg.add_edge(u, v); } } } @@ -909,7 +826,7 @@ class solver_ctx { solution(cg.sols[s]); } rdtsc1 = __rdtsc(); - print_log("findcycles %u edges rdtsc: %lu\n", cg.nlinks, rdtsc1-rdtsc0); + print_log("findcycles rdtsc: %lu\n", rdtsc1-rdtsc0); } void abort() { @@ -990,10 +907,9 @@ class solver_ctx { const u64 buflast = buf[NEBS - NSIPHASH + ns]; buf[NEBS - NSIPHASH + ns] = 0; for (u32 e = 0; e < NEBS; e += NSIPHASH) { - const u32 dir = (e / NSIPHASH) & 1; const u64 nodes = buf[e + ns] ^ buflast; - const u32 node0 = (nodes & NODE1MASK) << 1 | dir; - const u32 node1 = ((nodes >> 31) & (NODE1MASK << 1)) | dir; + const u32 node0 = nodes & EDGEMASK; + const u32 node1 = (nodes >> 32) & EDGEMASK; if (uxymap[node0 >> ZBITS]) { for (u32 j = 0; j < PROOFSIZE; j++) { if (cycleus[j] == node0 && cyclevs[j] == node1) {