From efbf4f865e2ed412f000fd487b324c08e420b735 Mon Sep 17 00:00:00 2001
From: John Tromp <john.tromp@gmail.com>
Date: Wed, 15 Jul 2020 15:54:17 +0200
Subject: [PATCH] working cuckarooz CUDA solver

---
 src/cuckarooz/Makefile |   4 +-
 src/cuckarooz/mean.cu  | 761 +++++++++++++++++++++++++----------------
 src/cuckarooz/mean.hpp | 330 +++++++-----------
 3 files changed, 587 insertions(+), 508 deletions(-)
diff --git a/src/cuckarooz/Makefile b/src/cuckarooz/Makefile
index 43e8af5..7911d0e 100644
--- a/src/cuckarooz/Makefile
+++ b/src/cuckarooz/Makefile
@@ -55,8 +55,8 @@ mean30x1:	cuckarooz.hpp  bitmap.hpp graph.hpp ../threads/barrier.hpp ../crypto/s
 mean30x8:	cuckarooz.hpp  bitmap.hpp graph.hpp ../threads/barrier.hpp ../crypto/siphash.hpp mean.hpp mean.cpp Makefile
 	$(GPP) -o $@ -mavx2 -DNSIPHASH=8 -DEXPANDROUND=10 -DCOMPRESSROUND=22 -DEDGEBITS=30 mean.cpp $(BLAKE_2B_SRC)
 
-cuda19:		../crypto/siphash.cuh compress.hpp graph.hpp mean.cu Makefile
+cuda19:		../crypto/siphash.cuh cuckarooz.hpp compress.hpp graph.hpp mean.cu Makefile
 	$(NVCC) -o $@ -DEPS_A=4 -DEPS_B=3 -DIDXSHIFT=2 -DEDGEBITS=19 -arch sm_35 mean.cu $(BLAKE_2B_SRC)
 
-cuda29:		../crypto/siphash.cuh compress.hpp graph.hpp mean.cu Makefile
+cuda29:		../crypto/siphash.cuh cuckarooz.hpp compress.hpp graph.hpp mean.cu Makefile
 	$(NVCC) -o $@ -DEDGEBITS=29 -arch sm_35 mean.cu $(BLAKE_2B_SRC)
diff --git a/src/cuckarooz/mean.cu b/src/cuckarooz/mean.cu
index 3fc3560..aa50c14 100644
--- a/src/cuckarooz/mean.cu
+++ b/src/cuckarooz/mean.cu
@@ -1,5 +1,5 @@
 // Cuckarooz Cycle, a memory-hard proof-of-work by John Tromp
-// Copyright (c) 2018-2020 Wilke Trei, Jiri Vadura (photon) and John Tromp
+// Copyright (c) 2018-2020 Wilke Trei (Lolliedieb), Jiri Vadura (photon) and John Tromp
 // This software is covered by the FAIR MINING license
 
 //Includes for IntelliSense
@@ -21,22 +21,17 @@
 #include "../crypto/blake2.h"
 
 // number of invocations of seeding kernel
-// values larger than 1 reduce pressure on TLB
+// larger values reduce pressure on TLB
 // which appears to be an issue on GTX cards
-#ifndef NTLB
-#define NTLB 2
-#endif
+#define NTLB 4
 // number of slices in edge splitting kernel
 // larger values reduce memory use
 // which is proportional to 1+1/NMEM
-#ifndef NMEM
 #define NMEM 4
-#endif
 // after seeding, main memory buffer is a NMEM x NTLB matrix
 // of blocks of buckets
 #define NMEMTLB (NMEM * NTLB)
 
-#define NODE1MASK NODEMASK
 #include "../crypto/siphash.cuh"
 
 typedef uint8_t u8;
@@ -50,12 +45,14 @@ typedef uint32_t u32;
 #endif
 
 const u32 NB = 1 << BUCKBITS;
+const u32 NB2 = NB / 2;
 const u32 NB_NTLB  = NB / NTLB;
 const u32 NB_NMEM  = NB / NMEM;
+const u32 NODEBITS = EDGEBITS+1;
 const u32 ZBITS = NODEBITS - BUCKBITS;
-const u32 ZBITS1 = ZBITS + 1;
 const u32 NZ = 1 << ZBITS;
 const u32 ZMASK = NZ - 1;
+const u32 ZBITS1 = ZBITS + 1;
 
 __device__ __forceinline__ uint2 ld_cs_u32_v2(const uint2 *p_src)
 {
@@ -103,7 +100,7 @@ __device__ __forceinline__ bool testbit(const u64 edgemap, const int slot)
 }
 
 __constant__ siphash_keys dipkeys;
-__constant__ u64 recovery[42];
+__constant__ u64 recovery[2*PROOFSIZE];
 
 #define ROTL(x,b) ( ((x) << (b)) | ( (x) >> (64 - (b))) )
 #define SIPROUND {\
@@ -130,7 +127,7 @@ __constant__ u64 recovery[42];
 #define SPLIT_TPB 1024
 #endif
 #ifndef TRIM_TPB
-#define TRIM_TPB 1024
+#define TRIM_TPB 512
 #endif
 #ifndef UNSPLIT_TPB
 #define UNSPLIT_TPB 256
@@ -144,9 +141,11 @@ __constant__ u64 recovery[42];
 
 template<int maxOut>
 __global__ void Seed(uint4 * __restrict__ buffer, u32 * __restrict__ indexes, const u32 part)
+// Seed<EDGES_A/NTLB><<<tp.seed.blocks, SEED_TPB>>>((uint4*)bufferA, indexesA, i);
 {
-  const int gid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int group = blockIdx.x;
   const int lid = threadIdx.x;
+  const int gid = group * SEED_TPB + lid;
   const int nthreads = gridDim.x * SEED_TPB;
 
   const int nloops = (NEDGES / NTLB / EDGE_BLOCK_SIZE - gid + nthreads-1) / nthreads;
@@ -161,7 +160,7 @@ __global__ void Seed(uint4 * __restrict__ buffer, u32 * __restrict__ indexes, co
   __syncthreads();
 
   u64 offset = part * (NEDGES / NTLB);
-  buffer  += part * (maxOut * NB_NMEM / 2); 
+  buffer  += part * (maxOut * NB_NMEM / 2); // buffer indexed by edgepairs rather than edges
   indexes += part * NB;
 
   for (int i = 0; i < nloops; i++) {
@@ -187,10 +186,26 @@ __global__ void Seed(uint4 * __restrict__ buffer, u32 * __restrict__ indexes, co
 
     u64 last = 0;
 
+#define BBITS2 (BUCKBITS/2)
+#define NBB2 (1 << BBITS2)
+#define BB2MASK (NBB2 - 1)
+#define BBITS21 (BBITS2 - 1)
+#define NBB21 (1 << BBITS21)
+#define BB21MASK (NBB21 - 1)
+
 #define DUMP(E) {\
   u64 lookup = (E);\
   const uint2 edge1 = make_uint2(lookup & NODEMASK, (lookup >> 32) & NODEMASK);\
-  int bucket = edge1.x >> ZBITS;\
+  const u32 uX = edge1.x >> (ZBITS + BBITS2);\
+  assert(uX < NBB2);\
+  const u32 vX = edge1.y >> (ZBITS + BBITS2);\
+  assert(vX < NBB2);\
+  const u32 uvXa = 2 * (uX >> BBITS21) + (vX >> BBITS21);\
+  assert(uvXa < 4);\
+  const u32 uvXb = (uX & BB21MASK) << BBITS21 | (vX & BB21MASK);\
+  assert(uvXb < NB/4);\
+  const u32 bucket = uvXa << (BUCKBITS-2) | uvXb;\
+  assert(bucket < NB);\
   u64 edge64 = (((u64)edge1.y) << 32) | edge1.x;\
   for (u64 ret = atomicCAS(&magazine[bucket], 0, edge64); ret; ) {\
     u64 ret2 = atomicCAS(&magazine[bucket], ret, 0);\
@@ -200,7 +215,7 @@ __global__ void Seed(uint4 * __restrict__ buffer, u32 * __restrict__ indexes, co
         printf("Seed dropping edges %llx %llx\n", ret, edge64);\
         break;\
       }\
-      int idx = (bucket + (bucket / NB_NMEM) * (NTLB-1) * NB_NMEM) * maxOut + position;\
+      const u32 idx = (uvXa * NTLB * NB_NMEM + uvXb) * maxOut + position;\
       buffer[idx/2] = make_uint4(ret, ret >> 32, edge1.x, edge1.y);\
       break;\
     }\
@@ -228,91 +243,98 @@ __global__ void Seed(uint4 * __restrict__ buffer, u32 * __restrict__ indexes, co
         printf("Seed dropping edge %llx\n", edge);
         continue;
       }
-      int idx = (bucket + (bucket / NB_NMEM) * (NTLB-1) * NB_NMEM) * maxOut + position;\
+      const u32 idx = ((bucket/NB_NMEM) * NTLB * NB_NMEM + bucket % NB_NMEM) * maxOut + position;
       buffer[idx/2] = make_uint4(edge, edge >> 32, 0, 0);
     }
   }
 }
 
-#ifndef EXTSHARED
-#define EXTSHARED 0xc000
-#endif
-
-#ifndef CYCLE_V0
-#define CYCLE_V0 0x59f7ada
-#define CYCLE_V1 0x106a0753
-#endif
-#define BUCK0 (CYCLE_V0 >> ZBITS)
-#define REST0 (CYCLE_V0 & ZMASK)
-#define BUCK1 (CYCLE_V1 >> ZBITS)
-#define REST1 (CYCLE_V1 & ZMASK)
-
 template<int maxIn, int maxOut>
-__global__ void EdgeSplit(const uint2 *src, uint2 *dst, const u32 *srcIdx, u32 *dstIdx, u64 *edgemap, const int part)
+__global__ void EdgeSplit(const uint2 *src, uint2 *dst, const u32 *srcIdx, u32 *dstIdx, u32 *edgeslots, const int uvXa)
+// EdgeSplit<EDGES_A/NTLB, EDGES_A/NMEM/2><<<NB_NMEM*4, SPLIT_TPB>>>((uint2*)bufferA, (uint2*)bufferB, indexesA, indexesB, edgeSlots, i);
 {
+  const int uvYa = blockIdx.x % 4;
+  const int uvXb = blockIdx.x / 4;
   const int lid = threadIdx.x;
-  const int group = blockIdx.x;
 
-  extern __shared__ unsigned long long magazine[];
-  u32 *emap = (u32 *)(magazine + NB);
+  __shared__ unsigned long long magazine[NB];
   int nloops[NTLB];
 
   for (int i = 0; i < NB/SPLIT_TPB; i++) {
     magazine[lid + SPLIT_TPB * i] = 0;
-    emap[lid + SPLIT_TPB * i] = 0;
+    // emap[lid + SPLIT_TPB * i] = 0;
   }
 
-  const int offset = part * NB_NMEM;
-  const int Group = offset + group;
-  edgemap += NB * Group;
-
-#if EXTSHARED >= 0xc004
-  u32 *ourIdx = emap + NB;
-#else
-  u32 *ourIdx = (u32 *)edgemap;
-#endif
-
-  if (!lid)
-    *ourIdx = 0;
+  const u32 offset = uvXa * NB_NMEM;
+  const u32 uvX = offset + uvXb;
+  const u32 uXb = uvXb >> BBITS21; // 5 bits
+  const u32 vXb = uvXb & BB21MASK; // 5 bits
+  const u32 uX = (uvXa/2) << BBITS21 | uXb; // 6 bits
+  const u32 vX = (uvXa%2) << BBITS21 | vXb; // 6 bits
+  edgeslots += (uX * NBB2 + vX) * NB;
+  if (!lid) assert(uXb < 32);
+  if (!lid) assert(vXb < 32);
 
   for (int a = 0; a < NTLB; a++)
-    nloops[a] = (min(srcIdx[a * NB + Group], maxIn) - lid + SPLIT_TPB-1) / SPLIT_TPB;
+    nloops[a] = (min(srcIdx[a * NB + uvX], maxIn) - lid + SPLIT_TPB-1) / SPLIT_TPB;
 
-  src += (offset * NTLB + group) * maxIn + lid;
-  dst += part * NB * maxOut;
-  uint2 *dstX = dst + (NB_NMEM + group) * NTLB * maxIn / 2;
+  src += (offset * NTLB + uvXb) * maxIn + lid;
+  dst += uvXa * 2 * NB * maxOut;
+  uint2 *ydst = dst + NB * maxOut;
+  u32 *xdstIdx = dstIdx + uvXa * 2 * NB;
+  u32 *ydstIdx = xdstIdx + NB;
 
   __syncthreads();
 
-  const int rowOffset = offset * NMEM;
   for (int a = 0; a < NTLB; a++) {
     const u32 delta = a * NB_NMEM * maxIn;
     for (int i = 0; i < nloops[a]; i++) {
       uint2 edge = src[delta + i * SPLIT_TPB];
       if (edge.x == edge.y) continue;
-      int bucket = edge.y >> ZBITS;
+      const u32 uXYa = edge.x >> (ZBITS-1); // 13 bits
+      const u32 vXYa = edge.y >> (ZBITS-1); // 13 bits
+      const u32 uXY = uXYa >> 1;
+      const u32 vXY = vXYa >> 1;
+      assert(uXY < NB);
+      assert(vXY < NB);
+      assert((uXY >> BBITS2) == ((uvXa/2) << BBITS21 | uXb)); // match 6 bits
+      assert((vXY >> BBITS2) == ((uvXa%2) << BBITS21 | vXb)); // match 6 bits
+      if (2 * (uXYa % 2) + (vXYa % 2) != uvYa) continue;
+      const u32 bucket = (uXY % NBB2) * NBB2 + vXY % NBB2;
+      assert(bucket < NB);
       u64 edge64 = (((u64)edge.y) << 32) | edge.x;
       for (u64 ret = atomicCAS(&magazine[bucket], 0, edge64); ret; ) {
         u64 ret2 = atomicCAS(&magazine[bucket], ret, 0);
         if (ret2 == ret) {
-          const u32 slot = atomicAdd(emap + bucket, 2);
+          const u32 slot = atomicAdd(edgeslots + bucket, 2);
           if (slot >= 64) {
 #ifdef VERBOSE
             printf("dropped edges %llx %llx\n", ret, edge64);
 #endif
             break;
-          }
+          } else if (slot >= 63) {
+#ifdef VERBOSE
+            printf("dropped edge %llx\n", ret);
+#endif
+            break;
+	  }
           const u32 slots = (slot+1) << 6 | slot; // slot for edge, slot+1 for ret
-          int bktIdx = atomicAdd(ourIdx, 1);
-          dstX[bktIdx] = make_uint2(bucket << ZBITS | edge.x & ZMASK, slots << ZBITS | ret & ZMASK);
-          bktIdx = atomicAdd(dstIdx + rowOffset + bucket, 1);
-          if (bktIdx >= maxOut/2) {
+          const u32 xidx = atomicAdd(xdstIdx + uXYa % NB, 1);
+          if (xidx >= maxOut) {
+#ifdef VERBOSE
+            printf("dropped u-halfedge %llx\n", edge64);
+#endif
+            break;
+          }
+          dst[(uXYa % NB) * maxOut + xidx] = make_uint2((vXY << ZBITS) | (edge.x & ZMASK), slots << ZBITS | ret & ZMASK);
+          const u32 yIdx = atomicAdd(ydstIdx + vXYa % NB, 1);
+          if (yIdx >= maxOut) {
 #ifdef VERBOSE
-            printf("dropped halfedge %llx\n", edge);
+            printf("dropped v-halfedge %llx\n", edge);
 #endif
             break;
           }
-          dst[bucket * maxOut/2 + bktIdx] = make_uint2(Group << ZBITS | edge.y & ZMASK, slots << ZBITS | (ret>>32) & ZMASK);
+          ydst[(vXYa % NB) * maxOut + yIdx] = make_uint2((uXY << ZBITS) | (edge.y & ZMASK), slots << ZBITS | (ret>>32) & ZMASK);
           break;
         }
         ret = ret2 ? ret2 : atomicCAS(&magazine[bucket], 0, edge64);
@@ -326,41 +348,56 @@ __global__ void EdgeSplit(const uint2 *src, uint2 *dst, const u32 *srcIdx, u32 *
     int bucket = lid + SPLIT_TPB * i;
     u64 edge = magazine[bucket];
     if (edge != 0) {
-      const u32 slot = atomicAdd(emap + bucket, 1);
+      const u32 slot = atomicAdd(edgeslots + bucket, 1);
       if (slot >= 64) {
 #ifdef VERBOSE
         printf("dropped edge %llx\n", edge);
 #endif
         continue;
       }
+      const u32 uXY = uX * NBB2 + bucket / NBB2;
+      const u32 vXY = vX * NBB2 + bucket % NBB2;
+      const u32 uXYa = uXY << 1 | (uvYa / 2);
+      const u32 vXYa = vXY << 1 | (uvYa % 2);
+      assert(((edge & 0xffffffff) >> (ZBITS-1)) == uXYa);
+      assert((edge >> (32+ZBITS-1)) == vXYa);
       const u32 slots = slot << 6 | slot; // duplicate slot indicates singleton pair
-      int bktIdx = atomicAdd(ourIdx, 1);
-      dstX[bktIdx] = make_uint2(bucket << ZBITS | edge & ZMASK, slots << ZBITS | edge & ZMASK);
-      bktIdx = atomicAdd(dstIdx + rowOffset + bucket, 1);
-      if (bktIdx >= maxOut/2) {
+      int bktIdx = atomicAdd(xdstIdx + uXYa % NB, 1);
+      if (bktIdx >= maxOut) {
 #ifdef VERBOSE
-        printf("dropped halfedge %llx\n", edge);
+        printf("dropped u-halfedge %llx\n", edge);
 #endif
-        continue;
+        break;
+      }
+      dst[(uXYa % NB) * maxOut + bktIdx] = make_uint2((vXY << ZBITS) | (edge & ZMASK), slots << ZBITS | edge & ZMASK);
+      bktIdx = atomicAdd(ydstIdx + vXYa % NB, 1);
+      if (bktIdx >= maxOut) {
+#ifdef VERBOSE
+        printf("dropped v-halfedge %llx\n", edge);
+#endif
+        break;
       }
-      dst[bucket * maxOut/2 + bktIdx] = make_uint2(Group << ZBITS | (edge>>32) & ZMASK, slots << ZBITS | (edge>>32) & ZMASK);
+      ydst[(vXYa % NB) * maxOut + bktIdx] = make_uint2((uXY << ZBITS) | (edge>>32) & ZMASK, slots << ZBITS | (edge>>32) & ZMASK);
     }
   }
+}
 
-  __syncthreads();
-
-  if (!lid) {
-    u32 *dstXIdx = dstIdx + NMEM * NB;
-    dstXIdx[Group] = *ourIdx;
-  }
+__global__  void setEdgeMap(u32 *edgeslots, u64 *edgemap)
+//setEdgeMap<<<NB, SPLIT_TPB>>>(edgeSlots, edgeMap);
+{
+  const int uX = blockIdx.x / NBB2;
+  const int vX = blockIdx.x % NBB2;
+  const int lid = threadIdx.x;
 
+  edgeslots += blockIdx.x * NB;
   for (int i = 0; i < NB/SPLIT_TPB; i++) {
-    int idx = lid + SPLIT_TPB * i;
-    edgemap[idx] = emap[idx] < 64 ? (1ULL << emap[idx]) - 1 : ~0ULL;
+    int bucket = lid + SPLIT_TPB * i;
+    edgemap[(uX*NBB2 + bucket/NBB2) * NB + vX*NBB2 + bucket%NBB2] = edgeslots[bucket] < 64 ? (1ULL << edgeslots[bucket]) - 1 : ~0ULL;
   }
 }
 
 __device__ __forceinline__ void addhalfedge(u32 *magazine, const u32 bucket, const u32 rest, const u32 slot, u32 *indices, uint2* dst) {
+// addhalfedge(magazine, bucket, pair.x&ZMASK, slot0, sourceIndexes+NMEM*NB+group, source+delta);
   u32 halfedge = slot << ZBITS | rest;
   for (u32 ret = atomicCAS(&magazine[bucket], 0, halfedge); ret; ) {
     u64 ret2 = atomicCAS(&magazine[bucket], ret, 0);
@@ -387,184 +424,306 @@ __device__ __forceinline__ void flushhalfedges(u32 *magazine, const int lid, u32
   }
 }
 
-template<int maxIn> // maxIn is size of small buckets in half-edges
-__global__ void Round(uint2 *source, u32 *sourceIndexes, u64 *edgemap)
+static const int NODEDEGWORDS = NZ / 32;
+
+__device__ __forceinline__  void nodeSet(u32 *nodeDeg, const int bucket) {
+  int word = bucket >> 5;
+  unsigned char bit = bucket & 0x1F;
+  u32 mask = 1 << bit;
+
+  u32 old = atomicOr(nodeDeg + word, mask) & mask;
+  if (old)
+    atomicOr(nodeDeg + NODEDEGWORDS/2 + word, mask);
+}
+
+__device__ __forceinline__  bool nodeTest(u32 *nodeDeg, const int bucket) {
+  int word = bucket >> 5;
+  unsigned char bit = bucket & 0x1F;
+
+  return (nodeDeg[NODEDEGWORDS/2 + word] >> bit) & 1;
+}
+
+template<int maxIn> // maxIn is size of buckets in half-edge apirs
+__global__  void Round(uint2 *source, u32 *sourceIndexes, u64 *edgemap)
+// Round<EDGES_A/NMEM/2><<<2*NB, TRIM_TPB>>>((uint2*)bufferB, indexesB, edgeMap);
 {
   const int lid = threadIdx.x;
   const int group = blockIdx.x;
-  int bktsize[NMEM];
-  int npairs[NMEM];
+  int xsize[2], ysize[2];
+  int nxpairs[2], nypairs[2];
 
-  __shared__ u32 htnode[2*NZ/32/NPARTS]; // u & v node bits
+  __shared__ u32 nodeDeg[NODEDEGWORDS]; // node degrees
   __shared__ u32 magazine[NB];
 
-  for (int i = 0; i < 2*NZ/32/NPARTS/TRIM_TPB; i++)
-    htnode[lid + i * TRIM_TPB] = 0;
+  for (int i = 0; i < NODEDEGWORDS/TRIM_TPB; i++)
+    nodeDeg[lid + i * TRIM_TPB] = 0;
 
   for (int i = 0; i < NB/TRIM_TPB; i++)
     magazine[lid + i * TRIM_TPB] = 0;
 
-  for (int a = 0; a < NMEM; a++) {
-    bktsize[a] = sourceIndexes[a * NB + group];
-    npairs[a] = (bktsize[a] + TRIM_TPB-1) / TRIM_TPB;
+  const u32 groupa = group / NB;
+  if (!lid) assert(groupa < 2);
+  const int groupb = group % NB;
+  for (int ai = 0; ai < 2; ai++) {
+    xsize[ai] = sourceIndexes[(2 * groupa + ai) * 2*NB + groupb];
+    nxpairs[ai] = (xsize[ai] + TRIM_TPB-1) / TRIM_TPB;
+    ysize[ai] = sourceIndexes[(2 * ai + groupa) * 2*NB + NB + groupb];
+    nypairs[ai] = (ysize[ai] + TRIM_TPB-1) / TRIM_TPB;
   }
-  const u32 bsize = sourceIndexes[NMEM * NB + group];
-  const u32 np = (bsize + TRIM_TPB-1) / TRIM_TPB;
 
   __syncthreads();
 
-  if (lid <= NMEM)
-    sourceIndexes[lid * NB + group] = 0;
+  if (lid < NMEM) {
+    if (lid / 2 == groupa)
+      sourceIndexes[lid * 2*NB + groupb] = 0;
+    if (lid % 2 == groupa)
+      sourceIndexes[lid * 2*NB + NB + groupb] = 0;
+  }
 
-  const u32 delta = (group / NB_NMEM) * NB * maxIn + (NB + (group % NB_NMEM) * NMEM) * maxIn/2;
-  for (int i = 0; i < np; i++) {
-    __syncthreads();
-    u32 idx = lid + i * TRIM_TPB;
-    if (idx >= bsize) continue;
-    uint2 pair = source[delta + idx];
-    u32 bucket = pair.x >> ZBITS;
-    u32 slots = pair.y >> ZBITS;
-    u32 slot0 = slots % 64, slot1 = slots / 64;
-    u64 es = edgemap[group * NB + bucket];
-    if (testbit(es, slot0))
-      setbit(htnode, pair.x & ZMASK);
-    if (slot1 != slot0 && testbit(es, slot1))
-      setbit(htnode, pair.y & ZMASK);
+#if 1
+  for (int ai = 0; ai < 2; ai++) {
+    int a = 2 * groupa + ai;
+    const u32 delta = (a * 2*NB + groupb) * maxIn;
+    // if (!group && !lid && !a) printf("xsize[%d] = %d nxpairs[a] = %d\n", a, xsize[ai], nxpairs[ai]);
+    for (int i = 0; i < nxpairs[ai]; i++) {
+      __syncthreads();
+      const u32 idx = lid + i * TRIM_TPB;
+      if (idx >= xsize[ai]) continue; // not to be optimized away with every iteration synced
+      const uint2 pair = source[delta + idx];
+      const u32 bucket = pair.x >> ZBITS;
+      assert(bucket < NB);
+      const u32 slots = pair.y >> ZBITS;
+      const u32 slot0 = slots % 64, slot1 = slots / 64;
+      assert(group/2 < NB);
+      const u64 es = edgemap[(group/2) * NB + bucket]; // group/2 * NB
+      // if (!group && !lid && !a) printf("delta+idx %d bucket %x pair %x %x\n", delta+idx, bucket, pair.x, pair.y);
+      if (testbit(es, slot0))
+        nodeSet(nodeDeg, pair.x & ZMASK/2);
+      if (slot1 != slot0 && testbit(es, slot1))
+        nodeSet(nodeDeg, pair.y & ZMASK/2);
+    }
+  }
+#endif
+
+#if 1
+  for (int ai = 0; ai < 2; ai++) {
+    int a = 2 * ai + groupa;
+    const u32 delta = (a * 2*NB + NB + groupb) * maxIn;
+    for (int i = 0; i < nypairs[ai]; i++) {
+      __syncthreads();
+      const u32 idx = lid + i * TRIM_TPB;
+      if (idx >= ysize[ai]) continue; // not to be optimized away with every iteration synced
+      const uint2 pair = source[delta + idx];
+      const u32 bucket = pair.x >> ZBITS;
+      assert(bucket < NB);
+      const u32 slots = pair.y >> ZBITS;
+      const u32 slot0 = slots % 64, slot1 = slots / 64;
+      const u64 es = edgemap[bucket * NB + group/2];
+      if (testbit(es, slot0))
+        nodeSet(nodeDeg, pair.x & ZMASK/2);
+      if (slot1 != slot0 && testbit(es, slot1))
+        nodeSet(nodeDeg, pair.y & ZMASK/2);
+    }
   }
+#endif
 
-  for (int a = 0; a < NMEM; a++) {
-    const u32 delta = a * NB * maxIn + group * maxIn/2; // / 2 for indexing pairs, * 2 for alternate buckets
-    for (int i = 0; i < npairs[a]; i++) {
+#if 1
+  for (int ai = 0; ai < 2; ai++) {
+    int a = 2 * groupa + ai;
+    const u32 delta = (a * 2*NB + groupb) * maxIn;
+    for (int i = 0; i < nxpairs[ai]; i++) {
       __syncthreads();
-      u32 idx = lid + i * TRIM_TPB;
-      if (idx >= bktsize[a]) continue;
-      uint2 pair = source[delta + idx];
-      u32 bucket = pair.x >> ZBITS;
-      u32 slots = pair.y >> ZBITS;
-      u32 slot0 = slots % 64, slot1 = slots / 64;
-      u64 es = edgemap[bucket * NB + group];
+      const u32 idx = lid + i * TRIM_TPB;
+      if (idx >= xsize[ai]) continue; // not to be optimized away with every iteration synced
+      const uint2 pair = source[delta + idx];
+      const u32 bucket = pair.x >> ZBITS;
+      assert(bucket < NB);
+      const u32 slots = pair.y >> ZBITS;
+      const u32 slot0 = slots % 64, slot1 = slots / 64;
+      const u64 es = edgemap[(group/2) * NB + bucket];
       if (testbit(es, slot0)) {
-	if (testbit(htnode, pair.x & ZMASK)) {
-          setbit(htnode, pair.x & ZMASK);
+        if (nodeTest(nodeDeg, pair.x & ZMASK/2)) {
+          addhalfedge(magazine, bucket, pair.x&ZMASK, slot0, sourceIndexes+a*2*NB+groupb, source+delta);
+        } else {
+          // if (!group && !bucket) printf("u-halfedge %08x slot %d dies\n", pair.x & ZMASK, slot0);
+	  resetbit(edgemap + (group/2) * NB + bucket, slot0);
 	}
       }
       if (slot1 != slot0 && testbit(es, slot1)) {
-        if (testbit(htnode, pair.y & ZMASK)) {
-          setbit(htnode, pair.y & ZMASK);
+        if (nodeTest(nodeDeg, pair.y & ZMASK/2)) {
+          addhalfedge(magazine, bucket, pair.y&ZMASK, slot1, sourceIndexes+a*2*NB+groupb, source+delta);
+        } else {
+          // if (!group && !bucket) printf("u-halfedge %08x slot %d dies\n", pair.y & ZMASK, slot1);
+          resetbit(edgemap + (group/2) * NB + bucket, slot1);
 	}
       }
     }
     __syncthreads();
+    flushhalfedges(magazine, lid, sourceIndexes+a*2*NB+groupb, source+delta);
   }
+#endif
 
-  for (int a = 0; a < NMEM; a++) {
-    const u32 delta = a * NB * maxIn + group * maxIn/2; // / 2 for indexing pairs, * 2 for alternate buckets
-    for (int i = 0; i < npairs[a]; i++) {
+#if 1
+  for (int ai = 0; ai < 2; ai++) {
+    int a = 2 * ai + groupa;
+    const u32 delta = (a * 2*NB + NB + groupb) * maxIn;
+    for (int i = 0; i < nypairs[ai]; i++) {
       __syncthreads();
-      u32 idx = lid + i * TRIM_TPB;
-      if (idx >= bktsize[a]) continue;
-      uint2 pair = source[delta + idx];
-      u32 bucket = pair.x >> ZBITS;
-      u32 slots = pair.y >> ZBITS;
-      u32 slot0 = slots % 64, slot1 = slots / 64;
-      u64 es = edgemap[bucket * NB + group];
+      const u32 idx = lid + i * TRIM_TPB;
+      if (idx >= ysize[ai]) continue; // not to be optimized away with every iteration synced
+      const uint2 pair = source[delta + idx];
+      const u32 bucket = pair.x >> ZBITS;
+      assert(bucket < NB);
+      const u32 slots = pair.y >> ZBITS;
+      const u32 slot0 = slots % 64, slot1 = slots / 64;
+      const u64 es = edgemap[bucket * NB + group/2];
       if (testbit(es, slot0)) {
-	if (testbit(htnode, pair.x & ZMASK)) {
-          addhalfedge(magazine, bucket, pair.x&ZMASK, slot0, sourceIndexes+a*NB+group, source+delta);
-	} else resetbit(edgemap + bucket * NB + group, slot0);
+	if (nodeTest(nodeDeg, pair.x & ZMASK/2)) {
+          addhalfedge(magazine, bucket, pair.x&ZMASK, slot0, sourceIndexes+a*2*NB+NB+groupb, source+delta);
+	} else {
+          // if (!group && !bucket) printf("v-halfedge %08x slot %d dies\n", pair.x & ZMASK, slot0);
+	  resetbit(edgemap + bucket * NB + group/2, slot0);
+	}
       }
       if (slot1 != slot0 && testbit(es, slot1)) {
-        if (testbit(htnode, pair.y & ZMASK)) {
-          addhalfedge(magazine, bucket, pair.y&ZMASK, slot1, sourceIndexes+a*NB+group, source+delta);
-	} else resetbit(edgemap + bucket * NB + group, slot1);
+        if (nodeTest(nodeDeg, pair.y & ZMASK/2)) {
+          addhalfedge(magazine, bucket, pair.y&ZMASK, slot1, sourceIndexes+a*2*NB+NB+groupb, source+delta);
+	} else {
+          // if (!group && !bucket) printf("v-halfedge %08x slot %d dies\n", pair.y & ZMASK, slot1);
+	  resetbit(edgemap + bucket * NB + group/2, slot1);
+	}
       }
     }
     __syncthreads();
-
-    flushhalfedges(magazine, lid, sourceIndexes+a*NB+group, source+delta);
+    flushhalfedges(magazine, lid, sourceIndexes+a*2*NB+NB+groupb, source+delta);
   }
+#endif
+}
+
+template<int maxIn>
+__global__ void EdgesMeet(uint2 *source, u32 *sourceIndexes, u32 *destIndexes)
+// EdgesMeet<EDGES_A/NMEM/2><<<4*NB, TRIM_TPB>>>((uint2*)bufferB, indexesB, indexesA);
+{
+  const int lid = threadIdx.x;
+  const int groupa = blockIdx.x / NB;
+  assert(groupa < 4);
+  const int groupb = blockIdx.x % NB;
+
+  int vsize = sourceIndexes[groupa * 2*NB + NB + groupb];
+  int nvpairs = (vsize + TRIM_TPB-1) / TRIM_TPB;
+
+  __syncthreads();
 
-  for (int i = 0; i < np; i++) {
+  const u32 deltaOut = groupa * 2*NB * maxIn;
+  const u32 deltaIn = deltaOut + (NB + groupb) * maxIn;
+  const int vX = (groupa%2) * NB2 + groupb/2;
+  for (int i = 0; i < nvpairs; i++) {
     __syncthreads();
     u32 idx = lid + i * TRIM_TPB;
-    if (idx >= bsize) continue;
-    uint2 pair = source[delta + idx];
-    u32 bucket = pair.x >> ZBITS;
-    u32 slots = pair.y >> ZBITS;
-    u32 slot0 = slots % 64, slot1 = slots / 64;
-    u64 es = edgemap[group * NB + bucket];
-    if (testbit(es, slot0)) {
-      if (testbit(htnode, pair.x & ZMASK)) {
-        addhalfedge(magazine, bucket, pair.x&ZMASK, slot0, sourceIndexes+NMEM*NB+group, source+delta);
-      } else resetbit(edgemap + group * NB + bucket, slot0);
-    }
-    if (slot1 != slot0 && testbit(es, slot1)) {
-      if (testbit(htnode, pair.y & ZMASK)) {
-        addhalfedge(magazine, bucket, pair.y&ZMASK, slot1, sourceIndexes+NMEM*NB+group, source+delta);
-      } else resetbit(edgemap + group * NB + bucket, slot1);
-    }
+    if (idx >= vsize) continue;
+    uint2 pair = source[deltaIn + idx];
+    u32 uXb = (pair.x >> (ZBITS-1)) % NB;
+    pair.x = (vX << ZBITS) | (pair.x & ZMASK);
+    int i = atomicAdd(destIndexes + groupa * 2*NB + uXb, 1);
+    source[deltaOut + uXb * maxIn + i] = pair;
   }
-  __syncthreads();
-
-  flushhalfedges(magazine, lid, sourceIndexes+NMEM*NB+group, source+delta);
 }
 
-template<int maxIn> // maxIn is size of small buckets in half-edges
-__global__ void EdgesMeet(uint2 *source, u32 *sourceIndexes, u32 *destIndexes)
+template<int maxIn, int  maxOut>
+__global__ void EdgesMoveU(uint2 *source, u32 *indexes, u32 *dstIndexes)
+// EdgesMoveU<EDGES_A/NMEM/2,2*EDGES_A/NMEM><<<NB, TRIM_TPB>>>((uint2*)bufferB, indexesB, indexesB+NMEM*2*NB);
 {
   const int lid = threadIdx.x;
   const int group = blockIdx.x;
-  int bktsize[NMEM];
-  int npairs[NMEM];
-
-  for (int a = 0; a < NMEM; a++) {
-    bktsize[a] = sourceIndexes[a * NB + group];
-    npairs[a] = (bktsize[a] + TRIM_TPB-1) / TRIM_TPB;
+  const int aOut = group / NB_NMEM;
+  const int uXa = aOut / 2;
+  assert(uXa < 2);
+  const int groupb = group % NB2;
+  const int groupbb = group % NB_NMEM;
+
+  const u32 deltaOut0 = (aOut * 2*NB + NB) * maxIn + groupbb * maxOut;
+  u32 deltaOut = deltaOut0;
+  for (int vXa = 0; vXa < 2; vXa++) {
+    int aIn = 2 * uXa + vXa;
+    for (int uYa = 0; uYa < 2; uYa++) {
+      const int groupba = 2*groupb +uYa;
+      int usize = indexes[aIn * 2*NB + groupba];
+      int nupairs = (usize + TRIM_TPB-1) / TRIM_TPB;
+      const u32 deltaIn = (aIn * 2*NB + groupba) * maxIn;
+      for (int i = 0; i < nupairs; i++) {
+        u32 idx = lid + i * TRIM_TPB;
+        if (idx >= usize) continue;
+        uint2 pair = source[deltaIn + idx];
+        // if (group==0 && lid>=0) printf("pair %08x %08x\n", pair.x, pair.y);
+        source[deltaOut + idx] = pair;
+      }
+      deltaOut += usize;
+    }
   }
+  if (!lid)
+    dstIndexes[group] = deltaOut - deltaOut0;
+}
 
-  __syncthreads();
-
-  for (int a = 0; a < NMEM; a++) {
-    const u32 delta = a * NB * maxIn + group * maxIn/2; // / 2 for indexing pairs, * 2 for alternate buckets
-    const u32 delta2 = a * NB * maxIn + NB * maxIn/2;
-    for (int i = 0; i < npairs[a]; i++) {
-      __syncthreads();
-      u32 idx = lid + i * TRIM_TPB;
-      if (idx >= bktsize[a]) continue;
-      uint2 pair = source[delta + idx];
-      u32 bucket = pair.x >> ZBITS;
-      pair.x = (group << ZBITS) | (pair.x & ZMASK);
-      int i = atomicAdd(destIndexes + bucket, 1);
-      const u32 dlt = (bucket % NB_NMEM) * NMEM * maxIn/2;
-      source[delta2 + dlt + i] = pair;
+template<int maxIn, int  maxOut>
+__global__ void EdgesMoveV(uint2 *source, u32 *indexes, u32 *indexes2, u32 *dstIndexes)
+// EdgesMoveV<EDGES_A/NMEM/2,2*EDGES_A/NMEM><<<NB, TRIM_TPB>>>((uint2*)bufferB, indexesB, indexesA, indexesB+NMEM*2*NB);
+{
+  const int lid = threadIdx.x;
+  const int group = blockIdx.x;
+  const int aOut = group / NB_NMEM;
+  const int uXa = aOut / 2;
+  assert(uXa < 2);
+  const int groupb = group % NB2;
+  const int groupbb = group % NB_NMEM;
+
+  const u32 deltaOut0 = (aOut * 2*NB + NB) * maxIn + groupbb * maxOut;
+  const u32 uOutsize = dstIndexes[group];
+  u32 deltaOut = deltaOut0 + uOutsize;
+  for (int vXa = 0; vXa < 2; vXa++) {
+    int aIn = 2 * uXa + vXa;
+    for (int uYa = 0; uYa < 2; uYa++) {
+      const int groupba = 2*groupb +uYa;
+      int usize = indexes[aIn * 2*NB + groupba];
+      int vsize = indexes2[aIn * 2*NB + groupba] - usize;
+      assert(vsize >= 0);
+      int nvpairs = (vsize + TRIM_TPB-1) / TRIM_TPB;
+      const u32 deltaIn = (aIn * 2*NB + groupba) * maxIn + usize;
+      for (int i = 0; i < nvpairs; i++) {
+        u32 idx = lid + i * TRIM_TPB;
+        if (idx >= vsize) continue;
+        uint2 pair = source[deltaIn + idx];
+        // if (group==0 && lid>=0) printf("pair %08x %08x\n", pair.x, pair.y);
+        source[deltaOut + idx] = pair;
+      }
+      deltaOut += vsize;
     }
   }
+  if (!lid)
+    dstIndexes[NB+group] = deltaOut - deltaOut0;
 }
 
 template<int maxOut>
-__device__ __forceinline__ void addedge(const u32 buckx, const u32 restx, const u32 bucky, const u32 resty, u32 *indices, uint2* dest, u32* dstidx) {
-  int idx = atomicAdd(indices + bucky, 1);
-  dest[    bucky *maxOut + idx] = make_uint2(restx << ZBITS1 | resty, buckx << ZBITS | restx); // extra +1 shift makes a 0 copybit
-  int idx2 = atomicAdd(dstidx, 1);
-  dest[(NB+buckx)*maxOut + idx2] = make_uint2(resty << ZBITS1 | restx, bucky << ZBITS | resty);
+__device__ __forceinline__ void addedge(const u32 buckx, const u32 restx, const u32 bucky, const u32 resty, u32 *indices, uint2* dest) {
+  // if (restx==0x150f3 || resty==0x150f3) printf("x %x %x y %x %x\n", buckx, restx, bucky, resty);
+  int idx  = atomicAdd(indices + bucky, 1);
+  dest[bucky * maxOut + idx] = make_uint2(restx << ZBITS1 | resty, buckx << ZBITS | restx); // extra +1 shift makes a 0 copybit
+  int idx2 = atomicAdd(indices + buckx, 1);
+  dest[buckx * maxOut + idx2] = make_uint2(resty << ZBITS1 | restx, bucky << ZBITS | resty);
 }
 
 template<int maxIn, int maxOut> // maxIn is size of small buckets in half-edges
 __global__ void UnsplitEdges(uint2 *source, u32 *srcIndexes, u32 *srcIndexes2, uint2 *dest, u32 *dstIndexes)
+// UnsplitEdges<EDGES_A/NMEM, EDGES_A/64><<<NB, UNSPLIT_TPB>>>((uint2*)bufferB, indexesB+NMEM*NB, indexesA, bufferC, indexesB);
 {
   const int lid = threadIdx.x;
   const int group = blockIdx.x;
 
   __shared__ u32 lists[NB];
   __shared__ u32 nexts[NB];
-  __shared__ u32 dstidx;
 
   for (int i = 0; i < NB/UNSPLIT_TPB; i++)
     lists[i * UNSPLIT_TPB + lid] = ~0;
 
-  if (!lid)
-    dstidx = 0;
-
   const u32 size1 = srcIndexes[group];
   const u32 np1 = (size1 + UNSPLIT_TPB-1) / UNSPLIT_TPB;
   const u32 size2 = srcIndexes2[group];
@@ -572,12 +731,13 @@ __global__ void UnsplitEdges(uint2 *source, u32 *srcIndexes, u32 *srcIndexes2, u
 
   __syncthreads();
 
-  const u32 delta = (group / NB_NMEM) * NB * maxIn + (NB + (group % NB_NMEM) * NMEM) * maxIn/2;
+    const u32 delta = (group / NB_NMEM) * NB * maxIn + (NB + (group % NB_NMEM) * NMEM) * maxIn/2;
   source += delta;
   for (int i = 0; i < np1; i++) {
     const u32 index = lid + i * UNSPLIT_TPB;
     if (index >= size1) continue;
     const uint2 pair = source[index];
+    // if (group==0 && lid==0) printf("pair %08x %08x\n", pair.x, pair.y);
     const u32 bucket = pair.x >> ZBITS;
     nexts[index] = atomicExch(&lists[bucket], index);
   }
@@ -595,28 +755,24 @@ __global__ void UnsplitEdges(uint2 *source, u32 *srcIndexes, u32 *srcIndexes2, u
       const uint2 pair2 = source[idx];
       const u32 slots2 = pair2.y >> ZBITS;
       const u32 slot20 = slots2 % 64, slot21 = slots2 / 64;
+      // if (group==0 && lid==0) printf("pair %08x %08x pair2 %08x %08x\n", pair.x, pair.y, pair2.x, pair2.y);
       if (slot20 == slot0)
-        addedge<maxOut>(group, pair2.x & ZMASK, bucket, pair.x & ZMASK, dstIndexes, dest, &dstidx);
+        addedge<maxOut>(group, pair2.x & ZMASK, bucket, pair.x & ZMASK, dstIndexes, dest);
       else if (slot20 == slot1)
-        addedge<maxOut>(group, pair2.x & ZMASK, bucket, pair.y & ZMASK, dstIndexes, dest, &dstidx);
+        addedge<maxOut>(group, pair2.x & ZMASK, bucket, pair.y & ZMASK, dstIndexes, dest);
       if (slot21 != slot20) {
         if (slot21 == slot0)
-          addedge<maxOut>(group, pair2.y & ZMASK, bucket, pair.x & ZMASK, dstIndexes, dest, &dstidx);
+          addedge<maxOut>(group, pair2.y & ZMASK, bucket, pair.x & ZMASK, dstIndexes, dest);
         else if (slot21 == slot1)
-          addedge<maxOut>(group, pair2.y & ZMASK, bucket, pair.y & ZMASK, dstIndexes, dest, &dstidx);
+          addedge<maxOut>(group, pair2.y & ZMASK, bucket, pair.y & ZMASK, dstIndexes, dest);
       }
       // if (slots != slots2) printf("pair slots %d %d pair2 slots %d %d\n", slot0, slot1, slot20, slot21);
     }
   }
-
-  __syncthreads();
-
-  if (!lid)
-    dstIndexes[NB + group] = dstidx;
 }
 
 #ifndef LISTBITS
-#define LISTBITS 11
+#define LISTBITS 12
 #endif
 
 const u32 NLISTS  = 1 << LISTBITS;
@@ -626,7 +782,9 @@ const u32 NLISTS  = 1 << LISTBITS;
 #endif
 
 template<int maxIn, int maxOut>
-__global__ void Tag_Relay(const uint2 *source, uint2 *destination, const u32 *sourceIndexes, u32 *destinationIndexes)
+__global__  void Tag_Relay(const uint2 *source, uint2 *destination, const u32 *sourceIndexes, u32 *destinationIndexes)
+// Tag_Relay<EDGES_A/64, EDGES_A/64><<<NB, RELAY_TPB>>>(bufferC, bufferD, indexesB, indexesA);
+// Tag_Relay<EDGES_A/64, EDGES_A/64><<<NB, RELAY_TPB>>>(bufferD, bufferC, indexesA, indexesB);
 {
   const int lid = threadIdx.x;
   const int group = blockIdx.x;
@@ -634,115 +792,94 @@ __global__ void Tag_Relay(const uint2 *source, uint2 *destination, const u32 *so
   const u32 TAGMASK = (~1U) << ZBITS;
   const u32 COPYFLAG = NZ;
 
-  __shared__ u32 lists0[NLISTS];
-  __shared__ u32 nexts0[NNEXTS];
-  __shared__ u32 lists1[NLISTS];
-  __shared__ u32 nexts1[NNEXTS];
+  __shared__ u32 lists[NLISTS];
+  __shared__ u32 nexts[NNEXTS];
 
-  const int nloops0 = (min(sourceIndexes[   group], NNEXTS) - lid + RELAY_TPB-1) / RELAY_TPB;
-  const int nloops1 = (min(sourceIndexes[NB+group], NNEXTS) - lid + RELAY_TPB-1) / RELAY_TPB;
+  const int nloops = (min(sourceIndexes[group], NNEXTS) - lid + RELAY_TPB-1) / RELAY_TPB;
 
   source += group * maxIn;
 
   for (int i = 0; i < NLISTS/RELAY_TPB; i++)
-    lists0[i * RELAY_TPB + lid] = lists1[i * RELAY_TPB + lid] = ~0;
+    lists[i * RELAY_TPB + lid] = ~0;
 
   __syncthreads();
 
-  for (int i = 0; i < nloops0; i++) {
+  for (int i = 0; i < nloops; i++) {
     const u32 index = i * RELAY_TPB + lid;
     const uint2 edge = source[index];
+    // if (group==0 && lid>=0) printf("edge %08x %08x\n", edge.x, edge.y);
     const u32 list = edge.x & LISTMASK;
-    nexts0[index] = atomicExch(&lists0[list], index);
+    nexts[index] = atomicExch(&lists[list], index);
   }
 
   __syncthreads();
 
-  for (int i = 0; i < nloops1; i++) {
+  for (int i = 0; i < nloops; i++) {
     const u32 index = i * RELAY_TPB + lid;
-    const uint2 edge = source[NB * maxIn + index];
-    const u32 list = edge.x & LISTMASK;
-    nexts1[index] = atomicExch(&lists1[list], index);
+    const uint2 edge = source[index];
     if (edge.x & COPYFLAG) continue; // copies don't relay
+    const u32 list = edge.x & LISTMASK;
+    // nexts1[index] = atomicExch(&lists1[list], index);
     u32 bucket = edge.y >> ZBITS;
     u32 copybit = 0;
-    for (u32 idx = lists0[list]; idx != ~0; idx = nexts0[idx]) {
+    for (u32 idx = lists[list]; idx != ~0; idx = nexts[idx]) {
       uint2 tagged = source[idx];
       // printf("compare with tagged %08x %08x xor %x\n", tagged.x, tagged.y, (tagged.x ^ edge.x) & ZMASK);
-      if ((tagged.x ^ edge.x) & ZMASK) continue;
-      u32 bktIdx = min(atomicAdd(destinationIndexes + bucket, 1), maxOut - 1);
+      if (idx == index || (tagged.x ^ edge.x) & ZMASK || tagged.y == edge.y) continue;
+      if (((group << ZBITS) | (edge.x & ZMASK)) == edge.y) { printf("OOPS! SELF\n"); continue; }
+      u32 bktIdx = atomicAdd(destinationIndexes + bucket, 1); // , maxOut - 1);
+      assert(bktIdx < maxOut);
       // printf("relaying from tagged %08x %08x bktIdx %d\n", tagged.x, tagged.y, bktIdx);
       u32 tag = (tagged.x & TAGMASK) | copybit;
       destination[bucket * maxOut + bktIdx] = make_uint2(tag | (edge.y & ZMASK), (group << ZBITS) | (edge.x & ZMASK));
       copybit = COPYFLAG;
     }
   }
-
-  __syncthreads();
-
-  for (int i = 0; i < nloops0; i++) {
-    const u32 index = i * RELAY_TPB + lid;
-    const uint2 edge = source[index];
-    const u32 list = edge.x & LISTMASK;
-    if (edge.x & COPYFLAG) continue; // copies don't relay
-    u32 bucket = edge.y >> ZBITS;
-    u32 copybit = 0;
-    for (u32 idx = lists1[list]; idx != ~0; idx = nexts1[idx]) {
-      uint2 tagged = source[NB * maxIn + idx];
-      if ((tagged.x ^ edge.x) & ZMASK) continue;
-      u32 bktIdx = min(atomicAdd(destinationIndexes + NB + bucket, 1), maxOut - 1);
-      // printf("relaying from tagged %08x %08x bktIdx %d\n", tagged.x, tagged.y, bktIdx);
-      u32 tag = (tagged.x & TAGMASK) | copybit;
-      destination[(NB + bucket) * maxOut + bktIdx] = make_uint2(tag | (edge.y & ZMASK), (group << ZBITS) | (edge.x & ZMASK));
-      copybit = COPYFLAG;
-    }
-  }
 }
 
 template<int maxIn>
-__global__ void Tail(const uint2 *source, uint2 *destination, const u32 *sourceIndexes, u32 *destinationIndexes)
+__global__  void Tail(const uint2 *source, uint2 *destination, const u32 *sourceIndexes, u32 *destinationIndexes)
 {
   const int lid = threadIdx.x;
   const int group = blockIdx.x;
   const u32 LISTMASK = NLISTS - 1;
   const u32 COPYFLAG = NZ;
 
-  __shared__ u32 lists0[NLISTS];
-  __shared__ u32 nexts0[NNEXTS];
+  __shared__ u32 lists[NLISTS];
+  __shared__ u32 nexts[NNEXTS];
 
-  const int nloops0 = (min(sourceIndexes[   group], NNEXTS) - lid + TAIL_TPB-1) / TAIL_TPB;
-  const int nloops1 = (min(sourceIndexes[NB+group], NNEXTS) - lid + TAIL_TPB-1) / TAIL_TPB;
+  const int nloops = (min(sourceIndexes[group], NNEXTS) - lid + TAIL_TPB-1) / TAIL_TPB;
 
   source += group * maxIn;
 
   for (int i = 0; i < NLISTS/TAIL_TPB; i++)
-    lists0[i * TAIL_TPB + lid] = ~0;
+    lists[i * TAIL_TPB + lid] = ~0;
 
   __syncthreads();
 
-  for (int i = 0; i < nloops0; i++) {
+  for (int i = 0; i < nloops; i++) {
     const u32 index = i * TAIL_TPB + lid;
     const uint2 edge = source[index];
     const u32 list = edge.x & LISTMASK;
-    nexts0[index] = atomicExch(&lists0[list], index);
+    nexts[index] = atomicExch(&lists[list], index);
   }
 
   __syncthreads();
 
-  for (int i = 0; i < nloops1; i++) {
+  for (int i = 0; i < nloops; i++) {
     const u32 index = i * TAIL_TPB + lid;
-    const uint2 edge = source[NB * maxIn + index];
+    const uint2 edge = source[index];
     const u32 list = edge.x & LISTMASK;
-    for (u32 idx = lists0[list]; idx != ~0; idx = nexts0[idx]) {
+    for (u32 idx = lists[list]; idx != ~0; idx = nexts[idx]) {
       uint2 tagged = source[idx];
-      if ((tagged.x ^ edge.x) & ~COPYFLAG) continue;
+      if (idx == index || (tagged.x ^ edge.x) & ~COPYFLAG) continue;
       u32 bktIdx = atomicAdd(destinationIndexes, 1);
       destination[bktIdx] = make_uint2((group << ZBITS) | (edge.x & ZMASK), edge.y);
     }
   }
 }
 
-__global__ void Recovery(u32 * indexes)
+__global__  void Recovery(u32 * indexes)
 {
   const int gid = blockDim.x * blockIdx.x + threadIdx.x;
   const int lid = threadIdx.x;
@@ -763,6 +900,7 @@ __global__ void Recovery(u32 * indexes)
 
   for (int i = 0; i < nloops; i++) {
     u64 blockNonce = gid * nloops * EDGE_BLOCK_SIZE + i * EDGE_BLOCK_SIZE;
+    assert(blockNonce < NEDGES);
 
     v0 = dipkeys.k0;
     v1 = dipkeys.k1;
@@ -787,7 +925,7 @@ __global__ void Recovery(u32 * indexes)
     for (int s = EDGE_BLOCK_SIZE; --s >= 0; last = lookup) {
       lookup = sipblock[s] ^ last;
       for (int i = 0; i < PROOFSIZE; i++) {
-        if (recovery[i] == (lookup & NODEMASK2))
+        if (recovery[i] == (lookup & NODEMASK2) || recovery[PROOFSIZE+i] == (lookup & NODEMASK2))
           nonces[i] = blockNonce + s;
       }
     }
@@ -813,18 +951,15 @@ typedef uint16_t u16;
 const u32 MAXEDGES = NEDGES >> IDXSHIFT;
 
 #ifndef NEPS_A
-#define NEPS_A 135
+#define NEPS_A 144
 #endif
 #ifndef NEPS_B
-#define NEPS_B 88
-#endif
-#ifndef NEPS_C
-#define NEPS_C 55
+#define NEPS_B 92
 #endif
 #define NEPS 128
 
-const u32 EDGES_A = NZ * NEPS_A / NEPS;
-const u32 EDGES_B = NZ * NEPS_B / NEPS;
+const u32 EDGES_A = (NEDGES / NB) * NEPS_A / NEPS;
+const u32 EDGES_B = (NEDGES / NB) * NEPS_B / NEPS;
 
 const u32 ALL_EDGES_A = EDGES_A * NB;
 const u32 ALL_EDGES_B = EDGES_B * NB;
@@ -887,12 +1022,15 @@ struct edgetrimmer {
   const size_t indexesSize = NB * sizeof(u32);
   const size_t indexesSizeNTLB = NTLB * indexesSize;
   const size_t indexesSizeNMEM = NMEM * indexesSize;
-  const size_t edgemapSize = 2 * NEDGES / 8; // 8 bits per byte; twice that for bucket (2^24 in all) size variance
+  const size_t edgeSlotsSize = NB * NB * sizeof(u32); // count slots in edgeMap
+  const size_t edgeMapSize = 2 * NEDGES / 8; // 8 bits per byte; twice that for bucket (2^24 in all) size variance
   u8 *bufferA;
   u8 *bufferB;
+  u8 *bufferA1;
   u32 *indexesA;
   u32 *indexesB;
-  u64 *edgemap;
+  u32 *edgeSlots;
+  u64 *edgeMap;
   u32 nedges;
   siphash_keys sipkeys;
   bool abort;
@@ -900,29 +1038,33 @@ struct edgetrimmer {
 
   edgetrimmer(const trimparams _tp) : tp(_tp) {
     checkCudaErrors_V(cudaMalloc((void**)&dt, sizeof(edgetrimmer)));
-    checkCudaErrors_V(cudaMalloc((void**)&indexesA, indexesSizeNTLB));
-    checkCudaErrors_V(cudaMalloc((void**)&indexesB, indexesSizeNMEM + indexesSize));
-    checkCudaErrors_V(cudaMalloc((void**)&edgemap, edgemapSize));
+    checkCudaErrors_V(cudaMalloc((void**)&indexesA, 2*indexesSizeNTLB));
+    checkCudaErrors_V(cudaMalloc((void**)&indexesB, 3*indexesSizeNMEM));
+    checkCudaErrors_V(cudaMalloc((void**)&edgeSlots, edgeSlotsSize));
     checkCudaErrors_V(cudaMalloc((void**)&bufferB, bufferSize));
     bufferA = bufferB + sizeA / NMEM;
-    // print_log("allocated %lld bytes bufferB %llx endBuffer %llx\n", bufferSize, bufferB, bufferB+bufferSize);
+    assert(edgeMapSize < sizeA / NMEM);
+    assert(edgeMapSize == NB * NB * sizeof(u64));
+    edgeMap = (u64 *)(bufferB + bufferSize - edgeMapSize);
+    bufferA1 = bufferB + bufferSize - sizeA / NMEM;
+    print_log("allocated %lld bytes bufferB %llx endBuffer %llx\n", bufferSize, bufferB, bufferB+bufferSize);
+    print_log("bufferA %llx bufferA1 %llx\n", bufferA, bufferA1);
     cudaMemcpy(dt, this, sizeof(edgetrimmer), cudaMemcpyHostToDevice);
     initsuccess = true;
-    cudaFuncSetAttribute(EdgeSplit<EDGES_A/NTLB, EDGES_A/NMEM>, cudaFuncAttributeMaxDynamicSharedMemorySize, EXTSHARED);
   }
   u64 globalbytes() const {
-    return bufferSize + indexesSizeNTLB + indexesSizeNMEM + indexesSize + edgemapSize + sizeof(siphash_keys) + sizeof(edgetrimmer);
+    return bufferSize + 2*indexesSizeNTLB + 3*indexesSizeNMEM + edgeSlotsSize + edgeMapSize + sizeof(siphash_keys) + sizeof(edgetrimmer);
   }
   ~edgetrimmer() {
     checkCudaErrors_V(cudaFree(bufferB));
     checkCudaErrors_V(cudaFree(indexesA));
     checkCudaErrors_V(cudaFree(indexesB));
-    checkCudaErrors_V(cudaFree(edgemap));
+    checkCudaErrors_V(cudaFree(edgeSlots));
     checkCudaErrors_V(cudaFree(dt));
     cudaDeviceReset();
   }
   void indexcount(u32 round, const u32 *indexes) {
-#ifdef VERBOSE
+#if VERBOSE
     u32 nedges[NB];
     for (int i = 0; i < NB; i++)
       nedges[i] = 0;
@@ -982,37 +1124,41 @@ struct edgetrimmer {
     print_log("Seeding completed in %.0f ms\n", durationA);
     print_log("EdgeSplit<<<%d,%d>>>\n", NB_NMEM, SPLIT_TPB); // 1024x1024
 #endif
+    if (abort) return false;
 
-    cudaMemset(indexesB, 0, indexesSizeNMEM + indexesSize);
+    assert((uint2*)bufferB + NB * EDGES_A/NMEM == (uint2 *)bufferA);
+    cudaMemset(edgeSlots, 0, edgeSlotsSize);
+    cudaMemset(indexesB, 0, 2*indexesSizeNMEM);
     for (u32 i=0; i < NMEM; i++) {
-      EdgeSplit<EDGES_A/NTLB, EDGES_A/NMEM><<<NB_NMEM, SPLIT_TPB, EXTSHARED>>>((uint2*)bufferA, (uint2*)bufferB, indexesA, indexesB, edgemap, i);
+      EdgeSplit<EDGES_A/NTLB, EDGES_A/NMEM/2><<<NB_NMEM*4, SPLIT_TPB>>>((uint2*)bufferA, (uint2*)bufferB, indexesA, indexesB, edgeSlots, i);
       if (abort) return false;
     }
 
     checkCudaErrors(cudaDeviceSynchronize()); cudaEventRecord(stop, NULL);
     cudaEventSynchronize(stop); cudaEventElapsedTime(&durationB, start, stop);
-    cudaEventRecord(start, NULL);
   
+    setEdgeMap<<<NB, SPLIT_TPB>>>(edgeSlots, edgeMap);
+    checkCudaErrors(cudaDeviceSynchronize());
+
 #ifdef VERBOSE
-    for (u32 i=0; i < NMEM; i++)
+    for (u32 i=0; i < 2*NMEM; i++)
       indexcount(1, indexesB+i*NB);
-    indexcount(1, indexesB+NMEM*NB);
-    edgemapcount(1, edgemap); // .400
+    edgemapcount(1, edgeMap); // .400
     print_log("EdgeSplit completed in %.0f ms\n", durationB);
     print_log("Round<<<%d,%d>>>\n", NB, TRIM_TPB); // 4096x1024
 #endif
 
-    Round<EDGES_A/NMEM><<<NB, TRIM_TPB>>>((uint2*)bufferB, indexesB, edgemap);
+    cudaEventRecord(start, NULL);
+    Round<EDGES_A/NMEM/2><<<2*NB, TRIM_TPB>>>((uint2*)bufferB, indexesB, edgeMap);
 
     checkCudaErrors(cudaDeviceSynchronize()); cudaEventRecord(stop, NULL);
     cudaEventSynchronize(stop); cudaEventElapsedTime(&durationC, start, stop);
     checkCudaErrors(cudaEventDestroy(start)); checkCudaErrors(cudaEventDestroy(stop));
   
     print_log("Round completed in %.0f ms\n", durationC);
-    for (u32 i=0; i < NMEM; i++)
+    for (u32 i=0; i < 2*NMEM; i++)
       indexcount(2, indexesB+i*NB);
-    indexcount(2, indexesB+NMEM*NB);
-    edgemapcount(2, edgemap);
+    edgemapcount(2, edgeMap);
     if (abort) return false;
 
 #ifdef VERBOSE
@@ -1020,17 +1166,32 @@ struct edgetrimmer {
 #endif
 
     for (int r = 3; r < tp.ntrims; r++) {
-      Round<EDGES_A/NMEM><<<NB, TRIM_TPB>>>((uint2*)bufferB, indexesB, edgemap);
+      Round<EDGES_A/NMEM/2><<<2*NB, TRIM_TPB>>>((uint2*)bufferB, indexesB, edgeMap);
       checkCudaErrors(cudaDeviceSynchronize());
-      edgemapcount(r, edgemap);
+      indexcount(r, indexesB);
+      edgemapcount(r, edgeMap);
     }
 
-    cudaMemcpy((void *)indexesA, indexesB+NMEM*NB, indexesSize, cudaMemcpyDeviceToDevice);
-    EdgesMeet<EDGES_A/NMEM><<<NB, TRIM_TPB>>>((uint2*)bufferB, indexesB, indexesA);
+    cudaMemcpy((void *)indexesA, indexesB, 2*indexesSizeNMEM, cudaMemcpyDeviceToDevice); // assumes NMEM == NTLB == 4
+    EdgesMeet<EDGES_A/NMEM/2><<<4*NB, TRIM_TPB>>>((uint2*)bufferB, indexesB, indexesA);
+    checkCudaErrors(cudaDeviceSynchronize());
+    EdgesMoveU<EDGES_A/NMEM/2,2*EDGES_A/NMEM><<<NB, TRIM_TPB>>>((uint2*)bufferB, indexesB, indexesB+NMEM*2*NB);
+    checkCudaErrors(cudaDeviceSynchronize());
+    EdgesMoveV<EDGES_A/NMEM/2,2*EDGES_A/NMEM><<<NB, TRIM_TPB>>>((uint2*)bufferB, indexesB, indexesA, indexesB+NMEM*2*NB);
+    checkCudaErrors(cudaDeviceSynchronize());
+    cudaMemcpy((void *)indexesA, indexesB+NMEM*2*NB+NB, indexesSize, cudaMemcpyDeviceToDevice);
+
+    indexcount(tp.ntrims, indexesB+NMEM*2*NB);
+    indexcount(tp.ntrims, indexesA);
+
+#ifdef VERBOSE
+    print_log("Done moving\n");
+#endif
 
     uint2 *bufferC = (uint2 *)bufferA;
-    cudaMemset(indexesB, 0, indexesSize); // UnsplitEdges zeroes second indexes set
-    UnsplitEdges<EDGES_A/NMEM, EDGES_A/128><<<NB, UNSPLIT_TPB>>>((uint2*)bufferB, indexesB+NMEM*NB, indexesA, bufferC, indexesB);
+    uint2 *bufferD = bufferC+NB_NMEM*EDGES_A;
+    cudaMemset(indexesB, 0, indexesSize);
+    UnsplitEdges<EDGES_A/NMEM, EDGES_A/64><<<NB, UNSPLIT_TPB>>>((uint2*)bufferB, indexesB+NMEM*2*NB, indexesA, bufferC, indexesB);
     checkCudaErrors(cudaDeviceSynchronize());
     indexcount(tp.ntrims, indexesB);
     indexcount(tp.ntrims, indexesB+NB);
@@ -1039,17 +1200,18 @@ struct edgetrimmer {
     print_log("UnsplitEdges<><<<%d,%d>>>\n", NB, UNSPLIT_TPB);
 #endif
 
-    static_assert(NTLB >= 2, "2 index sets need to fit in indexesA");
-    static_assert(NMEM >= 2, "2 index sets need to fit in indexesB");
+    static_assert(NTLB == 4, "4 index sets need to fit in indexesA");
+    static_assert(NMEM == 4, "4 index sets need to fit in indexesB");
 
     for (int r = 0; r < PROOFSIZE/2 - 1; r++) {
-      // printf("Relay round %d\n", r);
       if (r % 2 == 0) {
-        cudaMemset(indexesA, 0, 2*indexesSize);
-        Tag_Relay<EDGES_A/128, EDGES_A/128><<<NB, RELAY_TPB>>>(bufferC, bufferC+NB_NMEM*EDGES_A, indexesB, indexesA);
+        cudaMemset(indexesA, 0, indexesSize);
+        Tag_Relay<EDGES_A/64, EDGES_A/64><<<NB, RELAY_TPB>>>(bufferC, bufferD, indexesB, indexesA);
+        indexcount(tp.ntrims, indexesA);
       } else {
-        cudaMemset(indexesB, 0, 2*indexesSize);
-        Tag_Relay<EDGES_A/128, EDGES_A/128><<<NB, RELAY_TPB>>>(bufferC+NB_NMEM*EDGES_A, bufferC, indexesA, indexesB);
+        cudaMemset(indexesB, 0, indexesSize);
+        Tag_Relay<EDGES_A/64, EDGES_A/64><<<NB, RELAY_TPB>>>(bufferD, bufferC, indexesA, indexesB);
+        indexcount(tp.ntrims, indexesB);
       }
       checkCudaErrors(cudaDeviceSynchronize());
     }
@@ -1058,14 +1220,14 @@ struct edgetrimmer {
     print_log("Tail<><<<%d,%d>>>\n", NB, TAIL_TPB);
 #endif
 
-    cudaMemset(indexesA, 0, sizeof(u32));
+    cudaMemset(indexesB+NB, 0, sizeof(u32));
     if ((PROOFSIZE/2 - 1) % 2 == 0) {
-      Tail<EDGES_A/128><<<NB, TAIL_TPB>>>(bufferC,                 (uint2 *)bufferB, indexesB, indexesB+2*NB);
+      Tail<EDGES_A/64><<<NB, TAIL_TPB>>>(bufferC, (uint2 *)bufferB, indexesB, indexesB+NB);
     } else {
-      Tail<EDGES_A/128><<<NB, TAIL_TPB>>>(bufferC+NB_NMEM*EDGES_A, (uint2 *)bufferB, indexesA, indexesB+2*NB);
+      Tail<EDGES_A/64><<<NB, TAIL_TPB>>>(bufferD, (uint2 *)bufferB, indexesA, indexesB+NB);
     }
 
-    cudaMemcpy(&nedges, indexesB+2*NB, sizeof(u32), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&nedges, indexesB+NB, sizeof(u32), cudaMemcpyDeviceToHost);
     cudaDeviceSynchronize();
     print_log("%d rounds %d edges\n", tp.ntrims, nedges);
     return nedges;
@@ -1077,7 +1239,7 @@ struct solver_ctx {
   bool mutatenonce;
   uint2 *edges;
   graph<word_t> cg;
-  uint2 soledges[PROOFSIZE];
+  uint2 soledges[2*PROOFSIZE];
   std::vector<u32> sols; // concatenation of all proof's indices
 
   solver_ctx(const trimparams tp, bool mutate_nonce) : trimmer(tp), cg(MAXEDGES, MAXEDGES, MAX_SOLS, IDXSHIFT) {
@@ -1107,6 +1269,7 @@ struct solver_ctx {
 #endif
       for (u32 j = 0; j < PROOFSIZE; j++) {
         soledges[j] = edges[cg.sols[s][j]];
+        soledges[PROOFSIZE+j] = make_uint2(soledges[j].y, soledges[j].x);
 #ifdef VERBOSE
         print_log(" (%x, %x)", soledges[j].x, soledges[j].y);
 #endif
@@ -1120,8 +1283,8 @@ struct solver_ctx {
 #ifdef VERBOSE
     print_log("Recovery<><<<%d,%d>>>\n", trimmer.tp.recover.blocks, trimmer.tp.recover.tpb);
 #endif
-      Recovery<<<trimmer.tp.recover.blocks, trimmer.tp.recover.tpb>>>((u32 *)trimmer.bufferA);
-      cudaMemcpy(&sols[sols.size()-PROOFSIZE], trimmer.bufferA, PROOFSIZE * sizeof(u32), cudaMemcpyDeviceToHost);
+      Recovery<<<trimmer.tp.recover.blocks, trimmer.tp.recover.tpb>>>((u32 *)trimmer.bufferA1);
+      cudaMemcpy(&sols[sols.size()-PROOFSIZE], trimmer.bufferA1, PROOFSIZE * sizeof(u32), cudaMemcpyDeviceToHost);
       checkCudaErrors(cudaDeviceSynchronize());
       qsort(&sols[sols.size()-PROOFSIZE], PROOFSIZE, sizeof(u32), cg.nonce_cmp);
     }
@@ -1288,10 +1451,10 @@ CALL_CONVENTION void fill_default_params(SolverParams* params) {
   params->cpuload = false;
 }
 
+static_assert(BUCKBITS % 2 == 0, "BUCKBITS must be even");
 static_assert(NB % SEED_TPB == 0, "SEED_TPB must divide NB");
 static_assert(NB % SPLIT_TPB == 0, "SPLIT_TPB must divide NB");
 static_assert(NLISTS % (RELAY_TPB) == 0, "RELAY_TPB must divide NLISTS"); // for Tag_Edges lists    init
-static_assert(NZ % (32 * SPLIT_TPB) == 0, "SPLIT_TPB must divide NZ/32"); // for EdgeSplit htnode init
 static_assert(NZ % (32 *  TRIM_TPB) == 0, "TRIM_TPB must divide NZ/32"); // for Round htnode init
 
 int main(int argc, char **argv) {
@@ -1326,7 +1489,7 @@ int main(int argc, char **argv) {
         for (u32 i=0; i<len; i++)
           sscanf(optarg+2*i, "%2hhx", header+i); // hh specifies storage of a single byte
         break;
-      case 'm': // ntrims         =        15;
+      case 'm': // ntrims         =       15;
         params.ntrims = atoi(optarg);
         break;
       case 'n':
diff --git a/src/cuckarooz/mean.hpp b/src/cuckarooz/mean.hpp
index df1dc6c..391e0f4 100644
--- a/src/cuckarooz/mean.hpp
+++ b/src/cuckarooz/mean.hpp
@@ -21,55 +21,25 @@
 
 // algorithm/performance parameters
 
-// EDGEBITS/NEDGES/NODE1MASK defined in cuckarooz.h
+// EDGEBITS/NEDGES/EDGEMASK/NODEBITS/NNODES/defined in cuckarooz.h
 
-// The node bits are logically split into 3 groups:
-// XBITS 'X' bits (most significant), YBITS 'Y' bits, and ZBITS 'Z' bits (least significant)
-// Here we have the default XBITS=YBITS=7, ZBITS=15 summing to EDGEBITS=29
-// nodebits   XXXXXXX YYYYYYY ZZZZZZZZZZZZZZZ
-// bit%10     8765432 1098765 432109876543210
-// bit/10     2222222 2211111 111110000000000
+// The node bits are logically split into 2 groups:
+// XBITS 'X' bits (most significant), and ZBITS 'Z' bits (least significant)
+// Here we have the default XBITS=12, ZBITS=18 summing to NODEBITS=30
+// nodebits  XXXXXXXXXXXX ZZZZZZZZZZZZZZZZZZ
+// bit%10    987654321098 765432109876543210
+// bit/10    222222222211 111111110000000000
 
-// The matrix solver stores all edges in a matrix of NX * NX buckets,
-// where NX = 2^XBITS is the number of possible values of the 'X' bits.
+// The crossbar solver splits all edges in two halfedges that are linked
+// through a u64 crossbar[NX,NX] with NX = 2^XBITS
 // Edge i between nodes ui = siphash24(2*i) and vi = siphash24(2*i+1)
-// resides in the bucket at (uiX,viX)
-// In each trimming round, either a matrix row or a matrix column (NX buckets)
-// is bucket sorted on uY or vY respectively, and then within each bucket
-// uZ or vZ values are marked and edges with no marked endpoint neighbour are eliminated,
-// while remaining edges are bucket sorted back on vX or uX respectively.
-// When sufficiently many edges have been eliminated, a pair of compression
-// rounds remap surviving Y,Z values in each row or column into 15 bit
-// combined YZ values, allowing the remaining rounds to avoid the sorting on Y,
-// and directly count YZ values in a cache friendly 32KB.
-// A final pair of compression rounds remap YZ values from 16 into 12 bits.
+// uses one of the 64 bits in crossbar[uiX,viX] to track liveness.
+// each crossbar entry gets on average 2^29 / ((2^12)^2) = 2^5 = 32 edges mapped to it
+// with some tiny probability, more than 64 edges get mapped to one entry resulting
+// in some edges getting dropped.
 
 #ifndef XBITS
-// 7 seems to give best performance
-#define XBITS 7
-#endif
-
-#define YBITS XBITS
-
-// need to fit 2 endpoints bucketed by one XBITS
-#define BIGSIZE0 ((2*EDGEBITS-XBITS+7)/8)
-// 2*29-7+7 / 8 = 58 / 8 = 7
-// size in bytes of a big bucket entry
-// need to fit 2 endpoints bucketed by XBITS and YBITS
-#define BIGSIZE1 ((2*(EDGEBITS-XBITS)+7)/8)
-// YZ compression round; must be even
-#ifndef COMPRESSROUND 
-#define COMPRESSROUND 14
-#endif
-// size in bytes of a small bucket entry
-#define SMALLSIZE BIGSIZE1
-
-#ifndef BIGSIZE
-#if EDGEBITS <= 15
-#define BIGSIZE 4
-#else
-#define BIGSIZE 5
-#endif
+#define XBITS 12
 #endif
 
 typedef uint8_t u8;
@@ -77,55 +47,14 @@ typedef uint16_t u16;
 typedef uint32_t u32;
 typedef uint64_t u64;
 
-#if EDGEBITS >= 30
-typedef u64 offset_t;
-#else
-typedef u32 offset_t;
-#endif
-
-#if BIGSIZE0 > 4
-typedef u64 BIGTYPE0;
-#else
-typedef u32 BIGTYPE0;
-#endif
-
 // node bits have two groups of bucketbits (X for big and Y for small) and a remaining group Z of degree bits
 const u32 NX        = 1 << XBITS;
 const u32 XMASK     = NX - 1;
-const u32 NY        = 1 << YBITS;
-const u32 YMASK     = NY - 1;
-const u32 XYBITS    = XBITS + YBITS;
-const u32 NXY       = 1 << XYBITS;
-const u32 ZBITS     = EDGEBITS - XYBITS;
+const u32 ZBITS     = NODEBITS - XBITS;
 const u32 NZ        = 1 << ZBITS;
 const u32 ZMASK     = NZ - 1;
-const u32 YZBITS    = EDGEBITS - XBITS;
-const u32 NYZ       = 1 << YZBITS;
-const u32 YZMASK    = NYZ - 1;
-const u32 YZ1BITS   = YZBITS < 16 ? YZBITS : 16;  // compressed YZ bits
-const u32 NYZ1      = 1 << YZ1BITS;
-const u32 MAXNZNYZ1 = NZ < NYZ1 ? NYZ1 : NZ;
-const u32 YZ1MASK   = NYZ1 - 1;
-const u32 Z1BITS    = YZ1BITS - YBITS;
-const u32 NZ1       = 1 << Z1BITS;
-const u32 Z1MASK    = NZ1 - 1;
-const u32 YZ2BITS   = YZBITS < 12 ? YZBITS : 12;  // more compressed YZ bits
-const u32 NYZ2      = 1 << YZ2BITS;
-const u32 YZ2MASK   = NYZ2 - 1;
-const u32 Z2BITS    = YZ2BITS - YBITS;
-const u32 NZ2       = 1 << Z2BITS;
-const u32 Z2MASK    = NZ2 - 1;
-const u32 YZZBITS   = YZBITS + ZBITS;
-const u32 YZZ1BITS  = YZ1BITS + ZBITS;
-
-const u32 MAXEDGES = NX * NYZ2;
-
-const u32 BIGSLOTBITS   = BIGSIZE * 8;
-const u32 SMALLSLOTBITS = SMALLSIZE * 8;
-const u64 BIGSLOTMASK   = (1ULL << BIGSLOTBITS) - 1ULL;
-const u64 SMALLSLOTMASK = (1ULL << SMALLSLOTBITS) - 1ULL;
-const u32 BIGSLOTBITS0  = BIGSIZE0 * 8;
-const u64 BIGSLOTMASK0  = (1ULL << BIGSLOTBITS0) - 1ULL;
+
+const u32 MAXEDGES = NEDGES >> 13; // bound on #edges remaining after trimming
 
 // for p close to 0, Pr(X>=k) < e^{-n*p*eps^2} where k=n*p*(1+eps)
 // see https://en.wikipedia.org/wiki/Binomial_distribution#Tail_bounds
@@ -138,56 +67,50 @@ const u64 BIGSLOTMASK0  = (1ULL << BIGSLOTBITS0) - 1ULL;
 #define BIGEPS 3/64
 #endif
 
-// 176/256 is safely over 1-e(-1) ~ 0.63 trimming fraction
-#ifndef TRIMFRAC256
-#define TRIMFRAC256 176
-#endif
-
 const u32 NTRIMMEDZ  = NZ * TRIMFRAC256 / 256;
 
 const u32 ZBUCKETSLOTS = NZ + NZ * BIGEPS;
 const u32 ZBUCKETSIZE = ZBUCKETSLOTS * BIGSIZE0; 
 const u32 TBUCKETSIZE = ZBUCKETSLOTS * BIGSIZE1; 
 
+typedef struct { // half-edge pair
+  unsigned int otherX : XBITS;
+  unsigned int Z0 : ZBITS;
+  unsigned int Z1 : ZBITS;
+  unsigned int slot0 : 6;
+  unsigned int slot1 : 6;
+} hep;
+
+static_assert(sizeof(hep) == 8);
+
 template<u32 BUCKETSIZE>
 struct zbucket {
   u32 size;
   // should avoid different values of RENAMESIZE in different threads of one process
-  static const u32 RENAMESIZE = NZ2 + (COMPRESSROUND ? NZ1 : 0);
-  union alignas(16) {
-    u8 bytes[BUCKETSIZE];
-    struct {
-      u32 words[BUCKETSIZE/sizeof(u32) - RENAMESIZE];
-      u32 renameu1[NZ2/2];
-      u32 renamev1[NZ2/2];
-      u32 renameu[COMPRESSROUND ? NZ1/2 : 0];
-      u32 renamev[COMPRESSROUND ? NZ1/2 : 0];
-    };
-  };
-  u32 setsize(u8 const *end) {
-    size = end - bytes;
+  // union alignas(16) {
+  hep words[BUCKETSIZE];
+  u32 setsize(hep const *end) {
+    size = end - words;
     assert(size <= BUCKETSIZE);
     return size;
   }
 };
 
 template<u32 BUCKETSIZE>
-using yzbucket = zbucket<BUCKETSIZE>[NY];
-template <u32 BUCKETSIZE>
-using matrix = yzbucket<BUCKETSIZE>[NX];
+using buckets_t = zbucket<BUCKETSIZE>[NX];
 
 template<u32 BUCKETSIZE>
 struct indexer {
-  offset_t index[NX];
+  word_t index[NX];
 
   void matrixv(const u32 y) {
     const yzbucket<BUCKETSIZE> *foo = 0;
     for (u32 x = 0; x < NX; x++)
       index[x] = foo[x][y].bytes - (u8 *)foo;
   }
-  offset_t storev(yzbucket<BUCKETSIZE> *buckets, const u32 y) {
+  word_t storev(yzbucket<BUCKETSIZE> *buckets, const u32 y) {
     u8 const *base = (u8 *)buckets;
-    offset_t sumsize = 0;
+    word_t sumsize = 0;
     for (u32 x = 0; x < NX; x++)
       sumsize += buckets[x][y].setsize(base+index[x]);
     return sumsize;
@@ -197,9 +120,9 @@ struct indexer {
     for (u32 y = 0; y < NY; y++)
       index[y] = foo[x][y].bytes - (u8 *)foo;
   }
-  offset_t storeu(yzbucket<BUCKETSIZE> *buckets, const u32 x) {
+  word_t storeu(yzbucket<BUCKETSIZE> *buckets, const u32 x) {
     u8 const *base = (u8 *)buckets;
-    offset_t sumsize = 0;
+    word_t sumsize = 0;
     for (u32 y = 0; y < NY; y++)
       sumsize += buckets[x][y].setsize(base+index[y]);
     return sumsize;
@@ -217,7 +140,7 @@ typedef struct {
   edgetrimmer *et;
 } thread_ctx;
 
-typedef u8 zbucket8[MAXNZNYZ1];
+typedef u8 zbucket8[2*MAXNZNYZ1];
 typedef u16 zbucket16[NTRIMMEDZ];
 typedef u32 zbucket32[NTRIMMEDZ];
 
@@ -225,32 +148,27 @@ typedef u32 zbucket32[NTRIMMEDZ];
 class edgetrimmer {
 public:
   siphash_keys sip_keys;
-  yzbucket<ZBUCKETSIZE> *buckets;
-  yzbucket<TBUCKETSIZE> *tbuckets;
+  buckets_t<ZBUCKETSIZE> *buckets;
   zbucket8 *tdegs;
-  offset_t *tcounts;
+  word_t *tcounts;
   u32 ntrims;
   u32 nthreads;
   bool showall;
   thread_ctx *threads;
   trim_barrier barry;
 
-  void touch(u8 *p, const offset_t n) {
-    for (offset_t i=0; i<n; i+=4096)
+  void touch(u8 *p, const word_t n) {
+    for (word_t i=0; i<n; i+=4096)
       *(u32 *)(p+i) = 0;
   }
   edgetrimmer(const u32 n_threads, const u32 n_trims, const bool show_all) : barry(n_threads) {
-    assert(sizeof(matrix<ZBUCKETSIZE>) == NX * sizeof(yzbucket<ZBUCKETSIZE>));
-    assert(sizeof(matrix<TBUCKETSIZE>) == NX * sizeof(yzbucket<TBUCKETSIZE>));
     nthreads = n_threads;
     ntrims   = n_trims;
     showall = show_all;
-    buckets  = new yzbucket<ZBUCKETSIZE>[NX];
-    touch((u8 *)buckets, sizeof(matrix<ZBUCKETSIZE>));
-    tbuckets = new yzbucket<TBUCKETSIZE>[nthreads];
-    touch((u8 *)tbuckets, sizeof(yzbucket<TBUCKETSIZE>[nthreads]));
+    buckets  = new zbucket<ZBUCKETSIZE>[NX];
+    touch((u8 *)buckets, sizeof(buckets_t<ZBUCKETSIZE>));
     tdegs   = new zbucket8[nthreads];
-    tcounts = new offset_t[nthreads];
+    tcounts = new word_t[nthreads];
     threads = new thread_ctx[nthreads];
   }
   ~edgetrimmer() {
@@ -260,8 +178,8 @@ class edgetrimmer {
     delete[] tdegs;
     delete[] tcounts;
   }
-  offset_t count() const {
-    offset_t cnt = 0;
+  word_t count() const {
+    word_t cnt = 0;
     for (u32 t = 0; t < nthreads; t++)
       cnt += tcounts[t];
     return cnt;
@@ -278,7 +196,7 @@ class edgetrimmer {
     const u32 starty = NY *  id    / nthreads;
     const u32   endy = NY * (id+1) / nthreads;
     u32 edge0 = starty << YZBITS, endedge0 = edge0 + NYZ;
-    offset_t sumsize = 0;
+    word_t sumsize = 0;
     for (u32 my = starty; my < endy; my++, endedge0 += NYZ) {
       dst.matrixv(my);
       for (; edge0 < endedge0; edge0 += NEBS) {
@@ -338,10 +256,9 @@ class edgetrimmer {
           const u64 buflast = buf[NEBS - NSIPHASH + ns];
           buf[NEBS - NSIPHASH + ns] = 0;
           for (u32 e = 0; e < NEBS; e += NSIPHASH) {
-            const u32 dir = (e / NSIPHASH) & 1;                                                                                       
             const u64 nodes = buf[e + ns] ^ buflast;
-            const u32 node0 =  (nodes        & NODE1MASK) << 1   | dir;
-            const u32 node1 = ((nodes >> 31) & (NODE1MASK << 1)) | dir;
+            const u32 node0 = nodes & EDGEMASK;
+            const u32 node1 = (nodes >> 32) & EDGEMASK;
             const u32 ux = node0 >> YZBITS;
 // bit        50...22     21..15    14..0
 // write      VXXYYZZ     UYYYYY    UZZZZ
@@ -363,7 +280,7 @@ class edgetrimmer {
     indexer<TBUCKETSIZE> small;
   
     rdtsc0 = __rdtsc();
-    offset_t sumsize = 0;
+    word_t sumsize = 0;
     u8 const *base = (u8 *)buckets;
     u8 const *small0 = (u8 *)tbuckets[id];
     const u32 startux = NX *  id    / nthreads;
@@ -373,6 +290,7 @@ class edgetrimmer {
       for (u32 my = 0 ; my < NY; my++) {
         u8    *readbig = buckets[ux][my].bytes;
         u8 const *endreadbig = readbig + buckets[ux][my].size;
+// print_log("id %d x %d y %d size %u read %d\n", id, ux, my, buckets[ux][my].size, readbig-base);
         for (; readbig < endreadbig; readbig += BIGSIZE0) {
 // bit        50...22     21..15    14..0
 // write      VXXYYZZ     UYYYYY    UZZZZ   within UX partition
@@ -383,6 +301,7 @@ class edgetrimmer {
 // bit         43...15     14..0
 // write       VXXYYZZ     UZZZZ   within UX UY partition
           *(u64 *)(small0+small.index[uy]) = ((u64)vxyz << ZBITS) | (e & ZMASK);
+// print_log("id %d ux %d y %d e %010lx e' %010x\n", id, ux, my, e, ((u64)edge << ZBITS) | (e >> YBITS));
           small.index[uy] += SMALLSIZE;
         }
       }
@@ -391,10 +310,11 @@ class edgetrimmer {
       dst.matrixu(ux);
       for (u32 uy = 0 ; uy < NY; uy++) {
         assert(NZ <= sizeof(zbucket8));
-        memset(degs, 0, NZ);
+        memset(degs, 0xff, NZ);
         u8 *readsmall = tbuckets[id][uy].bytes, *endreadsmall = readsmall + tbuckets[id][uy].size;
+// if (id==1) print_log("id %d ux %d y %d size %u sumsize %u\n", id, ux, uy, tbuckets[id][uy].size/BIGSIZE1, sumsize);
         for (u8 *rdsmall = readsmall; rdsmall < endreadsmall; rdsmall+=SMALLSIZE)
-          degs[*(u32 *)rdsmall & ZMASK] = BIGSIZE;
+          degs[*(u32 *)rdsmall & ZMASK]++;
 
         u64 uy37 = (int64_t)uy << YZZBITS;
         for (u8 *rdsmall = readsmall; rdsmall < endreadsmall; rdsmall+=SMALLSIZE) {
@@ -406,7 +326,8 @@ class edgetrimmer {
 // bit     43/39..37    36..22     21..15     14..0
 // write      UYYYYY    UZZZZZ     VYYYYY     VZZZZ   within UX VX partition
           *(u64 *)(base+dst.index[vx]) = uy37 | ((u64)uz << YZBITS) | ((e >> ZBITS) & YZMASK);
-          dst.index[vx] += degs[uz ^ 1];
+// print_log("id %d ux %d y %d edge %08x e' %010lx vx %d\n", id, ux, uy, *readedge, uy34 | ((u64)(node & YZMASK) << ZBITS) | *readz, vx);
+          dst.index[vx] += degs[uz] ? BIGSIZE : 0;
         }
       }
       sumsize += dst.storeu(buckets, ux);
@@ -431,7 +352,7 @@ class edgetrimmer {
     indexer<TBUCKETSIZE> small;
   
     rdtsc0 = __rdtsc();
-    offset_t sumsize = 0;
+    word_t sumsize = 0;
     u8 const *base = (u8 *)buckets;
     u8 const *small0 = (u8 *)tbuckets[id];
     const u32 startvx = NY *  id    / nthreads;
@@ -442,11 +363,13 @@ class edgetrimmer {
         u32 uxyz = ux << YZBITS;
         zbucket<ZBUCKETSIZE> &zb = TRIMONV ? buckets[ux][vx] : buckets[vx][ux];
         const u8 *readbig = zb.bytes, *endreadbig = readbig + zb.size;
+// print_log("id %d vx %d ux %d size %u\n", id, vx, ux, zb.size/SRCSIZE);
         for (; readbig < endreadbig; readbig += SRCSIZE) {
 // bit        43..37    36..22     21..15     14..0
 // write      UYYYYY    UZZZZZ     VYYYYY     VZZZZ   within VX partition
           const u64 e = *(u64 *)readbig & SRCSLOTMASK;
           uxyz += ((u32)(e >> YZBITS) - uxyz) & SRCPREFMASK;
+// if (round==6) print_log("id %d vx %d ux %d e %010lx suffUXYZ %05x suffUXY %03x UXYZ %08x UXY %04x mask %x\n", id, vx, ux, e, (u32)(e >> YZBITS), (u32)(e >> YZZBITS), uxyz, uxyz>>ZBITS, SRCPREFMASK);
           const u32 vy = (e >> ZBITS) & YMASK;
 // bit     43/39..37    36..30     29..15     14..0
 // write      UXXXXX    UYYYYY     UZZZZZ     VZZZZ   within VX VY partition
@@ -462,20 +385,22 @@ class edgetrimmer {
       TRIMONV ? dst.matrixv(vx) : dst.matrixu(vx);
       for (u32 vy = 0 ; vy < NY; vy++) {
         const u64 vy34 = (u64)vy << YZZBITS;
-        memset(degs, 0, NZ);
+        memset(degs, 0xff, NZ);
         u8    *readsmall = tbuckets[id][vy].bytes, *endreadsmall = readsmall + tbuckets[id][vy].size;
+// print_log("id %d vx %d vy %d size %u sumsize %u\n", id, vx, vy, tbuckets[id][vx].size/BIGSIZE1, sumsize);
         for (u8 *rdsmall = readsmall; rdsmall < endreadsmall; rdsmall += DSTSIZE)
-          degs[*(u32 *)rdsmall & ZMASK] = DSTSIZE;
+          degs[*(u32 *)rdsmall & ZMASK]++;
         u32 ux = 0;
         for (u8 *rdsmall = readsmall; rdsmall < endreadsmall; rdsmall += DSTSIZE) {
 // bit        39..37    36..30     29..15     14..0      with XBITS==YBITS==7
 // read       UXXXXX    UYYYYY     UZZZZZ     VZZZZ   within VX VY partition
           const u64 e = *(u64 *)rdsmall & DSTSLOTMASK;
           ux += ((u32)(e >> YZZBITS) - ux) & DSTPREFMASK;
+// print_log("id %d vx %d vy %d e %010lx suffUX %02x UX %x mask %x\n", id, vx, vy, e, (u32)(e >> YZZBITS), ux, SRCPREFMASK);
 // bit    41/39..34    33..21     20..13     12..0
 // write     VYYYYY    VZZZZZ     UYYYYY     UZZZZ   within UX partition
           *(u64 *)(base+dst.index[ux]) = vy34 | ((e & ZMASK) << YZBITS) | ((e >> ZBITS) & YZMASK);
-          dst.index[ux] += degs[(e & ZMASK) ^ 1];
+          dst.index[ux] += degs[e & ZMASK] ? DSTSIZE : 0;
         }
         if (unlikely(ux >> DSTPREFBITS != XMASK >> DSTPREFBITS))
         { print_log("OOPS4: id %d vx %x ux %x vs %x\n", id, vx, ux, XMASK); exit(1); }
@@ -502,7 +427,7 @@ class edgetrimmer {
     u32 maxnnid = 0;
   
     rdtsc0 = __rdtsc();
-    offset_t sumsize = 0;
+    word_t sumsize = 0;
     u8 const *base = (u8 *)buckets;
     u8 const *small0 = (u8 *)tbuckets[id];
     const u32 startvx = NY *  id    / nthreads;
@@ -514,6 +439,7 @@ class edgetrimmer {
         u32 uyz = 0;
         zbucket<ZBUCKETSIZE> &zb = TRIMONV ? buckets[ux][vx] : buckets[vx][ux];
         const u8 *readbig = zb.bytes, *endreadbig = readbig + zb.size;
+// print_log("id %d vx %d ux %d size %u\n", id, vx, ux, zb.size/SRCSIZE);
         for (; readbig < endreadbig; readbig += SRCSIZE) {
 // bit        39..37    36..22     21..15     14..0
 // write      UYYYYY    UZZZZZ     VYYYYY     VZZZZ   within VX partition  if TRIMONV
@@ -523,29 +449,32 @@ class edgetrimmer {
           if (TRIMONV)
             uyz += ((u32)(e >> YZBITS) - uyz) & SRCPREFMASK;
           else uyz = e >> YZBITS;
+// if (round==32 && ux==25) print_log("id %d vx %d ux %d e %010lx suffUXYZ %05x suffUXY %03x UXYZ %08x UXY %04x mask %x\n", id, vx, ux, e, (u32)(e >> YZBITS), (u32)(e >> YZZBITS), uxyz, uxyz>>ZBITS, SRCPREFMASK);
           const u32 vy = (e >> ZBITS) & YMASK;
 // bit        39..37    36..30     29..15     14..0
 // write      UXXXXX    UYYYYY     UZZZZZ     VZZZZ   within VX VY partition  if TRIMONV
 // bit            37...31     30...15     14..0
 // write          VXXXXXX     VYYYZZ'     UZZZZ   within UX UY partition  if !TRIMONV
           *(u64 *)(small0+small.index[vy]) = ((u64)(ux << (TRIMONV ? YZBITS : YZ1BITS) | uyz) << ZBITS) | (e & ZMASK);
+// if (TRIMONV&&vx==75&&vy==83) print_log("id %d vx %d vy %d e %010lx e15 %x ux %x\n", id, vx, vy, ((u64)uxyz << ZBITS) | (e & ZMASK), uxyz, uxyz>>YZBITS);
           if (TRIMONV)
             uyz &= ~ZMASK;
           small.index[vy] += SRCSIZE;
         }
       }
-      u8 *degs = tdegs[id];
+      u16 *degs = (u16 *)tdegs[id];
       small.storeu(tbuckets+id, 0);
       TRIMONV ? dst.matrixv(vx) : dst.matrixu(vx);
       u32 newnodeid = 0;
       u32 *renames = TRIMONV ? buckets[0][vx].renamev : buckets[vx][0].renameu;
-      u32 *endrenames = renames + NZ1/2;
+      u32 *endrenames = renames + NZ1;
       for (u32 vy = 0 ; vy < NY; vy++) {
         assert(2*NZ <= sizeof(zbucket8));
-        memset(degs, 0, NZ);
+        memset(degs, 0xff, 2 * NZ); // sets each u16 entry to 0xffff
         u8    *readsmall = tbuckets[id][vy].bytes, *endreadsmall = readsmall + tbuckets[id][vy].size;
+// print_log("id %d vx %d vy %d size %u sumsize %u\n", id, vx, vy, tbuckets[id][vx].size/BIGSIZE1, sumsize);
         for (u8 *rdsmall = readsmall; rdsmall < endreadsmall; rdsmall += SRCSIZE)
-          degs[*(u32 *)rdsmall & ZMASK] = 1;
+          degs[*(u32 *)rdsmall & ZMASK]++;
         u32 ux = 0;
         u32 nrenames = 0;
         for (u8 *rdsmall = readsmall; rdsmall < endreadsmall; rdsmall += SRCSIZE) {
@@ -558,27 +487,26 @@ class edgetrimmer {
             ux += ((u32)(e >> YZZBITS) - ux) & SRCPREFMASK2;
           else ux = e >> YZZ1BITS;
           const u32 vz = e & ZMASK;
-          u16 *pdeg = (u16 *)degs + (vz >> 1);
-          u16 vdeg = *pdeg;
-          if (vdeg >= 0x0101) {
-            if (vdeg == 0x0101) {
-              *pdeg = vdeg = 0x0102 + nrenames++;
-              *renames++ = vy << ZBITS | (vz & ~1);  // neutralize parity, to be restored upon use
+          u16 vdeg = degs[vz];
+          if (vdeg) {
+            if (vdeg < 32) {
+              degs[vz] = vdeg = 32 + nrenames++;
+              *renames++ = vy << ZBITS | vz;
               if (renames == endrenames) {
                 endrenames += (TRIMONV ? sizeof(yzbucket<ZBUCKETSIZE>) : sizeof(zbucket<ZBUCKETSIZE>)) / sizeof(u32);
-                renames = endrenames - NZ1/2;
+                renames = endrenames - NZ1;
               }
             }
-            vdeg = ((vdeg-0x0102) << 1) | (vz & 1); // preserve parity
 // bit       37..22     21..15     14..0
 // write     VYYZZ'     UYYYYY     UZZZZ   within UX partition  if TRIMONV
             if (TRIMONV)
-                 *(u64 *)(base+dst.index[ux]) = ((u64)(newnodeid + vdeg) << YZBITS ) | ((e >> ZBITS) & YZMASK);
-            else *(u32 *)(base+dst.index[ux]) = (     (newnodeid + vdeg) << YZ1BITS) | ((e >> ZBITS) & YZ1MASK);
+                 *(u64 *)(base+dst.index[ux]) = ((u64)(newnodeid + vdeg-32) << YZBITS ) | ((e >> ZBITS) & YZMASK);
+            else *(u32 *)(base+dst.index[ux]) = (     (newnodeid + vdeg-32) << YZ1BITS) | ((e >> ZBITS) & YZ1MASK);
+// if (vx==44&&vy==58) print_log("  id %d vx %d vy %d newe %010lx\n", id, vx, vy, vy28 | ((vdeg) << YZBITS) | ((e >> ZBITS) & YZMASK));
             dst.index[ux] += DSTSIZE;
           }
         }
-        newnodeid += 2 * nrenames;
+        newnodeid += nrenames;
         if (TRIMONV && unlikely(ux >> SRCPREFBITS2 != XMASK >> SRCPREFBITS2))
         { print_log("OOPS6: id %d vx %d vy %d ux %x vs %x\n", id, vx, vy, ux, XMASK); exit(0); }
       }
@@ -599,7 +527,7 @@ class edgetrimmer {
     indexer<ZBUCKETSIZE> dst;
   
     rdtsc0 = __rdtsc();
-    offset_t sumsize = 0;
+    word_t sumsize = 0;
     u8 *degs = tdegs[id];
     u8 const *base = (u8 *)buckets;
     const u32 startvx = NY *  id    / nthreads;
@@ -607,13 +535,13 @@ class edgetrimmer {
     for (u32 vx = startvx; vx < endvx; vx++) {
       TRIMONV ? dst.matrixv(vx) : dst.matrixu(vx);
       assert(NYZ1 <= sizeof(zbucket8));
-      memset(degs, 0, NYZ1);
+      memset(degs, 0xff, NYZ1);
       for (u32 ux = 0 ; ux < NX; ux++) {
         zbucket<ZBUCKETSIZE> &zb = TRIMONV ? buckets[ux][vx] : buckets[vx][ux];
         u32 *readbig = zb.words, *endreadbig = readbig + zb.size/sizeof(u32);
         // print_log("id %d vx %d ux %d size %d\n", id, vx, ux, zb.size/SRCSIZE);
         for (; readbig < endreadbig; readbig++)
-          degs[*readbig & YZ1MASK] = sizeof(u32);
+          degs[*readbig & YZ1MASK]++;
       }
       for (u32 ux = 0 ; ux < NX; ux++) {
         zbucket<ZBUCKETSIZE> &zb = TRIMONV ? buckets[ux][vx] : buckets[vx][ux];
@@ -623,10 +551,11 @@ class edgetrimmer {
 // read      UYYZZZ'     VYYZZ'   within VX partition
           const u32 e = *readbig;
           const u32 vyz = e & YZ1MASK;
+          // print_log("id %d vx %d ux %d e %08lx vyz %04x uyz %04x\n", id, vx, ux, e, vyz, e >> YZ1BITS);
 // bit       31...16     15...0
 // write     VYYZZZ'     UYYZZ'   within UX partition
           *(u32 *)(base+dst.index[ux]) = (vyz << YZ1BITS) | (e >> YZ1BITS);
-          dst.index[ux] += degs[vyz ^ 1];
+          dst.index[ux] += degs[vyz] ? sizeof(u32) : 0;
         }
       }
       sumsize += TRIMONV ? dst.storev(buckets, vx) : dst.storeu(buckets, vx);
@@ -644,23 +573,24 @@ class edgetrimmer {
     u32 maxnnid = 0;
   
     rdtsc0 = __rdtsc();
-    offset_t sumsize = 0;
-    u8 *degs = tdegs[id];
+    word_t sumsize = 0;
+    u16 *degs = (u16 *)tdegs[id];
     u8 const *base = (u8 *)buckets;
     const u32 startvx = NY *  id    / nthreads;
     const u32   endvx = NY * (id+1) / nthreads;
     for (u32 vx = startvx; vx < endvx; vx++) {
       TRIMONV ? dst.matrixv(vx) : dst.matrixu(vx);
-      memset(degs, 0, NYZ1);
+      memset(degs, 0xff, 2 * NYZ1); // sets each u16 entry to 0xffff
       for (u32 ux = 0 ; ux < NX; ux++) {
         zbucket<ZBUCKETSIZE> &zb = TRIMONV ? buckets[ux][vx] : buckets[vx][ux];
         u32 *readbig = zb.words, *endreadbig = readbig + zb.size/sizeof(u32);
+        // print_log("id %d vx %d ux %d size %d\n", id, vx, ux, zb.size/SRCSIZE);
         for (; readbig < endreadbig; readbig++)
-          degs[*readbig & YZ1MASK] = 1;
+          degs[*readbig & YZ1MASK]++;
       }
-      u32 newnodepairid = 0;
+      u32 newnodeid = 0;
       u32 *renames = TRIMONV ? buckets[0][vx].renamev1 : buckets[vx][0].renameu1;
-      u32 *endrenames = renames + NZ2/2;
+      u32 *endrenames = renames + NZ2;
       for (u32 ux = 0 ; ux < NX; ux++) {
         zbucket<ZBUCKETSIZE> &zb = TRIMONV ? buckets[ux][vx] : buckets[vx][ux];
         u32 *readbig = zb.words, *endreadbig = readbig + zb.size/sizeof(u32);
@@ -669,28 +599,26 @@ class edgetrimmer {
 // read      UYYYZZ'     VYYZZ'   within VX partition
           const u32 e = *readbig;
           const u32 vyz = e & YZ1MASK;
-          u16 *pdeg = (u16 *)degs + (vyz >> 1);
-          u16 vdeg = *pdeg;
-          if (vdeg >= 0x0101) {
-            if (vdeg == 0x0101) {
-              *pdeg = vdeg = 0x0102 + newnodepairid++;
-              *renames++ = vyz & ~1;  // neutralize parity, to be restored upon use
+          u16 vdeg = degs[vyz];
+          if (vdeg) {
+            if (vdeg < 32) {
+              degs[vyz] = vdeg = 32 + newnodeid++;
+              *renames++ = vyz;
               if (renames == endrenames) {
                 endrenames += (TRIMONV ? sizeof(yzbucket<ZBUCKETSIZE>) : sizeof(zbucket<ZBUCKETSIZE>)) / sizeof(u32);
-                renames = endrenames - NZ2/2;
+                renames = endrenames - NZ2;
                 assert(renames < buckets[NX][0].renameu1);
               }
             }
-            vdeg = ((vdeg-0x0102) << 1) | (vyz & 1); // preserve parity                                      
 // bit       26...16     15...0
 // write     VYYZZZ"     UYYZZ'   within UX partition
-            *(u32 *)(base+dst.index[ux]) = (vdeg << (TRIMONV ? YZ1BITS : YZ2BITS)) | (e >> YZ1BITS);
+            *(u32 *)(base+dst.index[ux]) = ((vdeg - 32) << (TRIMONV ? YZ1BITS : YZ2BITS)) | (e >> YZ1BITS);
             dst.index[ux] += sizeof(u32);
           }
         }
       }
-      if (2*newnodepairid > maxnnid)
-        maxnnid = 2*newnodepairid;
+      if (newnodeid > maxnnid)
+        maxnnid = newnodeid;
       sumsize += TRIMONV ? dst.storev(buckets, vx) : dst.storeu(buckets, vx);
     }
     rdtsc1 = __rdtsc();
@@ -709,6 +637,7 @@ class edgetrimmer {
       int err = pthread_create(&threads[t].thread, NULL, etworker, (void *)&threads[t]);
       assert(err == 0);
     }
+    // sleep(7); abort();
     for (u32 t = 0; t < nthreads; t++) {
       int err = pthread_join(threads[t].thread, NULL);
       assert(err == 0);
@@ -797,7 +726,6 @@ class solver_ctx {
   bool mutatenonce;
   proof cycleus;
   proof cyclevs;
-  word_t rowsize[NX];
   std::bitset<NXY> uxymap;
   std::vector<word_t> sols; // concatanation of all proof's indices
 
@@ -812,7 +740,7 @@ class solver_ctx {
 
   solver_ctx(const u32 nthreads, const u32 n_trims, bool allrounds, bool show_cycle, bool mutate_nonce)
     : trimmer(nthreads, n_trims, allrounds), 
-      cg(MAXEDGES, MAXEDGES, MAX_SOLS, 0, (char *)trimmer.tbuckets) {
+      cg(MAXEDGES, MAXEDGES, MAX_SOLS, (char *)trimmer.tbuckets) {
     assert(cg.bytes() <= sizeof(yzbucket<TBUCKETSIZE>[nthreads])); // check that graph cg can fit in tbucket's memory
     showcycle = show_cycle;
     mutatenonce = mutate_nonce;
@@ -832,37 +760,26 @@ class solver_ctx {
   u32 threadbytes() const {
     return sizeof(thread_ctx) + sizeof(yzbucket<TBUCKETSIZE>) + sizeof(zbucket8) + sizeof(zbucket16) + sizeof(zbucket32);
   }
-  void recordedge(const u32 i, const u32 u1, const u32 v1) {
-    assert(u1 < NX * NYZ2);
+  void recordedge(const u32 i, const u32 u1, const u32 v2) {
     const u32 ux = u1 >> YZ2BITS;
-    u32 uyz = trimmer.buckets[ux][(u1 >> Z2BITS) & YMASK].renameu1[(u1 & Z2MASK) >> 1] | (u1 & 1);
-    assert(v1 < NX * NYZ2);
+    u32 uyz = trimmer.buckets[ux][(u1 >> Z2BITS) & YMASK].renameu1[u1 & Z2MASK];
+    const u32 v1 = v2 - MAXEDGES;
     const u32 vx = v1 >> YZ2BITS;
-    u32 vyz = trimmer.buckets[(v1 >> Z2BITS) & YMASK][vx].renamev1[(v1 & Z2MASK) >> 1] | (v1 & 1);
+    u32 vyz = trimmer.buckets[(v1 >> Z2BITS) & YMASK][vx].renamev1[v1 & Z2MASK];
 #if COMPRESSROUND > 0
-    uyz = trimmer.buckets[ux][uyz >> Z1BITS].renameu[(uyz & Z1MASK) >> 1] | (u1 & 1);
-    vyz = trimmer.buckets[vyz >> Z1BITS][vx].renamev[(vyz & Z1MASK) >> 1] | (v1 & 1);
+    uyz = trimmer.buckets[ux][uyz >> Z1BITS].renameu[uyz & Z1MASK];
+    vyz = trimmer.buckets[vyz >> Z1BITS][vx].renamev[vyz & Z1MASK];
 #endif
     const u32 u = cycleus[i] = (ux << YZBITS) | uyz;
     cyclevs[i] = (vx << YZBITS) | vyz;
+    // print_log(" (%x,%x)", u, cyclevs[i]);
     uxymap[u >> ZBITS] = 1;
   }
 
   void solution(const proof sol) {
     // print_log("Nodes");
-    for (u32 i = 0; i < PROOFSIZE; i++) {
-      u32 ux, vx, idx = sol[i];
-      for (vx = 0; idx >= rowsize[vx]; vx++) {
-        idx -= rowsize[vx];
-      }
-      for (ux = 0; idx >= trimmer.buckets[ux][vx].size/sizeof(u32); ux++) {
-        idx -= trimmer.buckets[ux][vx].size/sizeof(u32);
-      }
-      const u32 e = trimmer.buckets[ux][vx].words[idx];
-      const u32 u = (ux << YZ2BITS) | (e >> YZ2BITS);
-      const u32 v = (vx << YZ2BITS) | (e & YZ2MASK);
-      recordedge(i, u, v);
-    }
+    for (u32 i = 0; i < PROOFSIZE; i++)
+      recordedge(i, cg.links[2*sol[i]].to, cg.links[2*sol[i]+1].to);
     // print_log("\n");
     if (showcycle) {
       void *matchworker(void *vp);
@@ -890,18 +807,18 @@ class solver_ctx {
     rdtsc0 = __rdtsc();
     cg.reset();
     for (u32 vx = 0; vx < NX; vx++) {
-      rowsize[vx] = 0;
       for (u32 ux = 0 ; ux < NX; ux++) {
         zbucket<ZBUCKETSIZE> &zb = trimmer.buckets[ux][vx];
         u32 *readbig = zb.words, *endreadbig = readbig + zb.size/sizeof(u32);
-        rowsize[vx] += zb.size/sizeof(u32);
+// print_log("vx %d ux %d size %u\n", vx, ux, zb.size/4);
         for (; readbig < endreadbig; readbig++) {
 // bit        21..11     10...0
 // write      UYYZZZ'    VYYZZ'   within VX partition
           const u32 e = *readbig;
           const u32 u = (ux << YZ2BITS) | (e >> YZ2BITS);
           const u32 v = (vx << YZ2BITS) | (e & YZ2MASK);
-          cg.add_edge(u>>1, v>>1, u&1);
+          // print_log("add_edge(%x, %x)\n", u, v);
+          cg.add_edge(u, v);
         }
       }
     }
@@ -909,7 +826,7 @@ class solver_ctx {
       solution(cg.sols[s]);
     }
     rdtsc1 = __rdtsc();
-    print_log("findcycles %u edges rdtsc: %lu\n", cg.nlinks, rdtsc1-rdtsc0);
+    print_log("findcycles rdtsc: %lu\n", rdtsc1-rdtsc0);
   }
 
   void abort() {
@@ -990,10 +907,9 @@ class solver_ctx {
           const u64 buflast = buf[NEBS - NSIPHASH + ns];
           buf[NEBS - NSIPHASH + ns] = 0;
           for (u32 e = 0; e < NEBS; e += NSIPHASH) {
-            const u32 dir = (e / NSIPHASH) & 1;                                                                    
             const u64 nodes = buf[e + ns] ^ buflast;
-            const u32 node0 =  (nodes        & NODE1MASK) << 1   | dir;                                      
-            const u32 node1 = ((nodes >> 31) & (NODE1MASK << 1)) | dir;                                      
+            const u32 node0 = nodes & EDGEMASK;
+            const u32 node1 = (nodes >> 32) & EDGEMASK;
             if (uxymap[node0 >> ZBITS]) {
               for (u32 j = 0; j < PROOFSIZE; j++) {
                 if (cycleus[j] == node0 && cyclevs[j] == node1) {