diff --git a/src/cuckaroom/Makefile b/src/cuckaroom/Makefile
index 301876a..92e07aa 100644
--- a/src/cuckaroom/Makefile
+++ b/src/cuckaroom/Makefile
@@ -60,3 +60,6 @@ cuda19:		../crypto/siphash.cuh compress.hpp graph.hpp mean.cu Makefile
 
 cuda29:		../crypto/siphash.cuh compress.hpp graph.hpp mean.cu kernel.cuh Makefile
 	$(NVCC) -o $@ -DEDGEBITS=29 -arch sm_35 mean.cu $(BLAKE_2B_SRC)
+
+cd29:		../crypto/siphash.cuh compress.hpp graph.hpp meaner.cu Makefile
+	$(NVCC) -o $@ -DEDGEBITS=29 -arch sm_35 meaner.cu $(BLAKE_2B_SRC)
diff --git a/src/cuckatoo/mean.cu b/src/cuckatoo/mean.cu
index 43deec0..150798d 100644
--- a/src/cuckatoo/mean.cu
+++ b/src/cuckatoo/mean.cu
@@ -14,10 +14,21 @@
 typedef uint8_t u8;
 typedef uint16_t u16;
 
-#ifndef NA
-#define NA 4
+// number of invocations of seeding kernel
+// larger values reduce pressure on TLB
+// which appears to be an issue on GTX cards
+#ifndef NTLB
+#define NTLB 1
 #endif
-#define NA2 (NA * NA)
+// number of slices in edge splitting kernel
+// larger values reduce memory use
+// which is proportional to 1+1/NMEM
+#ifndef NMEM
+#define NMEM 4
+#endif
+// after seeding, main memory buffer is a NMEM x NTLB matrix
+// of blocks of buckets
+#define NMEMTLB (NMEM * NTLB)
 
 #ifndef MAXSOLS
 #define MAXSOLS 4
@@ -27,6 +38,7 @@ typedef uint16_t u16;
 // number of bits of compression of surviving edge endpoints
 // reduces space used in cycle finding, but too high a value
 // results in NODE OVERFLOW warnings and fake cycles
+// tag relay achieves a 2^12 reduction in edges
 #define IDXSHIFT 12
 #endif
 
@@ -34,64 +46,44 @@ const u32 MAXEDGES = NEDGES >> IDXSHIFT;
 
 typedef uint64_t u64; // save some typing
 
-#ifndef XBITS
+#ifndef BUCKBITS
 // assumes at least 2^18 bits of shared mem (32 KB) on thread block
-// #define XBITS ((EDGEBITS-18+1)/2)
-// scrap that; too few buckets inhibits parallellism
-#define XBITS 6
+// #define BUCKBITS (EDGEBITS-18)
+#define BUCKBITS 12
 #endif
 
-const u32 NX        = 1 << XBITS;
-const u32 NX2       = NX * NX;
-const u32 NX2_NA    = NX2 / NA;
-const u32 YBITS     = XBITS;
-const u32 NY        = 1 << YBITS;
-const u32 YZBITS    = EDGEBITS - XBITS;
-const u32 ZBITS     = YZBITS - YBITS;
+const u32 NB       = 1 << BUCKBITS;
+const u32 NB_NTLB  = NB / NTLB;
+const u32 NB_NMEM  = NB / NMEM;
+const u32 ZBITS     = EDGEBITS - BUCKBITS;
 const u32 NZ        = 1 << ZBITS;
 const u32 ZMASK     = NZ - 1;
 
+// buckets are a little oversized to allow for variance in distribution
+#define NEPS 128
 #ifndef NEPS_A
 #define NEPS_A 133
 #endif
+// max number of edges per bucket after seeding
+const u32 EDGES_A = NZ * NEPS_A / NEPS;
 #ifndef NEPS_B
 #define NEPS_B 85
 #endif
-#define NEPS 128
-
-const u32 EDGES_A = NZ * NEPS_A / NEPS;
+// max number of edges per bucket after one trim
 const u32 EDGES_B = NZ * NEPS_B / NEPS;
 
-const u32 ROW_EDGES_A = EDGES_A * NY;
-const u32 ROW_EDGES_B = EDGES_B * NY;
-
-// Number of rows in bufferB not overlapping bufferA
-#ifndef NRB1
-#define NRB1 (NX / 2)
-#endif
-#define NRB2 (NX - NRB1)
-#define NB 2
-
 __constant__ uint2 recoveredges[PROOFSIZE];
 __constant__ siphash_keys dipkeys;
 
-#ifndef FLUSHA // should perhaps be in trimparams and passed as template parameter
-#define FLUSHA 4
-#endif
-
-#ifndef NFLUSH
-#define NFLUSH (8 / FLUSHA)
-#endif
-
-__device__ __forceinline__  void bitmapset(u32 *ebitmap, const int bucket) {
-  int word = bucket >> 5;
+__device__ __forceinline__  void bitmapset(u32 *ebitmap, const u32 bucket) {
+  u32 word = bucket >> 5;
   unsigned char bit = bucket & 0x1F;
   const u32 mask = 1 << bit;
   atomicOr(ebitmap + word, mask);
 }
 
-__device__ __forceinline__  bool bitmaptest(u32 *ebitmap, const int bucket) {
-  int word = bucket >> 5;
+__device__ __forceinline__  bool bitmaptest(u32 *ebitmap, const u32 bucket) {
+  u32 word = bucket >> 5;
   unsigned char bit = bucket & 0x1F;
   return (ebitmap[word] >> bit) & 1;
 }
@@ -104,41 +96,65 @@ __device__ __forceinline__  u32 endpoint(uint2 nodes, bool uorv) {
   return uorv ? nodes.y : nodes.x;
 }
 
-template<int tpb, int maxOut>
-__global__ void Seed(u32 * __restrict__ buffer, u32 * __restrict__ indexes, const u64 offset) {
-  const int group = blockIdx.x;
-  const int dim = blockDim.x;
-  const int lid = threadIdx.x;
-  const int gid = group * dim + lid;
-  const int nthreads = gridDim.x * dim;
+#ifndef SEED_TPB
+#define SEED_TPB 256
+#endif
+#ifndef TRIM0_TPB
+#define TRIM0_TPB 1024
+#endif
+#ifndef TRIM1_TPB
+#define TRIM1_TPB 512
+#endif
+#ifndef TRIM_TPB
+#define TRIM_TPB 512
+#endif
+#ifndef RELAY_TPB
+#define RELAY_TPB 512
+#endif
+#ifndef TAIL_TPB
+#define TAIL_TPB SEED_TPB
+#endif
 
-  extern __shared__ u32 tmp[][FLUSHA-1];
-  int *counters = (int *)(tmp[NX2]);
+#ifndef FLUSHA // should perhaps be in trimparams and passed as template parameter
+#define FLUSHA 4
+#endif
 
-#if tpb && NX2 % tpb == 0
-  for (int i = 0; i < NX2/tpb; i++)
-#else
-  for (int i = 0; i < (NX2 - lid + tpb-1) / tpb; i++)
+#ifndef NFLUSH
+#define NFLUSH (8 / FLUSHA)
 #endif
-    counters[lid + tpb * i] = 0;
+
+static_assert(NB % SEED_TPB == 0, "SEED_TPB must divide NB");
+
+template<u32 maxOut>
+__global__ void Seed(u32 * __restrict__ buffer, u32 * __restrict__ indexes, const u32 offset) {
+  const u32 group = blockIdx.x;
+  const u32 lid = threadIdx.x;
+  const u32 gid = group * SEED_TPB + lid;
+  const u32 nthreads = gridDim.x * SEED_TPB;
+
+  extern __shared__ u32 tmp[][FLUSHA-1];
+  u32 *counters = (u32 *)(tmp[NB]);
+
+  for (u32 i = 0; i < NB/SEED_TPB; i++)
+    counters[lid + SEED_TPB * i] = 0;
 
   __syncthreads();
 
-  const int nloops = (NEDGES / NA - gid + nthreads-1) / nthreads;
-  for (int i = 0; i < nloops; i++) {
+  const u32 nloops = (NEDGES / NTLB - gid + nthreads-1) / nthreads;
+  for (u32 i = 0; i < nloops; i++) {
     const u32 nonce = offset + gid * nloops + i;
     const u32 node0 = endpoint(nonce, 0);
-    const int bucket = node0 >> ZBITS;
-    u32 counter = (int)atomicAdd(counters + bucket, 1);
-    for (int nf=0; ; nf++) {
-      if (counter < FLUSHA-1)
+    const u32 bucket = node0 >> ZBITS;
+    u32 counter = atomicAdd(counters + bucket, 1);
+    for (u32 nf=0; ; nf++) {
+      if (counter < FLUSHA-1) // int counter would need additional test counter >= 0
         tmp[bucket][counter] = nonce;
       __syncthreads();
       if (nf == NFLUSH)
         break;
       if (counter == FLUSHA-1) {
-        const u64 pos = min((int)atomicAdd(indexes + bucket, FLUSHA), (int)(maxOut - FLUSHA));
-        const u64 idx = (bucket + (bucket / NX2_NA) * (NX2 - NX2_NA)) * maxOut + pos;
+        const u32 pos = min((u32)atomicAdd(indexes + bucket, FLUSHA), (u32)(maxOut - FLUSHA));
+        const u64 idx = (u64)(bucket + (bucket / NB_NMEM) * (NTLB-1) * NB_NMEM) * maxOut + pos;
 #if FLUSHA==4
         ((uint4 *)buffer)[idx/4] = make_uint4(tmp[bucket][0], tmp[bucket][1], tmp[bucket][2], nonce);
 #elif FLUSHA==2
@@ -149,14 +165,15 @@ __global__ void Seed(u32 * __restrict__ buffer, u32 * __restrict__ indexes, cons
       __syncthreads();
       counter -= FLUSHA;
     }
+    // negative counters are fine
     if ((int)counter >= FLUSHA-1) printf("WHOOPS!\n");
   }
-  for (int i = 0; i < NX2 / tpb; i++) {
-    const int bucket = lid + i * tpb;
-    const int cnt = counters[bucket];
+  for (u32 i = 0; i < NB / SEED_TPB; i++) {
+    const u32 bucket = lid + i * SEED_TPB;
+    const u32 cnt = counters[bucket];
     if (cnt) {
-      const u64 pos = min((int)atomicAdd(indexes + bucket, FLUSHA), (int)(maxOut - FLUSHA));
-      const u64 idx = (bucket + (bucket / NX2_NA) * (NX2 - NX2_NA)) * maxOut + pos;
+      const u32 pos = min((u32)atomicAdd(indexes + bucket, FLUSHA), (u32)(maxOut - FLUSHA));
+      const u64 idx = (u64)(bucket + (bucket / NB_NMEM) * (NTLB-1) * NB_NMEM) * maxOut + pos;
 #if FLUSHA==4
         ((uint4 *)buffer)[idx/4] = make_uint4(tmp[bucket][0], cnt >= 2 ? tmp[bucket][1] : 0, cnt >= 3 ? tmp[bucket][2] : 0, 0);
 #elif FLUSHA==2
@@ -177,91 +194,87 @@ __global__ void Seed(u32 * __restrict__ buffer, u32 * __restrict__ indexes, cons
 const u32 PART_MASK = (1 << PART_BITS) - 1;
 const u32 NONPART_BITS = ZBITS - PART_BITS;
 const word_t NONPART_MASK = (1 << NONPART_BITS) - 1;
-const int BITMAPBYTES = (NZ >> PART_BITS) / 8;
+const u32 BITMAPBYTES = (NZ >> PART_BITS) / 8;
 
-template<int tpb, int maxIn, int maxOut>
-__global__ void Round0(u32 * src, u32 * dst, u32 * srcIdx, u32 * dstIdx, const int offset) {
-  const int part = blockIdx.x & PART_MASK;
-  const int group = blockIdx.x >> PART_BITS;
-  const int lid = threadIdx.x;
-  const int BITMAPWORDS = BITMAPBYTES / sizeof(u32);
-  int nloops[NA];
+template<u32 maxIn, u32 maxOut>
+__global__ void Round0(u32 * src, u32 * dst, u32 * srcIdx, u32 * dstIdx, const u32 offset) {
+  const u32 part = blockIdx.x & PART_MASK;
+  const u32 group = blockIdx.x >> PART_BITS;
+  const u32 lid = threadIdx.x;
+  const u32 BITMAPWORDS = BITMAPBYTES / sizeof(u32);
+  u32 nloops[NTLB];
 
   extern __shared__ u32 ebitmap[];
 
-#if tpb && BITMAPWORDS% tpb == 0
-  for (int i = 0; i < BITMAPWORDS/tpb; i++)
-#else
-  for (int i = 0; i < (BITMAPWORDS- lid + tpb-1) / tpb; i++)
-#endif
-    ebitmap[lid + tpb * i] = 0;
+  static_assert(BITMAPWORDS % TRIM0_TPB == 0, "TRIM0_TPB must divide BITMAPWORDS");
 
-  for (int a = 0; a < NA; a++)
-    nloops[a] = (min(srcIdx[a * NX2 + offset + group], maxIn) - lid + tpb-1) / tpb;
+  for (u32 i = 0; i < BITMAPWORDS/TRIM0_TPB; i++)
+    ebitmap[lid + TRIM0_TPB * i] = 0;
 
-  const int rowOffset = offset * NA;
-  src += maxIn * (rowOffset + group) + lid;
+  for (u32 a = 0; a < NTLB; a++)
+    nloops[a] = (min(srcIdx[a * NB + offset + group], maxIn) - lid + TRIM0_TPB-1) / TRIM0_TPB;
+
+  src += (u64)(offset * NTLB + group) * maxIn + lid;
 
   __syncthreads();
 
-  for (int a = 0; a < NA; a++) {
-    const int delta = a * NX2_NA * maxIn;
-    for (int i = 0; i < nloops[a]; i++) {
-      u32 edge = src[delta + i * tpb];
+  for (u32 a = 0; a < NTLB; a++) {
+    const u64 delta = a * NB_NMEM * (u64)maxIn;
+    for (u32 i = 0; i < nloops[a]; i++) {
+      u32 edge = src[delta + i * TRIM0_TPB];
       if (!edge) continue;
       u32 z = endpoint(edge, 0) & ZMASK;
-      if ((z >> NONPART_BITS) == part)
+      if (!PART_BITS || (z >> NONPART_BITS) == part)
         bitmapset(ebitmap, z & NONPART_MASK);
     }
   }
 
   __syncthreads();
 
-  for (int a = 0; a < NA; a++) {
-    const int delta = a * NX2_NA * maxIn;
-    for (int i = 0; i < nloops[a]; i++) {
-      const u32 edge = src[delta + i * tpb];
+  const u32 rowOffset = offset * NMEM;
+  for (u32 a = 0; a < NTLB; a++) {
+    const u64 delta = a * NB_NMEM * (u64)maxIn;
+    for (u32 i = 0; i < nloops[a]; i++) {
+      const u32 edge = src[delta + i * TRIM0_TPB];
       if (!edge) continue;
       const u32 node0 = endpoint(edge, 0);
       const u32 z = node0 & ZMASK;
-      if ((z >> NONPART_BITS) == part && bitmaptest(ebitmap, (z & NONPART_MASK) ^ 1)) {
-        const int bucket = endpoint(edge, 1) >> ZBITS;
-        const int bktIdx = min(atomicAdd(dstIdx + bucket + rowOffset, 1), maxOut - 1);
+      if ((!PART_BITS || (z >> NONPART_BITS) == part) && bitmaptest(ebitmap, (z & NONPART_MASK) ^ 1)) {
+        const u32 bucket = endpoint(edge, 1) >> ZBITS;
+        const u32 bktIdx = min(atomicAdd(dstIdx + rowOffset + bucket, 1), maxOut - 1);
         dst[bucket * maxOut + bktIdx] = edge;
       }
     }
   }
 }
 
-template<int tpb, int maxIn, int maxOut>
+template<u32 maxIn, u32 maxOut>
 __global__ void Round1(u32 * src, u32 * dst, u32 * srcIdx, u32 * dstIdx) {
-  const int part = blockIdx.x & PART_MASK;
-  const int group = blockIdx.x >> PART_BITS;
-  const int lid = threadIdx.x;
+  const u32 part = blockIdx.x & PART_MASK;
+  const u32 group = blockIdx.x >> PART_BITS;
+  const u32 lid = threadIdx.x;
 
-  const int BITMAPWORDS = BITMAPBYTES / sizeof(u32);
-  int nloops[NA];
+  const u32 BITMAPWORDS = BITMAPBYTES / sizeof(u32);
+  u32 nloops[NMEM];
 
   extern __shared__ u32 ebitmap[];
 
-#if tpb && BITMAPWORDS% tpb == 0
-  for (int i = 0; i < BITMAPWORDS/tpb; i++)
-#else
-  for (int i = 0; i < (BITMAPWORDS- lid + tpb-1) / tpb; i++)
-#endif
-    ebitmap[lid + tpb * i] = 0;
+  static_assert(BITMAPWORDS % TRIM1_TPB == 0, "TRIM1_TPB must divide BITMAPWORDS");
 
-  for (int a = 0; a < NA; a++)
-    nloops[a] = (min(srcIdx[a * NX2 + group], maxIn) - lid + tpb-1) / tpb;
+  for (u32 i = 0; i < BITMAPWORDS/TRIM1_TPB; i++)
+    ebitmap[lid + TRIM1_TPB * i] = 0;
 
-  src += maxIn * group + lid;
+  for (u32 a = 0; a < NMEM; a++)
+    nloops[a] = (min(srcIdx[a * NB + group], maxIn) - lid + TRIM1_TPB-1) / TRIM1_TPB;
+
+  src += (u64)maxIn * group + lid;
 
   __syncthreads();
 
-  for (int a = 0; a < NA; a++) {
-    const int delta = a * maxIn * NX2;
-    for (int i = 0; i < nloops[a]; i++) {
-      u32 edge = src[delta + i * tpb];
+  for (u32 a = 0; a < NMEM; a++) {
+    const u64 delta = a * (u64)maxIn * NB;
+    for (u32 i = 0; i < nloops[a]; i++) {
+      u32 edge = src[delta + i * TRIM1_TPB];
       u32 z = endpoint(edge, 1) & ZMASK;
       if ((z >> NONPART_BITS) == part)
         bitmapset(ebitmap, z & NONPART_MASK);
@@ -270,46 +283,44 @@ __global__ void Round1(u32 * src, u32 * dst, u32 * srcIdx, u32 * dstIdx) {
 
   __syncthreads();
 
-  for (int a = 0; a < NA; a++) {
-    const int delta = a * maxIn * NX2;
-    for (int i = 0; i < nloops[a]; i++) {
-      const u32 edge = src[delta + i * tpb];
+  for (u32 a = 0; a < NMEM; a++) {
+    const u64 delta = a * (u64)maxIn * NB;
+    for (u32 i = 0; i < nloops[a]; i++) {
+      const u32 edge = src[delta + i * TRIM1_TPB];
       const u32 node1 = endpoint(edge, 1);
       const u32 z = node1 & ZMASK;
       if ((z >> NONPART_BITS) == part && bitmaptest(ebitmap, (z & NONPART_MASK) ^ 1)) {
-        const int bucket = endpoint(edge, 0) >> ZBITS;
-        const int bktIdx = min(atomicAdd(dstIdx + bucket, 1), maxOut - 1);
+        const u32 bucket = endpoint(edge, 0) >> ZBITS;
+        const u32 bktIdx = min(atomicAdd(dstIdx + bucket, 1), maxOut - 1);
         dst[bucket * maxOut + bktIdx] = edge;
       }
     }
   }
 }
 
-template<int tpb, int maxIn, typename EdgeIn, int maxOut>
-__global__ void Round(const int round, EdgeIn * src, uint2 * dst, u32 * srcIdx, u32 * dstIdx) {
-  const int part = blockIdx.x & PART_MASK;
-  const int group = blockIdx.x >> PART_BITS;
-  const int lid = threadIdx.x;
+template<u32 maxIn, typename EdgeIn, u32 maxOut>
+__global__ void Round(const u32 round, EdgeIn * src, uint2 * dst, u32 * srcIdx, u32 * dstIdx) {
+  const u32 part = blockIdx.x & PART_MASK;
+  const u32 group = blockIdx.x >> PART_BITS;
+  const u32 lid = threadIdx.x;
 
-  const int BITMAPWORDS = BITMAPBYTES / sizeof(u32);
+  const u32 BITMAPWORDS = BITMAPBYTES / sizeof(u32);
 
   extern __shared__ u32 ebitmap[];
 
-  const int nloops = (min(srcIdx[group], maxIn) - lid + tpb-1) / tpb;
+  const u32 nloops = (min(srcIdx[group], maxIn) - lid + TRIM_TPB-1) / TRIM_TPB;
 
-#if tpb && BITMAPWORDS % tpb == 0
-  for (int i = 0; i < BITMAPWORDS/tpb; i++)
-#else
-  for (int i = 0; i < (BITMAPWORDS - lid + tpb-1) / tpb; i++)
-#endif
-    ebitmap[lid + tpb * i] = 0;
+  static_assert(BITMAPWORDS % TRIM_TPB == 0, "TRIM_TPB must divide BITMAPWORDS");
+
+  for (u32 i = 0; i < BITMAPWORDS/TRIM_TPB; i++)
+    ebitmap[lid + TRIM_TPB * i] = 0;
 
   src += maxIn * group + lid;
 
   __syncthreads();
 
-  for (int i = 0; i < nloops; i++) {
-    const EdgeIn edge = src[i * tpb]; // EdgeIn edge = __ldg(&src[index]);
+  for (u32 i = 0; i < nloops; i++) {
+    const EdgeIn edge = src[i * TRIM_TPB]; // EdgeIn edge = __ldg(&src[index]);
     const u32 z = endpoint(edge, 0) & ZMASK;
     if ((z >> NONPART_BITS) == part)
       bitmapset(ebitmap, z & NONPART_MASK);
@@ -317,14 +328,14 @@ __global__ void Round(const int round, EdgeIn * src, uint2 * dst, u32 * srcIdx,
 
   __syncthreads();
 
-  for (int i = 0; i < nloops; i++) {
-    const EdgeIn edge = src[i * tpb]; // EdgeIn edge = __ldg(&src[index]);
+  for (u32 i = 0; i < nloops; i++) {
+    const EdgeIn edge = src[i * TRIM_TPB]; // EdgeIn edge = __ldg(&src[index]);
     const u32 node = endpoint(edge, 0);
     const u32 z = node & ZMASK;
     if ((z >> NONPART_BITS) == part && bitmaptest(ebitmap, (z & NONPART_MASK) ^ 1)) {
       u32 node2 = endpoint(edge, 1);
-      const int bucket = node2 >> ZBITS;
-      const int bktIdx = min(atomicAdd(dstIdx + bucket, 1), maxOut - 1);
+      const u32 bucket = node2 >> ZBITS;
+      const u32 bktIdx = min(atomicAdd(dstIdx + bucket, 1), maxOut - 1);
       dst[bucket * maxOut + bktIdx] = make_uint2(node2, node);
     }
   }
@@ -341,26 +352,28 @@ const u32 LISTMASK = NLISTS - 1;
 #define NNEXTS NLISTS
 #endif
 
-template<int tpb, int bktInSize, int bktOutSize>
+const u32 COPYFLAG = NZ;
+
+template<u32 bktInSize, u32 bktOutSize>
 __global__  void Relay(const u32 round, const uint2 * source, uint2 * destination, const u32 * sourceIndexes, u32 * destinationIndexes, bool TAGGED)
 {
-  const int lid = threadIdx.x;
-  const int group = blockIdx.x;
+  const u32 lid = threadIdx.x;
+  const u32 group = blockIdx.x;
 
   __shared__ u32 lists[NLISTS];
   __shared__ u32 nexts[NNEXTS];
 
-  const int nloops = (min(sourceIndexes[group], NNEXTS) - lid + tpb-1) / tpb;
+  const u32 nloops = (min(sourceIndexes[group], NNEXTS) - lid + RELAY_TPB-1) / RELAY_TPB;
 
   source += bktInSize * group;
 
-  for (int i = 0; i < NLISTS/tpb; i++)
-    lists[i * tpb + lid] = ~0;
+  for (u32 i = 0; i < NLISTS/RELAY_TPB; i++)
+    lists[i * RELAY_TPB + lid] = ~0;
 
   __syncthreads();
 
-  for (int i = 0; i < nloops; i++) {
-    const u32 index = i * tpb + lid;
+  for (u32 i = 0; i < nloops; i++) {
+    const u32 index = i * RELAY_TPB + lid;
     const u32 list = endpoint(source[index], 0) & LISTMASK;
     nexts[index] = atomicExch(&lists[list], index);
   }
@@ -368,9 +381,9 @@ __global__  void Relay(const u32 round, const uint2 * source, uint2 * destinatio
   __syncthreads();
 
   for (int i = nloops; --i >= 0;) {
-    const u32 index = i * tpb + lid;
+    const u32 index = i * RELAY_TPB + lid;
     const uint2 edge = source[index];
-    if (edge.y & NEDGES) continue; // copies don't relay
+    if (TAGGED && (edge.x & COPYFLAG)) continue; // copies don't relay
     u32 bucket = edge.y >> ZBITS;
     u32 copybit = 0;
     const u32 list = (edge.x & LISTMASK) ^ 1;
@@ -378,31 +391,31 @@ __global__  void Relay(const u32 round, const uint2 * source, uint2 * destinatio
       uint2 tagged = source[idx];
       if ((tagged.x ^ edge.x ^ 1) & ZMASK) continue;
       u32 bktIdx = min(atomicAdd(destinationIndexes + bucket, 1), bktOutSize - 1);
-      u32 tag = TAGGED ? tagged.x >> ZBITS : tagged.y >> 1;
-      destination[(bucket * bktOutSize) + bktIdx] = make_uint2((tag << ZBITS) | (edge.y & ZMASK), copybit | (group << ZBITS) | (edge.x & ZMASK));
-      copybit = NEDGES;
+      u32 tag = (TAGGED ? tagged.x >> ZBITS : tagged.y >> 1) & ~1 | copybit;
+      destination[(bucket * bktOutSize) + bktIdx] = make_uint2((tag << ZBITS) | (edge.y & ZMASK), (group << ZBITS) | (edge.x & ZMASK));
+      copybit = 1;
     }
   }
 }
 
-template<int tpb, int maxIn>
+template<u32 tpb, u32 maxIn>
 __global__ void Tail(const uint2 *source, uint4 *destination, const u32 *sourceIndexes, u32 *destinationIndexes) {
-  const int lid = threadIdx.x;
-  const int group = blockIdx.x;
+  const u32 lid = threadIdx.x;
+  const u32 group = blockIdx.x;
 
   __shared__ u32 lists[NLISTS];
   __shared__ u32 nexts[NNEXTS];
 
-  const int nloops = (min(sourceIndexes[group], NNEXTS) - lid + tpb-1) / tpb;
+  const u32 nloops = (min(sourceIndexes[group], NNEXTS) - lid + tpb-1) / tpb;
 
   source += maxIn * group;
 
-  for (int i = 0; i < NLISTS/tpb; i++)
+  for (u32 i = 0; i < NLISTS/tpb; i++)
     lists[i * tpb + lid] = ~0;
 
   __syncthreads();
 
-  for (int i = 0; i < nloops; i++) {
+  for (u32 i = 0; i < nloops; i++) {
     const u32 index = i * tpb + lid;
     const u32 list = source[index].x & LISTMASK;
     nexts[index] = atomicExch(&lists[list], index);
@@ -413,17 +426,13 @@ __global__ void Tail(const uint2 *source, uint4 *destination, const u32 *sourceI
   for (int i = nloops; --i >= 0;) {
     const u32 index = i * tpb + lid;
     const uint2 edge = source[index];
-#ifdef DBG101
-    if (((edge.x^0x1d3cc2ae)&ZMASK)<2) printf("Tail group %x x %x y %x tag %x\n", group, edge.x, edge.y, edge.x>>ZBITS);
-    if (((edge.y^0x1d3cc2ae)&ZMASK)<2) printf("Tail group %x x %x y %x tag %x\n", group, edge.x, edge.y, edge.x>>ZBITS);
-#endif
     if (edge.x & 1) continue;
     const u32 list = (edge.x & LISTMASK) ^ 1;
     for (u32 idx = lists[list]; idx != ~0; idx = nexts[idx]) {
       uint2 other = source[idx];
-      if ((other.x ^ edge.x) != 1) continue;
+      if (((other.x ^ edge.x) & ~COPYFLAG) != 1) continue;
       u32 bktIdx = atomicAdd(destinationIndexes, 2);
-      destination[bktIdx/2] = make_uint4(edge.y & (NEDGES-1), (group << ZBITS) |  (edge.x & ZMASK),
+      destination[bktIdx/2] = make_uint4(edge.y, (group << ZBITS) |  (edge.x & ZMASK),
                                         other.y & (NEDGES-1), (group << ZBITS) | (other.x & ZMASK));
     }
   }
@@ -445,19 +454,19 @@ inline int gpuAssert(cudaError_t code, const char *file, int line, bool abort=tr
 }
 
 __global__ void Recovery(u32 *indexes) {
-  const int gid = blockDim.x * blockIdx.x + threadIdx.x;
-  const int lid = threadIdx.x;
-  const int nthreads = blockDim.x * gridDim.x;
+  const u32 gid = blockDim.x * blockIdx.x + threadIdx.x;
+  const u32 lid = threadIdx.x;
+  const u32 nthreads = blockDim.x * gridDim.x;
   __shared__ u32 nonces[PROOFSIZE];
   
-  const int nloops = (NEDGES -gid + nthreads-1) / nthreads;
+  const u32 nloops = (NEDGES -gid + nthreads-1) / nthreads;
   if (lid < PROOFSIZE) nonces[lid] = 0;
   __syncthreads();
-  for (int i = 0; i < nloops; i++) {
+  for (u32 i = 0; i < nloops; i++) {
     u64 nonce = gid * nloops + i;
     u64 u = endpoint(nonce, 0);
     u64 v = endpoint(nonce, 1);
-    for (int i = 0; i < PROOFSIZE; i++) {
+    for (u32 i = 0; i < PROOFSIZE; i++) {
       if (recoveredges[i].x == v && recoveredges[i].y == u)
         nonces[i] = nonce;
     }
@@ -474,25 +483,6 @@ struct blockstpb {
   u16 tpb;
 };
 
-#ifndef SEED_TPB
-#define SEED_TPB 256
-#endif
-#ifndef TRIM0_TPB
-#define TRIM0_TPB 1024
-#endif
-#ifndef TRIM1_TPB
-#define TRIM1_TPB 512
-#endif
-#ifndef TRIM_TPB
-#define TRIM_TPB 512
-#endif
-#ifndef RELAY_TPB
-#define RELAY_TPB 512
-#endif
-#ifndef TAIL_TPB
-#define TAIL_TPB SEED_TPB
-#endif
-
 struct trimparams {
   u16 ntrims;
   blockstpb seed;
@@ -504,16 +494,16 @@ struct trimparams {
 
   trimparams() {
     ntrims         =        31;
-    seed.blocks    =      1024;
+    seed.blocks    =   NB_NTLB;
     seed.tpb       =  SEED_TPB;
-    trim0.blocks   =    NX2_NA;
+    trim0.blocks   =   NB_NMEM;
     trim0.tpb      = TRIM0_TPB;
-    trim1.blocks   =    NX2_NA;
+    trim1.blocks   =   NB_NMEM;
     trim1.tpb      = TRIM1_TPB;
-    trim.blocks    =       NX2;
+    trim.blocks    =        NB;
     trim.tpb       =  TRIM_TPB;
-    tail.blocks    =       NX2;
-    tail.tpb       =  TAIL_TPB;;
+    tail.blocks    =        NB;
+    tail.tpb       =  TAIL_TPB;
     recover.blocks =      2048;
     recover.tpb    =       256;
   }
@@ -526,8 +516,9 @@ struct edgetrimmer {
   trimparams tp;
   edgetrimmer *dt;
   size_t sizeA, sizeB;
-  const size_t indexesSize = NX * NY * sizeof(u32);
-  const size_t indexesSizeNA = NA * indexesSize;
+  const size_t indexesSize = NB * sizeof(u32);
+  const size_t indexesSizeNTLB = NTLB * indexesSize;
+  const size_t indexesSizeNMEM = NMEM * indexesSize;
   u8 *bufferA;
   u8 *bufferB;
   u8 *bufferA1;
@@ -539,32 +530,34 @@ struct edgetrimmer {
   bool abort;
   bool initsuccess = false;
 
+  static_assert((NTLB & (NTLB-1)) == 0, "ensure NTLB is a 2 power");
+  static_assert((NMEM & (NMEM-1)) == 0, "ensure NMEM is a 2 power");
+
   edgetrimmer(const trimparams _tp) : tp(_tp) {
     checkCudaErrors_V(cudaMalloc((void**)&dt, sizeof(edgetrimmer)));
     checkCudaErrors_V(cudaMalloc((void**)&uvnodes, PROOFSIZE * 2 * sizeof(u32)));
-    checkCudaErrors_V(cudaMalloc((void**)&indexesA, indexesSizeNA));
-    checkCudaErrors_V(cudaMalloc((void**)&indexesB, indexesSizeNA));
-    sizeA = (u64)ROW_EDGES_A * NX * sizeof(u32);
-    sizeB = (u64)ROW_EDGES_B * NX * sizeof(u32);
-    const size_t bufferSize = sizeA + sizeB / NA;
+    checkCudaErrors_V(cudaMalloc((void**)&indexesA, indexesSizeNTLB));
+    checkCudaErrors_V(cudaMalloc((void**)&indexesB, indexesSizeNMEM));
+    sizeA = (u64)EDGES_A * NB * sizeof(u32);
+    sizeB = (u64)EDGES_B * NB * sizeof(u32);
+    const size_t bufferSize = sizeA + sizeB / NMEM;
     checkCudaErrors_V(cudaMalloc((void**)&bufferB, bufferSize));
-    bufferA = bufferB + sizeB / NA;
+    bufferA = bufferB + sizeB / NMEM;
     bufferA1 = bufferB + sizeB;
-  print_log("allocated %lld bytes bufferB %llx bufferA %llx bufferA1 %llx\n", bufferSize, bufferB, bufferA, bufferA1);
-  print_log("endB %llx endA %llx endBuffer %llx\n", bufferB+sizeB, bufferA+sizeA, bufferB+bufferSize);
-    assert((NA & (NA-1)) == 0); // ensure NA is a 2 power
+    // print_log("allocated %lld bytes bufferB %llx bufferA %llx bufferA1 %llx\n", bufferSize, bufferB, bufferA, bufferA1);
+    // print_log("endB %llx endA %llx endBuffer %llx\n", bufferB+sizeB, bufferA+sizeA, bufferB+bufferSize);
     cudaMemcpy(dt, this, sizeof(edgetrimmer), cudaMemcpyHostToDevice);
     initsuccess = true;
-    int maxbytes = 0x10000; // 64 KB
-    cudaFuncSetAttribute(Seed  < SEED_TPB, EDGES_A/NA                 >, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
-    cudaFuncSetAttribute(Round0<TRIM0_TPB, EDGES_A/NA,      EDGES_B/NA>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
-    cudaFuncSetAttribute(Round1<TRIM1_TPB, EDGES_B/NA,       EDGES_B/2>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
-    cudaFuncSetAttribute(Round < TRIM_TPB, EDGES_B/2,   u32, EDGES_A/4>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
-    cudaFuncSetAttribute(Round < TRIM_TPB, EDGES_A/4, uint2, EDGES_B/4>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
-    cudaFuncSetAttribute(Round < TRIM_TPB, EDGES_B/4, uint2, EDGES_B/4>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
+    u32 maxbytes = 0x10000; // 64 KB
+    cudaFuncSetAttribute(Seed  <EDGES_A/NTLB               >, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
+    cudaFuncSetAttribute(Round0<EDGES_A/NTLB,  EDGES_B/NMEM>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
+    cudaFuncSetAttribute(Round1<EDGES_B/NMEM,     EDGES_B/2>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
+    cudaFuncSetAttribute(Round <EDGES_B/2,   u32, EDGES_A/4>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
+    cudaFuncSetAttribute(Round <EDGES_A/4, uint2, EDGES_B/4>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
+    cudaFuncSetAttribute(Round <EDGES_B/4, uint2, EDGES_B/4>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
   }
   u64 globalbytes() const {
-    return sizeA + sizeB/NA + 2 * indexesSizeNA + sizeof(siphash_keys) + PROOFSIZE * 2*sizeof(u32) + sizeof(edgetrimmer);
+    return sizeA + sizeB/NMEM + indexesSizeNTLB + indexesSizeNMEM + sizeof(siphash_keys) + PROOFSIZE * 2*sizeof(u32) + sizeof(edgetrimmer);
   }
   ~edgetrimmer() {
     checkCudaErrors_V(cudaFree(bufferB));
@@ -581,12 +574,17 @@ struct edgetrimmer {
 
   void indexcount(u32 round, const u32 *indexes) {
 #ifdef VERBOSE
-    u32 nedges;
-    cudaMemcpy(&nedges, indexes+VBIDX, sizeof(u32), cudaMemcpyDeviceToHost);
+    u32 nedges[NB];
+    cudaMemcpy(nedges, indexes, NB * sizeof(u32), cudaMemcpyDeviceToHost);
     cudaDeviceSynchronize();
-    print_log("round %d edges %d\n", round, nedges);
+    u32 sum, max;
+    for (u32 i = sum = max = 0; i < NB; i++) {
+      sum += nedges[i];
+      if (nedges[i] > max)
+        max = nedges[i];
+    }
+    print_log("round %d edges avg %d max %d\n", round, sum/NB, max);
 #endif
-
   }
   u32 trim() {
     cudaEvent_t start, stop;
@@ -597,9 +595,9 @@ struct edgetrimmer {
     float durationA;
     cudaEventRecord(start, NULL);
   
-    cudaMemset(indexesA, 0, indexesSizeNA);
-    for (u32 i = 0; i < NA; i++) {
-      Seed<SEED_TPB, EDGES_A/NA><<<tp.seed.blocks, SEED_TPB, BITMAPBYTES>>>((u32*)(bufferA+i*(sizeA/NA2)), indexesA+i*NX2, i*(NEDGES/NA));
+    cudaMemset(indexesA, 0, indexesSizeNTLB);
+    for (u32 i = 0; i < NTLB; i++) {
+      Seed<EDGES_A/NTLB><<<tp.seed.blocks, SEED_TPB, BITMAPBYTES>>>((u32*)(bufferA+i*(sizeA/NMEMTLB)), indexesA+i*NB, i*(NEDGES/NTLB));
     cudaDeviceSynchronize();
       if (abort) return false;
     }
@@ -610,46 +608,47 @@ struct edgetrimmer {
   
     // checkCudaErrors(cudaEventDestroy(start)); checkCudaErrors(cudaEventDestroy(stop));
     print_log("Seeding completed in %.0f ms\n", durationA);
+    indexcount(0, indexesA);
     if (abort) return false;
   
-    cudaMemset(indexesB, 0, indexesSizeNA);
+    cudaMemset(indexesB, 0, indexesSizeNMEM);
 
-    const size_t qB = sizeB / NA;
-    for (u32 i = 0; i < NA; i++) {
-      Round0<TRIM0_TPB, EDGES_A/NA, EDGES_B/NA><<<NX2_NA<<PART_BITS, TRIM0_TPB, BITMAPBYTES>>>((u32*)bufferA, (u32*)(bufferB+i*qB), indexesA, indexesB, i*NX2_NA); // to .632
+    const size_t qB = sizeB / NMEM;
+    for (u32 i = 0; i < NMEM; i++) {
+      Round0<EDGES_A/NTLB, EDGES_B/NMEM><<<NB_NMEM<<PART_BITS, TRIM0_TPB, BITMAPBYTES>>>((u32*)bufferA, (u32*)(bufferB+i*qB), indexesA, indexesB, i*NB_NMEM); // to .632
       if (abort) return false;
     }
     indexcount(1, indexesB);
 
     cudaMemset(indexesA, 0, indexesSize);
 
-    Round1<TRIM1_TPB, EDGES_B/NA, EDGES_B/2><<<NX2<<PART_BITS, TRIM1_TPB, BITMAPBYTES>>>((u32*)bufferB, (u32*)bufferA1, indexesB, indexesA); // to .296
+    Round1<EDGES_B/NMEM, EDGES_B/2><<<NB<<PART_BITS, TRIM1_TPB, BITMAPBYTES>>>((u32*)bufferB, (u32*)bufferA1, indexesB, indexesA); // to .296
     if (abort) return false;
     indexcount(2, indexesA);
 
     cudaMemset(indexesB, 0, indexesSize);
 
-    Round<TRIM_TPB, EDGES_B/2, u32, EDGES_A/4><<<NX2<<PART_BITS, TRIM_TPB, BITMAPBYTES>>>(2, (u32 *)bufferA1, (uint2 *)bufferB, indexesA, indexesB); // to .176
+    Round<EDGES_B/2, u32, EDGES_A/4><<<NB<<PART_BITS, TRIM_TPB, BITMAPBYTES>>>(2, (u32 *)bufferA1, (uint2 *)bufferB, indexesA, indexesB); // to .176
     if (abort) return false;
     indexcount(3, indexesB);
 
     cudaMemset(indexesA, 0, indexesSize);
 
-    Round<TRIM_TPB, EDGES_A/4, uint2, EDGES_B/4><<<NX2<<PART_BITS, TRIM_TPB, BITMAPBYTES>>>(3, (uint2 *)bufferB, (uint2 *)bufferA1, indexesB, indexesA); // to .116
+    Round<EDGES_A/4, uint2, EDGES_B/4><<<NB<<PART_BITS, TRIM_TPB, BITMAPBYTES>>>(3, (uint2 *)bufferB, (uint2 *)bufferA1, indexesB, indexesA); // to .116
     if (abort) return false;
     indexcount(4, indexesA);
   
-    for (int round = 5; round < tp.ntrims + PROOFSIZE/2-1; round += 2) {
+    for (u32 round = 5; round < tp.ntrims + PROOFSIZE/2-1; round += 2) {
       cudaMemset(indexesB, 0, indexesSize);
       if (round >= tp.ntrims)
-        Relay<RELAY_TPB, EDGES_B/4, EDGES_B/4><<<NX2, RELAY_TPB>>>(round-1, (uint2 *)bufferA1, (uint2 *)bufferB, indexesA, indexesB, round > tp.ntrims);
-      else Round<TRIM_TPB, EDGES_B/4, uint2, EDGES_B/4><<<NX2<<PART_BITS, TRIM_TPB, BITMAPBYTES>>>(round-1, (uint2 *)bufferA1, (uint2 *)bufferB, indexesA, indexesB);
+        Relay<EDGES_B/4, EDGES_B/4><<<NB, RELAY_TPB>>>(round-1, (uint2 *)bufferA1, (uint2 *)bufferB, indexesA, indexesB, round > tp.ntrims);
+      else Round<EDGES_B/4, uint2, EDGES_B/4><<<NB<<PART_BITS, TRIM_TPB, BITMAPBYTES>>>(round-1, (uint2 *)bufferA1, (uint2 *)bufferB, indexesA, indexesB);
       indexcount(round, indexesB);
       if (abort) return false;
       cudaMemset(indexesA, 0, indexesSize);
       if (round+1 >= tp.ntrims)
-        Relay<RELAY_TPB, EDGES_B/4, EDGES_B/4><<<NX2, RELAY_TPB>>>(round, (uint2 *)bufferB, (uint2 *)bufferA1, indexesB, indexesA, round+1 > tp.ntrims);
-      else Round<TRIM_TPB, EDGES_B/4, uint2, EDGES_B/4><<<NX2<<PART_BITS, TRIM_TPB, BITMAPBYTES>>>(round, (uint2 *)bufferB, (uint2 *)bufferA1, indexesB, indexesA);
+        Relay<EDGES_B/4, EDGES_B/4><<<NB, RELAY_TPB>>>(round, (uint2 *)bufferB, (uint2 *)bufferA1, indexesB, indexesA, round+1 > tp.ntrims);
+      else Round<EDGES_B/4, uint2, EDGES_B/4><<<NB<<PART_BITS, TRIM_TPB, BITMAPBYTES>>>(round, (uint2 *)bufferB, (uint2 *)bufferA1, indexesB, indexesA);
       indexcount(round+1, indexesA);
       if (abort) return false;
     }
@@ -657,7 +656,7 @@ struct edgetrimmer {
     cudaMemset(indexesB, 0, indexesSize);
     cudaDeviceSynchronize();
   
-    Tail<TAIL_TPB, EDGES_B/4><<<NX2, TAIL_TPB>>>((const uint2 *)bufferA1, (uint4 *)bufferB, indexesA, indexesB);
+    Tail<TAIL_TPB, EDGES_B/4><<<NB, TAIL_TPB>>>((const uint2 *)bufferA1, (uint4 *)bufferB, indexesA, indexesB);
     cudaMemcpy(&nedges, indexesB, sizeof(u32), cudaMemcpyDeviceToHost);
     cudaDeviceSynchronize();
     return nedges;
@@ -717,7 +716,7 @@ struct solver_ctx {
     return ndupes;
   }
 
-  int solve() {
+  u32 solve() {
     u64 time0, time1;
     u32 timems,timems2;
 
@@ -806,12 +805,12 @@ CALL_CONVENTION int run_solver(SolverCtx* ctx,
         for (u32 i = 0; i < PROOFSIZE; i++) 
           solutions->sols[sumnsols+s].proof[i] = (u64) prf[i];
       }
-      int pow_rc = verify(prf, &ctx->trimmer.sipkeys);
+      u32 pow_rc = verify(prf, &ctx->trimmer.sipkeys);
       if (pow_rc == POW_OK) {
         print_log("Verified with cyclehash ");
         unsigned char cyclehash[32];
         blake2b((void *)cyclehash, sizeof(cyclehash), (const void *)prf, sizeof(proof), 0, 0);
-        for (int i=0; i<32; i++)
+        for (u32 i=0; i<32; i++)
           print_log("%02x", cyclehash[i]);
         print_log("\n");
       } else {
@@ -941,19 +940,19 @@ int main(int argc, char **argv) {
   cudaDeviceProp prop;
   checkCudaErrors(cudaGetDeviceProperties(&prop, device));
   u64 dbytes = prop.totalGlobalMem;
-  int dunit;
+  u32 dunit;
   for (dunit=0; dbytes >= 102400; dbytes>>=10,dunit++) ;
   print_log("%s with %d%cB @ %d bits x %dMHz\n", prop.name, (u32)dbytes, " KMGT"[dunit], prop.memoryBusWidth, prop.memoryClockRate/1000);
 
   print_log("Looking for %d-cycle on cuckatoo%d(\"%s\",%d", PROOFSIZE, EDGEBITS, header, nonce);
   if (range > 1)
     print_log("-%d", nonce+range-1);
-  print_log(") with 50%% edges, %d*%d buckets, %d trims, and %d thread blocks.\n", NX, NY, params.ntrims, NX);
+  print_log(") with 50%% edges, %d buckets, and %d trims.\n", NB, params.ntrims);
 
   SolverCtx* ctx = create_solver_ctx(&params);
 
   u64 bytes = ctx->trimmer.globalbytes();
-  int unit;
+  u32 unit;
   for (unit=0; bytes >= 102400; bytes>>=10,unit++) ;
   print_log("Using %d%cB of global memory.\n", (u32)bytes, " KMGT"[unit]);