From 567086469e95df7cad18244a65d4e9ef62dccc77 Mon Sep 17 00:00:00 2001
From: Ben Schmidt <bmschmidt@gmail.com>
Date: Wed, 18 Dec 2024 16:51:22 -0500
Subject: [PATCH] webGPU pipelines

---
 package-lock.json         |   7 +-
 package.json              |   2 +-
 src/webGPU/buffertools.ts | 220 +++++++++++------------
 src/webGPU/forests.ts     | 107 ++++++------
 src/webGPU/lib.ts         | 356 +++++++++++++++++++++++---------------
 5 files changed, 385 insertions(+), 307 deletions(-)

diff --git a/package-lock.json b/package-lock.json
index de69b85e..ef7cdbd0 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "deepscatter",
-  "version": "3.0.0-next.43",
+  "version": "3.0.0-next.45",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "deepscatter",
-      "version": "3.0.0-next.43",
+      "version": "3.0.0-next.45",
       "license": "CC BY-NC-SA 4.0",
       "dependencies": {
         "d3-array": "^3.2.4",
@@ -59,7 +59,7 @@
         "vitest": "^2.1.4"
       },
       "peerDependencies": {
-        "apache-arrow": ">=11.0.0"
+        "apache-arrow": "^17.0.0"
       }
     },
     "node_modules/@75lb/deep-merge": {
@@ -1193,6 +1193,7 @@
       "version": "17.0.0",
       "resolved": "https://registry.npmjs.org/apache-arrow/-/apache-arrow-17.0.0.tgz",
       "integrity": "sha512-X0p7auzdnGuhYMVKYINdQssS4EcKec9TCXyez/qtJt32DrIMGbzqiaMiQ0X6fQlQpw8Fl0Qygcv4dfRAr5Gu9Q==",
+      "license": "Apache-2.0",
       "peer": true,
       "dependencies": {
         "@swc/helpers": "^0.5.11",
diff --git a/package.json b/package.json
index 250ec588..e46584a0 100644
--- a/package.json
+++ b/package.json
@@ -45,7 +45,7 @@
   },
   "homepage": "https://github.com/nomic-ai/deepscatter#readme",
   "peerDependencies": {
-    "apache-arrow": ">=11.0.0"
+    "apache-arrow": "^17.0.0"
   },
   "dependencies": {
     "d3-array": "^3.2.4",
diff --git a/src/webGPU/buffertools.ts b/src/webGPU/buffertools.ts
index b980ad08..06ecefd6 100644
--- a/src/webGPU/buffertools.ts
+++ b/src/webGPU/buffertools.ts
@@ -1,127 +1,133 @@
 import { isTypedArray, type TypedArray } from 'webgpu-utils';
 import { BufferSet } from '../regl_rendering';
 import { WebGPUBufferLocation } from '../types';
-// I track locations on buffers like this.
-// We keep track of both size -- the number of meaningful data bytes
-// and paddedSize -- the number of bytes including 256-byte padding.
-
-export class WebGPUBufferSet extends BufferSet<GPUBuffer, WebGPUBufferLocation> {
-	// Copied with alterations from deepscatter
+import { Some, TupleMap } from '../utilityFunctions';
 
-	// An abstraction creating an expandable set of buffers that can be subdivided
-	// to put more than one variable on the same
-	// block of memory. Reusing buffers this way can have performance benefits over allocating
-	// multiple different buffers for each small block used.
+// Unlike in webgl, we keep track of both size -- the number of meaningful data bytes
+// and paddedSize -- the number of bytes including 256-byte padding.
 
-	// The general purpose here is to call 'allocate_block' that releases a block of memory
-	// to use in creating a new array to be passed to regl.
+export class WebGPUBufferSet extends BufferSet<
+  GPUBuffer,
+  WebGPUBufferLocation
+> {
+  public device: GPUDevice;
+  private stagingBuffer: GPUBuffer;
+  public usage: number;
 
-	public device: GPUDevice;
-	private stagingBuffer: GPUBuffer;
-	public usage: number;
+  public store: TupleMap<string, WebGPUBufferLocation> = new TupleMap();
 
-	public store: Map<string, WebGPUBufferLocation> = new Map();
+  /**
+   *
+   * @param regl the Regl context we're using.
+   * @param buffer_size The number of bytes on each strip of memory that we'll ask for.
+   */
 
-	/**
-	 *
-	 * @param regl the Regl context we're using.
-	 * @param buffer_size The number of bytes on each strip of memory that we'll ask for.
-	 */
+  constructor(
+    device: GPUDevice,
+    buffer_size: number,
+    usage: number = GPUBufferUsage.STORAGE |
+      GPUBufferUsage.COPY_DST |
+      GPUBufferUsage.COPY_SRC,
+  ) {
+    super(buffer_size);
+    this.device = device;
+    // Track the ends in case we want to allocate smaller items.
+    this.usage = usage;
+    this.generate_new_buffer();
+    this.stagingBuffer = device.createBuffer({
+      size: buffer_size,
+      usage: GPUBufferUsage.COPY_SRC | GPUBufferUsage.MAP_WRITE,
+      mappedAtCreation: false, // saves a little trouble in the passThrough function
+    });
+  }
 
-	constructor(
-		device: GPUDevice,
-		buffer_size: number,
-		usage: number = GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC
-	) {
-		super(buffer_size)
-		this.device = device;
-		// Track the ends in case we want to allocate smaller items.
-		this.usage = usage;
-		this.generate_new_buffer();
-		this.stagingBuffer = device.createBuffer({
-			size: buffer_size,
-			usage: GPUBufferUsage.COPY_SRC | GPUBufferUsage.MAP_WRITE,
-			mappedAtCreation: false // saves a little trouble in the passThrough function
-		});
-	}
+  private async passThroughStagingBuffer(
+    values: Uint32Array,
+    bufferLocation: WebGPUBufferLocation,
+  ) {
+    // WebGPU
+    const { buffer, offset, paddedSize } = bufferLocation;
+    while (this.stagingBuffer.mapState !== 'unmapped') {
+      // Wait in line for a millisecond.
+      // Would be better to hold a queue and apply more than one of these at once.
+      await new Promise((resolve) => setTimeout(resolve, 1));
+    }
+    await this.stagingBuffer.mapAsync(GPUMapMode.WRITE, 0, paddedSize);
+    new Uint32Array(
+      this.stagingBuffer.getMappedRange(0, values.byteLength),
+    ).set(values);
+    this.stagingBuffer.unmap();
+    const commandEncoder = this.device.createCommandEncoder();
+    commandEncoder.copyBufferToBuffer(
+      this.stagingBuffer,
+      0,
+      buffer,
+      offset,
+      paddedSize,
+    );
+    this.device.queue.submit([commandEncoder.finish()]);
+  }
 
-	private async passThroughStagingBuffer(values: Uint32Array, bufferLocation: WebGPUBufferLocation) {
-		// WebGPU 
-		const { buffer, offset, paddedSize } = bufferLocation;
-		while (this.stagingBuffer.mapState !== 'unmapped') {
-			// Wait in line for a millisecond.
-			// Would be better to hold a queue and apply more than one of these at once.
-			await new Promise((resolve) => setTimeout(resolve, 1));
-		}
-		await this.stagingBuffer.mapAsync(GPUMapMode.WRITE, 0, paddedSize);
-		new Uint32Array(this.stagingBuffer.getMappedRange(0, values.byteLength)).set(values);
-		this.stagingBuffer.unmap();
-		const commandEncoder = this.device.createCommandEncoder();
-		commandEncoder.copyBufferToBuffer(this.stagingBuffer, 0, buffer, offset, paddedSize);
-		this.device.queue.submit([commandEncoder.finish()]);
-	}
+  register(k: Some<string>, v: WebGPUBufferLocation) {
+    this.store.set(k, v);
+  }
 
-	register(k: string, v: WebGPUBufferLocation) {
-		this.store.set(k, v);
-	}
+  async set(key: Some<string>, value: TypedArray) {
+    if (this.store.has(key)) {
+      throw new Error(`Key ${key.join(', ')} already exists in buffer set.`);
+    }
+    const size = value.byteLength;
+    const paddedSize = Math.ceil(size / 256) * 256;
 
-	async set(key: string, value: TypedArray) {
-		if (this.store.has(key)) {
-			throw new Error(`Key ${key} already exists in buffer set.`);
-		}
-		const size = value.byteLength;
-		const paddedSize = Math.ceil(size / 256) * 256;
+    const { buffer, offset } = this.allocate_block(paddedSize);
 
-		const { buffer, offset } = this.allocate_block(paddedSize);
+    // If it's a typed array, we can just copy it directly.
+    // cast it to uint32array
+    const v2 = value;
+    const data = new Uint32Array(v2.buffer, v2.byteOffset, v2.byteLength / 4);
+    const description = { buffer, offset, size, paddedSize };
+    await this.passThroughStagingBuffer(data, description);
+    this.register(key, description);
+  }
 
-		// If it's a typed array, we can just copy it directly.
-		// cast it to uint32array
-		const v2 = value;
-		const data = new Uint32Array(v2.buffer, v2.byteOffset, v2.byteLength / 4);
-		const description = { buffer, offset, size, paddedSize };
-		await this.passThroughStagingBuffer(data, description);
-		this.register(key, description);
-	}
+  _create_buffer(): GPUBuffer {
+    return this.device.createBuffer({
+      size: this.buffer_size,
+      usage: this.usage,
+      mappedAtCreation: false,
+    });
+  }
 
-		_create_buffer() : GPUBuffer {
-			return this.device.createBuffer({
-				size: this.buffer_size,
-				usage: this.usage,
-				mappedAtCreation: false
-			})
-		}
-	
-		_create_leftover_buffer() : WebGPUBufferLocation {
-			return {
-					buffer: this.buffers[0],
-					offset: this.pointer,
-					stride: 4, // meaningless here.
-					byte_size: this.buffer_size - this.pointer,
-					paddedSize: this.buffer_size - this.pointer
-			}
-		}
+  _create_leftover_buffer(): WebGPUBufferLocation {
+    return {
+      buffer: this.buffers[0],
+      offset: this.pointer,
+      stride: 4, // meaningless here.
+      byte_size: this.buffer_size - this.pointer,
+      paddedSize: this.buffer_size - this.pointer,
+    };
+  }
 }
 
-
 export function createSingletonBuffer(
-	device: GPUDevice,
-	data: Uint32Array | Int32Array | Float32Array | ArrayBuffer,
-	usage: number
+  device: GPUDevice,
+  data: Uint32Array | Int32Array | Float32Array | ArrayBuffer,
+  usage: number,
 ): GPUBuffer {
-	// Creates a disposable singleton buffer.
-	// ReadonlyBufferSet ought to provide better performance; but
-	// this allows more different buffer sizes and easier destruction.
-	const buffer = device.createBuffer({
-		size: data.byteLength,
-		usage,
-		mappedAtCreation: true
-	});
-	const mappedRange = buffer.getMappedRange();
-	if (isTypedArray(data)) {
-		new Uint32Array(mappedRange).set(data as TypedArray);
-	} else {
-		new Uint32Array(mappedRange).set(new Uint32Array(data as ArrayBuffer));
-	}
-	buffer.unmap();
-	return buffer;
+  // Creates a disposable singleton buffer.
+  // ReadonlyBufferSet ought to provide better performance; but
+  // this allows more different buffer sizes and easier destruction.
+  const buffer = device.createBuffer({
+    size: data.byteLength,
+    usage,
+    mappedAtCreation: true,
+  });
+  const mappedRange = buffer.getMappedRange();
+  if (isTypedArray(data)) {
+    new Uint32Array(mappedRange).set(data as TypedArray);
+  } else {
+    new Uint32Array(mappedRange).set(new Uint32Array(data as ArrayBuffer));
+  }
+  buffer.unmap();
+  return buffer;
 }
diff --git a/src/webGPU/forests.ts b/src/webGPU/forests.ts
index 2c24626f..cf26ed94 100644
--- a/src/webGPU/forests.ts
+++ b/src/webGPU/forests.ts
@@ -1,5 +1,6 @@
-import { createSingletonBuffer, WebGPUBufferSet } from "./buffertools";
-import { StatefulGPU } from "./lib";
+import { Deeptable } from '../deepscatter';
+import { createSingletonBuffer, WebGPUBufferSet } from './buffertools';
+import { DeepGPU } from './lib';
 
 type TinyForestParams = {
   nTrees: number;
@@ -7,28 +8,31 @@ type TinyForestParams = {
   // The number of features to consider at each split.
   maxFeatures: number;
   D: number;
-}
+};
 
-const defaultTinyForestParams : TinyForestParams = {
+const defaultTinyForestParams: TinyForestParams = {
   nTrees: 128,
   depth: 8,
   maxFeatures: 32,
   D: 768,
-}
+};
 
-export class TinyForest extends StatefulGPU {
+export class TinyForest extends DeepGPU {
   params: TinyForestParams;
-  
+
   private _bootstrapSamples?: GPUBuffer; // On the order of 100 KB
-  protected _forests?: GPUBuffer // On the order of 10 MB.
+  protected _forests?: GPUBuffer; // On the order of 10 MB.
   // private trainedThrough: number = 0;
   constructor(
-    device: GPUDevice, 
-    bufferSize = 1024 * 1024 * 256, 
-    t: Partial<TinyForestParams> = {}) {
-    super(device, bufferSize)
-    this.params = {...defaultTinyForestParams, ...t}
-    this.initializeForestsToZero()
+    device: GPUDevice,
+    bufferSize = 1024 * 1024 * 256,
+    t: Partial<TinyForestParams> = {},
+    deeptable: Deeptable
+  ) {
+    throw new Error("Not implemented")
+    super(device, deeptable);
+    this.params = { ...defaultTinyForestParams, ...t };
+    this.initializeForestsToZero();
     this.bufferSet = new WebGPUBufferSet(device, bufferSize);
   }
 
@@ -48,48 +52,51 @@ export class TinyForest extends StatefulGPU {
           // features buffer;
           binding: 0,
           visibility: GPUShaderStage.COMPUTE,
-          buffer: { type: 'storage' }
+          buffer: { type: 'storage' },
         },
         {
           // dims to check array;
           binding: 1,
           visibility: GPUShaderStage.COMPUTE,
-          buffer: { type: 'storage' }
+          buffer: { type: 'storage' },
         },
         {
           // output count buffer.
           binding: 2,
           visibility: GPUShaderStage.COMPUTE,
-          buffer: { type: 'storage' }
-        }
-      ]
-    })
+          buffer: { type: 'storage' },
+        },
+      ],
+    });
 
     // const subsetsToCheck = this.chooseNextFeatures();
-    const pipelineLayout = device.createPipelineLayout({ bindGroupLayouts: [layout] });
+    const pipelineLayout = device.createPipelineLayout({
+      bindGroupLayouts: [layout],
+    });
 
-    const shaderModule = device.createShaderModule({ code: `
+    const shaderModule = device.createShaderModule({
+      code: `
       @group(0) @binding(0) var<storage, read> features: array<u32>;
       @group(0) @binding(1) var<storage, read> dimsToCheck: array<u16>;
       @group(0) @binding(2) var<storage, write> counts: array<u32>;
 
       @compute @workgroup_size(64)
       //TODOD HERE
-      ` });
-
+      `,
+    });
 
     return device.createComputePipeline({
       layout: pipelineLayout,
       compute: {
         module: shaderModule,
-        entryPoint: 'main'
-      }
+        entryPoint: 'main',
+      },
     });
   }
 
   //@ts-expect-error foo
   private chooseNextFeatures(n = 32) {
-    console.log({n})
+    console.log({ n });
     const { maxFeatures, nTrees, D } = this.params;
     const features = new Uint16Array(maxFeatures * D);
     for (let i = 0; i < nTrees; i++) {
@@ -100,71 +107,59 @@ export class TinyForest extends StatefulGPU {
       const arr = new Uint16Array([...set].sort());
       features.set(arr, i * maxFeatures);
     }
-    return createSingletonBuffer(
-      this.device,
-      features,
-      GPUBufferUsage.STORAGE
-    )
+    return createSingletonBuffer(this.device, features, GPUBufferUsage.STORAGE);
   }
 
-
-
   initializeForestsToZero() {
-    // Each tree is a set of bits; For every possible configuration 
-    // the first D indicating 
+    // Each tree is a set of bits; For every possible configuration
+    // the first D indicating
     // the desired outcome for the dimension,
     // the second D indicating whether the bits in those
     // positions are to be considered in checking if the tree
     // fits. There are 2**depth bitmasks for each dimension--each point
     // will match only one, and part of the inference task is determining which one.
 
-    const treeSizeInBytes = 
-      2 * this.params.D * (2 ** this.params.depth) / 8;
+    const treeSizeInBytes = (2 * this.params.D * 2 ** this.params.depth) / 8;
 
-    const data = new Uint8Array(treeSizeInBytes * this.params.nTrees)
+    const data = new Uint8Array(treeSizeInBytes * this.params.nTrees);
     this._forests = createSingletonBuffer(
       this.device,
       data,
-      GPUBufferUsage.STORAGE
-    )
+      GPUBufferUsage.STORAGE,
+    );
   }
-  
 
   // Rather than actually bootstrap, we generate a single
   // list of 100,000 numbers drawn from a poisson distribution.
-  // These serve as weights for draws with replacement; to 
+  // These serve as weights for draws with replacement; to
   // bootstrap any given record batch, we take a sequence of
-  // numbers from the buffer with offset i. 
+  // numbers from the buffer with offset i.
   get bootstrapSamples() {
     if (this._bootstrapSamples) {
-      return this._bootstrapSamples
+      return this._bootstrapSamples;
     } else {
-      const arr = new Uint8Array(100000)
+      const arr = new Uint8Array(100000);
       for (let i = 0; i < arr.length; i++) {
-        arr[i] = poissonRandomNumber()
+        arr[i] = poissonRandomNumber();
       }
       this._bootstrapSamples = createSingletonBuffer(
         this.device,
         arr,
-        GPUBufferUsage.STORAGE
-      )
-      return this._bootstrapSamples
+        GPUBufferUsage.STORAGE,
+      );
+      return this._bootstrapSamples;
     }
   }
-
-  
 }
 
-
-function poissonRandomNumber() : number {
+function poissonRandomNumber(): number {
   let p = 1.0;
   let k = 0;
 
   do {
     k++;
     p *= Math.random();
-  } while (p > 1/Math.E);
+  } while (p > 1 / Math.E);
 
   return k - 1;
 }
-
diff --git a/src/webGPU/lib.ts b/src/webGPU/lib.ts
index 4ad88694..e1cf03b7 100644
--- a/src/webGPU/lib.ts
+++ b/src/webGPU/lib.ts
@@ -1,135 +1,204 @@
 import { makeShaderDataDefinitions, makeStructuredView } from 'webgpu-utils';
 import { WebGPUBufferSet, createSingletonBuffer } from './buffertools';
-import { Scatterplot, Tile } from '../deepscatter';
-
-export class StatefulGPU {
-	device: GPUDevice;
-	bufferSet: WebGPUBufferSet;
-	constructor(device: GPUDevice, bufferSize = 1024 * 1024 * 256) {
-		this.device = device;
-		this.bufferSet = new WebGPUBufferSet(device, bufferSize);
+import { Deeptable, Scatterplot, Tile } from '../deepscatter';
+import { Bool, Vector, vectorFromArray } from 'apache-arrow';
+
+export class DeepGPU {
+  // This is a stateful class for bundling together GPU buffers and resources.
+  // It's sort of replacing regl? I don't know yet, just feeling this out.
+  device: GPUDevice;
+  bufferSet: WebGPUBufferSet;
+  deeptable: Deeptable;
+
+  /**
+   * Create a DeepGPU synchronously. Usually call DeepGPU.create()
+   *
+   * @param device The initialized
+   * @param bufferSize
+   */
+  constructor(
+    device: GPUDevice,
+    deeptable: Deeptable,
+    bufferSize = 1024 * 1024 * 256,
+  ) {
+    this.device = device;
+    this.deeptable = deeptable;
+    this.bufferSet = new WebGPUBufferSet(device, bufferSize);
+  }
+
+  static async create(deeptable: Deeptable): Promise<DeepGPU> {
+    // Create a DeepGPU object.
+    if (!navigator.gpu) {
+      throw new Error('WebGPU is not supported in this browser.');
+    }
+
+    const adapter = await navigator.gpu.requestAdapter();
+    if (!adapter) {
+      throw new Error('Failed to get GPU adapter.');
+    }
+
+    const device = await adapter.requestDevice();
+    return new DeepGPU(device, deeptable);
+  }
+
+	async get(field: string, tile: Tile) {
+		if (this.bufferSet.store.has([field, tile.key])) {
+			return this.bufferSet.store.get([field, tile.key])
+		} else {
+			const values = (await tile.get_column(field)).data[0].children[0]
+				.values as Uint8Array;
+			await this.bufferSet.set([field, tile.key], values);
+			return this.bufferSet.store.get([field, tile.key])
+		}
+	}
+}
+
+
+
+
+export abstract class ReusableWebGPUPipeline {
+	public gpuState: DeepGPU
+	constructor(
+		gpuState: DeepGPU,
+	) {
+		this.gpuState = gpuState
 	}
-	static async initializeWebGPU(): Promise<StatefulGPU> {
-		if (!navigator.gpu) {
-			throw new Error('WebGPU is not supported in this browser.');
+	abstract shaderCode() : string;
+	// eslint-disable-next-line @typescript-eslint/no-explicit-any
+	abstract uniforms(): Record<string, any>;
+	protected uniformBuffer?: GPUBuffer;
+	protected pipeline?: GPUComputePipeline;
+}
+
+export class HammingPipeline extends ReusableWebGPUPipeline {
+	public gpuState: DeepGPU;
+	public dimensionality? : number;
+	public comparisonBuffer: GPUBuffer;
+	private fieldName = '_hamming_embeddings';
+	constructor(
+		gpuState: DeepGPU,
+	) {
+		super(gpuState)
+	}
+
+	bindGroupLayout(device: GPUDevice) {
+		return device.createBindGroupLayout({
+			entries: [
+				{
+					binding: 0,
+					visibility: GPUShaderStage.COMPUTE,
+					buffer: { type: 'read-only-storage' },
+				},
+				{
+					binding: 1,
+					visibility: GPUShaderStage.COMPUTE,
+					buffer: { type: 'read-only-storage' },
+				},
+				{
+					binding: 2,
+					visibility: GPUShaderStage.COMPUTE,
+					buffer: { type: 'storage' },
+				},
+				{
+					binding: 3,
+					visibility: GPUShaderStage.COMPUTE,
+					buffer: { type: 'uniform' },
+				},
+			],
+		});
+	}
+
+	shaderCode() {
+		return `
+		struct SizeEtc {
+			objectSize: u32,
+		};
+		
+		@group(0) @binding(0) var<storage, read> comparisonArray : array<u32>;
+		@group(0) @binding(1) var<storage, read> matrixArray : array<u32>;
+		@group(0) @binding(2) var<storage, read_write> outputArray : array<u32>;
+		@group(0) @binding(3) var<uniform> myUniforms: SizeEtc;
+		
+		@compute @workgroup_size(64)
+		fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+				let idx = global_id.x;
+				let o = myUniforms.objectSize;
+				if (idx < arrayLength(&matrixArray)) {
+						var totalDistance: u32 = 0;
+						for (var i: u32 = 0; i < o; i = i + 1) {
+								for (var j: u32 = 0; j < arrayLength(&comparisonArray) / o; j = j + 1) {
+									totalDistance = totalDistance + countOneBits(comparisonArray[j * o + i] ^ matrixArray[idx * o + i]);
+								}
+						}
+						outputArray[global_id.x] = totalDistance;
+				}
 		}
-	
-		const adapter = await navigator.gpu.requestAdapter();
-		if (!adapter) {
-			throw new Error('Failed to get GPU adapter.');
+	`}
+
+	setComparisonArray(
+		arr: Vector<Bool>
+	) {
+		const underlying = arr.data[0].values;
+		this.comparisonBuffer = createSingletonBuffer(
+			this.gpuState.device,
+			underlying,
+			GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC,
+		);
+		this.dimensionality = underlying.length;
+	}
+
+	uniforms() {
+		return {
+			objectSize: this.dimensionality / 32,
 		}
+	}
+
+	prepUniforms() {
+		const defs = makeShaderDataDefinitions(this.shaderCode());
+	
+		const myUniformValues = makeStructuredView(defs.uniforms.myUniforms);
 	
-		const device = await adapter.requestDevice();
-		return new StatefulGPU(device);
+		myUniformValues.set(this.uniforms());
+		return myUniformValues;
 	}
-}
 
-const bindGroupLayout = (device: GPUDevice) =>
-	device.createBindGroupLayout({
-		entries: [
-			{
-				binding: 0,
-				visibility: GPUShaderStage.COMPUTE,
-				buffer: { type: 'read-only-storage' }
-			},
-			{
-				binding: 1,
-				visibility: GPUShaderStage.COMPUTE,
-				buffer: { type: 'read-only-storage' }
-			},
-			{
-				binding: 2,
-				visibility: GPUShaderStage.COMPUTE,
-				buffer: { type: 'storage' }
+	prep() {
+		const { device } = this.gpuState;
+		const layout = device.createPipelineLayout({
+			bindGroupLayouts: [this.bindGroupLayout(device)],
+		});
+		// Create shader module and pipeline
+		const shaderModule = device.createShaderModule({ code: this.shaderCode() });
+		this.pipeline = device.createComputePipeline({
+			layout,
+			compute: {
+				module: shaderModule,
+				entryPoint: 'main',
 			},
-			{
-				binding: 3,
-				visibility: GPUShaderStage.COMPUTE,
-				buffer: { type: 'uniform' }
-			}
-		]
-	});
-
-  export function prepareComputeShader(
-	state: StatefulGPU,
-	comparisonArray: Uint32Array,
-	embeddingSize: number = 128
-): (tile, key) => Promise<Uint32Array> {
-	// Create buffers
-	const { device, bufferSet } = state;
-	const comparisonBuffer = createSingletonBuffer(
-		device,
-		comparisonArray,
-		GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC
-	);
-
-	// Shader code
-	const shaderCode = `
-
-  struct SizeEtc {
-    objectSize: u32,
-  };
-  
-  @group(0) @binding(0) var<storage, read> comparisonArray : array<u32>;
-  @group(0) @binding(1) var<storage, read> matrixArray : array<u32>;
-  @group(0) @binding(2) var<storage, read_write> outputArray : array<u32>;
-  @group(0) @binding(3) var<uniform> myUniforms: SizeEtc;
-  
-  @compute @workgroup_size(64)
-  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-      let idx = global_id.x;
-			let o = myUniforms.objectSize;
-      if (idx < arrayLength(&matrixArray)) {
-          var totalDistance: u32 = 0;
-          for (var i: u32 = 0; i < o; i = i + 1) {
-              for (var j: u32 = 0; j < arrayLength(&comparisonArray) / o; j = j + 1) {
-                totalDistance = totalDistance + countOneBits(comparisonArray[j * o + i] ^ matrixArray[idx * o + i]);
-              }
-          }
-          outputArray[global_id.x] = totalDistance;
-      }
-  }
-`;
-
-	const defs = makeShaderDataDefinitions(shaderCode);
-	const myUniformValues = makeStructuredView(defs.uniforms.myUniforms);
-	myUniformValues.set({
-		objectSize: embeddingSize / 32
-	});
-	const layout = device.createPipelineLayout({ bindGroupLayouts: [bindGroupLayout(device)] });
-	// Create shader module and pipeline
-	const shaderModule = device.createShaderModule({ code: shaderCode });
-	const pipeline = device.createComputePipeline({
-		layout,
-		compute: {
-			module: shaderModule,
-			entryPoint: 'main'
-		}
-	});
-	const uniformBuffer = createSingletonBuffer(
-		device,
-		myUniformValues.arrayBuffer,
-		GPUBufferUsage.UNIFORM
-	);
-
-	const run = async function (tile: Tile, fieldName: string) {
+		});
+		this.uniformBuffer = createSingletonBuffer(
+			device,
+			this.prepUniforms().arrayBuffer,
+			GPUBufferUsage.UNIFORM,
+		);
+	}
+
+	async runOnTile(tile: Tile) {
+		const { comparisonBuffer, fieldName, pipeline, uniformBuffer, dimensionality: embeddingSize } = this;
+		const { device } = this.gpuState;
 		const commandEncoder = device.createCommandEncoder();
-		const key = `${tile.key}_${fieldName}`;
-		if (!bufferSet.store.has(key)) {
-			const values = (await tile.get_column(fieldName)).data[0].children[0].values as Uint8Array;
-			await bufferSet.set(key, values);
-		}
-		const { buffer, offset, byte_size: size } = bufferSet.store.get(key);
+	
+		const { buffer, offset, byte_size: size } = await this.gpuState.get(fieldName, tile)
 		const outputSize = (size / embeddingSize) * 8;
 		const paddedSize = Math.ceil(outputSize / 4) * 4;
-
+	
 		// TODO this should be a permanent buffer.
 		const outputBuffer = device.createBuffer({
 			// Put a ceiling on it.
 			size: paddedSize * 4,
-			usage: GPUBufferUsage.COPY_SRC | GPUBufferUsage.STORAGE
+			usage: GPUBufferUsage.COPY_SRC | GPUBufferUsage.STORAGE,
 		});
-
+	
 		const passEncoder = commandEncoder.beginComputePass();
 		passEncoder.setPipeline(pipeline);
 		passEncoder.setBindGroup(
@@ -140,55 +209,62 @@ const bindGroupLayout = (device: GPUDevice) =>
 					{ binding: 0, resource: { buffer: comparisonBuffer } },
 					{ binding: 1, resource: { buffer, offset, size } },
 					{ binding: 2, resource: { buffer: outputBuffer } },
-					{ binding: 3, resource: { buffer: uniformBuffer } }
-				]
-			})
+					{ binding: 3, resource: { buffer: uniformBuffer } },
+				],
+			}),
 		);
-
+	
 		passEncoder.dispatchWorkgroups(size / 4 / 64);
 		passEncoder.end();
 
 		// Submit the commands
 		const gpuReadBuffer = device.createBuffer({
 			size: paddedSize * 4,
-			usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ
+			usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ,
 		});
 
-		commandEncoder.copyBufferToBuffer(outputBuffer, 0, gpuReadBuffer, 0, paddedSize * 4);
+		commandEncoder.copyBufferToBuffer(
+			outputBuffer,
+			0,
+			gpuReadBuffer,
+			0,
+			paddedSize * 4,
+		);
 		device.queue.submit([commandEncoder.finish()]);
 
 		// Read back the results
 		await gpuReadBuffer.mapAsync(GPUMapMode.READ);
 		const outputArray = new Uint32Array(gpuReadBuffer.getMappedRange());
-		return outputArray.slice(0, outputSize);
-	};
-	return run;
+		const usable = outputArray.slice(0, outputSize);
+		const returnVal = new Float32Array(usable.length)
+		for (let i = 0; i < returnVal.length; i++) {
+			returnVal[i] = usable[i] / embeddingSize // (originally this was squared??)
+		}
+		return vectorFromArray(returnVal)
+	}
 }
-
+	
 // hide the state in a global variable.
-const dumb: StatefulGPU[] = [];
+const dumb: DeepGPU[] = [];
 
 export async function create_hamming_transform(
 	scatterplot: Scatterplot,
 	id: string,
-	view: Uint8Array,
-	dims: number,
-	column: string
+	view: Vector<Bool>,
 ) {
 	if (dumb.length === 0) {
-		dumb.push(await StatefulGPU.initializeWebGPU());
+		dumb.push(await DeepGPU.create(scatterplot.deeptable));
 	}
 	if (scatterplot.dataset.transformations[id] !== undefined) {
 		return;
 	}
-	// Cast from int8 to int32
-	const comparisonArray = new Uint32Array(view.buffer);
-	const run = prepareComputeShader(dumb[0], comparisonArray, dims);
-
-	scatterplot.dataset.transformations[id] = async function (tile) {
-		const value = await run(tile, column);
-		const scaled = [...value].map((d) => d  / ( comparisonArray.length * 32 / dims));
-		return 	new Float32Array(scaled)
-	};
-	await scatterplot.dataset.root_tile.get_column(id);
+
+	const [gpuState] = dumb;
+	const pipeline = new HammingPipeline(gpuState);
+	pipeline.setComparisonArray(view)
+	pipeline.prep();
+
+	scatterplot.dataset.transformations[id] = (tile) => pipeline.runOnTile(tile)
 }
+
+