From 0905a61579e24a81d746bcf5d04d94c807d8cede Mon Sep 17 00:00:00 2001
From: Carl Johnsen <cjjohnsen@nbi.ku.dk>
Date: Fri, 10 Jun 2022 15:49:12 +0200
Subject: [PATCH 001/136] Restructuring pybind kernels

---
 src/pybind_kernels/Makefile                   | 14 +++----
 src/pybind_kernels/Readme.md                  |  1 +
 src/pybind_kernels/__init__.py                |  0
 src/pybind_kernels/cpu/__init__.py            |  0
 src/pybind_kernels/{ => cpu}/geometry.cc      |  0
 src/pybind_kernels/{ => cpu}/histograms.cc    |  0
 src/pybind_kernels/{ => cpu}/label.cc         |  0
 src/pybind_kernels/{ => include}/datatypes.hh |  0
 src/pybind_kernels/{ => include}/parallel.hh  |  0
 src/pybind_kernels/opencv_pybind.cc           | 34 ----------------
 src/pybind_kernels/opencv_tester.cc           | 26 ------------
 .../{ => pybind}/geometry-pybind.cc           |  0
 src/pybind_kernels/uk_pybind11.py             | 40 -------------------
 13 files changed, 8 insertions(+), 107 deletions(-)
 create mode 100644 src/pybind_kernels/Readme.md
 create mode 100644 src/pybind_kernels/__init__.py
 create mode 100644 src/pybind_kernels/cpu/__init__.py
 rename src/pybind_kernels/{ => cpu}/geometry.cc (100%)
 rename src/pybind_kernels/{ => cpu}/histograms.cc (100%)
 rename src/pybind_kernels/{ => cpu}/label.cc (100%)
 rename src/pybind_kernels/{ => include}/datatypes.hh (100%)
 rename src/pybind_kernels/{ => include}/parallel.hh (100%)
 delete mode 100644 src/pybind_kernels/opencv_pybind.cc
 delete mode 100644 src/pybind_kernels/opencv_tester.cc
 rename src/pybind_kernels/{ => pybind}/geometry-pybind.cc (100%)
 delete mode 100644 src/pybind_kernels/uk_pybind11.py

diff --git a/src/pybind_kernels/Makefile b/src/pybind_kernels/Makefile
index 8b69a8b..49f49b8 100644
--- a/src/pybind_kernels/Makefile
+++ b/src/pybind_kernels/Makefile
@@ -1,16 +1,16 @@
-PYBIND_FLAGS += $(shell python3 -m pybind11 --include) -O3 -march=native -Wall -shared -fPIC -fopenmp -g -std=c++17
+PYBIND_FLAGS += $(shell python3 -m pybind11 --include) -march=native -Wall -shared -fPIC -fopenmp -g -std=c++17
 PYBIND_SUFFIX = $(shell python3-config --extension-suffix)
 
 OPENCV_INCLUDE=$(shell pkg-config opencv4 --cflags)
 OPENCV_LIB=$(shell pkg-config opencv4 --libs)
 
 # Detect if OpenACC can be used
-ifneq (, $(shell which nvc++))
-CXX = nvc++
-CXXFLAGS += -acc=gpu -Minfo=accel -tp=native
-else
-$(info OpenACC compiler nvc++ not found. Compiling without)
-endif
+#ifneq (, $(shell which nvc++))
+#CXX = nvc++
+#CXXFLAGS += -acc=gpu -Minfo=accel -tp=native
+#else
+#$(info OpenACC compiler nvc++ not found. Compiling without)
+#endif
 
 CXXFLAGS += -I../contrib/cpptqdm/
 
diff --git a/src/pybind_kernels/Readme.md b/src/pybind_kernels/Readme.md
new file mode 100644
index 0000000..27d0412
--- /dev/null
+++ b/src/pybind_kernels/Readme.md
@@ -0,0 +1 @@
+# TODO :)
\ No newline at end of file
diff --git a/src/pybind_kernels/__init__.py b/src/pybind_kernels/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/pybind_kernels/cpu/__init__.py b/src/pybind_kernels/cpu/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/pybind_kernels/geometry.cc b/src/pybind_kernels/cpu/geometry.cc
similarity index 100%
rename from src/pybind_kernels/geometry.cc
rename to src/pybind_kernels/cpu/geometry.cc
diff --git a/src/pybind_kernels/histograms.cc b/src/pybind_kernels/cpu/histograms.cc
similarity index 100%
rename from src/pybind_kernels/histograms.cc
rename to src/pybind_kernels/cpu/histograms.cc
diff --git a/src/pybind_kernels/label.cc b/src/pybind_kernels/cpu/label.cc
similarity index 100%
rename from src/pybind_kernels/label.cc
rename to src/pybind_kernels/cpu/label.cc
diff --git a/src/pybind_kernels/datatypes.hh b/src/pybind_kernels/include/datatypes.hh
similarity index 100%
rename from src/pybind_kernels/datatypes.hh
rename to src/pybind_kernels/include/datatypes.hh
diff --git a/src/pybind_kernels/parallel.hh b/src/pybind_kernels/include/parallel.hh
similarity index 100%
rename from src/pybind_kernels/parallel.hh
rename to src/pybind_kernels/include/parallel.hh
diff --git a/src/pybind_kernels/opencv_pybind.cc b/src/pybind_kernels/opencv_pybind.cc
deleted file mode 100644
index 1b0a2d5..0000000
--- a/src/pybind_kernels/opencv_pybind.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-#include <pybind11/pybind11.h>
-#include <pybind11/numpy.h>
-
-#include <opencv2/core.hpp>
-#include <opencv2/imgcodecs.hpp>
-#include <opencv2/highgui.hpp>
-#include <iostream>
-#include <stdio.h>
-#include <inttypes.h>
-
-using namespace std;
-using namespace cv;
-namespace py = pybind11;
-
-void opencv_tester(const py::array_t<uint8_t>& np_image)
-{
-  py::buffer_info image_info = np_image.request();
-  uint64_t 
-    Ny  = image_info.shape[0],
-    Nx  = image_info.shape[1];
-
-  Mat img(Ny, Nx, CV_8UC1, image_info.ptr);
-
-  imshow("opencv_tester window",img);
-  int k = waitKey(0);
-}
-
-
-PYBIND11_MODULE(opencv_pybind, m) {
-    m.doc() = "Test of C++ OpenCV through pybind"; // optional module docstring
-
-    m.def("tester",  &opencv_tester);
-}
-
diff --git a/src/pybind_kernels/opencv_tester.cc b/src/pybind_kernels/opencv_tester.cc
deleted file mode 100644
index 0a39a42..0000000
--- a/src/pybind_kernels/opencv_tester.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-#include <opencv2/core.hpp>
-#include <opencv2/imgcodecs.hpp>
-#include <opencv2/highgui.hpp>
-#include <iostream>
-#include <stdio.h>
-#include <inttypes.h>
-
-using namespace std;
-
-int main(int ac, char **av)
-{
-  fprintf(stderr,"Just starting up, doing nothing.\n");
-  
-  if(ac<2) return -1;
-  
-  cv::Mat img = cv::imread(av[1], cv::IMREAD_COLOR);
-  
-  if(img.empty()) return -2;
-
-  cv::imshow("Window",img);
-
-  int k = cv::waitKey(0);
-
-  return 0;
-
-}
diff --git a/src/pybind_kernels/geometry-pybind.cc b/src/pybind_kernels/pybind/geometry-pybind.cc
similarity index 100%
rename from src/pybind_kernels/geometry-pybind.cc
rename to src/pybind_kernels/pybind/geometry-pybind.cc
diff --git a/src/pybind_kernels/uk_pybind11.py b/src/pybind_kernels/uk_pybind11.py
deleted file mode 100644
index 50b5abd..0000000
--- a/src/pybind_kernels/uk_pybind11.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import histograms, numpy as np
-from time import time;
-import sys
-
-# TODO: Currently specialized to uint16_t
-def masked_minmax(voxels):
-    return histograms.masked_minmax(voxels)
-
-def axes_histogram(voxels, ranges=None, voxel_bins=256):
-    (Nz,Ny,Nx) = voxels.shape
-    Nr = int(np.sqrt((Nx//2)**2 + (Ny//2)**2))+1
-    
-    x_bins   = np.zeros((Nx,voxel_bins),dtype=np.uint64)
-    y_bins   = np.zeros((Ny,voxel_bins),dtype=np.uint64)
-    z_bins   = np.zeros((Nz,voxel_bins),dtype=np.uint64)
-    r_bins   = np.zeros((Nr,voxel_bins),dtype=np.uint64)
-    
-    if ranges is None:
-        vmin, vmax = 1, 4095
-    else:
-        vmin, vmax = ranges
-
-    histograms.axis_histogram(voxels, x_bins, y_bins, z_bins, r_bins, vmin, vmax);
-    return x_bins, y_bins, z_bins, r_bins
-
-
-def field_histogram(voxels, field, ranges=None,field_bins=256, voxel_bins=256):
-    assert(voxels.dtype == np.uint16)
-    
-    bins   = np.zeros((field_bins,voxel_bins),dtype=np.uint64)
-
-    if ranges is None:
-        vmin, vmax = masked_minmax(voxels)
-    else:
-        (vmin,vmax) = ranges
-
-    print("Calculating field histogram",flush=True);        
-    histograms.field_histogram(voxels,field,bins,vmin,vmax)
-    
-    return bins

From 1d8f0bafc985d5f5b5458e2067b55d2264be2e54 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <cjjohnsen@nbi.ku.dk>
Date: Sun, 12 Jun 2022 09:28:56 +0200
Subject: [PATCH 002/136] Moved I/O functions to its own library in the new
 format

---
 src/pybind_kernels/Makefile             | 33 ++++++++++-----
 src/pybind_kernels/cpu/histograms.cc    | 53 +------------------------
 src/pybind_kernels/cpu/io.cc            | 22 ++++++++++
 src/pybind_kernels/cpu_seq/io.cc        | 40 +++++++++++++++++++
 src/pybind_kernels/gpu/io.cc            | 22 ++++++++++
 src/pybind_kernels/include/datatypes.hh |  4 ++
 src/pybind_kernels/include/io.hh        | 12 ++++++
 src/pybind_kernels/pybind/io-pybind.cc  | 48 ++++++++++++++++++++++
 8 files changed, 173 insertions(+), 61 deletions(-)
 create mode 100644 src/pybind_kernels/cpu/io.cc
 create mode 100644 src/pybind_kernels/cpu_seq/io.cc
 create mode 100644 src/pybind_kernels/gpu/io.cc
 create mode 100644 src/pybind_kernels/include/io.hh
 create mode 100644 src/pybind_kernels/pybind/io-pybind.cc

diff --git a/src/pybind_kernels/Makefile b/src/pybind_kernels/Makefile
index 49f49b8..0de8d95 100644
--- a/src/pybind_kernels/Makefile
+++ b/src/pybind_kernels/Makefile
@@ -5,16 +5,20 @@ OPENCV_INCLUDE=$(shell pkg-config opencv4 --cflags)
 OPENCV_LIB=$(shell pkg-config opencv4 --libs)
 
 # Detect if OpenACC can be used
-#ifneq (, $(shell which nvc++))
-#CXX = nvc++
-#CXXFLAGS += -acc=gpu -Minfo=accel -tp=native
-#else
-#$(info OpenACC compiler nvc++ not found. Compiling without)
-#endif
+ifneq (, $(shell which nvc++))
+CXX = nvc++
+CXXFLAGS += -acc=gpu -Minfo=accel -tp=native
+else
+$(info OpenACC compiler nvc++ not found. Compiling without)
+endif
 
-CXXFLAGS += -I../contrib/cpptqdm/
+CXXFLAGS += -I../contrib/cpptqdm/ -Iinclude
 
-all: histograms$(PYBIND_SUFFIX) geometry$(PYBIND_SUFFIX) label$(PYBIND_SUFFIX)
+PLATFORMS=cpu_seq cpu gpu
+LIBS=io
+TARGETS = $(foreach PLATFORM, $(PLATFORMS), $(foreach LIB, $(LIBS), $(PLATFORM)/$(LIB)$(PYBIND_SUFFIX)))
+
+all: $(TARGETS)
 
 histograms$(PYBIND_SUFFIX): histograms.cc
 	$(CXX) $(CXXFLAGS) $(PYBIND_FLAGS) $< -o histograms$(PYBIND_SUFFIX)
@@ -31,8 +35,19 @@ opencv_pybind$(PYBIND_SUFFIX): opencv_pybind.cc
 opencv_tester: opencv_tester.cc
 	$(CXX) $(CXXFLAGS) $(OPENCV_INCLUDE) $(OPENCV_LIB) $< -o opencv_tester
 
+define GEN_RULE
+$(PLATFORM)/$(LIB)$(PYBIND_SUFFIX): pybind/$(LIB)-pybind.cc $(PLATFORM)/$(LIB).cc
+	$(CXX) $(CXXFLAGS) $(PYBIND_FLAGS) -I$(PLATFORM) $$< -o $(PLATFORM)/$(LIB)$(PYBIND_SUFFIX)
+endef
+
+$(foreach PLATFORM, $(PLATFORMS), \
+	$(foreach LIB, $(LIBS), \
+		$(eval $(GEN_RULE)) \
+	) \
+)
+
 clean:
-	rm -f histograms.o histograms$(PYBIND_SUFFIX) opencv_pybind$(PYBIND_SUFFIX) opencv_tester
+	rm -f $(TARGETS)
 
 
 
diff --git a/src/pybind_kernels/cpu/histograms.cc b/src/pybind_kernels/cpu/histograms.cc
index cf1b115..73862ba 100644
--- a/src/pybind_kernels/cpu/histograms.cc
+++ b/src/pybind_kernels/cpu/histograms.cc
@@ -5,17 +5,11 @@
 #include <stdio.h>
 #include <omp.h>
 #include <chrono>
-#include <iostream>
-#include <fstream>
 #include <tqdm.h>
 using namespace std;
 namespace py = pybind11;
 
-typedef uint16_t voxel_type;
-//typedef float    field_type;
-typedef uint16_t field_type;
-typedef uint8_t mask_type;
-typedef float gauss_type;
+#include "datatypes.hh"
 
 #define INLINE __attribute__((always_inline)) inline
 
@@ -302,48 +296,6 @@ pair<float,float> float_minmax(const py::array_t<float> np_field) {
     return make_pair(voxel_min,voxel_max);
 }
 
-void load_slice(py::array_t<voxel_type> &np_data, string filename,
-                const tuple<uint64_t, uint64_t, uint64_t> offset,
-                const tuple<uint64_t, uint64_t, uint64_t> shape) {
-    auto data_info = np_data.request();
-    voxel_type *data = static_cast<voxel_type*>(data_info.ptr);
-    ifstream file;
-    file.open(filename.c_str(), ios::binary);
-    if(!file.is_open()){
-      fprintf(stderr,"load_slice: Error opening %s for reading.\n",filename.c_str());
-      exit(-1);
-    }
-    auto [Nz, Ny, Nx] = shape;
-    auto [oz, oy, ox] = offset;
-    uint64_t flat_offset = (oz*Ny*Nx + oy*Nx + ox) * sizeof(voxel_type);
-    file.seekg(flat_offset, ios::beg);
-    file.read((char*) data, data_info.size * sizeof(voxel_type));
-    file.close();
-}
-
-void write_slice(py::array_t<voxel_type> &np_data, uint64_t offset, string filename) {
-    auto data_info = np_data.request();
-    const voxel_type *data = static_cast<const voxel_type*>(data_info.ptr);
-    ofstream file;
-    file.open(filename.c_str(), ios::binary | ios::in);
-    if (!file.is_open()) {
-        file.clear();
-        file.open(filename.c_str(), ios::binary);
-    }
-    file.seekp(offset * sizeof(voxel_type), ios::beg);
-    file.write((char*) data, data_info.size * sizeof(voxel_type));
-    file.close();
-}
-
-void append_slice(py::array_t<voxel_type> &np_data, string filename) {
-    auto data_info = np_data.request();
-    const voxel_type *data = static_cast<const voxel_type*>(data_info.ptr);
-    ofstream file;
-    file.open(filename.c_str(), ios::binary | ios::app);
-    file.write((char*) data, data_info.size * sizeof(voxel_type));
-    file.close();
-}
-
 // On entry, np_*_bins are assumed to be pre allocated and zeroed.
 void axis_histogram_par_cpu(const py::array_t<voxel_type> np_voxels,
                             const tuple<uint64_t,uint64_t,uint64_t> offset,
@@ -1133,9 +1085,6 @@ void otsu(
 
 PYBIND11_MODULE(histograms, m) {
     m.doc() = "2D histogramming plugin"; // optional module docstring
-    m.def("load_slice", &load_slice);
-    m.def("append_slice", &append_slice);
-    m.def("write_slice", &write_slice);
     m.def("axis_histogram_seq_cpu",  &axis_histogram_seq_cpu);
     m.def("axis_histogram_par_cpu",  &axis_histogram_par_cpu);
     m.def("axis_histogram_par_gpu",  &axis_histogram_par_gpu);
diff --git a/src/pybind_kernels/cpu/io.cc b/src/pybind_kernels/cpu/io.cc
new file mode 100644
index 0000000..41b56ec
--- /dev/null
+++ b/src/pybind_kernels/cpu/io.cc
@@ -0,0 +1,22 @@
+#include <iostream>
+#include <fstream>
+
+#include "io.hh"
+
+using namespace std;
+
+template <typename T>
+void load_contiguous_slice(T *data,
+        const string filename,
+        const uint64_t offset,
+        const uint64_t size) {
+    throw runtime_error(string("Library doesn't have a parallel cpu implementation of ") + __FUNCTION__);
+}
+
+template <typename T>
+void write_contiguous_slice(const T *data,
+        const string filename,
+        const uint64_t offset,
+        const uint64_t size) {
+    throw runtime_error(string("Library doesn't have a parallel cpu implementation of ") + __FUNCTION__);
+}
diff --git a/src/pybind_kernels/cpu_seq/io.cc b/src/pybind_kernels/cpu_seq/io.cc
new file mode 100644
index 0000000..01cf2f8
--- /dev/null
+++ b/src/pybind_kernels/cpu_seq/io.cc
@@ -0,0 +1,40 @@
+#include <iostream>
+#include <fstream>
+
+#include "io.hh"
+
+using namespace std;
+
+template <typename T>
+void load_contiguous_slice(T *data,
+        const string filename,
+        const uint64_t offset,
+        const uint64_t size) {
+    ifstream file;
+    file.open(filename.c_str(), ios::binary);
+    if (!file.is_open()) {
+        fprintf(stderr, "load_slice: Error opening %s for reading.\n", filename.c_str());
+        exit(-1);
+    }
+    file.seekg(offset * sizeof(T), ios::beg);
+    file.read((char*) data, size * sizeof(T));
+    file.close();
+}
+
+template <typename T>
+void write_contiguous_slice(const T *data,
+        const string filename,
+        const uint64_t offset,
+        const uint64_t size) {
+    ofstream file;
+    file.open(filename.c_str(), ios::binary | ios::in);
+    if (!file.is_open()) {
+        file.clear();
+        file.open(filename.c_str(), ios::binary);
+    }
+    file.seekp(offset * sizeof(T), ios::beg);
+    file.write((char*) data, size * sizeof(T));
+    file.close();
+}
+
+// TODO non-contiguous
diff --git a/src/pybind_kernels/gpu/io.cc b/src/pybind_kernels/gpu/io.cc
new file mode 100644
index 0000000..4eb196a
--- /dev/null
+++ b/src/pybind_kernels/gpu/io.cc
@@ -0,0 +1,22 @@
+#include <iostream>
+#include <fstream>
+
+#include "io.hh"
+
+using namespace std;
+
+template <typename T>
+void load_contiguous_slice(T *data,
+        const string filename,
+        const uint64_t offset,
+        const uint64_t size) {
+    throw runtime_error(string("Library doesn't have a gpu implementation of ") + __FUNCTION__);
+}
+
+template <typename T>
+void write_contiguous_slice(const T *data,
+        const string filename,
+        const uint64_t offset,
+        const uint64_t size) {
+    throw runtime_error(string("Library doesn't have a gpu implementation of ") + __FUNCTION__);
+}
diff --git a/src/pybind_kernels/include/datatypes.hh b/src/pybind_kernels/include/datatypes.hh
index f2e121a..88a068d 100644
--- a/src/pybind_kernels/include/datatypes.hh
+++ b/src/pybind_kernels/include/datatypes.hh
@@ -2,6 +2,10 @@
 #include <vector>
 
 typedef uint8_t mask_type;	// TODO: Template + explicit instantiation
+typedef uint16_t voxel_type;
+//typedef float    field_type;
+typedef uint16_t field_type;
+typedef float gauss_type;
 typedef double real_t;
 
 constexpr ssize_t acc_block_size =  1024 * 1024 * 1024/sizeof(mask_type); // 1 GB
diff --git a/src/pybind_kernels/include/io.hh b/src/pybind_kernels/include/io.hh
new file mode 100644
index 0000000..fae2cbf
--- /dev/null
+++ b/src/pybind_kernels/include/io.hh
@@ -0,0 +1,12 @@
+#ifndef io_h
+#define io_h
+
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+
+template <typename T>
+void load_contiguous_slice(T *data, const string filename, const uint64_t offset, const uint64_t size);
+template <typename T>
+void write_contiguous_slice(T *np_data, const string filename, const uint64_t offset, const uint64_t size);
+
+#endif
\ No newline at end of file
diff --git a/src/pybind_kernels/pybind/io-pybind.cc b/src/pybind_kernels/pybind/io-pybind.cc
new file mode 100644
index 0000000..d7c370a
--- /dev/null
+++ b/src/pybind_kernels/pybind/io-pybind.cc
@@ -0,0 +1,48 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+
+using namespace std;
+namespace py = pybind11;
+
+#include "datatypes.hh"
+#include "io.cc"
+
+template <typename T>
+void load_slice(py::array_t<T> &np_data, const string filename,
+                const tuple<uint64_t, uint64_t, uint64_t> offset,
+                const tuple<uint64_t, uint64_t, uint64_t> shape) {
+    auto data_info = np_data.request();
+    T *data = static_cast<T*>(data_info.ptr);
+    auto [Nz, Ny, Nx] = shape;
+    auto [oz, oy, ox] = offset;
+    uint64_t flat_offset = oz*Ny*Nx + oy*Nx + ox;
+    load_contiguous_slice<T>(data, filename, flat_offset, data_info.size);
+}
+
+template <typename T>
+void write_slice(const py::array_t<T> &np_data,
+        const string filename,
+        const tuple<uint64_t, uint64_t, uint64_t> offset,
+        const tuple<uint64_t, uint64_t, uint64_t> shape) {
+    auto data_info = np_data.request();
+    const T *data = static_cast<const T*>(data_info.ptr);
+    auto [Nz, Ny, Nx] = shape;
+    auto [oz, oy, ox] = offset;
+    uint64_t flat_offset = oz*Ny*Nx + oy*Nx + ox;
+    write_contiguous_slice<T>(data, filename, flat_offset, data_info.size);
+}
+
+PYBIND11_MODULE(histograms, m) {
+    m.doc() = "I/O functions for handling flat binary format files."; // optional module docstring
+    m.def("load_slice", &load_slice<mask_type>);
+    m.def("load_slice", &load_slice<voxel_type>);
+    m.def("load_slice", &load_slice<field_type>);
+    m.def("load_slice", &load_slice<gauss_type>);
+    m.def("load_slice", &load_slice<real_t>);
+
+    m.def("write_slice", &write_slice<mask_type>);
+    m.def("write_slice", &write_slice<voxel_type>);
+    m.def("write_slice", &write_slice<field_type>);
+    m.def("write_slice", &write_slice<gauss_type>);
+    m.def("write_slice", &write_slice<real_t>);
+}
\ No newline at end of file

From 616672820a642a86db49697e3c2de64ae2b4539e Mon Sep 17 00:00:00 2001
From: Carl Johnsen <cjjohnsen@nbi.ku.dk>
Date: Sun, 12 Jun 2022 10:03:49 +0200
Subject: [PATCH 003/136] Moved morphology to its own files

---
 src/pybind_kernels/Makefile                   |  17 ++-
 src/pybind_kernels/cpu/histograms.cc          | 116 ------------------
 src/pybind_kernels/cpu/morphology.cc          |  46 +++++++
 src/pybind_kernels/cpu_seq/morphology.cc      |  45 +++++++
 src/pybind_kernels/gpu/morphology.cc          |  52 ++++++++
 src/pybind_kernels/include/datatypes.hh       |   9 +-
 src/pybind_kernels/include/io.hh              |   3 -
 src/pybind_kernels/include/morphology.hh      |  14 +++
 src/pybind_kernels/pybind/io-pybind.cc        |   2 +-
 .../pybind/morphology-pybind.cc               |  33 +++++
 10 files changed, 203 insertions(+), 134 deletions(-)
 create mode 100644 src/pybind_kernels/cpu/morphology.cc
 create mode 100644 src/pybind_kernels/cpu_seq/morphology.cc
 create mode 100644 src/pybind_kernels/gpu/morphology.cc
 create mode 100644 src/pybind_kernels/include/morphology.hh
 create mode 100644 src/pybind_kernels/pybind/morphology-pybind.cc

diff --git a/src/pybind_kernels/Makefile b/src/pybind_kernels/Makefile
index 0de8d95..59c9911 100644
--- a/src/pybind_kernels/Makefile
+++ b/src/pybind_kernels/Makefile
@@ -5,17 +5,17 @@ OPENCV_INCLUDE=$(shell pkg-config opencv4 --cflags)
 OPENCV_LIB=$(shell pkg-config opencv4 --libs)
 
 # Detect if OpenACC can be used
-ifneq (, $(shell which nvc++))
-CXX = nvc++
-CXXFLAGS += -acc=gpu -Minfo=accel -tp=native
-else
-$(info OpenACC compiler nvc++ not found. Compiling without)
-endif
+#ifneq (, $(shell which nvc++))
+#CXX = nvc++
+#CXXFLAGS += -acc=gpu -Minfo=accel -tp=native
+#else
+#$(info OpenACC compiler nvc++ not found. Compiling without)
+#endif
 
 CXXFLAGS += -I../contrib/cpptqdm/ -Iinclude
 
 PLATFORMS=cpu_seq cpu gpu
-LIBS=io
+LIBS=io morphology
 TARGETS = $(foreach PLATFORM, $(PLATFORMS), $(foreach LIB, $(LIBS), $(PLATFORM)/$(LIB)$(PYBIND_SUFFIX)))
 
 all: $(TARGETS)
@@ -48,6 +48,3 @@ $(foreach PLATFORM, $(PLATFORMS), \
 
 clean:
 	rm -f $(TARGETS)
-
-
-
diff --git a/src/pybind_kernels/cpu/histograms.cc b/src/pybind_kernels/cpu/histograms.cc
index 73862ba..b876840 100644
--- a/src/pybind_kernels/cpu/histograms.cc
+++ b/src/pybind_kernels/cpu/histograms.cc
@@ -55,118 +55,6 @@ template <typename T> void convolve1d(const py::array_t<T> np_kernel,
 
 }
 
-template <typename Op, bool neutral> void morphology_3d_sphere_cpu(
-        const py::array_t<mask_type> &np_voxels,
-        const int64_t radius,
-        const py::array_t<mask_type> np_result
-) {
-    auto
-        voxels_info = np_voxels.request(),
-        result_info = np_result.request();
-
-    int32_t Nz = voxels_info.shape[0], Ny = voxels_info.shape[1], Nx = voxels_info.shape[2];
-    int64_t N[3] = {Nz, Ny, Nx};
-    int64_t strides[3] = {Ny*Nx, Nx, 1};
-
-    const mask_type *voxels = static_cast<const mask_type*>(voxels_info.ptr);
-    mask_type *result = static_cast<mask_type*>(result_info.ptr);
-
-    Op op;
-
-    int64_t sqradius = radius * radius;
-
-    #pragma omp parallel for collapse(3)
-    for (int64_t z = 0; z < N[0]; z++) {
-        for (int64_t y = 0; y < N[1]; y++) {
-            for (int64_t x = 0; x < N[2]; x++) {
-                // Compute boundaries
-                int64_t flat_index = z*strides[0] + y*strides[1] + x*strides[2];
-                int64_t X[3] = {z, y, x};
-                int64_t limits[6];
-                for (int axis = 0; axis < 3; axis++) {
-                    limits[(axis*2)] = -min(radius, X[axis]);
-                    limits[(axis*2)+1] = min(radius, N[axis] - X[axis] - 1);
-                }
-
-                // Apply the spherical kernel
-                bool value = neutral;
-                //#pragma omp simd collapse(3) reduction(op:value)
-                for (int64_t pz = limits[0]; pz <= limits[1]; pz++) {
-                    for (int64_t py = limits[2]; py <= limits[3]; py++) {
-                        for (int64_t px = limits[4]; px <= limits[5]; px++) {
-                            // TODO exact match with ndimage
-                            bool within = px*px + py*py + pz*pz <= sqradius; // sphere kernel
-                            int64_t offset = pz*strides[0] + py*strides[1] + px*strides[2];
-                            value = within? op(value, voxels[flat_index+offset]) : value;
-                        }
-                    }
-                }
-
-                // Store the results
-                result[flat_index] = value;
-            }
-        }
-    }
-}
-
-template <typename Op, bool neutral> void morphology_3d_sphere_gpu(
-        const py::array_t<mask_type> &np_voxels,
-        const int64_t radius,
-        const py::array_t<mask_type> np_result) {
-#ifdef _OPENACC
-    auto
-        voxels_info = np_voxels.request(),
-        result_info = np_result.request();
-
-    int32_t Nz = voxels_info.shape[0], Ny = voxels_info.shape[1], Nx = voxels_info.shape[2];
-    int64_t N[3] = {Nz, Ny, Nx};
-    int64_t strides[3] = {Ny*Nx, Nx, 1};
-
-    const mask_type *voxels = static_cast<const mask_type*>(voxels_info.ptr);
-    mask_type *result = static_cast<mask_type*>(result_info.ptr);
-
-    Op op;
-    int64_t sqradius = radius * radius;
-
-    #pragma acc data copyin(voxels[:Nz*Ny*Nx], N[:3], strides[:3], sqradius) copyout(result[:Nz*Ny*Nx])
-    {
-        #pragma acc parallel loop collapse(3)
-        for (int64_t z = 0; z < N[0]; z++) {
-            for (int64_t y = 0; y < N[1]; y++) {
-                for (int64_t x = 0; x < N[2]; x++) {
-                    // Compute boundaries
-                    int64_t flat_index = z*strides[0] + y*strides[1] + x*strides[2];
-                    int64_t X[3] = {z, y, x};
-                    int64_t limits[6];
-                    for (int axis = 0; axis < 3; axis++) {
-                        limits[(axis*2)] = -min(radius, X[axis]);
-                        limits[(axis*2)+1] = min(radius, N[axis] - X[axis] - 1);
-                    }
-
-                    // Apply the spherical kernel
-                    bool value = neutral;
-                    //#pragma omp simd collapse(3) reduction(op:value)
-                    for (int64_t pz = limits[0]; pz <= limits[1]; pz++) {
-                        for (int64_t py = limits[2]; py <= limits[3]; py++) {
-                            for (int64_t px = limits[4]; px <= limits[5]; px++) {
-                                bool within = px*px + py*py + pz*pz <= sqradius; // sphere kernel
-                                int64_t offset = pz*strides[0] + py*strides[1] + px*strides[2];
-                                value = within? op(value, voxels[flat_index+offset]) : value;
-                            }
-                        }
-                    }
-
-                    // Store the results
-                    result[flat_index] = value;
-                }
-            }
-        }
-    }
-#else
-    throw runtime_error("Library wasn't compiled with OpenACC.");
-#endif
-}
-
 void gauss_filter_par_cpu(const py::array_t<mask_type> np_mask,
                           const tuple<uint64_t, uint64_t, uint64_t> shape,
                           const py::array_t<gauss_type> np_kernel,
@@ -1094,9 +982,5 @@ PYBIND11_MODULE(histograms, m) {
     m.def("masked_minmax", &masked_minmax);
     m.def("float_minmax", &float_minmax);
     m.def("gauss_filter_par_cpu", &gauss_filter_par_cpu);
-    m.def("dilate_3d_sphere_cpu", &morphology_3d_sphere_cpu<std::bit_or<mask_type>, false>);
-    m.def("erode_3d_sphere_cpu", &morphology_3d_sphere_cpu<std::bit_and<mask_type>, true>);
-    m.def("dilate_3d_sphere_gpu", &morphology_3d_sphere_gpu<std::bit_or<mask_type>, false>);
-    m.def("erode_3d_sphere_gpu", &morphology_3d_sphere_gpu<std::bit_and<mask_type>, true>);
     m.def("otsu", &otsu);
 }
diff --git a/src/pybind_kernels/cpu/morphology.cc b/src/pybind_kernels/cpu/morphology.cc
new file mode 100644
index 0000000..d706fff
--- /dev/null
+++ b/src/pybind_kernels/cpu/morphology.cc
@@ -0,0 +1,46 @@
+#include "morphology.hh"
+#include "datatypes.hh"
+
+template <typename Op, bool neutral>
+void morphology_3d_sphere(
+        const mask_type *voxels,
+        const int64_t radius,
+        const int64_t N[3],
+        const int64_t strides[3],
+        mask_type *result) {
+    Op op;
+    int64_t sqradius = radius * radius;
+
+    #pragma omp parallel for collapse(3)
+    for (int64_t z = 0; z < N[0]; z++) {
+        for (int64_t y = 0; y < N[1]; y++) {
+            for (int64_t x = 0; x < N[2]; x++) {
+                // Compute boundaries
+                int64_t flat_index = z*strides[0] + y*strides[1] + x*strides[2];
+                int64_t X[3] = {z, y, x};
+                int64_t limits[6];
+                for (int axis = 0; axis < 3; axis++) {
+                    limits[(axis*2)] = -min(radius, X[axis]);
+                    limits[(axis*2)+1] = min(radius, N[axis] - X[axis] - 1);
+                }
+
+                // Apply the spherical kernel
+                bool value = neutral;
+                //#pragma omp simd collapse(3) reduction(op:value)
+                for (int64_t pz = limits[0]; pz <= limits[1]; pz++) {
+                    for (int64_t py = limits[2]; py <= limits[3]; py++) {
+                        for (int64_t px = limits[4]; px <= limits[5]; px++) {
+                            // TODO exact match with ndimage
+                            bool within = px*px + py*py + pz*pz <= sqradius; // sphere kernel
+                            int64_t offset = pz*strides[0] + py*strides[1] + px*strides[2];
+                            value = within? op(value, voxels[flat_index+offset]) : value;
+                        }
+                    }
+                }
+
+                // Store the results
+                result[flat_index] = value;
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/pybind_kernels/cpu_seq/morphology.cc b/src/pybind_kernels/cpu_seq/morphology.cc
new file mode 100644
index 0000000..60cea00
--- /dev/null
+++ b/src/pybind_kernels/cpu_seq/morphology.cc
@@ -0,0 +1,45 @@
+#include "morphology.hh"
+#include "datatypes.hh"
+
+template <typename Op, bool neutral>
+void morphology_3d_sphere(
+        const mask_type *voxels,
+        const int64_t radius,
+        const int64_t N[3],
+        const int64_t strides[3],
+        mask_type *result) {
+    Op op;
+    int64_t sqradius = radius * radius;
+
+    for (int64_t z = 0; z < N[0]; z++) {
+        for (int64_t y = 0; y < N[1]; y++) {
+            for (int64_t x = 0; x < N[2]; x++) {
+                // Compute boundaries
+                int64_t flat_index = z*strides[0] + y*strides[1] + x*strides[2];
+                int64_t X[3] = {z, y, x};
+                int64_t limits[6];
+                for (int axis = 0; axis < 3; axis++) {
+                    limits[(axis*2)] = -min(radius, X[axis]);
+                    limits[(axis*2)+1] = min(radius, N[axis] - X[axis] - 1);
+                }
+
+                // Apply the spherical kernel
+                bool value = neutral;
+                //#pragma omp simd collapse(3) reduction(op:value)
+                for (int64_t pz = limits[0]; pz <= limits[1]; pz++) {
+                    for (int64_t py = limits[2]; py <= limits[3]; py++) {
+                        for (int64_t px = limits[4]; px <= limits[5]; px++) {
+                            // TODO exact match with ndimage
+                            bool within = px*px + py*py + pz*pz <= sqradius; // sphere kernel
+                            int64_t offset = pz*strides[0] + py*strides[1] + px*strides[2];
+                            value = within? op(value, voxels[flat_index+offset]) : value;
+                        }
+                    }
+                }
+
+                // Store the results
+                result[flat_index] = value;
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/pybind_kernels/gpu/morphology.cc b/src/pybind_kernels/gpu/morphology.cc
new file mode 100644
index 0000000..6230924
--- /dev/null
+++ b/src/pybind_kernels/gpu/morphology.cc
@@ -0,0 +1,52 @@
+#include "morphology.hh"
+#include "datatypes.hh"
+
+template <typename Op, bool neutral>
+void morphology_3d_sphere(
+        const mask_type *voxels,
+        const int64_t radius,
+        const int64_t N[3],
+        const int64_t strides[3],
+        mask_type *result) {
+#ifdef _OPENACC
+    Op op;
+    int64_t sqradius = radius * radius;
+
+    #pragma acc data copyin(voxels[:Nz*Ny*Nx], N[:3], strides[:3], sqradius) copyout(result[:Nz*Ny*Nx])
+    {
+        #pragma acc parallel loop collapse(3)
+        for (int64_t z = 0; z < N[0]; z++) {
+            for (int64_t y = 0; y < N[1]; y++) {
+                for (int64_t x = 0; x < N[2]; x++) {
+                    // Compute boundaries
+                    int64_t flat_index = z*strides[0] + y*strides[1] + x*strides[2];
+                    int64_t X[3] = {z, y, x};
+                    int64_t limits[6];
+                    for (int axis = 0; axis < 3; axis++) {
+                        limits[(axis*2)] = -min(radius, X[axis]);
+                        limits[(axis*2)+1] = min(radius, N[axis] - X[axis] - 1);
+                    }
+
+                    // Apply the spherical kernel
+                    bool value = neutral;
+                    //#pragma omp simd collapse(3) reduction(op:value)
+                    for (int64_t pz = limits[0]; pz <= limits[1]; pz++) {
+                        for (int64_t py = limits[2]; py <= limits[3]; py++) {
+                            for (int64_t px = limits[4]; px <= limits[5]; px++) {
+                                bool within = px*px + py*py + pz*pz <= sqradius; // sphere kernel
+                                int64_t offset = pz*strides[0] + py*strides[1] + px*strides[2];
+                                value = within? op(value, voxels[flat_index+offset]) : value;
+                            }
+                        }
+                    }
+
+                    // Store the results
+                    result[flat_index] = value;
+                }
+            }
+        }
+    }
+#else
+    throw runtime_error("Library wasn't compiled with OpenACC.");
+#endif
+}
\ No newline at end of file
diff --git a/src/pybind_kernels/include/datatypes.hh b/src/pybind_kernels/include/datatypes.hh
index 88a068d..91c7490 100644
--- a/src/pybind_kernels/include/datatypes.hh
+++ b/src/pybind_kernels/include/datatypes.hh
@@ -1,3 +1,5 @@
+#ifndef datatypes_h
+#define datatypes_h
 #include <array>
 #include <vector>
 
@@ -19,7 +21,7 @@ template <typename T> struct input_ndarray {
   const vector<ssize_t> shape;
 
   input_ndarray(const T *data, const vector<ssize_t> &shape): data(data), shape(shape) {}
-  input_ndarray(const void *data, const vector<ssize_t> &shape): data(static_cast<const T*>(data)), shape(shape) {}  
+  input_ndarray(const void *data, const vector<ssize_t> &shape): data(static_cast<const T*>(data)), shape(shape) {}
 };
 
 template <typename T> struct output_ndarray {
@@ -27,8 +29,7 @@ template <typename T> struct output_ndarray {
   const vector<ssize_t> shape;
 
   output_ndarray(T *data, const vector<ssize_t> &shape): data(data), shape(shape) {}
-  output_ndarray(void *data, const vector<ssize_t> &shape): data(static_cast<T*>(data)), shape(shape) {}    
+  output_ndarray(void *data, const vector<ssize_t> &shape): data(static_cast<T*>(data)), shape(shape) {}
 };
 
-
-
+#endif
\ No newline at end of file
diff --git a/src/pybind_kernels/include/io.hh b/src/pybind_kernels/include/io.hh
index fae2cbf..a28da76 100644
--- a/src/pybind_kernels/include/io.hh
+++ b/src/pybind_kernels/include/io.hh
@@ -1,9 +1,6 @@
 #ifndef io_h
 #define io_h
 
-#include <pybind11/pybind11.h>
-#include <pybind11/numpy.h>
-
 template <typename T>
 void load_contiguous_slice(T *data, const string filename, const uint64_t offset, const uint64_t size);
 template <typename T>
diff --git a/src/pybind_kernels/include/morphology.hh b/src/pybind_kernels/include/morphology.hh
new file mode 100644
index 0000000..66a28e4
--- /dev/null
+++ b/src/pybind_kernels/include/morphology.hh
@@ -0,0 +1,14 @@
+#ifndef morphology_h
+#define morphology_h
+
+#include "datatypes.hh"
+
+template <typename Op, bool neutral>
+void morphology_3d_sphere(
+        const mask_type *voxels,
+        const int64_t radius,
+        const int64_t N[3],
+        const int64_t strides[3],
+        mask_type *result);
+
+#endif
\ No newline at end of file
diff --git a/src/pybind_kernels/pybind/io-pybind.cc b/src/pybind_kernels/pybind/io-pybind.cc
index d7c370a..6d1c4e0 100644
--- a/src/pybind_kernels/pybind/io-pybind.cc
+++ b/src/pybind_kernels/pybind/io-pybind.cc
@@ -32,7 +32,7 @@ void write_slice(const py::array_t<T> &np_data,
     write_contiguous_slice<T>(data, filename, flat_offset, data_info.size);
 }
 
-PYBIND11_MODULE(histograms, m) {
+PYBIND11_MODULE(io, m) {
     m.doc() = "I/O functions for handling flat binary format files."; // optional module docstring
     m.def("load_slice", &load_slice<mask_type>);
     m.def("load_slice", &load_slice<voxel_type>);
diff --git a/src/pybind_kernels/pybind/morphology-pybind.cc b/src/pybind_kernels/pybind/morphology-pybind.cc
new file mode 100644
index 0000000..f9c7891
--- /dev/null
+++ b/src/pybind_kernels/pybind/morphology-pybind.cc
@@ -0,0 +1,33 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+
+using namespace std;
+namespace py = pybind11;
+
+#include "morphology.cc"
+#include "datatypes.hh"
+
+template <typename Op, bool neutral>
+void morphology_3d_sphere_wrapper(
+        const py::array_t<mask_type> &np_voxels,
+        const int64_t radius,
+        py::array_t<mask_type> np_result) {
+    auto
+        voxels_info = np_voxels.request(),
+        result_info = np_result.request();
+
+    int32_t Nz = voxels_info.shape[0], Ny = voxels_info.shape[1], Nx = voxels_info.shape[2];
+    int64_t N[3] = {Nz, Ny, Nx};
+    int64_t strides[3] = {Ny*Nx, Nx, 1};
+
+    const mask_type *voxels = static_cast<const mask_type*>(voxels_info.ptr);
+    mask_type *result = static_cast<mask_type*>(result_info.ptr);
+
+    morphology_3d_sphere<Op, neutral>(voxels, radius, N, strides, result);
+}
+
+PYBIND11_MODULE(morphology, m) {
+    m.doc() = "Morphology operations."; // optional module docstring
+    m.def("dilate_3d_sphere", &morphology_3d_sphere_wrapper<std::bit_or<mask_type>, false>);
+    m.def("erode_3d_sphere", &morphology_3d_sphere_wrapper<std::bit_and<mask_type>, true>);
+}
\ No newline at end of file

From 507725941e56e42d88a83df7331c2214c6b04425 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <cjjohnsen@nbi.ku.dk>
Date: Mon, 13 Jun 2022 18:39:08 +0200
Subject: [PATCH 004/136] #16 Added unit test for io

---
 src/pybind_kernels/Makefile              |  3 ++
 src/pybind_kernels/cpu/morphology.cc     |  2 +-
 src/pybind_kernels/cpu_seq/morphology.cc |  3 +-
 src/pybind_kernels/gpu/morphology.cc     |  4 +-
 src/pybind_kernels/pybind/io-pybind.cc   | 22 ++++----
 src/pybind_kernels/test/test_io.py       | 66 ++++++++++++++++++++++++
 6 files changed, 85 insertions(+), 15 deletions(-)
 create mode 100644 src/pybind_kernels/test/test_io.py

diff --git a/src/pybind_kernels/Makefile b/src/pybind_kernels/Makefile
index 59c9911..7612608 100644
--- a/src/pybind_kernels/Makefile
+++ b/src/pybind_kernels/Makefile
@@ -46,5 +46,8 @@ $(foreach PLATFORM, $(PLATFORMS), \
 	) \
 )
 
+test: all
+	python3 -m pytest test
+
 clean:
 	rm -f $(TARGETS)
diff --git a/src/pybind_kernels/cpu/morphology.cc b/src/pybind_kernels/cpu/morphology.cc
index d706fff..a180aa5 100644
--- a/src/pybind_kernels/cpu/morphology.cc
+++ b/src/pybind_kernels/cpu/morphology.cc
@@ -33,7 +33,7 @@ void morphology_3d_sphere(
                             // TODO exact match with ndimage
                             bool within = px*px + py*py + pz*pz <= sqradius; // sphere kernel
                             int64_t offset = pz*strides[0] + py*strides[1] + px*strides[2];
-                            value = within? op(value, voxels[flat_index+offset]) : value;
+                            value = within ? op(value, voxels[flat_index+offset]) : value;
                         }
                     }
                 }
diff --git a/src/pybind_kernels/cpu_seq/morphology.cc b/src/pybind_kernels/cpu_seq/morphology.cc
index 60cea00..50c16bf 100644
--- a/src/pybind_kernels/cpu_seq/morphology.cc
+++ b/src/pybind_kernels/cpu_seq/morphology.cc
@@ -25,14 +25,13 @@ void morphology_3d_sphere(
 
                 // Apply the spherical kernel
                 bool value = neutral;
-                //#pragma omp simd collapse(3) reduction(op:value)
                 for (int64_t pz = limits[0]; pz <= limits[1]; pz++) {
                     for (int64_t py = limits[2]; py <= limits[3]; py++) {
                         for (int64_t px = limits[4]; px <= limits[5]; px++) {
                             // TODO exact match with ndimage
                             bool within = px*px + py*py + pz*pz <= sqradius; // sphere kernel
                             int64_t offset = pz*strides[0] + py*strides[1] + px*strides[2];
-                            value = within? op(value, voxels[flat_index+offset]) : value;
+                            value = within ? op(value, voxels[flat_index+offset]) : value;
                         }
                     }
                 }
diff --git a/src/pybind_kernels/gpu/morphology.cc b/src/pybind_kernels/gpu/morphology.cc
index 6230924..c42ec13 100644
--- a/src/pybind_kernels/gpu/morphology.cc
+++ b/src/pybind_kernels/gpu/morphology.cc
@@ -12,7 +12,7 @@ void morphology_3d_sphere(
     Op op;
     int64_t sqradius = radius * radius;
 
-    #pragma acc data copyin(voxels[:Nz*Ny*Nx], N[:3], strides[:3], sqradius) copyout(result[:Nz*Ny*Nx])
+    #pragma acc data copyin(voxels[:N[0]*N[1]*N[2]], N[:3], strides[:3], sqradius) copyout(result[:N[0]*N[1]*N[2]])
     {
         #pragma acc parallel loop collapse(3)
         for (int64_t z = 0; z < N[0]; z++) {
@@ -35,7 +35,7 @@ void morphology_3d_sphere(
                             for (int64_t px = limits[4]; px <= limits[5]; px++) {
                                 bool within = px*px + py*py + pz*pz <= sqradius; // sphere kernel
                                 int64_t offset = pz*strides[0] + py*strides[1] + px*strides[2];
-                                value = within? op(value, voxels[flat_index+offset]) : value;
+                                value = within ? op(value, voxels[flat_index+offset]) : value;
                             }
                         }
                     }
diff --git a/src/pybind_kernels/pybind/io-pybind.cc b/src/pybind_kernels/pybind/io-pybind.cc
index 6d1c4e0..496b990 100644
--- a/src/pybind_kernels/pybind/io-pybind.cc
+++ b/src/pybind_kernels/pybind/io-pybind.cc
@@ -34,15 +34,17 @@ void write_slice(const py::array_t<T> &np_data,
 
 PYBIND11_MODULE(io, m) {
     m.doc() = "I/O functions for handling flat binary format files."; // optional module docstring
-    m.def("load_slice", &load_slice<mask_type>);
-    m.def("load_slice", &load_slice<voxel_type>);
-    m.def("load_slice", &load_slice<field_type>);
-    m.def("load_slice", &load_slice<gauss_type>);
-    m.def("load_slice", &load_slice<real_t>);
+    m.def("load_slice", &load_slice<uint8_t>);
+    m.def("load_slice", &load_slice<uint16_t>);
+    m.def("load_slice", &load_slice<uint32_t>);
+    m.def("load_slice", &load_slice<uint64_t>);
+    m.def("load_slice", &load_slice<float>);
+    m.def("load_slice", &load_slice<double>);
 
-    m.def("write_slice", &write_slice<mask_type>);
-    m.def("write_slice", &write_slice<voxel_type>);
-    m.def("write_slice", &write_slice<field_type>);
-    m.def("write_slice", &write_slice<gauss_type>);
-    m.def("write_slice", &write_slice<real_t>);
+    m.def("write_slice", &write_slice<uint8_t>);
+    m.def("write_slice", &write_slice<uint16_t>);
+    m.def("write_slice", &write_slice<uint32_t>);
+    m.def("write_slice", &write_slice<uint64_t>);
+    m.def("write_slice", &write_slice<float>);
+    m.def("write_slice", &write_slice<double>);
 }
\ No newline at end of file
diff --git a/src/pybind_kernels/test/test_io.py b/src/pybind_kernels/test/test_io.py
new file mode 100644
index 0000000..64df23f
--- /dev/null
+++ b/src/pybind_kernels/test/test_io.py
@@ -0,0 +1,66 @@
+'''
+Unittests for the I/O pybind kernels.
+'''
+import sys
+sys.path.append(sys.path[0]+"/../")
+import cpu_seq.io as io
+import numpy as np
+import tempfile
+import os
+import pytest
+
+# TODO np.bool doesn't work. It works when writing, but numpy doesn't recognize that the memory has been updated. It works fine if data_read is a np.uint8 array, even though an np.bool array has been written.
+dtypes_to_test = [np.uint8, np.uint16, np.uint32, np.uint64, np.float32, np.float64]
+tmp_folder = tempfile._get_default_tempdir()
+tmp_filename = next(tempfile._get_candidate_names())
+tmp_file = f'{tmp_folder}/{tmp_filename}'
+dim_size = 16
+dim_shape = (dim_size, dim_size, dim_size)
+partial_factor = 4
+
+def random(shape, dtype):
+    rnds = np.random.random(shape) * 100
+    return rnds > .5 if dtype == np.bool else rnds.astype(dtype)
+
+@pytest.mark.parametrize("dtype", dtypes_to_test)
+def test_dtype(dtype):
+    individual_tmp_file = f'{tmp_file}.{dtype.__name__}'
+    data = random(dim_shape, dtype)
+    data[0,0,1] = False
+    partial = dim_size // partial_factor
+
+    # Write out a new file
+    io.write_slice(data, individual_tmp_file, (0,0,0), dim_shape)
+    assert os.path.getsize(individual_tmp_file) == data.nbytes
+
+    # Read back and verify in chunks
+    read_data = np.zeros((partial, dim_size, dim_size), dtype=dtype)
+    for i in range(partial_factor):
+        io.load_slice(read_data, individual_tmp_file, (i*partial,0,0), read_data.shape)
+        assert np.allclose(data[i*partial:(i+1)*partial], read_data)
+
+    # Append another layer
+    data = np.append(data, random((partial, dim_size, dim_size), dtype), axis=0)
+    io.write_slice(data[dim_size:], individual_tmp_file, (dim_size,0,0), data.shape)
+    assert os.path.getsize(individual_tmp_file) == data.nbytes
+
+    # Read back and verify in chunks
+    for i in range(partial_factor+1):
+        io.load_slice(read_data, individual_tmp_file, (i*partial,0,0), read_data.shape)
+        assert np.allclose(data[i*partial:(i+1)*partial], read_data)
+
+    # Overwrite one of the "middle" chunks
+    data[partial:2*partial] = random((partial, dim_size, dim_size), dtype)
+    io.write_slice(data[partial:partial*2], individual_tmp_file, (partial,0,0), data.shape)
+
+    # Read back and verify in chunks
+    for i in range(partial_factor+1):
+        io.load_slice(read_data, individual_tmp_file, (i*partial,0,0), read_data.shape)
+        assert np.allclose(data[i*partial:(i+1)*partial], read_data)
+
+    os.remove(individual_tmp_file)
+
+if __name__ == '__main__':
+    for dtype in dtypes_to_test:
+        print (f'Testing {dtype.__name__}')
+        test_dtype(dtype)
\ No newline at end of file

From c4c833aff41d27b5d47fd89aec11228d21d4125c Mon Sep 17 00:00:00 2001
From: Carl Johnsen <cjjohnsen@nbi.ku.dk>
Date: Mon, 13 Jun 2022 20:05:23 +0200
Subject: [PATCH 005/136] #16 added unittest for morphology

---
 src/pybind_kernels/Makefile                | 12 +++---
 src/pybind_kernels/test/test_morphology.py | 50 ++++++++++++++++++++++
 2 files changed, 56 insertions(+), 6 deletions(-)
 create mode 100644 src/pybind_kernels/test/test_morphology.py

diff --git a/src/pybind_kernels/Makefile b/src/pybind_kernels/Makefile
index 7612608..dfd0d09 100644
--- a/src/pybind_kernels/Makefile
+++ b/src/pybind_kernels/Makefile
@@ -5,12 +5,12 @@ OPENCV_INCLUDE=$(shell pkg-config opencv4 --cflags)
 OPENCV_LIB=$(shell pkg-config opencv4 --libs)
 
 # Detect if OpenACC can be used
-#ifneq (, $(shell which nvc++))
-#CXX = nvc++
-#CXXFLAGS += -acc=gpu -Minfo=accel -tp=native
-#else
-#$(info OpenACC compiler nvc++ not found. Compiling without)
-#endif
+ifneq (, $(shell which nvc++))
+CXX = nvc++
+CXXFLAGS += -acc=gpu -Minfo=accel -tp=native
+else
+$(info OpenACC compiler nvc++ not found. Compiling without)
+endif
 
 CXXFLAGS += -I../contrib/cpptqdm/ -Iinclude
 
diff --git a/src/pybind_kernels/test/test_morphology.py b/src/pybind_kernels/test/test_morphology.py
new file mode 100644
index 0000000..cb562d5
--- /dev/null
+++ b/src/pybind_kernels/test/test_morphology.py
@@ -0,0 +1,50 @@
+'''
+Unittests for the morphology pybind kernels.
+'''
+import sys
+sys.path.append(sys.path[0]+"/../")
+import cpu_seq.morphology as m_cpu_seq
+import cpu.morphology as m_cpu
+import gpu.morphology as m_gpu
+import numpy as np
+from scipy import ndimage as ndi
+import pytest
+
+# Parameters
+implant_dims = 32
+cross_width = 8
+# TODO if implant_dims doesn't divide by radius, it doesn't work. Except for 2, which also fails.
+rs = [4, 8, 16]
+impls = [m_cpu_seq, m_cpu, m_gpu]
+funcs = [('dilate', ndi.binary_dilation), ('erode', ndi.binary_erosion)]
+
+def sphere(n):
+    xs = np.linspace(-1,1,n)
+    return (xs[:,np.newaxis,np.newaxis]**2 + xs[np.newaxis,:,np.newaxis]**2 + xs[np.newaxis,np.newaxis,:]**2) <= 1
+
+@pytest.mark.parametrize('r', rs)
+@pytest.mark.parametrize('m', impls)
+@pytest.mark.parametrize('op,nd', funcs)
+def test_morphology(r, m, op, nd):
+    implant_mask = np.zeros((implant_dims,implant_dims,implant_dims), dtype=np.uint8)
+    c = implant_dims // 2
+    cross_start, cross_end = c - (cross_width // 2), c + (cross_width // 2)
+
+    implant_mask[:,cross_start:cross_end,cross_start:cross_end] = True
+    implant_mask[cross_start:cross_end,:,cross_start:cross_end] = True
+    implant_mask[cross_start:cross_end,cross_start:cross_end,:] = True
+
+    result = np.empty_like(implant_mask)
+    f = getattr(m, f'{op}_3d_sphere')
+    f(implant_mask, r, result)
+
+    verification = nd(implant_mask, sphere((2*r)+1))
+
+    assert np.allclose(verification, result)
+
+if __name__ == '__main__':
+    for r in rs:
+        for m in impls:
+            for op, nd in funcs:
+                print (f'Testing the {m.__name__} implementation of {op}')
+                test_morphology(r, m, op, nd)
\ No newline at end of file

From c2a7b2f0d0ef737595c733680573c62a94d5a892 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Wed, 21 Dec 2022 11:29:33 +0100
Subject: [PATCH 006/136] Added mac as a target in pybind

---
 src/pybind_kernels/Makefile | 46 +++++++++++++++----------------------
 1 file changed, 18 insertions(+), 28 deletions(-)

diff --git a/src/pybind_kernels/Makefile b/src/pybind_kernels/Makefile
index dfd0d09..aeba001 100644
--- a/src/pybind_kernels/Makefile
+++ b/src/pybind_kernels/Makefile
@@ -1,40 +1,30 @@
-PYBIND_FLAGS += $(shell python3 -m pybind11 --include) -march=native -Wall -shared -fPIC -fopenmp -g -std=c++17
+# Define constants and collections 
+PYBIND_FLAGS += $(shell python3 -m pybind11 --include) -march=native -Wall -shared -fPIC -fopenmp -g -std=c++17 -O3
 PYBIND_SUFFIX = $(shell python3-config --extension-suffix)
-
-OPENCV_INCLUDE=$(shell pkg-config opencv4 --cflags)
-OPENCV_LIB=$(shell pkg-config opencv4 --libs)
+#CXXFLAGS += -I../contrib/cpptqdm/ -Iinclude
+CXXFLAGS += -Iinclude
+PLATFORMS=cpu_seq cpu gpu
+LIBS=io morphology
+TARGETS = $(foreach PLATFORM, $(PLATFORMS), $(foreach LIB, $(LIBS), $(PLATFORM)/$(LIB)$(PYBIND_SUFFIX)))
+CLEANUP = $(TARGETS) $(foreach PLATFORM, $(PLATFORMS), $(PLATFORM)/__pycache__)
 
 # Detect if OpenACC can be used
 ifneq (, $(shell which nvc++))
 CXX = nvc++
 CXXFLAGS += -acc=gpu -Minfo=accel -tp=native
-else
-$(info OpenACC compiler nvc++ not found. Compiling without)
+else 
+$(info OpenACC compiler nvc++ not found. Compiling without.)
 endif
 
-CXXFLAGS += -I../contrib/cpptqdm/ -Iinclude
-
-PLATFORMS=cpu_seq cpu gpu
-LIBS=io morphology
-TARGETS = $(foreach PLATFORM, $(PLATFORMS), $(foreach LIB, $(LIBS), $(PLATFORM)/$(LIB)$(PYBIND_SUFFIX)))
+# Detect OS for OS specific changes
+ifeq ($(shell uname -s), Darwin) # Mac OSX
+CXX = g++-12 # Use homebrew gcc, as system gcc is an alias for clang
+CXXFLAGS += -undefined dynamic_lookup # https://pybind11.readthedocs.io/en/stable/compiling.html#building-manually
+CLEANUP += $(TARGETS) $(foreach TARGET, $(TARGETS), $(TARGET).dSYM) # These are also generated on Mac
+endif
 
 all: $(TARGETS)
 
-histograms$(PYBIND_SUFFIX): histograms.cc
-	$(CXX) $(CXXFLAGS) $(PYBIND_FLAGS) $< -o histograms$(PYBIND_SUFFIX)
-
-geometry$(PYBIND_SUFFIX): geometry-pybind.cc geometry.cc
-	$(CXX) $(CXXFLAGS) $(PYBIND_FLAGS) $< -o geometry$(PYBIND_SUFFIX)
-
-label$(PYBIND_SUFFIX): label.cc
-	$(CXX) $(CXXFLAGS) $(PYBIND_FLAGS) $< -o label$(PYBIND_SUFFIX)
-
-opencv_pybind$(PYBIND_SUFFIX): opencv_pybind.cc
-	$(CXX) $(CXXFLAGS) $(PYBIND_FLAGS) $(OPENCV_INCLUDE) $(OPENCV_LIB) $< -o opencv_pybind$(PYBIND_SUFFIX)
-
-opencv_tester: opencv_tester.cc
-	$(CXX) $(CXXFLAGS) $(OPENCV_INCLUDE) $(OPENCV_LIB) $< -o opencv_tester
-
 define GEN_RULE
 $(PLATFORM)/$(LIB)$(PYBIND_SUFFIX): pybind/$(LIB)-pybind.cc $(PLATFORM)/$(LIB).cc
 	$(CXX) $(CXXFLAGS) $(PYBIND_FLAGS) -I$(PLATFORM) $$< -o $(PLATFORM)/$(LIB)$(PYBIND_SUFFIX)
@@ -47,7 +37,7 @@ $(foreach PLATFORM, $(PLATFORMS), \
 )
 
 test: all
-	python3 -m pytest test
+	python3 -m pytest -n auto test
 
 clean:
-	rm -f $(TARGETS)
+	rm -rf $(CLEANUP) test/__pycache__ .pytest_cache

From f75f53681df05c8e9189466429427f96c003cd2a Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Wed, 21 Dec 2022 11:30:17 +0100
Subject: [PATCH 007/136] Added compiled files to gitignore

---
 .gitignore                         | 4 ++++
 src/pybind_kernels/cpu/__init__.py | 0
 2 files changed, 4 insertions(+)
 delete mode 100644 src/pybind_kernels/cpu/__init__.py

diff --git a/.gitignore b/.gitignore
index b647bd7..6e48464 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,3 +16,7 @@ src/meow/meow_triggers/*
 src/meow/runner_data/*
 src/meow/runner_output/*
 src/meow/runner_processing/*
+
+# Compiled files
+*.so
+*.so.dSYM
\ No newline at end of file
diff --git a/src/pybind_kernels/cpu/__init__.py b/src/pybind_kernels/cpu/__init__.py
deleted file mode 100644
index e69de29..0000000

From e10541416a0725618dd4ba6780c610cf259afad6 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Wed, 21 Dec 2022 11:31:33 +0100
Subject: [PATCH 008/136] Fixed verification error in erode near the borders

---
 src/pybind_kernels/test/test_morphology.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/pybind_kernels/test/test_morphology.py b/src/pybind_kernels/test/test_morphology.py
index cb562d5..a38f5ed 100644
--- a/src/pybind_kernels/test/test_morphology.py
+++ b/src/pybind_kernels/test/test_morphology.py
@@ -9,6 +9,7 @@
 import numpy as np
 from scipy import ndimage as ndi
 import pytest
+from functools import partial
 
 # Parameters
 implant_dims = 32
@@ -16,7 +17,7 @@
 # TODO if implant_dims doesn't divide by radius, it doesn't work. Except for 2, which also fails.
 rs = [4, 8, 16]
 impls = [m_cpu_seq, m_cpu, m_gpu]
-funcs = [('dilate', ndi.binary_dilation), ('erode', ndi.binary_erosion)]
+funcs = [('dilate', ndi.binary_dilation), ('erode', partial(ndi.binary_erosion, border_value=1))]
 
 def sphere(n):
     xs = np.linspace(-1,1,n)
@@ -26,7 +27,7 @@ def sphere(n):
 @pytest.mark.parametrize('m', impls)
 @pytest.mark.parametrize('op,nd', funcs)
 def test_morphology(r, m, op, nd):
-    implant_mask = np.zeros((implant_dims,implant_dims,implant_dims), dtype=np.uint8)
+    implant_mask = np.random.randint(0, 2, (implant_dims, implant_dims, implant_dims), dtype=np.uint8)
     c = implant_dims // 2
     cross_start, cross_end = c - (cross_width // 2), c + (cross_width // 2)
 

From 306f21fb9f185555b2151cb3a810ed2f4ea23388 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Wed, 21 Dec 2022 11:39:42 +0100
Subject: [PATCH 009/136] Remove numpy bool deprecation warning

---
 src/pybind_kernels/test/test_io.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pybind_kernels/test/test_io.py b/src/pybind_kernels/test/test_io.py
index 64df23f..6b351db 100644
--- a/src/pybind_kernels/test/test_io.py
+++ b/src/pybind_kernels/test/test_io.py
@@ -20,7 +20,7 @@
 
 def random(shape, dtype):
     rnds = np.random.random(shape) * 100
-    return rnds > .5 if dtype == np.bool else rnds.astype(dtype)
+    return rnds > .5 if dtype == bool else rnds.astype(dtype)
 
 @pytest.mark.parametrize("dtype", dtypes_to_test)
 def test_dtype(dtype):

From ea80d3b26492ef3071db69978e262c6933245fa7 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Tue, 7 Feb 2023 15:46:24 +0100
Subject: [PATCH 010/136] Added manual specification of root paths

---
 src/config/threadripper00/paths.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/config/threadripper00/paths.py b/src/config/threadripper00/paths.py
index 199d562..00a8685 100644
--- a/src/config/threadripper00/paths.py
+++ b/src/config/threadripper00/paths.py
@@ -1,5 +1,8 @@
-hdf5_root      = "/data/MAXIBONE/Goats/tomograms"
-hdf5_root_fast = "/mnt/shared/MAXIBONE/Goats/tomograms"
+data_root = "/data"
+fast_root = "/data_fast"
+
+hdf5_root      = f"{data_root}/MAXIBONE/Goats/tomograms"
+hdf5_root_fast = f"{fast_root}/MAXIBONE/Goats/tomograms"
 binary_root    = f"{hdf5_root_fast}/binary"
 
 esrf_data_local= f"{hdf5_root}/ESRF/"

From 55ef3de24d602dca3d2898f13831f08d613d230e Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Tue, 7 Feb 2023 15:46:38 +0100
Subject: [PATCH 011/136] Remove IO test file, if exists, before running test

---
 src/pybind_kernels/test/test_io.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/pybind_kernels/test/test_io.py b/src/pybind_kernels/test/test_io.py
index 6b351db..3ece7e4 100644
--- a/src/pybind_kernels/test/test_io.py
+++ b/src/pybind_kernels/test/test_io.py
@@ -25,6 +25,8 @@ def random(shape, dtype):
 @pytest.mark.parametrize("dtype", dtypes_to_test)
 def test_dtype(dtype):
     individual_tmp_file = f'{tmp_file}.{dtype.__name__}'
+    if os.path.exists(individual_tmp_file):
+        os.remove(individual_tmp_file)
     data = random(dim_shape, dtype)
     data[0,0,1] = False
     partial = dim_size // partial_factor

From 01a0d698549ecafdeb68454057a5462b8804d230 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Tue, 7 Feb 2023 15:47:02 +0100
Subject: [PATCH 012/136] Added option to specify python interpreter for pybind

---
 src/pybind_kernels/Makefile | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/pybind_kernels/Makefile b/src/pybind_kernels/Makefile
index aeba001..a8f7f9a 100644
--- a/src/pybind_kernels/Makefile
+++ b/src/pybind_kernels/Makefile
@@ -1,6 +1,7 @@
+PYTHON = python3.11
 # Define constants and collections 
-PYBIND_FLAGS += $(shell python3 -m pybind11 --include) -march=native -Wall -shared -fPIC -fopenmp -g -std=c++17 -O3
-PYBIND_SUFFIX = $(shell python3-config --extension-suffix)
+PYBIND_FLAGS += $(shell $(PYTHON) -m pybind11 --include) -march=native -Wall -shared -fPIC -fopenmp -g -std=c++17 -O3
+PYBIND_SUFFIX = $(shell $(PYTHON)-config --extension-suffix)
 #CXXFLAGS += -I../contrib/cpptqdm/ -Iinclude
 CXXFLAGS += -Iinclude
 PLATFORMS=cpu_seq cpu gpu
@@ -37,7 +38,7 @@ $(foreach PLATFORM, $(PLATFORMS), \
 )
 
 test: all
-	python3 -m pytest -n auto test
+	$(PYTHON) -m pytest -n auto test
 
 clean:
 	rm -rf $(CLEANUP) test/__pycache__ .pytest_cache

From a60cb2dc13e7efa00e615bb319551ac58f725b07 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Tue, 7 Feb 2023 15:47:12 +0100
Subject: [PATCH 013/136] Added notes on how to restructure

---
 src/struktur.md | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 src/struktur.md

diff --git a/src/struktur.md b/src/struktur.md
new file mode 100644
index 0000000..f8e10b1
--- /dev/null
+++ b/src/struktur.md
@@ -0,0 +1,42 @@
+src/
+    __init__.py
+    config/
+        constants.py
+        paths.py
+        threadripper00.json
+    lib/
+        __init__.py
+        cpp/
+            cpu/
+            cpu_seq/
+            gpu/
+            best/
+            include/
+        py/ # TODO tænk over hvordan de vælger implementation -- gerne hvordan det trickler "nedad"
+            Istedet for at loade al data ind i ram og så køre blokvist over på GPU, så udnyt async yield til at lave en generator! 
+            async memmap! 
+            geometry/
+                FoR_me.py
+    debug-explore/
+        *.ipynb
+    processing_steps/ # kun cli ting der kører af sig selv (+rapport ting over hvad der skete)
+        100-.py
+        200-
+    pybind/
+        *-pybind.cc
+    test/
+        pybind-*.py
+        større-test(s).py
+    utils/
+        io/
+        histograms/
+        alternative_processing_steps/
+    doitall.sh
+
+sæt ci op som test lokalt > generer fil > github action tjekker om fil rapporten matcher git commit hash og melder korrekt test kørsel (eller noget i den dur!)
+
+under oprydning, hold til samme argument interface som de andre! (i.e. compute_ridges gør ikke ( ͡° ͜ʖ ͡°) )
+
+gennemgå doitall og hiv de relevante ud i processing_steps. Dertil kør alt igennem! 
+
+doitall skal også lave en rapport tex. (tænk applied ML small assignment rapporten)
\ No newline at end of file

From 91f75bc0e754d02203172c1d36595c1ce1b9737f Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Thu, 9 Feb 2023 11:41:47 +0100
Subject: [PATCH 014/136] Ubuntu doesn't have python3.11

---
 src/pybind_kernels/Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/pybind_kernels/Makefile b/src/pybind_kernels/Makefile
index a8f7f9a..857ce2e 100644
--- a/src/pybind_kernels/Makefile
+++ b/src/pybind_kernels/Makefile
@@ -1,5 +1,5 @@
-PYTHON = python3.11
-# Define constants and collections 
+PYTHON = python3.10
+# Define constants and collections
 PYBIND_FLAGS += $(shell $(PYTHON) -m pybind11 --include) -march=native -Wall -shared -fPIC -fopenmp -g -std=c++17 -O3
 PYBIND_SUFFIX = $(shell $(PYTHON)-config --extension-suffix)
 #CXXFLAGS += -I../contrib/cpptqdm/ -Iinclude
@@ -13,7 +13,7 @@ CLEANUP = $(TARGETS) $(foreach PLATFORM, $(PLATFORMS), $(PLATFORM)/__pycache__)
 ifneq (, $(shell which nvc++))
 CXX = nvc++
 CXXFLAGS += -acc=gpu -Minfo=accel -tp=native
-else 
+else
 $(info OpenACC compiler nvc++ not found. Compiling without.)
 endif
 

From 7e491570d65703132c9d86bc7a613370e4571c84 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Thu, 9 Feb 2023 11:42:06 +0100
Subject: [PATCH 015/136] Added lightweight benchmarking to morphology tests

---
 src/pybind_kernels/test/test_morphology.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/pybind_kernels/test/test_morphology.py b/src/pybind_kernels/test/test_morphology.py
index a38f5ed..a28c8a6 100644
--- a/src/pybind_kernels/test/test_morphology.py
+++ b/src/pybind_kernels/test/test_morphology.py
@@ -10,6 +10,7 @@
 from scipy import ndimage as ndi
 import pytest
 from functools import partial
+import datetime
 
 # Parameters
 implant_dims = 32
@@ -38,14 +39,21 @@ def test_morphology(r, m, op, nd):
     result = np.empty_like(implant_mask)
     f = getattr(m, f'{op}_3d_sphere')
     f(implant_mask, r, result)
+    fsta = datetime.datetime.now()
+    f(implant_mask, r, result)
+    fend = datetime.datetime.now()
 
+    vsta = datetime.datetime.now()
     verification = nd(implant_mask, sphere((2*r)+1))
+    vend = datetime.datetime.now()
 
     assert np.allclose(verification, result)
 
+    return fend - fsta, (vend - vsta) / (fend - fsta)
+
 if __name__ == '__main__':
+    # TDOO move the data generation and ndi verification out to speed up running
     for r in rs:
         for m in impls:
             for op, nd in funcs:
-                print (f'Testing the {m.__name__} implementation of {op}')
-                test_morphology(r, m, op, nd)
\ No newline at end of file
+                print (f'Testing the {m.__name__} implementation of {op}', test_morphology(r, m, op, nd))

From 0f586c016c8fe76826a80456aea4c8ea31694479 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Thu, 9 Feb 2023 12:55:31 +0100
Subject: [PATCH 016/136] Started restructuring by moving all of the old
 scripts into its own folder

---
 .../analysis/cylinder_surface.py                  |   0
 .../analysis/cylinder_surface2.py                 |   0
 {src => pre-cleanup-src}/analysis/ellipsoids.py   |   0
 {src => pre-cleanup-src}/analysis/getthebone.py   |   0
 {src => pre-cleanup-src}/analysis/orientation.py  |   0
 {src => pre-cleanup-src}/analysis/osteocytes.py   |   0
 .../analysis/video1-segmentation.py               |   0
 .../analysis/video2-implant_contact.py            |   0
 .../bh-kernels/axes_histogram.c                   |   0
 .../bh-kernels/centres-of-mass-3xn.c              |   0
 .../bh-kernels/centres-of-mass.c                  |   0
 .../bh-kernels/collect-labeled.c                  |   0
 .../bh-kernels/collect-nonzero.c                  |   0
 .../bh-kernels/count-labeled.c                    |   0
 {src => pre-cleanup-src}/bh-kernels/fft-cpu.c     |   0
 {src => pre-cleanup-src}/bh-kernels/fft-kernel.c  |   0
 .../bh-kernels/inertia-matrix.c                   |   0
 .../bh-kernels/inertia-matrix.o                   | Bin
 {src => pre-cleanup-src}/bh-kernels/lookup-cpu.c  |   0
 .../bh-kernels/matrix3x3/eigenvalues.c            |   0
 .../bh-kernels/matrix3x3/eigenvalues.o            | Bin
 .../bh-kernels/matrix3x3/eigenvalues.py           |   0
 .../bh-kernels/matrix3x3/eigenvectors.c           |   0
 {src => pre-cleanup-src}/bh-kernels/ndi_label.c   |   0
 {src => pre-cleanup-src}/bh-kernels/ndi_label.py  |   0
 .../bh-kernels/principal-axes.c                   |   0
 {src => pre-cleanup-src}/bh-kernels/reduceat.c    |   0
 {src => pre-cleanup-src}/bh-kernels/reduceat.o    | Bin
 {src => pre-cleanup-src}/bh-kernels/rfft-cpu.c    |   0
 .../bh-kernels/select_segments.c                  |   0
 .../bh-kernels/select_segments.o                  | Bin
 .../bh-kernels/select_segments.py                 |   0
 .../bh-kernels/sliding-kernel-opencl.c            |   0
 .../bh-kernels/sliding-kernel.c                   |   0
 .../bh-kernels/sliding-kernel_2D.c                |   0
 .../bh-kernels/sparse_label.c                     |   0
 {src => pre-cleanup-src}/config/__init__.py       |   0
 {src => pre-cleanup-src}/config/constants.py      |   0
 {src => pre-cleanup-src}/config/kakapo/paths.py   |   0
 {src => pre-cleanup-src}/config/nautilus/paths.py |   0
 {src => pre-cleanup-src}/config/paths.py          |   0
 .../config/threadripper00/paths.py                |   0
 .../config/threadripper00/paths.py~               |   0
 .../config/threadripper01/paths.py                |   0
 {src => pre-cleanup-src}/contrib/cpptqdm/LICENSE  |   0
 .../contrib/cpptqdm/README.md                     |   0
 {src => pre-cleanup-src}/contrib/cpptqdm/tqdm.h   |   0
 {src => pre-cleanup-src}/convert-to-hdf5.py       |   0
 {src => pre-cleanup-src}/doitall.py               |   0
 .../experimental/histogram.cc                     |   0
 {src => pre-cleanup-src}/figures/fig_bic.py       |   0
 {src => pre-cleanup-src}/figures/figures.py       |   0
 {src => pre-cleanup-src}/figures/vedo_blood.py    |   0
 {src => pre-cleanup-src}/generate-Igauss.py       |   0
 .../generate-absorption-classes.py                |   0
 {src => pre-cleanup-src}/generate-byte-files.py   |   0
 {src => pre-cleanup-src}/generate_gauss_c.py      |   0
 {src => pre-cleanup-src}/helper_functions.py      |   0
 .../histogram_processing/compute_distributions.py |   0
 .../histogram_processing/compute_histograms.py    |   0
 .../histogram_processing/compute_probabilities.py |   0
 .../histogram_processing/compute_ridges.py        |   0
 .../histogram_processing/cubic2.py                |   0
 .../histogram_processing/distributions.py         |   0
 .../histogram_processing/material_correction.py   |   0
 .../optimize_distributions_flat.py                |   0
 .../histogram_processing/piecewise_cubic.py       |   0
 .../histogram_processing/piecewise_quadratic.py   |   0
 .../histogram_processing/pybind_kernels           |   0
 .../histogram_processing/test.py                  |   0
 {src => pre-cleanup-src}/imaging/bitmaps.py       |   0
 {src => pre-cleanup-src}/imaging/clustering.py    |   0
 {src => pre-cleanup-src}/imaging/distributions.py |   0
 {src => pre-cleanup-src}/imaging/sparse_labels.py |   0
 {src => pre-cleanup-src}/imaging/sparse_ndi.py    |   0
 {src => pre-cleanup-src}/imaging/uk_ndi.py        |   0
 {src => pre-cleanup-src}/io_modules/blockmap.py   |   0
 .../io_modules/cache_esrf2013.py                  |   0
 {src => pre-cleanup-src}/io_modules/esrf2011.py   |   0
 {src => pre-cleanup-src}/io_modules/esrf_read.py  |   0
 .../io_modules/h5-blockmap.cc                     |   0
 {src => pre-cleanup-src}/io_modules/h5tomo.py     |   0
 .../io_modules/write_video.py                     |   0
 {src => pre-cleanup-src}/limbo/datasources.py     |   0
 .../limbo/rescale-everything.py                   |   0
 {src => pre-cleanup-src}/limbo/volm.py            |   0
 {src => pre-cleanup-src}/meow/config              |   0
 {src => pre-cleanup-src}/meow/meow_variables.py   |   0
 .../meow/notebooks/00_generate_byte_data.ipynb    |   0
 .../meow/notebooks/01_volume_matcher.ipynb        |   0
 .../meow/notebooks/02_generate_scales.ipynb       |   0
 .../meow/notebooks/03_implant_analysis.ipynb      |   0
 .../notebooks/04_generate_implant_diffusion.ipynb |   0
 .../meow/notebooks/05_generate_implant_edt.ipynb  |   0
 .../meow/notebooks/06_compute_histograms.ipynb    |   0
 .../meow/notebooks/07_compute_ridges.ipynb        |   0
 .../meow/notebooks/08_compute_probabilities.ipynb |   0
 .../meow/notebooks/09_compute_segmentation.ipynb  |   0
 .../meow/notebooks/10_compute_bone_area.ipynb     |   0
 .../11_repeat_histogram_with_constraints.ipynb    |   0
 {src => pre-cleanup-src}/meow/notebooks/config    |   0
 {src => pre-cleanup-src}/meow/run_workflow.py     |   0
 .../meow/update_live_runner.py                    |   0
 .../obsolete/generate-histograms-axes.py          |   0
 .../obsolete/generate-radial-histograms.py        |   0
 .../obsolete/generate-y-histograms.py             |   0
 .../preprocess/generate-implant-diffusion.py      |   0
 .../preprocess/generate-implant-edt.py            |   0
 {src => pre-cleanup-src}/preprocess/resample.py   |   0
 .../preprocess/rescale-cupy-bin.py                |   0
 .../preprocess/rescale-cupy.py                    |   0
 .../pybind_kernels/cpu/geometry.cc                |   0
 .../pybind_kernels/cpu/histograms.cc              |   0
 .../pybind_kernels/cpu/label.cc                   |   0
 .../pybind_kernels/include/parallel.hh            |   0
 .../pybind_kernels/pybind/geometry-pybind.cc      |   0
 {src => pre-cleanup-src}/scripts/bin2npy.py       |   0
 .../scripts/closing_mask.ipynb                    |   0
 {src => pre-cleanup-src}/scripts/closing_mask.py  |   0
 {src => pre-cleanup-src}/scripts/config           |   0
 .../scripts/display_partial_segment.py            |   0
 .../scripts/generate-byte-hdf5.py                 |   0
 .../scripts/generate-scales.py                    |   0
 .../scripts/generate_gimp_probabilities.py        |   0
 .../scripts/generate_otsu_probabilities.py        |   0
 {src => pre-cleanup-src}/scripts/h5tobin.py       |   0
 {src => pre-cleanup-src}/scripts/otsu.ipynb       |   0
 .../scripts/segment_from_distributions.py         |   0
 .../scripts/volume_matcher.py                     |   0
 .../segmentation/airandbone-fn.py                 |   0
 .../segmentation/airandbone.py                    |   0
 {src => pre-cleanup-src}/segmentation/bone.py     |   0
 .../segmentation/hiresboneregion.py               |   0
 .../segmentation/implant-FoR.py                   |   0
 .../segmentation/implant-data.py                  |   0
 .../segmentation/segment-air-cc.py                |   0
 .../segmentation/segment-blood-cc.py              |   0
 .../segmentation/segment-blood-cc2.py             |   0
 .../segmentation/segment-implant-cc.py            |   0
 .../segmentation/segment-implant.py               |   0
 {src => pre-cleanup-src}/struktur.md              |   0
 {src => pre-cleanup-src}/test.py                  |   0
 src/Makefile                                      |  14 --------------
 143 files changed, 14 deletions(-)
 rename {src => pre-cleanup-src}/analysis/cylinder_surface.py (100%)
 rename {src => pre-cleanup-src}/analysis/cylinder_surface2.py (100%)
 rename {src => pre-cleanup-src}/analysis/ellipsoids.py (100%)
 rename {src => pre-cleanup-src}/analysis/getthebone.py (100%)
 rename {src => pre-cleanup-src}/analysis/orientation.py (100%)
 rename {src => pre-cleanup-src}/analysis/osteocytes.py (100%)
 rename {src => pre-cleanup-src}/analysis/video1-segmentation.py (100%)
 rename {src => pre-cleanup-src}/analysis/video2-implant_contact.py (100%)
 rename {src => pre-cleanup-src}/bh-kernels/axes_histogram.c (100%)
 rename {src => pre-cleanup-src}/bh-kernels/centres-of-mass-3xn.c (100%)
 rename {src => pre-cleanup-src}/bh-kernels/centres-of-mass.c (100%)
 rename {src => pre-cleanup-src}/bh-kernels/collect-labeled.c (100%)
 rename {src => pre-cleanup-src}/bh-kernels/collect-nonzero.c (100%)
 rename {src => pre-cleanup-src}/bh-kernels/count-labeled.c (100%)
 rename {src => pre-cleanup-src}/bh-kernels/fft-cpu.c (100%)
 rename {src => pre-cleanup-src}/bh-kernels/fft-kernel.c (100%)
 rename {src => pre-cleanup-src}/bh-kernels/inertia-matrix.c (100%)
 rename {src => pre-cleanup-src}/bh-kernels/inertia-matrix.o (100%)
 rename {src => pre-cleanup-src}/bh-kernels/lookup-cpu.c (100%)
 rename {src => pre-cleanup-src}/bh-kernels/matrix3x3/eigenvalues.c (100%)
 rename {src => pre-cleanup-src}/bh-kernels/matrix3x3/eigenvalues.o (100%)
 rename {src => pre-cleanup-src}/bh-kernels/matrix3x3/eigenvalues.py (100%)
 rename {src => pre-cleanup-src}/bh-kernels/matrix3x3/eigenvectors.c (100%)
 rename {src => pre-cleanup-src}/bh-kernels/ndi_label.c (100%)
 rename {src => pre-cleanup-src}/bh-kernels/ndi_label.py (100%)
 rename {src => pre-cleanup-src}/bh-kernels/principal-axes.c (100%)
 rename {src => pre-cleanup-src}/bh-kernels/reduceat.c (100%)
 rename {src => pre-cleanup-src}/bh-kernels/reduceat.o (100%)
 rename {src => pre-cleanup-src}/bh-kernels/rfft-cpu.c (100%)
 rename {src => pre-cleanup-src}/bh-kernels/select_segments.c (100%)
 rename {src => pre-cleanup-src}/bh-kernels/select_segments.o (100%)
 rename {src => pre-cleanup-src}/bh-kernels/select_segments.py (100%)
 rename {src => pre-cleanup-src}/bh-kernels/sliding-kernel-opencl.c (100%)
 rename {src => pre-cleanup-src}/bh-kernels/sliding-kernel.c (100%)
 rename {src => pre-cleanup-src}/bh-kernels/sliding-kernel_2D.c (100%)
 rename {src => pre-cleanup-src}/bh-kernels/sparse_label.c (100%)
 rename {src => pre-cleanup-src}/config/__init__.py (100%)
 rename {src => pre-cleanup-src}/config/constants.py (100%)
 rename {src => pre-cleanup-src}/config/kakapo/paths.py (100%)
 rename {src => pre-cleanup-src}/config/nautilus/paths.py (100%)
 rename {src => pre-cleanup-src}/config/paths.py (100%)
 rename {src => pre-cleanup-src}/config/threadripper00/paths.py (100%)
 rename {src => pre-cleanup-src}/config/threadripper00/paths.py~ (100%)
 rename {src => pre-cleanup-src}/config/threadripper01/paths.py (100%)
 rename {src => pre-cleanup-src}/contrib/cpptqdm/LICENSE (100%)
 rename {src => pre-cleanup-src}/contrib/cpptqdm/README.md (100%)
 rename {src => pre-cleanup-src}/contrib/cpptqdm/tqdm.h (100%)
 rename {src => pre-cleanup-src}/convert-to-hdf5.py (100%)
 rename {src => pre-cleanup-src}/doitall.py (100%)
 rename {src => pre-cleanup-src}/experimental/histogram.cc (100%)
 rename {src => pre-cleanup-src}/figures/fig_bic.py (100%)
 rename {src => pre-cleanup-src}/figures/figures.py (100%)
 rename {src => pre-cleanup-src}/figures/vedo_blood.py (100%)
 rename {src => pre-cleanup-src}/generate-Igauss.py (100%)
 rename {src => pre-cleanup-src}/generate-absorption-classes.py (100%)
 rename {src => pre-cleanup-src}/generate-byte-files.py (100%)
 rename {src => pre-cleanup-src}/generate_gauss_c.py (100%)
 rename {src => pre-cleanup-src}/helper_functions.py (100%)
 rename {src => pre-cleanup-src}/histogram_processing/compute_distributions.py (100%)
 rename {src => pre-cleanup-src}/histogram_processing/compute_histograms.py (100%)
 rename {src => pre-cleanup-src}/histogram_processing/compute_probabilities.py (100%)
 rename {src => pre-cleanup-src}/histogram_processing/compute_ridges.py (100%)
 rename {src => pre-cleanup-src}/histogram_processing/cubic2.py (100%)
 rename {src => pre-cleanup-src}/histogram_processing/distributions.py (100%)
 rename {src => pre-cleanup-src}/histogram_processing/material_correction.py (100%)
 rename {src => pre-cleanup-src}/histogram_processing/optimize_distributions_flat.py (100%)
 rename {src => pre-cleanup-src}/histogram_processing/piecewise_cubic.py (100%)
 rename {src => pre-cleanup-src}/histogram_processing/piecewise_quadratic.py (100%)
 rename {src => pre-cleanup-src}/histogram_processing/pybind_kernels (100%)
 rename {src => pre-cleanup-src}/histogram_processing/test.py (100%)
 rename {src => pre-cleanup-src}/imaging/bitmaps.py (100%)
 rename {src => pre-cleanup-src}/imaging/clustering.py (100%)
 rename {src => pre-cleanup-src}/imaging/distributions.py (100%)
 rename {src => pre-cleanup-src}/imaging/sparse_labels.py (100%)
 rename {src => pre-cleanup-src}/imaging/sparse_ndi.py (100%)
 rename {src => pre-cleanup-src}/imaging/uk_ndi.py (100%)
 rename {src => pre-cleanup-src}/io_modules/blockmap.py (100%)
 rename {src => pre-cleanup-src}/io_modules/cache_esrf2013.py (100%)
 rename {src => pre-cleanup-src}/io_modules/esrf2011.py (100%)
 rename {src => pre-cleanup-src}/io_modules/esrf_read.py (100%)
 rename {src => pre-cleanup-src}/io_modules/h5-blockmap.cc (100%)
 rename {src => pre-cleanup-src}/io_modules/h5tomo.py (100%)
 rename {src => pre-cleanup-src}/io_modules/write_video.py (100%)
 rename {src => pre-cleanup-src}/limbo/datasources.py (100%)
 rename {src => pre-cleanup-src}/limbo/rescale-everything.py (100%)
 rename {src => pre-cleanup-src}/limbo/volm.py (100%)
 rename {src => pre-cleanup-src}/meow/config (100%)
 rename {src => pre-cleanup-src}/meow/meow_variables.py (100%)
 rename {src => pre-cleanup-src}/meow/notebooks/00_generate_byte_data.ipynb (100%)
 rename {src => pre-cleanup-src}/meow/notebooks/01_volume_matcher.ipynb (100%)
 rename {src => pre-cleanup-src}/meow/notebooks/02_generate_scales.ipynb (100%)
 rename {src => pre-cleanup-src}/meow/notebooks/03_implant_analysis.ipynb (100%)
 rename {src => pre-cleanup-src}/meow/notebooks/04_generate_implant_diffusion.ipynb (100%)
 rename {src => pre-cleanup-src}/meow/notebooks/05_generate_implant_edt.ipynb (100%)
 rename {src => pre-cleanup-src}/meow/notebooks/06_compute_histograms.ipynb (100%)
 rename {src => pre-cleanup-src}/meow/notebooks/07_compute_ridges.ipynb (100%)
 rename {src => pre-cleanup-src}/meow/notebooks/08_compute_probabilities.ipynb (100%)
 rename {src => pre-cleanup-src}/meow/notebooks/09_compute_segmentation.ipynb (100%)
 rename {src => pre-cleanup-src}/meow/notebooks/10_compute_bone_area.ipynb (100%)
 rename {src => pre-cleanup-src}/meow/notebooks/11_repeat_histogram_with_constraints.ipynb (100%)
 rename {src => pre-cleanup-src}/meow/notebooks/config (100%)
 rename {src => pre-cleanup-src}/meow/run_workflow.py (100%)
 rename {src => pre-cleanup-src}/meow/update_live_runner.py (100%)
 rename {src => pre-cleanup-src}/obsolete/generate-histograms-axes.py (100%)
 rename {src => pre-cleanup-src}/obsolete/generate-radial-histograms.py (100%)
 rename {src => pre-cleanup-src}/obsolete/generate-y-histograms.py (100%)
 rename {src => pre-cleanup-src}/preprocess/generate-implant-diffusion.py (100%)
 rename {src => pre-cleanup-src}/preprocess/generate-implant-edt.py (100%)
 rename {src => pre-cleanup-src}/preprocess/resample.py (100%)
 rename {src => pre-cleanup-src}/preprocess/rescale-cupy-bin.py (100%)
 rename {src => pre-cleanup-src}/preprocess/rescale-cupy.py (100%)
 rename {src => pre-cleanup-src}/pybind_kernels/cpu/geometry.cc (100%)
 rename {src => pre-cleanup-src}/pybind_kernels/cpu/histograms.cc (100%)
 rename {src => pre-cleanup-src}/pybind_kernels/cpu/label.cc (100%)
 rename {src => pre-cleanup-src}/pybind_kernels/include/parallel.hh (100%)
 rename {src => pre-cleanup-src}/pybind_kernels/pybind/geometry-pybind.cc (100%)
 rename {src => pre-cleanup-src}/scripts/bin2npy.py (100%)
 rename {src => pre-cleanup-src}/scripts/closing_mask.ipynb (100%)
 rename {src => pre-cleanup-src}/scripts/closing_mask.py (100%)
 rename {src => pre-cleanup-src}/scripts/config (100%)
 rename {src => pre-cleanup-src}/scripts/display_partial_segment.py (100%)
 rename {src => pre-cleanup-src}/scripts/generate-byte-hdf5.py (100%)
 rename {src => pre-cleanup-src}/scripts/generate-scales.py (100%)
 rename {src => pre-cleanup-src}/scripts/generate_gimp_probabilities.py (100%)
 rename {src => pre-cleanup-src}/scripts/generate_otsu_probabilities.py (100%)
 rename {src => pre-cleanup-src}/scripts/h5tobin.py (100%)
 rename {src => pre-cleanup-src}/scripts/otsu.ipynb (100%)
 rename {src => pre-cleanup-src}/scripts/segment_from_distributions.py (100%)
 rename {src => pre-cleanup-src}/scripts/volume_matcher.py (100%)
 rename {src => pre-cleanup-src}/segmentation/airandbone-fn.py (100%)
 rename {src => pre-cleanup-src}/segmentation/airandbone.py (100%)
 rename {src => pre-cleanup-src}/segmentation/bone.py (100%)
 rename {src => pre-cleanup-src}/segmentation/hiresboneregion.py (100%)
 rename {src => pre-cleanup-src}/segmentation/implant-FoR.py (100%)
 rename {src => pre-cleanup-src}/segmentation/implant-data.py (100%)
 rename {src => pre-cleanup-src}/segmentation/segment-air-cc.py (100%)
 rename {src => pre-cleanup-src}/segmentation/segment-blood-cc.py (100%)
 rename {src => pre-cleanup-src}/segmentation/segment-blood-cc2.py (100%)
 rename {src => pre-cleanup-src}/segmentation/segment-implant-cc.py (100%)
 rename {src => pre-cleanup-src}/segmentation/segment-implant.py (100%)
 rename {src => pre-cleanup-src}/struktur.md (100%)
 rename {src => pre-cleanup-src}/test.py (100%)
 delete mode 100644 src/Makefile

diff --git a/src/analysis/cylinder_surface.py b/pre-cleanup-src/analysis/cylinder_surface.py
similarity index 100%
rename from src/analysis/cylinder_surface.py
rename to pre-cleanup-src/analysis/cylinder_surface.py
diff --git a/src/analysis/cylinder_surface2.py b/pre-cleanup-src/analysis/cylinder_surface2.py
similarity index 100%
rename from src/analysis/cylinder_surface2.py
rename to pre-cleanup-src/analysis/cylinder_surface2.py
diff --git a/src/analysis/ellipsoids.py b/pre-cleanup-src/analysis/ellipsoids.py
similarity index 100%
rename from src/analysis/ellipsoids.py
rename to pre-cleanup-src/analysis/ellipsoids.py
diff --git a/src/analysis/getthebone.py b/pre-cleanup-src/analysis/getthebone.py
similarity index 100%
rename from src/analysis/getthebone.py
rename to pre-cleanup-src/analysis/getthebone.py
diff --git a/src/analysis/orientation.py b/pre-cleanup-src/analysis/orientation.py
similarity index 100%
rename from src/analysis/orientation.py
rename to pre-cleanup-src/analysis/orientation.py
diff --git a/src/analysis/osteocytes.py b/pre-cleanup-src/analysis/osteocytes.py
similarity index 100%
rename from src/analysis/osteocytes.py
rename to pre-cleanup-src/analysis/osteocytes.py
diff --git a/src/analysis/video1-segmentation.py b/pre-cleanup-src/analysis/video1-segmentation.py
similarity index 100%
rename from src/analysis/video1-segmentation.py
rename to pre-cleanup-src/analysis/video1-segmentation.py
diff --git a/src/analysis/video2-implant_contact.py b/pre-cleanup-src/analysis/video2-implant_contact.py
similarity index 100%
rename from src/analysis/video2-implant_contact.py
rename to pre-cleanup-src/analysis/video2-implant_contact.py
diff --git a/src/bh-kernels/axes_histogram.c b/pre-cleanup-src/bh-kernels/axes_histogram.c
similarity index 100%
rename from src/bh-kernels/axes_histogram.c
rename to pre-cleanup-src/bh-kernels/axes_histogram.c
diff --git a/src/bh-kernels/centres-of-mass-3xn.c b/pre-cleanup-src/bh-kernels/centres-of-mass-3xn.c
similarity index 100%
rename from src/bh-kernels/centres-of-mass-3xn.c
rename to pre-cleanup-src/bh-kernels/centres-of-mass-3xn.c
diff --git a/src/bh-kernels/centres-of-mass.c b/pre-cleanup-src/bh-kernels/centres-of-mass.c
similarity index 100%
rename from src/bh-kernels/centres-of-mass.c
rename to pre-cleanup-src/bh-kernels/centres-of-mass.c
diff --git a/src/bh-kernels/collect-labeled.c b/pre-cleanup-src/bh-kernels/collect-labeled.c
similarity index 100%
rename from src/bh-kernels/collect-labeled.c
rename to pre-cleanup-src/bh-kernels/collect-labeled.c
diff --git a/src/bh-kernels/collect-nonzero.c b/pre-cleanup-src/bh-kernels/collect-nonzero.c
similarity index 100%
rename from src/bh-kernels/collect-nonzero.c
rename to pre-cleanup-src/bh-kernels/collect-nonzero.c
diff --git a/src/bh-kernels/count-labeled.c b/pre-cleanup-src/bh-kernels/count-labeled.c
similarity index 100%
rename from src/bh-kernels/count-labeled.c
rename to pre-cleanup-src/bh-kernels/count-labeled.c
diff --git a/src/bh-kernels/fft-cpu.c b/pre-cleanup-src/bh-kernels/fft-cpu.c
similarity index 100%
rename from src/bh-kernels/fft-cpu.c
rename to pre-cleanup-src/bh-kernels/fft-cpu.c
diff --git a/src/bh-kernels/fft-kernel.c b/pre-cleanup-src/bh-kernels/fft-kernel.c
similarity index 100%
rename from src/bh-kernels/fft-kernel.c
rename to pre-cleanup-src/bh-kernels/fft-kernel.c
diff --git a/src/bh-kernels/inertia-matrix.c b/pre-cleanup-src/bh-kernels/inertia-matrix.c
similarity index 100%
rename from src/bh-kernels/inertia-matrix.c
rename to pre-cleanup-src/bh-kernels/inertia-matrix.c
diff --git a/src/bh-kernels/inertia-matrix.o b/pre-cleanup-src/bh-kernels/inertia-matrix.o
similarity index 100%
rename from src/bh-kernels/inertia-matrix.o
rename to pre-cleanup-src/bh-kernels/inertia-matrix.o
diff --git a/src/bh-kernels/lookup-cpu.c b/pre-cleanup-src/bh-kernels/lookup-cpu.c
similarity index 100%
rename from src/bh-kernels/lookup-cpu.c
rename to pre-cleanup-src/bh-kernels/lookup-cpu.c
diff --git a/src/bh-kernels/matrix3x3/eigenvalues.c b/pre-cleanup-src/bh-kernels/matrix3x3/eigenvalues.c
similarity index 100%
rename from src/bh-kernels/matrix3x3/eigenvalues.c
rename to pre-cleanup-src/bh-kernels/matrix3x3/eigenvalues.c
diff --git a/src/bh-kernels/matrix3x3/eigenvalues.o b/pre-cleanup-src/bh-kernels/matrix3x3/eigenvalues.o
similarity index 100%
rename from src/bh-kernels/matrix3x3/eigenvalues.o
rename to pre-cleanup-src/bh-kernels/matrix3x3/eigenvalues.o
diff --git a/src/bh-kernels/matrix3x3/eigenvalues.py b/pre-cleanup-src/bh-kernels/matrix3x3/eigenvalues.py
similarity index 100%
rename from src/bh-kernels/matrix3x3/eigenvalues.py
rename to pre-cleanup-src/bh-kernels/matrix3x3/eigenvalues.py
diff --git a/src/bh-kernels/matrix3x3/eigenvectors.c b/pre-cleanup-src/bh-kernels/matrix3x3/eigenvectors.c
similarity index 100%
rename from src/bh-kernels/matrix3x3/eigenvectors.c
rename to pre-cleanup-src/bh-kernels/matrix3x3/eigenvectors.c
diff --git a/src/bh-kernels/ndi_label.c b/pre-cleanup-src/bh-kernels/ndi_label.c
similarity index 100%
rename from src/bh-kernels/ndi_label.c
rename to pre-cleanup-src/bh-kernels/ndi_label.c
diff --git a/src/bh-kernels/ndi_label.py b/pre-cleanup-src/bh-kernels/ndi_label.py
similarity index 100%
rename from src/bh-kernels/ndi_label.py
rename to pre-cleanup-src/bh-kernels/ndi_label.py
diff --git a/src/bh-kernels/principal-axes.c b/pre-cleanup-src/bh-kernels/principal-axes.c
similarity index 100%
rename from src/bh-kernels/principal-axes.c
rename to pre-cleanup-src/bh-kernels/principal-axes.c
diff --git a/src/bh-kernels/reduceat.c b/pre-cleanup-src/bh-kernels/reduceat.c
similarity index 100%
rename from src/bh-kernels/reduceat.c
rename to pre-cleanup-src/bh-kernels/reduceat.c
diff --git a/src/bh-kernels/reduceat.o b/pre-cleanup-src/bh-kernels/reduceat.o
similarity index 100%
rename from src/bh-kernels/reduceat.o
rename to pre-cleanup-src/bh-kernels/reduceat.o
diff --git a/src/bh-kernels/rfft-cpu.c b/pre-cleanup-src/bh-kernels/rfft-cpu.c
similarity index 100%
rename from src/bh-kernels/rfft-cpu.c
rename to pre-cleanup-src/bh-kernels/rfft-cpu.c
diff --git a/src/bh-kernels/select_segments.c b/pre-cleanup-src/bh-kernels/select_segments.c
similarity index 100%
rename from src/bh-kernels/select_segments.c
rename to pre-cleanup-src/bh-kernels/select_segments.c
diff --git a/src/bh-kernels/select_segments.o b/pre-cleanup-src/bh-kernels/select_segments.o
similarity index 100%
rename from src/bh-kernels/select_segments.o
rename to pre-cleanup-src/bh-kernels/select_segments.o
diff --git a/src/bh-kernels/select_segments.py b/pre-cleanup-src/bh-kernels/select_segments.py
similarity index 100%
rename from src/bh-kernels/select_segments.py
rename to pre-cleanup-src/bh-kernels/select_segments.py
diff --git a/src/bh-kernels/sliding-kernel-opencl.c b/pre-cleanup-src/bh-kernels/sliding-kernel-opencl.c
similarity index 100%
rename from src/bh-kernels/sliding-kernel-opencl.c
rename to pre-cleanup-src/bh-kernels/sliding-kernel-opencl.c
diff --git a/src/bh-kernels/sliding-kernel.c b/pre-cleanup-src/bh-kernels/sliding-kernel.c
similarity index 100%
rename from src/bh-kernels/sliding-kernel.c
rename to pre-cleanup-src/bh-kernels/sliding-kernel.c
diff --git a/src/bh-kernels/sliding-kernel_2D.c b/pre-cleanup-src/bh-kernels/sliding-kernel_2D.c
similarity index 100%
rename from src/bh-kernels/sliding-kernel_2D.c
rename to pre-cleanup-src/bh-kernels/sliding-kernel_2D.c
diff --git a/src/bh-kernels/sparse_label.c b/pre-cleanup-src/bh-kernels/sparse_label.c
similarity index 100%
rename from src/bh-kernels/sparse_label.c
rename to pre-cleanup-src/bh-kernels/sparse_label.c
diff --git a/src/config/__init__.py b/pre-cleanup-src/config/__init__.py
similarity index 100%
rename from src/config/__init__.py
rename to pre-cleanup-src/config/__init__.py
diff --git a/src/config/constants.py b/pre-cleanup-src/config/constants.py
similarity index 100%
rename from src/config/constants.py
rename to pre-cleanup-src/config/constants.py
diff --git a/src/config/kakapo/paths.py b/pre-cleanup-src/config/kakapo/paths.py
similarity index 100%
rename from src/config/kakapo/paths.py
rename to pre-cleanup-src/config/kakapo/paths.py
diff --git a/src/config/nautilus/paths.py b/pre-cleanup-src/config/nautilus/paths.py
similarity index 100%
rename from src/config/nautilus/paths.py
rename to pre-cleanup-src/config/nautilus/paths.py
diff --git a/src/config/paths.py b/pre-cleanup-src/config/paths.py
similarity index 100%
rename from src/config/paths.py
rename to pre-cleanup-src/config/paths.py
diff --git a/src/config/threadripper00/paths.py b/pre-cleanup-src/config/threadripper00/paths.py
similarity index 100%
rename from src/config/threadripper00/paths.py
rename to pre-cleanup-src/config/threadripper00/paths.py
diff --git a/src/config/threadripper00/paths.py~ b/pre-cleanup-src/config/threadripper00/paths.py~
similarity index 100%
rename from src/config/threadripper00/paths.py~
rename to pre-cleanup-src/config/threadripper00/paths.py~
diff --git a/src/config/threadripper01/paths.py b/pre-cleanup-src/config/threadripper01/paths.py
similarity index 100%
rename from src/config/threadripper01/paths.py
rename to pre-cleanup-src/config/threadripper01/paths.py
diff --git a/src/contrib/cpptqdm/LICENSE b/pre-cleanup-src/contrib/cpptqdm/LICENSE
similarity index 100%
rename from src/contrib/cpptqdm/LICENSE
rename to pre-cleanup-src/contrib/cpptqdm/LICENSE
diff --git a/src/contrib/cpptqdm/README.md b/pre-cleanup-src/contrib/cpptqdm/README.md
similarity index 100%
rename from src/contrib/cpptqdm/README.md
rename to pre-cleanup-src/contrib/cpptqdm/README.md
diff --git a/src/contrib/cpptqdm/tqdm.h b/pre-cleanup-src/contrib/cpptqdm/tqdm.h
similarity index 100%
rename from src/contrib/cpptqdm/tqdm.h
rename to pre-cleanup-src/contrib/cpptqdm/tqdm.h
diff --git a/src/convert-to-hdf5.py b/pre-cleanup-src/convert-to-hdf5.py
similarity index 100%
rename from src/convert-to-hdf5.py
rename to pre-cleanup-src/convert-to-hdf5.py
diff --git a/src/doitall.py b/pre-cleanup-src/doitall.py
similarity index 100%
rename from src/doitall.py
rename to pre-cleanup-src/doitall.py
diff --git a/src/experimental/histogram.cc b/pre-cleanup-src/experimental/histogram.cc
similarity index 100%
rename from src/experimental/histogram.cc
rename to pre-cleanup-src/experimental/histogram.cc
diff --git a/src/figures/fig_bic.py b/pre-cleanup-src/figures/fig_bic.py
similarity index 100%
rename from src/figures/fig_bic.py
rename to pre-cleanup-src/figures/fig_bic.py
diff --git a/src/figures/figures.py b/pre-cleanup-src/figures/figures.py
similarity index 100%
rename from src/figures/figures.py
rename to pre-cleanup-src/figures/figures.py
diff --git a/src/figures/vedo_blood.py b/pre-cleanup-src/figures/vedo_blood.py
similarity index 100%
rename from src/figures/vedo_blood.py
rename to pre-cleanup-src/figures/vedo_blood.py
diff --git a/src/generate-Igauss.py b/pre-cleanup-src/generate-Igauss.py
similarity index 100%
rename from src/generate-Igauss.py
rename to pre-cleanup-src/generate-Igauss.py
diff --git a/src/generate-absorption-classes.py b/pre-cleanup-src/generate-absorption-classes.py
similarity index 100%
rename from src/generate-absorption-classes.py
rename to pre-cleanup-src/generate-absorption-classes.py
diff --git a/src/generate-byte-files.py b/pre-cleanup-src/generate-byte-files.py
similarity index 100%
rename from src/generate-byte-files.py
rename to pre-cleanup-src/generate-byte-files.py
diff --git a/src/generate_gauss_c.py b/pre-cleanup-src/generate_gauss_c.py
similarity index 100%
rename from src/generate_gauss_c.py
rename to pre-cleanup-src/generate_gauss_c.py
diff --git a/src/helper_functions.py b/pre-cleanup-src/helper_functions.py
similarity index 100%
rename from src/helper_functions.py
rename to pre-cleanup-src/helper_functions.py
diff --git a/src/histogram_processing/compute_distributions.py b/pre-cleanup-src/histogram_processing/compute_distributions.py
similarity index 100%
rename from src/histogram_processing/compute_distributions.py
rename to pre-cleanup-src/histogram_processing/compute_distributions.py
diff --git a/src/histogram_processing/compute_histograms.py b/pre-cleanup-src/histogram_processing/compute_histograms.py
similarity index 100%
rename from src/histogram_processing/compute_histograms.py
rename to pre-cleanup-src/histogram_processing/compute_histograms.py
diff --git a/src/histogram_processing/compute_probabilities.py b/pre-cleanup-src/histogram_processing/compute_probabilities.py
similarity index 100%
rename from src/histogram_processing/compute_probabilities.py
rename to pre-cleanup-src/histogram_processing/compute_probabilities.py
diff --git a/src/histogram_processing/compute_ridges.py b/pre-cleanup-src/histogram_processing/compute_ridges.py
similarity index 100%
rename from src/histogram_processing/compute_ridges.py
rename to pre-cleanup-src/histogram_processing/compute_ridges.py
diff --git a/src/histogram_processing/cubic2.py b/pre-cleanup-src/histogram_processing/cubic2.py
similarity index 100%
rename from src/histogram_processing/cubic2.py
rename to pre-cleanup-src/histogram_processing/cubic2.py
diff --git a/src/histogram_processing/distributions.py b/pre-cleanup-src/histogram_processing/distributions.py
similarity index 100%
rename from src/histogram_processing/distributions.py
rename to pre-cleanup-src/histogram_processing/distributions.py
diff --git a/src/histogram_processing/material_correction.py b/pre-cleanup-src/histogram_processing/material_correction.py
similarity index 100%
rename from src/histogram_processing/material_correction.py
rename to pre-cleanup-src/histogram_processing/material_correction.py
diff --git a/src/histogram_processing/optimize_distributions_flat.py b/pre-cleanup-src/histogram_processing/optimize_distributions_flat.py
similarity index 100%
rename from src/histogram_processing/optimize_distributions_flat.py
rename to pre-cleanup-src/histogram_processing/optimize_distributions_flat.py
diff --git a/src/histogram_processing/piecewise_cubic.py b/pre-cleanup-src/histogram_processing/piecewise_cubic.py
similarity index 100%
rename from src/histogram_processing/piecewise_cubic.py
rename to pre-cleanup-src/histogram_processing/piecewise_cubic.py
diff --git a/src/histogram_processing/piecewise_quadratic.py b/pre-cleanup-src/histogram_processing/piecewise_quadratic.py
similarity index 100%
rename from src/histogram_processing/piecewise_quadratic.py
rename to pre-cleanup-src/histogram_processing/piecewise_quadratic.py
diff --git a/src/histogram_processing/pybind_kernels b/pre-cleanup-src/histogram_processing/pybind_kernels
similarity index 100%
rename from src/histogram_processing/pybind_kernels
rename to pre-cleanup-src/histogram_processing/pybind_kernels
diff --git a/src/histogram_processing/test.py b/pre-cleanup-src/histogram_processing/test.py
similarity index 100%
rename from src/histogram_processing/test.py
rename to pre-cleanup-src/histogram_processing/test.py
diff --git a/src/imaging/bitmaps.py b/pre-cleanup-src/imaging/bitmaps.py
similarity index 100%
rename from src/imaging/bitmaps.py
rename to pre-cleanup-src/imaging/bitmaps.py
diff --git a/src/imaging/clustering.py b/pre-cleanup-src/imaging/clustering.py
similarity index 100%
rename from src/imaging/clustering.py
rename to pre-cleanup-src/imaging/clustering.py
diff --git a/src/imaging/distributions.py b/pre-cleanup-src/imaging/distributions.py
similarity index 100%
rename from src/imaging/distributions.py
rename to pre-cleanup-src/imaging/distributions.py
diff --git a/src/imaging/sparse_labels.py b/pre-cleanup-src/imaging/sparse_labels.py
similarity index 100%
rename from src/imaging/sparse_labels.py
rename to pre-cleanup-src/imaging/sparse_labels.py
diff --git a/src/imaging/sparse_ndi.py b/pre-cleanup-src/imaging/sparse_ndi.py
similarity index 100%
rename from src/imaging/sparse_ndi.py
rename to pre-cleanup-src/imaging/sparse_ndi.py
diff --git a/src/imaging/uk_ndi.py b/pre-cleanup-src/imaging/uk_ndi.py
similarity index 100%
rename from src/imaging/uk_ndi.py
rename to pre-cleanup-src/imaging/uk_ndi.py
diff --git a/src/io_modules/blockmap.py b/pre-cleanup-src/io_modules/blockmap.py
similarity index 100%
rename from src/io_modules/blockmap.py
rename to pre-cleanup-src/io_modules/blockmap.py
diff --git a/src/io_modules/cache_esrf2013.py b/pre-cleanup-src/io_modules/cache_esrf2013.py
similarity index 100%
rename from src/io_modules/cache_esrf2013.py
rename to pre-cleanup-src/io_modules/cache_esrf2013.py
diff --git a/src/io_modules/esrf2011.py b/pre-cleanup-src/io_modules/esrf2011.py
similarity index 100%
rename from src/io_modules/esrf2011.py
rename to pre-cleanup-src/io_modules/esrf2011.py
diff --git a/src/io_modules/esrf_read.py b/pre-cleanup-src/io_modules/esrf_read.py
similarity index 100%
rename from src/io_modules/esrf_read.py
rename to pre-cleanup-src/io_modules/esrf_read.py
diff --git a/src/io_modules/h5-blockmap.cc b/pre-cleanup-src/io_modules/h5-blockmap.cc
similarity index 100%
rename from src/io_modules/h5-blockmap.cc
rename to pre-cleanup-src/io_modules/h5-blockmap.cc
diff --git a/src/io_modules/h5tomo.py b/pre-cleanup-src/io_modules/h5tomo.py
similarity index 100%
rename from src/io_modules/h5tomo.py
rename to pre-cleanup-src/io_modules/h5tomo.py
diff --git a/src/io_modules/write_video.py b/pre-cleanup-src/io_modules/write_video.py
similarity index 100%
rename from src/io_modules/write_video.py
rename to pre-cleanup-src/io_modules/write_video.py
diff --git a/src/limbo/datasources.py b/pre-cleanup-src/limbo/datasources.py
similarity index 100%
rename from src/limbo/datasources.py
rename to pre-cleanup-src/limbo/datasources.py
diff --git a/src/limbo/rescale-everything.py b/pre-cleanup-src/limbo/rescale-everything.py
similarity index 100%
rename from src/limbo/rescale-everything.py
rename to pre-cleanup-src/limbo/rescale-everything.py
diff --git a/src/limbo/volm.py b/pre-cleanup-src/limbo/volm.py
similarity index 100%
rename from src/limbo/volm.py
rename to pre-cleanup-src/limbo/volm.py
diff --git a/src/meow/config b/pre-cleanup-src/meow/config
similarity index 100%
rename from src/meow/config
rename to pre-cleanup-src/meow/config
diff --git a/src/meow/meow_variables.py b/pre-cleanup-src/meow/meow_variables.py
similarity index 100%
rename from src/meow/meow_variables.py
rename to pre-cleanup-src/meow/meow_variables.py
diff --git a/src/meow/notebooks/00_generate_byte_data.ipynb b/pre-cleanup-src/meow/notebooks/00_generate_byte_data.ipynb
similarity index 100%
rename from src/meow/notebooks/00_generate_byte_data.ipynb
rename to pre-cleanup-src/meow/notebooks/00_generate_byte_data.ipynb
diff --git a/src/meow/notebooks/01_volume_matcher.ipynb b/pre-cleanup-src/meow/notebooks/01_volume_matcher.ipynb
similarity index 100%
rename from src/meow/notebooks/01_volume_matcher.ipynb
rename to pre-cleanup-src/meow/notebooks/01_volume_matcher.ipynb
diff --git a/src/meow/notebooks/02_generate_scales.ipynb b/pre-cleanup-src/meow/notebooks/02_generate_scales.ipynb
similarity index 100%
rename from src/meow/notebooks/02_generate_scales.ipynb
rename to pre-cleanup-src/meow/notebooks/02_generate_scales.ipynb
diff --git a/src/meow/notebooks/03_implant_analysis.ipynb b/pre-cleanup-src/meow/notebooks/03_implant_analysis.ipynb
similarity index 100%
rename from src/meow/notebooks/03_implant_analysis.ipynb
rename to pre-cleanup-src/meow/notebooks/03_implant_analysis.ipynb
diff --git a/src/meow/notebooks/04_generate_implant_diffusion.ipynb b/pre-cleanup-src/meow/notebooks/04_generate_implant_diffusion.ipynb
similarity index 100%
rename from src/meow/notebooks/04_generate_implant_diffusion.ipynb
rename to pre-cleanup-src/meow/notebooks/04_generate_implant_diffusion.ipynb
diff --git a/src/meow/notebooks/05_generate_implant_edt.ipynb b/pre-cleanup-src/meow/notebooks/05_generate_implant_edt.ipynb
similarity index 100%
rename from src/meow/notebooks/05_generate_implant_edt.ipynb
rename to pre-cleanup-src/meow/notebooks/05_generate_implant_edt.ipynb
diff --git a/src/meow/notebooks/06_compute_histograms.ipynb b/pre-cleanup-src/meow/notebooks/06_compute_histograms.ipynb
similarity index 100%
rename from src/meow/notebooks/06_compute_histograms.ipynb
rename to pre-cleanup-src/meow/notebooks/06_compute_histograms.ipynb
diff --git a/src/meow/notebooks/07_compute_ridges.ipynb b/pre-cleanup-src/meow/notebooks/07_compute_ridges.ipynb
similarity index 100%
rename from src/meow/notebooks/07_compute_ridges.ipynb
rename to pre-cleanup-src/meow/notebooks/07_compute_ridges.ipynb
diff --git a/src/meow/notebooks/08_compute_probabilities.ipynb b/pre-cleanup-src/meow/notebooks/08_compute_probabilities.ipynb
similarity index 100%
rename from src/meow/notebooks/08_compute_probabilities.ipynb
rename to pre-cleanup-src/meow/notebooks/08_compute_probabilities.ipynb
diff --git a/src/meow/notebooks/09_compute_segmentation.ipynb b/pre-cleanup-src/meow/notebooks/09_compute_segmentation.ipynb
similarity index 100%
rename from src/meow/notebooks/09_compute_segmentation.ipynb
rename to pre-cleanup-src/meow/notebooks/09_compute_segmentation.ipynb
diff --git a/src/meow/notebooks/10_compute_bone_area.ipynb b/pre-cleanup-src/meow/notebooks/10_compute_bone_area.ipynb
similarity index 100%
rename from src/meow/notebooks/10_compute_bone_area.ipynb
rename to pre-cleanup-src/meow/notebooks/10_compute_bone_area.ipynb
diff --git a/src/meow/notebooks/11_repeat_histogram_with_constraints.ipynb b/pre-cleanup-src/meow/notebooks/11_repeat_histogram_with_constraints.ipynb
similarity index 100%
rename from src/meow/notebooks/11_repeat_histogram_with_constraints.ipynb
rename to pre-cleanup-src/meow/notebooks/11_repeat_histogram_with_constraints.ipynb
diff --git a/src/meow/notebooks/config b/pre-cleanup-src/meow/notebooks/config
similarity index 100%
rename from src/meow/notebooks/config
rename to pre-cleanup-src/meow/notebooks/config
diff --git a/src/meow/run_workflow.py b/pre-cleanup-src/meow/run_workflow.py
similarity index 100%
rename from src/meow/run_workflow.py
rename to pre-cleanup-src/meow/run_workflow.py
diff --git a/src/meow/update_live_runner.py b/pre-cleanup-src/meow/update_live_runner.py
similarity index 100%
rename from src/meow/update_live_runner.py
rename to pre-cleanup-src/meow/update_live_runner.py
diff --git a/src/obsolete/generate-histograms-axes.py b/pre-cleanup-src/obsolete/generate-histograms-axes.py
similarity index 100%
rename from src/obsolete/generate-histograms-axes.py
rename to pre-cleanup-src/obsolete/generate-histograms-axes.py
diff --git a/src/obsolete/generate-radial-histograms.py b/pre-cleanup-src/obsolete/generate-radial-histograms.py
similarity index 100%
rename from src/obsolete/generate-radial-histograms.py
rename to pre-cleanup-src/obsolete/generate-radial-histograms.py
diff --git a/src/obsolete/generate-y-histograms.py b/pre-cleanup-src/obsolete/generate-y-histograms.py
similarity index 100%
rename from src/obsolete/generate-y-histograms.py
rename to pre-cleanup-src/obsolete/generate-y-histograms.py
diff --git a/src/preprocess/generate-implant-diffusion.py b/pre-cleanup-src/preprocess/generate-implant-diffusion.py
similarity index 100%
rename from src/preprocess/generate-implant-diffusion.py
rename to pre-cleanup-src/preprocess/generate-implant-diffusion.py
diff --git a/src/preprocess/generate-implant-edt.py b/pre-cleanup-src/preprocess/generate-implant-edt.py
similarity index 100%
rename from src/preprocess/generate-implant-edt.py
rename to pre-cleanup-src/preprocess/generate-implant-edt.py
diff --git a/src/preprocess/resample.py b/pre-cleanup-src/preprocess/resample.py
similarity index 100%
rename from src/preprocess/resample.py
rename to pre-cleanup-src/preprocess/resample.py
diff --git a/src/preprocess/rescale-cupy-bin.py b/pre-cleanup-src/preprocess/rescale-cupy-bin.py
similarity index 100%
rename from src/preprocess/rescale-cupy-bin.py
rename to pre-cleanup-src/preprocess/rescale-cupy-bin.py
diff --git a/src/preprocess/rescale-cupy.py b/pre-cleanup-src/preprocess/rescale-cupy.py
similarity index 100%
rename from src/preprocess/rescale-cupy.py
rename to pre-cleanup-src/preprocess/rescale-cupy.py
diff --git a/src/pybind_kernels/cpu/geometry.cc b/pre-cleanup-src/pybind_kernels/cpu/geometry.cc
similarity index 100%
rename from src/pybind_kernels/cpu/geometry.cc
rename to pre-cleanup-src/pybind_kernels/cpu/geometry.cc
diff --git a/src/pybind_kernels/cpu/histograms.cc b/pre-cleanup-src/pybind_kernels/cpu/histograms.cc
similarity index 100%
rename from src/pybind_kernels/cpu/histograms.cc
rename to pre-cleanup-src/pybind_kernels/cpu/histograms.cc
diff --git a/src/pybind_kernels/cpu/label.cc b/pre-cleanup-src/pybind_kernels/cpu/label.cc
similarity index 100%
rename from src/pybind_kernels/cpu/label.cc
rename to pre-cleanup-src/pybind_kernels/cpu/label.cc
diff --git a/src/pybind_kernels/include/parallel.hh b/pre-cleanup-src/pybind_kernels/include/parallel.hh
similarity index 100%
rename from src/pybind_kernels/include/parallel.hh
rename to pre-cleanup-src/pybind_kernels/include/parallel.hh
diff --git a/src/pybind_kernels/pybind/geometry-pybind.cc b/pre-cleanup-src/pybind_kernels/pybind/geometry-pybind.cc
similarity index 100%
rename from src/pybind_kernels/pybind/geometry-pybind.cc
rename to pre-cleanup-src/pybind_kernels/pybind/geometry-pybind.cc
diff --git a/src/scripts/bin2npy.py b/pre-cleanup-src/scripts/bin2npy.py
similarity index 100%
rename from src/scripts/bin2npy.py
rename to pre-cleanup-src/scripts/bin2npy.py
diff --git a/src/scripts/closing_mask.ipynb b/pre-cleanup-src/scripts/closing_mask.ipynb
similarity index 100%
rename from src/scripts/closing_mask.ipynb
rename to pre-cleanup-src/scripts/closing_mask.ipynb
diff --git a/src/scripts/closing_mask.py b/pre-cleanup-src/scripts/closing_mask.py
similarity index 100%
rename from src/scripts/closing_mask.py
rename to pre-cleanup-src/scripts/closing_mask.py
diff --git a/src/scripts/config b/pre-cleanup-src/scripts/config
similarity index 100%
rename from src/scripts/config
rename to pre-cleanup-src/scripts/config
diff --git a/src/scripts/display_partial_segment.py b/pre-cleanup-src/scripts/display_partial_segment.py
similarity index 100%
rename from src/scripts/display_partial_segment.py
rename to pre-cleanup-src/scripts/display_partial_segment.py
diff --git a/src/scripts/generate-byte-hdf5.py b/pre-cleanup-src/scripts/generate-byte-hdf5.py
similarity index 100%
rename from src/scripts/generate-byte-hdf5.py
rename to pre-cleanup-src/scripts/generate-byte-hdf5.py
diff --git a/src/scripts/generate-scales.py b/pre-cleanup-src/scripts/generate-scales.py
similarity index 100%
rename from src/scripts/generate-scales.py
rename to pre-cleanup-src/scripts/generate-scales.py
diff --git a/src/scripts/generate_gimp_probabilities.py b/pre-cleanup-src/scripts/generate_gimp_probabilities.py
similarity index 100%
rename from src/scripts/generate_gimp_probabilities.py
rename to pre-cleanup-src/scripts/generate_gimp_probabilities.py
diff --git a/src/scripts/generate_otsu_probabilities.py b/pre-cleanup-src/scripts/generate_otsu_probabilities.py
similarity index 100%
rename from src/scripts/generate_otsu_probabilities.py
rename to pre-cleanup-src/scripts/generate_otsu_probabilities.py
diff --git a/src/scripts/h5tobin.py b/pre-cleanup-src/scripts/h5tobin.py
similarity index 100%
rename from src/scripts/h5tobin.py
rename to pre-cleanup-src/scripts/h5tobin.py
diff --git a/src/scripts/otsu.ipynb b/pre-cleanup-src/scripts/otsu.ipynb
similarity index 100%
rename from src/scripts/otsu.ipynb
rename to pre-cleanup-src/scripts/otsu.ipynb
diff --git a/src/scripts/segment_from_distributions.py b/pre-cleanup-src/scripts/segment_from_distributions.py
similarity index 100%
rename from src/scripts/segment_from_distributions.py
rename to pre-cleanup-src/scripts/segment_from_distributions.py
diff --git a/src/scripts/volume_matcher.py b/pre-cleanup-src/scripts/volume_matcher.py
similarity index 100%
rename from src/scripts/volume_matcher.py
rename to pre-cleanup-src/scripts/volume_matcher.py
diff --git a/src/segmentation/airandbone-fn.py b/pre-cleanup-src/segmentation/airandbone-fn.py
similarity index 100%
rename from src/segmentation/airandbone-fn.py
rename to pre-cleanup-src/segmentation/airandbone-fn.py
diff --git a/src/segmentation/airandbone.py b/pre-cleanup-src/segmentation/airandbone.py
similarity index 100%
rename from src/segmentation/airandbone.py
rename to pre-cleanup-src/segmentation/airandbone.py
diff --git a/src/segmentation/bone.py b/pre-cleanup-src/segmentation/bone.py
similarity index 100%
rename from src/segmentation/bone.py
rename to pre-cleanup-src/segmentation/bone.py
diff --git a/src/segmentation/hiresboneregion.py b/pre-cleanup-src/segmentation/hiresboneregion.py
similarity index 100%
rename from src/segmentation/hiresboneregion.py
rename to pre-cleanup-src/segmentation/hiresboneregion.py
diff --git a/src/segmentation/implant-FoR.py b/pre-cleanup-src/segmentation/implant-FoR.py
similarity index 100%
rename from src/segmentation/implant-FoR.py
rename to pre-cleanup-src/segmentation/implant-FoR.py
diff --git a/src/segmentation/implant-data.py b/pre-cleanup-src/segmentation/implant-data.py
similarity index 100%
rename from src/segmentation/implant-data.py
rename to pre-cleanup-src/segmentation/implant-data.py
diff --git a/src/segmentation/segment-air-cc.py b/pre-cleanup-src/segmentation/segment-air-cc.py
similarity index 100%
rename from src/segmentation/segment-air-cc.py
rename to pre-cleanup-src/segmentation/segment-air-cc.py
diff --git a/src/segmentation/segment-blood-cc.py b/pre-cleanup-src/segmentation/segment-blood-cc.py
similarity index 100%
rename from src/segmentation/segment-blood-cc.py
rename to pre-cleanup-src/segmentation/segment-blood-cc.py
diff --git a/src/segmentation/segment-blood-cc2.py b/pre-cleanup-src/segmentation/segment-blood-cc2.py
similarity index 100%
rename from src/segmentation/segment-blood-cc2.py
rename to pre-cleanup-src/segmentation/segment-blood-cc2.py
diff --git a/src/segmentation/segment-implant-cc.py b/pre-cleanup-src/segmentation/segment-implant-cc.py
similarity index 100%
rename from src/segmentation/segment-implant-cc.py
rename to pre-cleanup-src/segmentation/segment-implant-cc.py
diff --git a/src/segmentation/segment-implant.py b/pre-cleanup-src/segmentation/segment-implant.py
similarity index 100%
rename from src/segmentation/segment-implant.py
rename to pre-cleanup-src/segmentation/segment-implant.py
diff --git a/src/struktur.md b/pre-cleanup-src/struktur.md
similarity index 100%
rename from src/struktur.md
rename to pre-cleanup-src/struktur.md
diff --git a/src/test.py b/pre-cleanup-src/test.py
similarity index 100%
rename from src/test.py
rename to pre-cleanup-src/test.py
diff --git a/src/Makefile b/src/Makefile
deleted file mode 100644
index bc449e1..0000000
--- a/src/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-CXX=icpc
-CXXFLAGS += -std=c++17
-CXXFLAGS += $(shell pkg-config --cflags fmt)
-CXXFLAGS += $(shell pkg-config --cflags hdf5)
-LIBS += $(shell pkg-config --libs fmt)
-LIBS += $(shell pkg-config --libs hdf5)
-
-%.o: %.cc
-	$(CXX) $(CXXFLAGS) -c $< 
-
-h5-blockmap: h5-blockmap.o
-	echo $(LIBS)
-	$(CXX) $(CXXFLAGS) $< $(LIBS) -o $@
-

From e738c63a6e1079695bf3e81fe14c6331c571495e Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Thu, 9 Feb 2023 12:56:09 +0100
Subject: [PATCH 017/136] Moved structure notes to github issues

---
 pre-cleanup-src/struktur.md | 42 -------------------------------------
 1 file changed, 42 deletions(-)
 delete mode 100644 pre-cleanup-src/struktur.md

diff --git a/pre-cleanup-src/struktur.md b/pre-cleanup-src/struktur.md
deleted file mode 100644
index f8e10b1..0000000
--- a/pre-cleanup-src/struktur.md
+++ /dev/null
@@ -1,42 +0,0 @@
-src/
-    __init__.py
-    config/
-        constants.py
-        paths.py
-        threadripper00.json
-    lib/
-        __init__.py
-        cpp/
-            cpu/
-            cpu_seq/
-            gpu/
-            best/
-            include/
-        py/ # TODO tænk over hvordan de vælger implementation -- gerne hvordan det trickler "nedad"
-            Istedet for at loade al data ind i ram og så køre blokvist over på GPU, så udnyt async yield til at lave en generator! 
-            async memmap! 
-            geometry/
-                FoR_me.py
-    debug-explore/
-        *.ipynb
-    processing_steps/ # kun cli ting der kører af sig selv (+rapport ting over hvad der skete)
-        100-.py
-        200-
-    pybind/
-        *-pybind.cc
-    test/
-        pybind-*.py
-        større-test(s).py
-    utils/
-        io/
-        histograms/
-        alternative_processing_steps/
-    doitall.sh
-
-sæt ci op som test lokalt > generer fil > github action tjekker om fil rapporten matcher git commit hash og melder korrekt test kørsel (eller noget i den dur!)
-
-under oprydning, hold til samme argument interface som de andre! (i.e. compute_ridges gør ikke ( ͡° ͜ʖ ͡°) )
-
-gennemgå doitall og hiv de relevante ud i processing_steps. Dertil kør alt igennem! 
-
-doitall skal også lave en rapport tex. (tænk applied ML small assignment rapporten)
\ No newline at end of file

From 8bd436761d5da89a4957b942cf4366c96bbb3be7 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Thu, 9 Feb 2023 14:46:47 +0100
Subject: [PATCH 018/136] #25 Moved current updated files to match new
 structure

---
 .gitignore                                      |  1 -
 pre-cleanup-src/Makefile                        | 14 ++++++++++++++
 src/{pybind_kernels => }/Makefile               | 17 ++++++++---------
 src/{pybind_kernels => lib/cpp}/cpu/io.cc       |  0
 .../cpp}/cpu/morphology.cc                      |  0
 src/{pybind_kernels => lib/cpp}/cpu_seq/io.cc   |  0
 .../cpp}/cpu_seq/morphology.cc                  |  0
 src/{pybind_kernels => lib/cpp}/gpu/io.cc       |  0
 .../cpp}/gpu/morphology.cc                      |  0
 .../cpp}/include/datatypes.hh                   |  0
 src/{pybind_kernels => lib/cpp}/include/io.hh   |  0
 .../cpp}/include/morphology.hh                  |  0
 src/{pybind_kernels => }/pybind/io-pybind.cc    |  0
 .../pybind/morphology-pybind.cc                 |  0
 src/pybind_kernels/Readme.md                    |  1 -
 src/pybind_kernels/__init__.py                  |  0
 src/{pybind_kernels => }/test/test_io.py        |  2 +-
 .../test/test_morphology.py                     |  2 +-
 18 files changed, 24 insertions(+), 13 deletions(-)
 create mode 100644 pre-cleanup-src/Makefile
 rename src/{pybind_kernels => }/Makefile (69%)
 rename src/{pybind_kernels => lib/cpp}/cpu/io.cc (100%)
 rename src/{pybind_kernels => lib/cpp}/cpu/morphology.cc (100%)
 rename src/{pybind_kernels => lib/cpp}/cpu_seq/io.cc (100%)
 rename src/{pybind_kernels => lib/cpp}/cpu_seq/morphology.cc (100%)
 rename src/{pybind_kernels => lib/cpp}/gpu/io.cc (100%)
 rename src/{pybind_kernels => lib/cpp}/gpu/morphology.cc (100%)
 rename src/{pybind_kernels => lib/cpp}/include/datatypes.hh (100%)
 rename src/{pybind_kernels => lib/cpp}/include/io.hh (100%)
 rename src/{pybind_kernels => lib/cpp}/include/morphology.hh (100%)
 rename src/{pybind_kernels => }/pybind/io-pybind.cc (100%)
 rename src/{pybind_kernels => }/pybind/morphology-pybind.cc (100%)
 delete mode 100644 src/pybind_kernels/Readme.md
 delete mode 100644 src/pybind_kernels/__init__.py
 rename src/{pybind_kernels => }/test/test_io.py (98%)
 rename src/{pybind_kernels => }/test/test_morphology.py (97%)

diff --git a/.gitignore b/.gitignore
index 6e48464..5743ee6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,7 +5,6 @@ CMakeCache.txt
 CMakeFiles
 CMakeScripts
 Testing
-Makefile
 cmake_install.cmake
 install_manifest.txt
 compile_commands.json
diff --git a/pre-cleanup-src/Makefile b/pre-cleanup-src/Makefile
new file mode 100644
index 0000000..bc449e1
--- /dev/null
+++ b/pre-cleanup-src/Makefile
@@ -0,0 +1,14 @@
+CXX=icpc
+CXXFLAGS += -std=c++17
+CXXFLAGS += $(shell pkg-config --cflags fmt)
+CXXFLAGS += $(shell pkg-config --cflags hdf5)
+LIBS += $(shell pkg-config --libs fmt)
+LIBS += $(shell pkg-config --libs hdf5)
+
+%.o: %.cc
+	$(CXX) $(CXXFLAGS) -c $< 
+
+h5-blockmap: h5-blockmap.o
+	echo $(LIBS)
+	$(CXX) $(CXXFLAGS) $< $(LIBS) -o $@
+
diff --git a/src/pybind_kernels/Makefile b/src/Makefile
similarity index 69%
rename from src/pybind_kernels/Makefile
rename to src/Makefile
index 18cd3fb..2a00b42 100644
--- a/src/pybind_kernels/Makefile
+++ b/src/Makefile
@@ -1,13 +1,14 @@
-PYTHON = python3.10
 # Define constants and collections
+PYTHON = python3.10
 PYBIND_FLAGS += $(shell $(PYTHON) -m pybind11 --include) -march=native -Wall -shared -fPIC -fopenmp -g -std=c++17 -O3
 PYBIND_SUFFIX = $(shell $(PYTHON)-config --extension-suffix)
+CPP_FOLDER=lib/cpp
 #CXXFLAGS += -I../contrib/cpptqdm/ -Iinclude
-CXXFLAGS += -Iinclude
+CXXFLAGS += -I$(CPP_FOLDER)/include
 PLATFORMS=cpu_seq cpu gpu
 LIBS=io morphology
-TARGETS = $(foreach PLATFORM, $(PLATFORMS), $(foreach LIB, $(LIBS), $(PLATFORM)/$(LIB)$(PYBIND_SUFFIX)))
-CLEANUP = $(TARGETS) $(foreach PLATFORM, $(PLATFORMS), $(PLATFORM)/__pycache__)
+TARGETS = $(foreach PLATFORM, $(PLATFORMS), $(foreach LIB, $(LIBS), $(CPP_FOLDER)/$(PLATFORM)/$(LIB)$(PYBIND_SUFFIX)))
+CLEANUP = $(TARGETS) $(foreach PLATFORM, $(PLATFORMS), $(CPP_FOLDER)/$(PLATFORM)/__pycache__)
 
 # Detect if OpenACC can be used
 ifneq (, $(shell which nvc++))
@@ -24,13 +25,11 @@ CXXFLAGS += -undefined dynamic_lookup # https://pybind11.readthedocs.io/en/stabl
 CLEANUP += $(TARGETS) $(foreach TARGET, $(TARGETS), $(TARGET).dSYM) # These are also generated on Mac
 endif
 
-CXXFLAGS += -I../contrib/cpptqdm/ 
-
 all: $(TARGETS)
 
 define GEN_RULE
-$(PLATFORM)/$(LIB)$(PYBIND_SUFFIX): pybind/$(LIB)-pybind.cc $(PLATFORM)/$(LIB).cc
-	$(CXX) $(CXXFLAGS) $(PYBIND_FLAGS) -I$(PLATFORM) $$< -o $(PLATFORM)/$(LIB)$(PYBIND_SUFFIX)
+$(CPP_FOLDER)/$(PLATFORM)/$(LIB)$(PYBIND_SUFFIX): pybind/$(LIB)-pybind.cc $(CPP_FOLDER)/$(PLATFORM)/$(LIB).cc
+	$(CXX) $(CXXFLAGS) $(PYBIND_FLAGS) -I$(CPP_FOLDER)/$(PLATFORM) $$< -o $(CPP_FOLDER)/$(PLATFORM)/$(LIB)$(PYBIND_SUFFIX)
 endef
 
 $(foreach PLATFORM, $(PLATFORMS), \
@@ -43,4 +42,4 @@ test: all
 	$(PYTHON) -m pytest -n auto test
 
 clean:
-	rm -rf $(CLEANUP) test/__pycache__ .pytest_cache **/*.so
\ No newline at end of file
+	rm -rf $(CLEANUP) __pycache__ test/__pycache__ .pytest_cache lib/cpp/**/*.so
\ No newline at end of file
diff --git a/src/pybind_kernels/cpu/io.cc b/src/lib/cpp/cpu/io.cc
similarity index 100%
rename from src/pybind_kernels/cpu/io.cc
rename to src/lib/cpp/cpu/io.cc
diff --git a/src/pybind_kernels/cpu/morphology.cc b/src/lib/cpp/cpu/morphology.cc
similarity index 100%
rename from src/pybind_kernels/cpu/morphology.cc
rename to src/lib/cpp/cpu/morphology.cc
diff --git a/src/pybind_kernels/cpu_seq/io.cc b/src/lib/cpp/cpu_seq/io.cc
similarity index 100%
rename from src/pybind_kernels/cpu_seq/io.cc
rename to src/lib/cpp/cpu_seq/io.cc
diff --git a/src/pybind_kernels/cpu_seq/morphology.cc b/src/lib/cpp/cpu_seq/morphology.cc
similarity index 100%
rename from src/pybind_kernels/cpu_seq/morphology.cc
rename to src/lib/cpp/cpu_seq/morphology.cc
diff --git a/src/pybind_kernels/gpu/io.cc b/src/lib/cpp/gpu/io.cc
similarity index 100%
rename from src/pybind_kernels/gpu/io.cc
rename to src/lib/cpp/gpu/io.cc
diff --git a/src/pybind_kernels/gpu/morphology.cc b/src/lib/cpp/gpu/morphology.cc
similarity index 100%
rename from src/pybind_kernels/gpu/morphology.cc
rename to src/lib/cpp/gpu/morphology.cc
diff --git a/src/pybind_kernels/include/datatypes.hh b/src/lib/cpp/include/datatypes.hh
similarity index 100%
rename from src/pybind_kernels/include/datatypes.hh
rename to src/lib/cpp/include/datatypes.hh
diff --git a/src/pybind_kernels/include/io.hh b/src/lib/cpp/include/io.hh
similarity index 100%
rename from src/pybind_kernels/include/io.hh
rename to src/lib/cpp/include/io.hh
diff --git a/src/pybind_kernels/include/morphology.hh b/src/lib/cpp/include/morphology.hh
similarity index 100%
rename from src/pybind_kernels/include/morphology.hh
rename to src/lib/cpp/include/morphology.hh
diff --git a/src/pybind_kernels/pybind/io-pybind.cc b/src/pybind/io-pybind.cc
similarity index 100%
rename from src/pybind_kernels/pybind/io-pybind.cc
rename to src/pybind/io-pybind.cc
diff --git a/src/pybind_kernels/pybind/morphology-pybind.cc b/src/pybind/morphology-pybind.cc
similarity index 100%
rename from src/pybind_kernels/pybind/morphology-pybind.cc
rename to src/pybind/morphology-pybind.cc
diff --git a/src/pybind_kernels/Readme.md b/src/pybind_kernels/Readme.md
deleted file mode 100644
index 27d0412..0000000
--- a/src/pybind_kernels/Readme.md
+++ /dev/null
@@ -1 +0,0 @@
-# TODO :)
\ No newline at end of file
diff --git a/src/pybind_kernels/__init__.py b/src/pybind_kernels/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/src/pybind_kernels/test/test_io.py b/src/test/test_io.py
similarity index 98%
rename from src/pybind_kernels/test/test_io.py
rename to src/test/test_io.py
index 3ece7e4..12ed56d 100644
--- a/src/pybind_kernels/test/test_io.py
+++ b/src/test/test_io.py
@@ -2,7 +2,7 @@
 Unittests for the I/O pybind kernels.
 '''
 import sys
-sys.path.append(sys.path[0]+"/../")
+sys.path.append(sys.path[0]+"/../lib/cpp")
 import cpu_seq.io as io
 import numpy as np
 import tempfile
diff --git a/src/pybind_kernels/test/test_morphology.py b/src/test/test_morphology.py
similarity index 97%
rename from src/pybind_kernels/test/test_morphology.py
rename to src/test/test_morphology.py
index a28c8a6..a608d0d 100644
--- a/src/pybind_kernels/test/test_morphology.py
+++ b/src/test/test_morphology.py
@@ -2,7 +2,7 @@
 Unittests for the morphology pybind kernels.
 '''
 import sys
-sys.path.append(sys.path[0]+"/../")
+sys.path.append(sys.path[0]+"/../lib/cpp")
 import cpu_seq.morphology as m_cpu_seq
 import cpu.morphology as m_cpu
 import gpu.morphology as m_gpu

From e93880b7417777ebbfbc0f41fd450a63f340228f Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Thu, 9 Feb 2023 15:05:27 +0100
Subject: [PATCH 019/136] #16 Removed pytest warning

---
 src/test/test_morphology.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/test/test_morphology.py b/src/test/test_morphology.py
index a608d0d..d99ccae 100644
--- a/src/test/test_morphology.py
+++ b/src/test/test_morphology.py
@@ -49,11 +49,11 @@ def test_morphology(r, m, op, nd):
 
     assert np.allclose(verification, result)
 
-    return fend - fsta, (vend - vsta) / (fend - fsta)
+    print (f'Testing the {m.__name__} implementation of {op}. Ran in {fend - fsta}, which is {(vend - vsta) / (fend - fsta)} times better than ndi')
 
 if __name__ == '__main__':
     # TDOO move the data generation and ndi verification out to speed up running
     for r in rs:
         for m in impls:
             for op, nd in funcs:
-                print (f'Testing the {m.__name__} implementation of {op}', test_morphology(r, m, op, nd))
+                test_morphology(r, m, op, nd)

From bb412d960bbb3aac334ee48979df7af89b547420 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Thu, 9 Feb 2023 15:39:36 +0100
Subject: [PATCH 020/136] #25 moved processing step 1 into new structure, along
 side its dependencies.

---
 src/config/paths.py                               | 12 ++++++++++++
 .../paths.py => src/lib/py/helpers.py             | 15 +--------------
 .../processing_steps/100_cache_esrf2013.py        |  8 ++++----
 3 files changed, 17 insertions(+), 18 deletions(-)
 create mode 100644 src/config/paths.py
 rename pre-cleanup-src/config/threadripper00/paths.py => src/lib/py/helpers.py (65%)
 rename pre-cleanup-src/io_modules/cache_esrf2013.py => src/processing_steps/100_cache_esrf2013.py (83%)

diff --git a/src/config/paths.py b/src/config/paths.py
new file mode 100644
index 0000000..e1ac537
--- /dev/null
+++ b/src/config/paths.py
@@ -0,0 +1,12 @@
+data_root = "/data"
+fast_root = "/data_fast"
+
+hdf5_root        = f"{data_root}/MAXIBONE/Goats/tomograms"
+hdf5_root_fast   = f"{fast_root}/MAXIBONE/Goats/tomograms"
+binary_root      = f"{hdf5_root}/binary"
+binary_root_fast = f"{hdf5_root_fast}/binary"
+
+esrf_data_local     = f"{hdf5_root}/ESRF/"
+esrf_data_sftp      =  "/XNS/XrayImaging/MiG/manjula.esci.nbi.dk.2_localhost/"
+esrf_implants_root  = f"{esrf_data_local}/esrf_dental_implants_april_2013/"
+esrf_granules_root  = f"{esrf_data_local}/esrf_dental_granules_july_2012/"
\ No newline at end of file
diff --git a/pre-cleanup-src/config/threadripper00/paths.py b/src/lib/py/helpers.py
similarity index 65%
rename from pre-cleanup-src/config/threadripper00/paths.py
rename to src/lib/py/helpers.py
index 00a8685..f76df16 100644
--- a/pre-cleanup-src/config/threadripper00/paths.py
+++ b/src/lib/py/helpers.py
@@ -1,17 +1,5 @@
-data_root = "/data"
-fast_root = "/data_fast"
-
-hdf5_root      = f"{data_root}/MAXIBONE/Goats/tomograms"
-hdf5_root_fast = f"{fast_root}/MAXIBONE/Goats/tomograms"
-binary_root    = f"{hdf5_root_fast}/binary"
-
-esrf_data_local= f"{hdf5_root}/ESRF/"
-esrf_data_sftp =  "/XNS/XrayImaging/MiG/manjula.esci.nbi.dk.2_localhost/"
-esrf_implants_root  = f"{esrf_data_local}/esrf_dental_implants_april_2013/"
-esrf_granules_root  = f"{esrf_data_local}/esrf_dental_granules_july_2012/"
-
-# TODO: Hvorhen skal det her hen?
 import sys
+
 def commandline_args(defaults):
     keys = list(defaults.keys())
 
@@ -40,4 +28,3 @@ def commandline_args(defaults):
             args.append(type(default)(sys.argv[i+1]))
 
     return args
-
diff --git a/pre-cleanup-src/io_modules/cache_esrf2013.py b/src/processing_steps/100_cache_esrf2013.py
similarity index 83%
rename from pre-cleanup-src/io_modules/cache_esrf2013.py
rename to src/processing_steps/100_cache_esrf2013.py
index 73dae5e..f2b12c6 100644
--- a/pre-cleanup-src/io_modules/cache_esrf2013.py
+++ b/src/processing_steps/100_cache_esrf2013.py
@@ -1,10 +1,11 @@
 import os, sys, pathlib, tqdm, fabric
 sys.path.append(sys.path[0]+"/../")
-from config.paths import commandline_args, esrf_data_sftp, esrf_data_local
+from lib.py.helpers import commandline_args
+from config.paths import esrf_data_sftp, esrf_data_local
 
 if __name__ == "__main__":
-    sample, experiment = commandline_args({"sample":"<required>",
-                                           "experiment":"esrf_dental_implants_april_2013"})
+    sample, experiment = commandline_args({"sample" : "<required>",
+                                           "experiment" : "esrf_dental_implants_april_2013"})
     
     index_dir  = f"{esrf_data_local}/{experiment}/index/";
     with open(f"{index_dir}/{sample}.txt") as f:
@@ -32,4 +33,3 @@
                     sftp.get(f,f)
 
             connection.close()
-    

From 89f29fa326561bf276bc83d4f77ff2377a933051 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Thu, 9 Feb 2023 15:40:36 +0100
Subject: [PATCH 021/136] #29 Added verbose to cache_esrf

---
 src/processing_steps/100_cache_esrf2013.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/processing_steps/100_cache_esrf2013.py b/src/processing_steps/100_cache_esrf2013.py
index f2b12c6..c744dc7 100644
--- a/src/processing_steps/100_cache_esrf2013.py
+++ b/src/processing_steps/100_cache_esrf2013.py
@@ -4,8 +4,9 @@
 from config.paths import esrf_data_sftp, esrf_data_local
 
 if __name__ == "__main__":
-    sample, experiment = commandline_args({"sample" : "<required>",
-                                           "experiment" : "esrf_dental_implants_april_2013"})
+    sample, experiment, verbose = commandline_args({"sample" : "<required>",
+                                                    "experiment" : "esrf_dental_implants_april_2013",
+                                                    "verbose" : 1})
     
     index_dir  = f"{esrf_data_local}/{experiment}/index/";
     with open(f"{index_dir}/{sample}.txt") as f:
@@ -16,19 +17,19 @@
         volume_dir = os.path.dirname(volume_xml)
         local_directory = f"{esrf_data_local}/{experiment}/{volume_dir}"
         sftp_directory  = f"{esrf_data_sftp}/{experiment}/{volume_dir}"
-        print(f"Local: Creating directory {local_directory}")
+        if verbose >= 1: print(f"Local: Creating directory {local_directory}")
         pathlib.Path(local_directory).mkdir(parents=True, exist_ok=True)
 
         with fabric.Connection('erda') as connection:
-            print("Connected to ERDA")
+            if verbose >= 1: print("Connected to ERDA")
             with connection.sftp() as sftp:
-                print(f"SFTP: Attempting to chdir to {sftp_directory}")
+                if verbose >= 1: print(f"SFTP: Attempting to chdir to {sftp_directory}")
                 sftp.chdir(sftp_directory)
-                print(f"SFTP: Reading directory contents")
+                if verbose >= 1: print(f"SFTP: Reading directory contents")
                 files = sftp.listdir()
-                print(f"Local: Attempting to chdir to {local_directory}")                
+                if verbose >= 1: print(f"Local: Attempting to chdir to {local_directory}")                
                 os.chdir(local_directory)
-                print("SFTP: Downloading subvolume contents")
+                if verbose >= 1: print("SFTP: Downloading subvolume contents")
                 for f in tqdm.tqdm(files):
                     sftp.get(f,f)
 

From f0cdf30e4e64f419890b3c793bbead30bae8aa08 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Thu, 9 Feb 2023 15:41:00 +0100
Subject: [PATCH 022/136] #25 Started adding pip dependencies

---
 src/requirements.txt | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 src/requirements.txt

diff --git a/src/requirements.txt b/src/requirements.txt
new file mode 100644
index 0000000..b4d4689
--- /dev/null
+++ b/src/requirements.txt
@@ -0,0 +1,2 @@
+fabric==3.0.0
+tqdm==4.64.1
\ No newline at end of file

From 62dc2d65f02bddc984ac83e33d23210bdf7e5754 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Thu, 9 Feb 2023 15:51:41 +0100
Subject: [PATCH 023/136] #25 Moved the generate byte hdf 5 script

---
 .../processing_steps/200_generate_byte_hdf5.py                    | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename pre-cleanup-src/scripts/generate-byte-hdf5.py => src/processing_steps/200_generate_byte_hdf5.py (100%)

diff --git a/pre-cleanup-src/scripts/generate-byte-hdf5.py b/src/processing_steps/200_generate_byte_hdf5.py
similarity index 100%
rename from pre-cleanup-src/scripts/generate-byte-hdf5.py
rename to src/processing_steps/200_generate_byte_hdf5.py

From 5d9155fa3654d3b03014569bdd4cf3ee8cf4725c Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Thu, 9 Feb 2023 15:52:38 +0100
Subject: [PATCH 024/136] #25 Fixed the import errors of generate_byte_hdf5.
 This included moving teh esrf_read helper script to lib/py/

---
 .../io_modules => src/lib/py}/esrf_read.py    |  0
 .../200_generate_byte_hdf5.py                 | 19 +++++++++++--------
 2 files changed, 11 insertions(+), 8 deletions(-)
 rename {pre-cleanup-src/io_modules => src/lib/py}/esrf_read.py (100%)

diff --git a/pre-cleanup-src/io_modules/esrf_read.py b/src/lib/py/esrf_read.py
similarity index 100%
rename from pre-cleanup-src/io_modules/esrf_read.py
rename to src/lib/py/esrf_read.py
diff --git a/src/processing_steps/200_generate_byte_hdf5.py b/src/processing_steps/200_generate_byte_hdf5.py
index fa64136..a3df603 100755
--- a/src/processing_steps/200_generate_byte_hdf5.py
+++ b/src/processing_steps/200_generate_byte_hdf5.py
@@ -7,16 +7,19 @@
 import h5py, sys, os.path, pathlib, tqdm
 sys.path.append(sys.path[0]+"/../")
 import bohrium as bh # TODO: Get rid of Bohrium dependence without losing too much performance
-from io_modules.esrf_read import *
+from lib.py.esrf_read import *
 import numpy   as np, matplotlib.pyplot as plt
 from config.paths import *
+from lib.py.helpers import commandline_args
 from PIL import Image
 
 
 NA = np.newaxis
 
-sample, chunk_length, use_bohrium, xml_root  = commandline_args({"sample":"<required>","chunk_length":256,
-                                                                 "use_bohrium":True,"xml_root":esrf_implants_root})
+sample, chunk_length, use_bohrium, xml_root  = commandline_args({"sample" : "<required>",
+                                                                 "chunk_length" : 256,
+                                                                 "use_bohrium" : True,
+                                                                 "xml_root" : esrf_implants_root})
 
 
 print(f"data_root={xml_root}")
@@ -97,11 +100,11 @@ def normalize(A,value_range,nbits=16,dtype=np.uint16):
 h5tomo_lsb = h5file_lsb['voxels']
 
 def cylinder_mask(Ny,Nx):
-    ys = bh.linspace(-1,1,Ny)
-    xs = bh.linspace(-1,1,Nx)
+    ys = np.linspace(-1,1,Ny)
+    xs = np.linspace(-1,1,Nx)
     return (xs[NA,:]**2 + ys[:,NA]**2) < 1 
 
-mask = bh.array(cylinder_mask(Ny,Nx))
+mask = np.array(cylinder_mask(Ny,Nx))
 
 for i in tqdm.tqdm(range(len(subvolume_metadata))):
     subvolume_info = subvolume_metadata[i];
@@ -115,7 +118,7 @@ def cylinder_mask(Ny,Nx):
     # print(f"Writing {subvolume_info['experiment']}")    
     # h5tomo[z_offset:z_offset+nz] = tomo[:,sy:ey,sx:ex];
     # del tomo
-    chunk = bh.zeros((chunk_length,Ny,Nx),dtype=np.uint16);
+    chunk = np.zeros((chunk_length,Ny,Nx),dtype=np.uint16);
     for z in range(0,nz,chunk_length):
         chunk_end = min(z+chunk_length,nz);
 
@@ -148,7 +151,7 @@ def cylinder_mask(Ny,Nx):
         chunk_lsb = chunk_lsb.copy2numpy()
         print("chunk_lsb.copy2numpy().max: ", chunk_lsb.max())
         h5tomo_lsb[z_offset+z:z_offset+chunk_end] = chunk_lsb[:]
-        bh.flush()
+        np.flush()
         
     z_offset += nz;
 

From 5d91765e7f9c07589a7fc02aa94dc8195ca33470 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Thu, 9 Feb 2023 15:54:58 +0100
Subject: [PATCH 025/136] #29 Added verbose to generate_byte_hdf5

---
 .../200_generate_byte_hdf5.py                 | 49 ++++++++++---------
 1 file changed, 25 insertions(+), 24 deletions(-)

diff --git a/src/processing_steps/200_generate_byte_hdf5.py b/src/processing_steps/200_generate_byte_hdf5.py
index a3df603..559bc85 100755
--- a/src/processing_steps/200_generate_byte_hdf5.py
+++ b/src/processing_steps/200_generate_byte_hdf5.py
@@ -16,13 +16,14 @@
 
 NA = np.newaxis
 
-sample, chunk_length, use_bohrium, xml_root  = commandline_args({"sample" : "<required>",
-                                                                 "chunk_length" : 256,
-                                                                 "use_bohrium" : True,
-                                                                 "xml_root" : esrf_implants_root})
+sample, chunk_length, use_bohrium, xml_root, verbose  = commandline_args({"sample" : "<required>",
+                                                                          "chunk_length" : 256,
+                                                                          "use_bohrium" : True,
+                                                                          "xml_root" : esrf_implants_root,
+                                                                          "verbose" : 1})
 
 
-print(f"data_root={xml_root}")
+if verbose >= 1: print(f"data_root={xml_root}")
 
 # Normalize, such that 1,...,2^(nbits)-1 correspond to vmin,...,vmax
 # 0 corresponds to a masked value
@@ -43,10 +44,10 @@ def normalize(A,value_range,nbits=16,dtype=np.uint16):
 (Nz,Ny,Nx)  = (np.sum(subvolume_dimensions[:,0]), np.min(subvolume_dimensions[:,1]&~31), np.min(subvolume_dimensions[:,2]&~31))
 
 for i in range(len(subvolume_metadata)):
-    print(f"{i} {sample}/{subvolume_metadata[i]['experiment']}: {subvolume_range[i]}")
-print((global_vmin, global_vmax), (Nz,Ny,Nx))    
-print(subvolume_dimensions)
-print(subvolume_range)
+    if verbose >= 1: print(f"{i} {sample}/{subvolume_metadata[i]['experiment']}: {subvolume_range[i]}")
+if verbose >= 1: print((global_vmin, global_vmax), (Nz,Ny,Nx))    
+if verbose >= 1: print(subvolume_dimensions)
+if verbose >= 1: print(subvolume_range)
 
 
 #import re
@@ -69,7 +70,7 @@ def normalize(A,value_range,nbits=16,dtype=np.uint16):
 outdir = os.path.dirname(lsb_filename)
 pathlib.Path(outdir).mkdir(parents=True, exist_ok=True)
 
-print(f"Writing {msb_filename} and {lsb_filename}")
+if verbose >= 1: print(f"Writing {msb_filename} and {lsb_filename}")
 h5file_msb = h5py.File(msb_filename,"w");
 h5file_lsb = h5py.File(lsb_filename,"w");
 
@@ -111,11 +112,11 @@ def cylinder_mask(Ny,Nx):
     (nz,ny,nx)     = subvolume_dimensions[i];
     (sy,sx)        = ((ny-Ny)//2+((ny-Ny)%2), (nx-Nx)//2+((nx-Nx)%2))
     (ey,ex)        = (ny-(ny-Ny)//2, nx-(nx-Nx)//2)
-    print((sy,ey),(sx,ex))
+    if verbose >= 1: print((sy,ey),(sx,ex))
     
-    # print(f"Loading {subvolume_info['experiment']}")
+    # if verbose >= 1: print(f"Loading {subvolume_info['experiment']}")
     # tomo = normalize(esrf_full_tomogram_bh(subvolume_info), (global_vmin,global_vmax));
-    # print(f"Writing {subvolume_info['experiment']}")    
+    # if verbose >= 1: print(f"Writing {subvolume_info['experiment']}")    
     # h5tomo[z_offset:z_offset+nz] = tomo[:,sy:ey,sx:ex];
     # del tomo
     chunk = np.zeros((chunk_length,Ny,Nx),dtype=np.uint16);
@@ -123,14 +124,14 @@ def cylinder_mask(Ny,Nx):
         chunk_end = min(z+chunk_length,nz);
 
         region = [[sx,sy,z],[ex,ey,chunk_end]]
-        print(f"Reading chunk {z+z_offset}:{chunk_end+z_offset} ({i}-{z}), region={region}");
+        if verbose >= 1: print(f"Reading chunk {z+z_offset}:{chunk_end+z_offset} ({i}-{z}), region={region}");
         slab_data = esrf_edfrange_to_bh(subvolume_info,region)
-        print(f"Chunk shape: {slab_data.shape}")
-        print("Max value before masking:", slab_data.max())
+        if verbose >= 1: print(f"Chunk shape: {slab_data.shape}")
+        if verbose >= 1: print("Max value before masking:", slab_data.max())
         slab_data *= mask[NA,:,:]
-        print("Max value after masking:", slab_data.max())        
+        if verbose >= 1: print("Max value after masking:", slab_data.max())        
         chunk[:chunk_end-z] = normalize(slab_data,(global_vmin,global_vmax))
-        print("Max value after normalizing:", chunk.max())
+        if verbose >= 1: print("Max value after normalizing:", chunk.max())
 
         # for j in range(0,chunk_end-z):
         #     slice_meta, slice_data = esrf_edf_n_to_npy(subvolume_info,z+j);
@@ -138,18 +139,18 @@ def cylinder_mask(Ny,Nx):
         #     chunk[j] = normalize(slice_data[sy:ey,sx:ex],(global_vmin,global_vmax)) * mask
 
             
-        print(f"Writing {sample} MSB slice {z+z_offset}:{chunk_end+z_offset} ({i}-{z})");
+        if verbose >= 1: print(f"Writing {sample} MSB slice {z+z_offset}:{chunk_end+z_offset} ({i}-{z})");
         chunk_msb = ((chunk[:chunk_end-z]>>8)&0xff).astype(np.uint8)
-        print("chunk_msb.max: ", chunk_msb.max())
+        if verbose >= 1: print("chunk_msb.max: ", chunk_msb.max())
         chunk_msb = chunk_msb.copy2numpy()
-        print("chunk_msb.copy2numpy().max: ", chunk_msb.max())
+        if verbose >= 1: print("chunk_msb.copy2numpy().max: ", chunk_msb.max())
         h5tomo_msb[z_offset+z:z_offset+chunk_end] = chunk_msb[:]
         
-        print(f"Writing {sample} LSB slice {z+z_offset}:{chunk_end+z_offset} ({i}-{z})");
+        if verbose >= 1: print(f"Writing {sample} LSB slice {z+z_offset}:{chunk_end+z_offset} ({i}-{z})");
         chunk_lsb = (chunk[:chunk_end-z]&0xff).astype(np.uint8)
-        print("chunk_lsb.max: ", chunk_lsb.max())
+        if verbose >= 1: print("chunk_lsb.max: ", chunk_lsb.max())
         chunk_lsb = chunk_lsb.copy2numpy()
-        print("chunk_lsb.copy2numpy().max: ", chunk_lsb.max())
+        if verbose >= 1: print("chunk_lsb.copy2numpy().max: ", chunk_lsb.max())
         h5tomo_lsb[z_offset+z:z_offset+chunk_end] = chunk_lsb[:]
         np.flush()
         

From e567284ab4db94b1b997bf32391d7e25e12f948d Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Thu, 9 Feb 2023 15:58:09 +0100
Subject: [PATCH 026/136] #25 Moved volume matcher and added jax as a
 dependency

---
 .../processing_steps/300_volume_matcher.py                    | 4 +++-
 src/requirements.txt                                          | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)
 rename pre-cleanup-src/scripts/volume_matcher.py => src/processing_steps/300_volume_matcher.py (98%)

diff --git a/pre-cleanup-src/scripts/volume_matcher.py b/src/processing_steps/300_volume_matcher.py
similarity index 98%
rename from pre-cleanup-src/scripts/volume_matcher.py
rename to src/processing_steps/300_volume_matcher.py
index 2e26ac5..cf8c602 100755
--- a/pre-cleanup-src/scripts/volume_matcher.py
+++ b/src/processing_steps/300_volume_matcher.py
@@ -12,7 +12,9 @@
 import jax.numpy as jp
 import h5py, jax, sys
 from PIL import Image
-from config.paths import hdf5_root, commandline_args
+sys.path.append(sys.path[0]+"/../")
+from config.paths import hdf5_root
+from lib.py.helpers import commandline_args
 
 volume_matched_dir = f"{hdf5_root}/processed/volume_matched"
 
diff --git a/src/requirements.txt b/src/requirements.txt
index b4d4689..945718f 100644
--- a/src/requirements.txt
+++ b/src/requirements.txt
@@ -1,2 +1,3 @@
 fabric==3.0.0
-tqdm==4.64.1
\ No newline at end of file
+tqdm==4.64.1
+jax==0.4.3
\ No newline at end of file

From 20c7ff3452319ce462e2bb3a6e12fee21323c305 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Thu, 9 Feb 2023 16:00:22 +0100
Subject: [PATCH 027/136] #29 Added verbose to volume matcher

---
 src/processing_steps/300_volume_matcher.py | 31 +++++++++++++---------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/src/processing_steps/300_volume_matcher.py b/src/processing_steps/300_volume_matcher.py
index cf8c602..336660c 100755
--- a/src/processing_steps/300_volume_matcher.py
+++ b/src/processing_steps/300_volume_matcher.py
@@ -16,6 +16,7 @@
 from config.paths import hdf5_root
 from lib.py.helpers import commandline_args
 
+verbose = 1
 volume_matched_dir = f"{hdf5_root}/processed/volume_matched"
 
 def match_region(voxels_top, voxels_bot, overlap, max_shift):
@@ -44,25 +45,25 @@ def match_all_regions(voxels,crossings,write_image_checks=True):
     errors = np.zeros(len(crossings),dtype=np.float32)
     match_region_jit = jax.jit(match_region,static_argnums=(2,3));
     
-    print(f"Crossings at z-indices: {crossings}")
+    if verbose >= 1: print(f"Crossings at z-indices: {crossings}")
     for i in range(len(crossings)):
         crossing = crossings[i]
-        print(f"Processing crossing at z={crossing}:")
-        print(f"Reading top region:    voxels[{crossing-max_shift}:{crossing}]")
+        if verbose >= 1: print(f"Processing crossing at z={crossing}:")
+        if verbose >= 1: print(f"Reading top region:    voxels[{crossing-max_shift}:{crossing}]")
         top_voxels = jp.array(voxels[crossing-max_shift:crossing]).astype(jp.float32)
-        print(f"Reading bottom region: voxels[{crossing}:{crossing+max_shift}]")    
+        if verbose >= 1: print(f"Reading bottom region: voxels[{crossing}:{crossing+max_shift}]")    
         bot_voxels = jp.array(voxels[crossing:crossing+max_shift]).astype(jp.float32)
 
-        print(f"Matching regions (Shapes: {bot_voxels.shape} {top_voxels.shape})")
+        if verbose >= 1: print(f"Matching regions (Shapes: {bot_voxels.shape} {top_voxels.shape})")
         shift, error = match_region_jit(top_voxels,bot_voxels,overlap,max_shift)
         shifts[i] = shift
         errors[i] = error
-        print(f"Optimal shift is {shift} with error {error} per voxel")
+        if verbose >= 1: print(f"Optimal shift is {shift} with error {error} per voxel")
 
         if(write_image_checks):
             image_dir = f"{volume_matched_dir}/verification"
             pathlib.Path(image_dir).mkdir(parents=True, exist_ok=True)                
-            print(f"Writing images of matched slices to {image_dir} to check correctness.")
+            if verbose >= 1: print(f"Writing images of matched slices to {image_dir} to check correctness.")
             merged_zy_slice = np.concatenate([top_voxels[:,:,Nx//2],bot_voxels[shift:,:,Nx//2]])
 #            merged_zy_slice  = np.array(merged_voxels[:,:,Nx//2])
             
@@ -84,11 +85,11 @@ def write_matched(voxels_in, voxels_out, crossings, shifts):
     cum_shifts = [0]+list(np.cumsum(shifts))
     crossings  = list(crossings) + [voxels_in.shape[0]]
 
-    print(f"Cumulative shifts: {cum_shifts}")
-    print(f"Duplicating subvolume 0: 0:{crossings[0]}")
+    if verbose >= 1: print(f"Cumulative shifts: {cum_shifts}")
+    if verbose >= 1: print(f"Duplicating subvolume 0: 0:{crossings[0]}")
     voxels_out[:crossings[0]] = voxels_in[:crossings[0]];
     for i in range(len(crossings)-1):
-        print(f"Duplicating unmatched part of subvolume {i+1}: voxels_out[{crossings[i]-cum_shifts[i]}:{crossings[i+1]-cum_shifts[i]-shifts[i]}] = voxels_in[{crossings[i]+shifts[i]}:{crossings[i+1]}];")
+        if verbose >= 1: print(f"Duplicating unmatched part of subvolume {i+1}: voxels_out[{crossings[i]-cum_shifts[i]}:{crossings[i+1]-cum_shifts[i]-shifts[i]}] = voxels_in[{crossings[i]+shifts[i]}:{crossings[i+1]}];")
         voxels_out[crossings[i]-cum_shifts[i]:crossings[i+1]-cum_shifts[i]-shifts[i]] = voxels_in[crossings[i]+shifts[i]:crossings[i+1]];
 
     
@@ -108,7 +109,11 @@ def write_matched_hdf5(h5_filename_in, h5_filename_out, crossings, shifts, compr
         
 
 if __name__ == "__main__":
-    sample, overlap, max_shift, generate_h5 = commandline_args({"sample":"<required>","overlap":10,"max_shift":150,"generate_h5":False})
+    sample, overlap, max_shift, generate_h5, verbose = commandline_args({"sample" : "<required>",
+                                                                         "overlap" : 10,
+                                                                         "max_shift" : 150,
+                                                                         "generate_h5" : False,
+                                                                         "verbose" : 1})
 
     input_h5name  = f"{hdf5_root}/hdf5-byte/msb/{sample}.h5"
     output_h5name = f"{volume_matched_dir}/1x/{sample}.h5"
@@ -122,7 +127,7 @@ def write_matched_hdf5(h5_filename_in, h5_filename_out, crossings, shifts, compr
     (Nz,Ny,Nx) = h5file['voxels'].shape
     
     crossings = np.cumsum(subvolume_dimensions[:-1,0]).astype(int)
-    print(f"Matching all regions for sample {sample} at crossings {crossings}.")
+    if verbose >= 1: print(f"Matching all regions for sample {sample} at crossings {crossings}.")
     shifts, errors = match_all_regions(voxels,crossings)
     
     np.save(f"{volume_matched_dir}/{sample}-shifts.npy",shifts)
@@ -134,5 +139,5 @@ def write_matched_hdf5(h5_filename_in, h5_filename_out, crossings, shifts, compr
         
     h5file.close()
 
-    print(f"Copying over volume from {input_h5name} shifted by {shifts} to {output_h5name}")
+    if verbose >= 1: print(f"Copying over volume from {input_h5name} shifted by {shifts} to {output_h5name}")
     if(generate_h5): write_matched_hdf5(input_h5name, output_h5name, crossings, shifts)

From 369e3acd773b4e409e5d8e44f09c245571d75b77 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Thu, 9 Feb 2023 17:21:57 +0100
Subject: [PATCH 028/136] #25 Moved h5tobin

---
 .../scripts/h5tobin.py => src/processing_steps/400_h5tobin.py     | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename pre-cleanup-src/scripts/h5tobin.py => src/processing_steps/400_h5tobin.py (100%)

diff --git a/pre-cleanup-src/scripts/h5tobin.py b/src/processing_steps/400_h5tobin.py
similarity index 100%
rename from pre-cleanup-src/scripts/h5tobin.py
rename to src/processing_steps/400_h5tobin.py

From 3510a35c3ac7f6709120da5d9ad65460f591a536 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Thu, 9 Feb 2023 17:29:50 +0100
Subject: [PATCH 029/136] #25 Moved helper functions

---
 pre-cleanup-src/helper_functions.py | 138 --------------------------
 src/lib/py/helpers.py               | 145 ++++++++++++++++++++++++++++
 2 files changed, 145 insertions(+), 138 deletions(-)
 delete mode 100644 pre-cleanup-src/helper_functions.py

diff --git a/pre-cleanup-src/helper_functions.py b/pre-cleanup-src/helper_functions.py
deleted file mode 100644
index 80d30c9..0000000
--- a/pre-cleanup-src/helper_functions.py
+++ /dev/null
@@ -1,138 +0,0 @@
-#!/usr/bin/env python3
-import os, h5py, pathlib, numpy as np, pybind_kernels.histograms as histograms, matplotlib.pyplot as plt, tqdm
-from config.paths import hdf5_root, binary_root
-from numpy import newaxis as NA
-
-def update_hdf5(filename,group_name,datasets={},attributes={},dimensions=None,
-                compression=None,chunk_shape=None):
-
-    output_dir = os.path.dirname(filename)
-    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)    
-    f = h5py.File(filename,'a')
-
-    if((group_name is not None) and (group_name != "/")):
-        g = f.require_group(group_name)
-    else:
-       g = f
-    
-    for k in datasets:
-        v = datasets[k]
-        if(k in g): del g[k]
-        g.create_dataset(k,shape=v.shape,dtype=v.dtype,
-                          compression=compression, chunks=chunk_shape,maxshape=None)
-        g[k][:] = v[:]
-
-        if dimensions is not None:
-            try:
-                dims = dimensions[k]
-                for i, description in enumerate(dims):
-                    g[k].dims[i] = description
-            except:
-                pass
-
-    for k in attributes:
-        v = attributes[k]
-        g.attrs[k] = v
-
-    f.close()
-
-
-#TODO: Use this for masks, no compression and no chunking default for small metadata datasets
-def update_hdf5_mask(filename,group_name,datasets={},attributes={},dimensions=None,
-                     compression="lzf",chunk_shape=(64,64,64)):
-    update_hdf5(filename,group_name,datasets,attributes,dimensions,compression,chunk_shape)
-
-
-def h5meta_info_volume_matched(sample):
-    with h5py.File(f"{hdf5_root}/hdf5-byte/msb/{sample}.h5","r") as h5meta:
-        vm_shifts  = h5meta["volume_matching_shifts"][:]        
-        Nz, Ny, Nx = h5meta['voxels'].shape
-        Nz -= np.sum(vm_shifts)
-
-        subvolume_dimensions =  h5meta['subvolume_dimensions'][:]                
-        subvolume_nzs = subvolume_dimensions[:,0] - np.append(vm_shifts,0)
-        voxel_size    = h5meta["voxels"].attrs["voxelsize"]
-        
-        return ((Nz,Ny,Nx), subvolume_nzs, voxel_size)
-        
-def block_info(h5meta_filename,block_size=0, n_blocks=0,z_offset=0):
-    print(f"Opening {h5meta_filename}")
-    with h5py.File(h5meta_filename, 'r') as h5meta:
-        vm_shifts  = h5meta["volume_matching_shifts"][:]
-        Nz, Ny, Nx = h5meta['voxels'].shape
-        Nz -= np.sum(vm_shifts)
-        Nr = int(np.sqrt((Nx//2)**2 + (Ny//2)**2))+1
-
-
-        subvolume_dimensions =  h5meta['subvolume_dimensions'][:]                
-        subvolume_nzs = subvolume_dimensions[:,0] - np.append(vm_shifts,0)
-
-        if block_size == 0:
-            # If block_size is 0, let each block be exactly a full subvolume
-            blocks_are_subvolumes = True
-
-            # Do either n_blocks subvolumes, or if n_blocks == 0: all remaining after offset
-            if n_blocks == 0:
-                n_blocks = len(subvolume_nzs)-z_offset 
-                
-        else:
-            blocks_are_subvolumes = False        
-            if n_blocks == 0:
-                n_blocks = Nz // block_size + (Nz % block_size > 0)
-
-
-        return {'dimensions':(Nz,Ny,Nx,Nr),
-                'voxel_size':h5meta["voxels"].attrs["voxelsize"],
-                'n_blocks': n_blocks,
-                'block_size': block_size,
-                'blocks_are_subvolumes': blocks_are_subvolumes,
-                'subvolume_dimensions': subvolume_dimensions,
-                'subvolume_nzs': subvolume_nzs,
-                'subvolume_starts': np.concatenate([[0],np.cumsum(subvolume_nzs)[:-1]]
-                )
-        }
-
-
-def load_block(sample, offset, block_size, mask_name, mask_scale, field_names):
-    '''
-    Loads a block of data from disk into memory.
-    '''
-    Nfields = len(field_names)
-
-    h5meta = h5py.File(f'{hdf5_root}/hdf5-byte/msb/{sample}.h5', 'r')
-    Nz, Ny, Nx = h5meta['voxels'].shape
-    Nz -= np.sum(h5meta["volume_matching_shifts"][:])
-    h5meta.close()
-#    print(block_size,Nz,offset)   
-    block_size       = min(block_size, Nz-offset)
-
-    voxels = np.zeros((block_size,Ny,Nx),    dtype=np.uint16)
-    fields = np.zeros((Nfields,block_size//2,Ny//2,Nx//2), dtype=np.uint16)    
-
-    if mask_name is not None:
-        for i in tqdm.tqdm(range(1),f"Loading {mask_name} mask from {hdf5_root}/masks/{mask_scale}x/{sample}.h5", leave=True):
-            with h5py.File(f"{hdf5_root}/masks/{mask_scale}x/{sample}.h5","r") as h5mask:
-                mask = h5mask[mask_name]["mask"][offset//mask_scale:offset//mask_scale + block_size//mask_scale]
-            
-    #TODO: Make voxel & field scale command line parameters
-    for i in tqdm.tqdm(range(1),f"Loading {voxels.shape} voxels from {binary_root}/voxels/1x/{sample}.uint16", leave=True):    
-        histograms.load_slice(voxels, f'{binary_root}/voxels/1x/{sample}.uint16', (offset, 0, 0), (Nz, Ny, Nx)) # TODO: Don't use 3 different methods for load/store
-
-    for i in tqdm.tqdm(range(Nfields),f"Loading {binary_root}/fields/implant-{field_names}/2x/{sample}.npy",leave=True):
-        fi = np.load(f"{binary_root}/fields/implant-{field_names[i]}/2x/{sample}.npy", mmap_mode='r')
-        fields[i,:] = fi[offset//2:offset//2 + block_size//2]
-
-    if mask_name is not None:
-        nz, ny, nx = (block_size//mask_scale), Ny//mask_scale, Nx//mask_scale
-        mask_1x = np.broadcast_to(mask[:,NA,:,NA,:,NA],(nz,mask_scale, ny,mask_scale, nx,mask_scale))
-        mask_1x = mask_1x.reshape(nz*mask_scale,ny*mask_scale,nx*mask_scale)
-        voxels[:nz*mask_scale] *= mask_1x               # block_size may not be divisible by mask_scale
-        voxels[nz*mask_scale:] *= mask_1x[-1][NA,...]  # Remainder gets last line of mask
-
-#    plt.imshow(voxels[:,voxels.shape[1]//2,:]); plt.show()
-#    plt.imshow(fields[0,:,fields[0].shape[1]//2,:]); plt.show()    
-    return voxels, fields
-
-def row_normalize(A,r):
-    na = np.newaxis
-    return A/(r[:,na]+(r==0)[:,na])
diff --git a/src/lib/py/helpers.py b/src/lib/py/helpers.py
index f76df16..b76ec6a 100644
--- a/src/lib/py/helpers.py
+++ b/src/lib/py/helpers.py
@@ -1,4 +1,13 @@
 import sys
+sys.path.append(sys.path[0]+"/../")
+
+from config.paths import binary_root, hdf5_root
+import h5py
+from lib.cpp.cpu.io import load_slice
+import os
+import numpy as np 
+import pathlib
+import tqdm
 
 def commandline_args(defaults):
     keys = list(defaults.keys())
@@ -28,3 +37,139 @@ def commandline_args(defaults):
             args.append(type(default)(sys.argv[i+1]))
 
     return args
+
+
+def update_hdf5(filename,group_name,datasets={},attributes={},dimensions=None,
+                compression=None,chunk_shape=None):
+
+    output_dir = os.path.dirname(filename)
+    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)    
+    f = h5py.File(filename,'a')
+
+    if((group_name is not None) and (group_name != "/")):
+        g = f.require_group(group_name)
+    else:
+       g = f
+    
+    for k in datasets:
+        v = datasets[k]
+        if(k in g): del g[k]
+        g.create_dataset(k,shape=v.shape,dtype=v.dtype,
+                          compression=compression, chunks=chunk_shape,maxshape=None)
+        g[k][:] = v[:]
+
+        if dimensions is not None:
+            try:
+                dims = dimensions[k]
+                for i, description in enumerate(dims):
+                    g[k].dims[i] = description
+            except:
+                pass
+
+    for k in attributes:
+        v = attributes[k]
+        g.attrs[k] = v
+
+    f.close()
+
+
+#TODO: Use this for masks, no compression and no chunking default for small metadata datasets
+def update_hdf5_mask(filename,group_name,datasets={},attributes={},dimensions=None,
+                     compression="lzf",chunk_shape=(64,64,64)):
+    update_hdf5(filename,group_name,datasets,attributes,dimensions,compression,chunk_shape)
+
+
+def h5meta_info_volume_matched(sample):
+    with h5py.File(f"{hdf5_root}/hdf5-byte/msb/{sample}.h5","r") as h5meta:
+        vm_shifts  = h5meta["volume_matching_shifts"][:]        
+        Nz, Ny, Nx = h5meta['voxels'].shape
+        Nz -= np.sum(vm_shifts)
+
+        subvolume_dimensions =  h5meta['subvolume_dimensions'][:]                
+        subvolume_nzs = subvolume_dimensions[:,0] - np.append(vm_shifts,0)
+        voxel_size    = h5meta["voxels"].attrs["voxelsize"]
+        
+        return ((Nz,Ny,Nx), subvolume_nzs, voxel_size)
+        
+def block_info(h5meta_filename,block_size=0, n_blocks=0,z_offset=0):
+    print(f"Opening {h5meta_filename}")
+    with h5py.File(h5meta_filename, 'r') as h5meta:
+        vm_shifts  = h5meta["volume_matching_shifts"][:]
+        Nz, Ny, Nx = h5meta['voxels'].shape
+        Nz -= np.sum(vm_shifts)
+        Nr = int(np.sqrt((Nx//2)**2 + (Ny//2)**2))+1
+
+
+        subvolume_dimensions =  h5meta['subvolume_dimensions'][:]                
+        subvolume_nzs = subvolume_dimensions[:,0] - np.append(vm_shifts,0)
+
+        if block_size == 0:
+            # If block_size is 0, let each block be exactly a full subvolume
+            blocks_are_subvolumes = True
+
+            # Do either n_blocks subvolumes, or if n_blocks == 0: all remaining after offset
+            if n_blocks == 0:
+                n_blocks = len(subvolume_nzs)-z_offset 
+                
+        else:
+            blocks_are_subvolumes = False        
+            if n_blocks == 0:
+                n_blocks = Nz // block_size + (Nz % block_size > 0)
+
+
+        return {'dimensions':(Nz,Ny,Nx,Nr),
+                'voxel_size':h5meta["voxels"].attrs["voxelsize"],
+                'n_blocks': n_blocks,
+                'block_size': block_size,
+                'blocks_are_subvolumes': blocks_are_subvolumes,
+                'subvolume_dimensions': subvolume_dimensions,
+                'subvolume_nzs': subvolume_nzs,
+                'subvolume_starts': np.concatenate([[0],np.cumsum(subvolume_nzs)[:-1]]
+                )
+        }
+
+
+def load_block(sample, offset, block_size, mask_name, mask_scale, field_names):
+    '''
+    Loads a block of data from disk into memory.
+    '''
+    NA = np.newaxis
+    Nfields = len(field_names)
+
+    h5meta = h5py.File(f'{hdf5_root}/hdf5-byte/msb/{sample}.h5', 'r')
+    Nz, Ny, Nx = h5meta['voxels'].shape
+    Nz -= np.sum(h5meta["volume_matching_shifts"][:])
+    h5meta.close()
+#    print(block_size,Nz,offset)   
+    block_size       = min(block_size, Nz-offset)
+
+    voxels = np.zeros((block_size,Ny,Nx),    dtype=np.uint16)
+    fields = np.zeros((Nfields,block_size//2,Ny//2,Nx//2), dtype=np.uint16)    
+
+    if mask_name is not None:
+        for i in tqdm.tqdm(range(1),f"Loading {mask_name} mask from {hdf5_root}/masks/{mask_scale}x/{sample}.h5", leave=True):
+            with h5py.File(f"{hdf5_root}/masks/{mask_scale}x/{sample}.h5","r") as h5mask:
+                mask = h5mask[mask_name]["mask"][offset//mask_scale:offset//mask_scale + block_size//mask_scale]
+            
+    #TODO: Make voxel & field scale command line parameters
+    for i in tqdm.tqdm(range(1),f"Loading {voxels.shape} voxels from {binary_root}/voxels/1x/{sample}.uint16", leave=True):    
+        load_slice(voxels, f'{binary_root}/voxels/1x/{sample}.uint16', (offset, 0, 0), (Nz, Ny, Nx)) # TODO: Don't use 3 different methods for load/store
+
+    for i in tqdm.tqdm(range(Nfields),f"Loading {binary_root}/fields/implant-{field_names}/2x/{sample}.npy",leave=True):
+        fi = np.load(f"{binary_root}/fields/implant-{field_names[i]}/2x/{sample}.npy", mmap_mode='r')
+        fields[i,:] = fi[offset//2:offset//2 + block_size//2]
+
+    if mask_name is not None:
+        nz, ny, nx = (block_size//mask_scale), Ny//mask_scale, Nx//mask_scale
+        mask_1x = np.broadcast_to(mask[:,NA,:,NA,:,NA],(nz,mask_scale, ny,mask_scale, nx,mask_scale))
+        mask_1x = mask_1x.reshape(nz*mask_scale,ny*mask_scale,nx*mask_scale)
+        voxels[:nz*mask_scale] *= mask_1x               # block_size may not be divisible by mask_scale
+        voxels[nz*mask_scale:] *= mask_1x[-1][NA,...]  # Remainder gets last line of mask
+
+#    plt.imshow(voxels[:,voxels.shape[1]//2,:]); plt.show()
+#    plt.imshow(fields[0,:,fields[0].shape[1]//2,:]); plt.show()    
+    return voxels, fields
+
+def row_normalize(A,r):
+    na = np.newaxis
+    return A/(r[:,na]+(r==0)[:,na])

From 25d0427afa31b2dca78cea8fae61d764c89771ff Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Thu, 9 Feb 2023 17:30:04 +0100
Subject: [PATCH 030/136] Updated h5tobin to use new structure

---
 src/processing_steps/400_h5tobin.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/processing_steps/400_h5tobin.py b/src/processing_steps/400_h5tobin.py
index f9ad99f..0457f58 100755
--- a/src/processing_steps/400_h5tobin.py
+++ b/src/processing_steps/400_h5tobin.py
@@ -1,10 +1,10 @@
 #!/usr/bin/env python3
 import sys, pathlib, h5py, numpy as np
 sys.path.append(sys.path[0]+"/../")
-import pybind_kernels.histograms as histograms
-from config.paths import hdf5_root, binary_root, commandline_args
+from config.paths import hdf5_root, binary_root
 from tqdm import tqdm
-from helper_functions import update_hdf5
+from lib.cpp.cpu.io import write_slice
+from lib.py.helpers import commandline_args, update_hdf5
 
 slice_all = slice(None)
 
@@ -65,7 +65,7 @@ def h5tobin(sample,region=(slice_all,slice_all,slice_all),shift_volume_match=1):
         subvolume_msb = dmsb[input_zstarts[i]:input_zends[i],y_range,x_range].astype(np.uint16)
         subvolume_lsb = dlsb[input_zstarts[i]:input_zends[i],y_range,x_range].astype(np.uint16)
 
-        histograms.write_slice((subvolume_msb << 8) | subvolume_lsb, output_zstarts[i]*Ny*Nx, outfile)
+        write_slice((subvolume_msb << 8) | subvolume_lsb, output_zstarts[i]*Ny*Nx, outfile)
 
         del subvolume_msb
         del subvolume_lsb

From 578054527cd2ab52756c130ef3907649113993c0 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Thu, 9 Feb 2023 17:31:43 +0100
Subject: [PATCH 031/136] #29 Added verbose to h5tobin

---
 src/processing_steps/400_h5tobin.py | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/src/processing_steps/400_h5tobin.py b/src/processing_steps/400_h5tobin.py
index 0457f58..53b63ed 100755
--- a/src/processing_steps/400_h5tobin.py
+++ b/src/processing_steps/400_h5tobin.py
@@ -7,6 +7,7 @@
 from lib.py.helpers import commandline_args, update_hdf5
 
 slice_all = slice(None)
+verbose = 1
 
 def slice_length(s,n):
     start = s.start if s.start is not None else 0
@@ -37,23 +38,23 @@ def h5tobin(sample,region=(slice_all,slice_all,slice_all),shift_volume_match=1):
     input_zstarts         = np.concatenate([[0], np.cumsum(Nzs[:-1])]).astype(int)
     input_zends           = (np.cumsum(Nzs) - np.concatenate([vm_shifts,[0]])).astype(int)
     
-    print(f'HDF5 voxel data:')
-    print(f'subvolume_dims =\n{subvolume_dims}')
-    print(f'Nzs = {Nzs}')
-    print(f'vm_shifts = {vm_shifts}')    
-    print(f'input_zstarts  = {input_zstarts}')
-    print(f'input_zends    = {input_zends}')
+    if verbose >= 1: print(f'HDF5 voxel data:')
+    if verbose >= 1: print(f'subvolume_dims =\n{subvolume_dims}')
+    if verbose >= 1: print(f'Nzs = {Nzs}')
+    if verbose >= 1: print(f'vm_shifts = {vm_shifts}')    
+    if verbose >= 1: print(f'input_zstarts  = {input_zstarts}')
+    if verbose >= 1: print(f'input_zends    = {input_zends}')
 
     output_zstarts        = np.concatenate([[0], np.cumsum(Nzs[:-1]) - np.cumsum(vm_shifts)]).astype(int)
     output_zends          = np.concatenate([output_zstarts[1:], [output_zstarts[-1]+Nzs[-1]]]).astype(int)
-    print(f'output_zstarts = {output_zstarts}')
-    print(f'output_zends   = {output_zends}')
+    if verbose >= 1: print(f'output_zstarts = {output_zstarts}')
+    if verbose >= 1: print(f'output_zends   = {output_zends}')
     assert((input_zends - input_zstarts == output_zends - output_zstarts).all())
 
-    print(f'Shape to extract:\n{region}')
+    if verbose >= 1: print(f'Shape to extract:\n{region}')
     
     nzs = input_zends - input_zstarts # Actual number of z-slices per subvolume after vm-correction
-    print(f"Volume matched subvolume nzs = {nzs}")
+    if verbose >= 1: print(f"Volume matched subvolume nzs = {nzs}")
     # TODO: z_range is ignored
     # TODO: Store metadata about region range in json
     # TODO: Come up with appropriate "file format" scheme
@@ -81,9 +82,10 @@ def h5tobin(sample,region=(slice_all,slice_all,slice_all),shift_volume_match=1):
     
         
 if __name__ == "__main__":
-    sample, y_cutoff, shift_volume_match = commandline_args({"sample":"<required>",
-                                                             "y_cutoff": 0,
-                                                             "shift_volume_match":1})
+    sample, y_cutoff, shift_volume_match, verbose = commandline_args({"sample" : "<required>",
+                                                                      "y_cutoff" :  0,
+                                                                      "shift_volume_match" : 1,
+                                                                      "verbose" : 1})
 
     region = (slice_all,slice(y_cutoff,None), slice_all)
     h5tobin(sample,region,shift_volume_match)

From b7fde73a72422f96c98686aa9a36c5aec47084ec Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Thu, 9 Feb 2023 17:32:48 +0100
Subject: [PATCH 032/136] #25 Moved rescale_cupy_bin

---
 .../processing_steps/500_rescale_cupy_bin.py                      | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename pre-cleanup-src/preprocess/rescale-cupy-bin.py => src/processing_steps/500_rescale_cupy_bin.py (100%)

diff --git a/pre-cleanup-src/preprocess/rescale-cupy-bin.py b/src/processing_steps/500_rescale_cupy_bin.py
similarity index 100%
rename from pre-cleanup-src/preprocess/rescale-cupy-bin.py
rename to src/processing_steps/500_rescale_cupy_bin.py

From c42fc29b2fe41792ecd71795591ea84ef3d14b47 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Thu, 9 Feb 2023 17:46:09 +0100
Subject: [PATCH 033/136] #25 Moved rescale_cupy_bin dependencies

---
 {pre-cleanup-src/preprocess => src/lib/py}/resample.py | 0
 src/processing_steps/500_rescale_cupy_bin.py           | 6 +++---
 src/requirements.txt                                   | 5 +++--
 3 files changed, 6 insertions(+), 5 deletions(-)
 rename {pre-cleanup-src/preprocess => src/lib/py}/resample.py (100%)

diff --git a/pre-cleanup-src/preprocess/resample.py b/src/lib/py/resample.py
similarity index 100%
rename from pre-cleanup-src/preprocess/resample.py
rename to src/lib/py/resample.py
diff --git a/src/processing_steps/500_rescale_cupy_bin.py b/src/processing_steps/500_rescale_cupy_bin.py
index 87257a4..6a40b0a 100644
--- a/src/processing_steps/500_rescale_cupy_bin.py
+++ b/src/processing_steps/500_rescale_cupy_bin.py
@@ -3,9 +3,9 @@
 import numpy as np
 import cupy  as cp
 #import numpy as cp
-from resample import downsample2x, downsample3x
-from config.paths import commandline_args, hdf5_root, binary_root
-from pybind_kernels.histograms import load_slice, write_slice # Rename and place under io_modules 
+from lib.py.helpers import commandline_args
+from lib.py.resample import downsample2x, downsample3x
+from config.paths import hdf5_root, binary_root
 
 mempool = cp.get_default_memory_pool()
 pinned_mempool = cp.get_default_pinned_memory_pool()
diff --git a/src/requirements.txt b/src/requirements.txt
index 945718f..18bdb83 100644
--- a/src/requirements.txt
+++ b/src/requirements.txt
@@ -1,3 +1,4 @@
+cupy-cuda11x==11.5.0
 fabric==3.0.0
-tqdm==4.64.1
-jax==0.4.3
\ No newline at end of file
+jax==0.4.3
+tqdm==4.64.1
\ No newline at end of file

From bf63dc0030dc037547c9884272f0dd56ead61eba Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Thu, 9 Feb 2023 17:47:30 +0100
Subject: [PATCH 034/136] #29 added verbose to rescale_cupy_bin

---
 src/processing_steps/500_rescale_cupy_bin.py | 32 +++++++++++---------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/src/processing_steps/500_rescale_cupy_bin.py b/src/processing_steps/500_rescale_cupy_bin.py
index 6a40b0a..f7d5f47 100644
--- a/src/processing_steps/500_rescale_cupy_bin.py
+++ b/src/processing_steps/500_rescale_cupy_bin.py
@@ -12,10 +12,12 @@
 mempool.free_all_blocks()
 pinned_mempool.free_all_blocks()
 
-
 if __name__ == "__main__":
-    sample, image, chunk_size, dtype = commandline_args({"sample":"<required>", "image": "voxels",
-                                                         "chunk_size":32*2, "dtype":"uint16"})
+    sample, image, chunk_size, dtype, verbose = commandline_args({"sample" : "<required>", 
+                                                                  "image" :  "voxels",
+                                                                  "chunk_size" : 32*2, 
+                                                                  "dtype" : "uint16",
+                                                                  "verbose" : 1})
 
     scales = [2,4,8,16,32];     # Can do 6, 9, 12, 24, 27, etc. as well, but we currently don't. See old rescaly-cupy.py
     T = np.dtype(dtype)
@@ -23,10 +25,10 @@
     input_meta  = f'{hdf5_root}/hdf5-byte/msb/{sample}.h5'
     input_bin   = f"{binary_root}/{image}/1x/{sample}.{dtype}"
     output_root = f"{binary_root}/{image}"
-    print(f"Generating power-of-twos rescalings for sample {sample}")
-    print(f"Input metadata from {input_meta}")
-    print(f"Input flat binary {dtype} data from {input_bin}")
-    print(f"Output flat binary {dtype} data to {output_root}/[1,2,4,8,16,32]x/{sample}.{dtype}")
+    if verbose >= 1: print(f"Generating power-of-twos rescalings for sample {sample}")
+    if verbose >= 1: print(f"Input metadata from {input_meta}")
+    if verbose >= 1: print(f"Input flat binary {dtype} data from {input_bin}")
+    if verbose >= 1: print(f"Output flat binary {dtype} data to {output_root}/[1,2,4,8,16,32]x/{sample}.{dtype}")
     
     meta_h5    = h5py.File(input_meta, 'r')
     full_Nz, Ny, Nx = meta_h5['voxels'].shape
@@ -34,13 +36,13 @@
     Nz         = full_Nz - np.sum(shifts)
     meta_h5.close()    
     
-    print(f"Downscaling from 1x {(Nz,Ny,Nx)} to 2x {(Nz//2,Ny//2,Nx//2)}")
+    if verbose >= 1: print(f"Downscaling from 1x {(Nz,Ny,Nx)} to 2x {(Nz//2,Ny//2,Nx//2)}")
     if(chunk_size % 32):
-        print(f"Chunk size {chunk_size} is invalid: must be divisible by 32.")
+        if verbose >= 1: print(f"Chunk size {chunk_size} is invalid: must be divisible by 32.")
         sys.exit(-1)
-#        print(f"Used GPU memory: {mempool.used_bytes()//1000000}MB out of {mempool.total_bytes()/1000000}MB. {pinned_mempool.n_free_blocks()} free pinned blocks.")
+#        if verbose >= 1: print(f"Used GPU memory: {mempool.used_bytes()//1000000}MB out of {mempool.total_bytes()/1000000}MB. {pinned_mempool.n_free_blocks()} free pinned blocks.")
 
-#    print(f"Used GPU memory: {mempool.used_bytes()//1000000}MB out of {mempool.total_bytes()/1000000}MB. {pinned_mempool.n_free_blocks()} free pinned blocks.")
+#    if verbose >= 1: print(f"Used GPU memory: {mempool.used_bytes()//1000000}MB out of {mempool.total_bytes()/1000000}MB. {pinned_mempool.n_free_blocks()} free pinned blocks.")
 
     # TODO: Just iterate now we do powers of two
     voxels2x  = np.empty((Nz//2,Ny//2,Nx//2),dtype=T)
@@ -61,10 +63,10 @@
         try:
             voxels1x_chunk = cp.fromfile(input_bin, dtype=T, count=chunk_items, offset=z*Ny*Nx*T.itemsize).reshape(zend-z,Ny,Nx)
         except:
-            print(f"Read failed. chunk_items = {chunk_items} = {(zend-z)*Ny*Nx}, z = {z}, zend-z = {zend-z}")
+            if verbose >= 1: print(f"Read failed. chunk_items = {chunk_items} = {(zend-z)*Ny*Nx}, z = {z}, zend-z = {zend-z}")
             sys.exit(-1)
             
-#        print(f"Used GPU memory: {mempool.used_bytes()//1000000}MB out of {mempool.total_bytes()/1000000}MB. {pinned_mempool.n_free_blocks()} free pinned blocks.")
+#        if verbose >= 1: print(f"Used GPU memory: {mempool.used_bytes()//1000000}MB out of {mempool.total_bytes()/1000000}MB. {pinned_mempool.n_free_blocks()} free pinned blocks.")
         voxels2x_chunk = downsample2x(voxels1x_chunk)
         del voxels1x_chunk
         voxels4x_chunk  = downsample2x(voxels2x_chunk)
@@ -90,10 +92,10 @@
         del voxels16x_chunk
         del voxels32x_chunk
         
-    print(f"Allocating {(Nz//2,Ny//2,Nx//2)}={Nz//2*Ny//2*Nx//2} {dtype} for voxels2x on GPU")
+    if verbose >= 1: print(f"Allocating {(Nz//2,Ny//2,Nx//2)}={Nz//2*Ny//2*Nx//2} {dtype} for voxels2x on GPU")
     
     for i in tqdm.tqdm(range(len(scales)),f"{sample}: Downscaling to all smaller scales: {scales[2:]}"):
         output_dir = f"{output_root}/{scales[i]}x/"
         pathlib.Path(f"{output_dir}").mkdir(parents=True, exist_ok=True)            
-        print(f"Writing out scale {scales[i]}x {(voxels[i].shape)} to {output_dir}/{sample}.uint16")
+        if verbose >= 1: print(f"Writing out scale {scales[i]}x {(voxels[i].shape)} to {output_dir}/{sample}.uint16")
         voxels[i].tofile(f"{output_dir}/{sample}.uint16")

From 3bcb916d7c7fab6f235debddc15acf1e7620ed7a Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Feb 2023 09:15:06 +0100
Subject: [PATCH 035/136] #25 Moved segment_implant_cc

---
 .../processing_steps/600_segment_implant_cc.py                    | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename pre-cleanup-src/segmentation/segment-implant-cc.py => src/processing_steps/600_segment_implant_cc.py (100%)

diff --git a/pre-cleanup-src/segmentation/segment-implant-cc.py b/src/processing_steps/600_segment_implant_cc.py
similarity index 100%
rename from pre-cleanup-src/segmentation/segment-implant-cc.py
rename to src/processing_steps/600_segment_implant_cc.py

From 1c794ef3ede80337b6577e24bc21aabe496a3630 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Feb 2023 09:17:56 +0100
Subject: [PATCH 036/136] #25 Fixed some of the dependencies of
 segment_implant_cc

---
 {pre-cleanup-src => src}/config/constants.py   | 0
 src/processing_steps/600_segment_implant_cc.py | 8 ++++----
 2 files changed, 4 insertions(+), 4 deletions(-)
 rename {pre-cleanup-src => src}/config/constants.py (100%)

diff --git a/pre-cleanup-src/config/constants.py b/src/config/constants.py
similarity index 100%
rename from pre-cleanup-src/config/constants.py
rename to src/config/constants.py
diff --git a/src/processing_steps/600_segment_implant_cc.py b/src/processing_steps/600_segment_implant_cc.py
index 47a9a29..df9d20d 100644
--- a/src/processing_steps/600_segment_implant_cc.py
+++ b/src/processing_steps/600_segment_implant_cc.py
@@ -1,10 +1,10 @@
 import h5py, sys, os.path, pathlib, numpy as np, scipy.ndimage as ndi, tqdm, matplotlib.pyplot as plt
 sys.path.append(sys.path[0]+"/../")
 from config.constants import *
-from config.paths import hdf5_root, binary_root, commandline_args
-from pybind_kernels.geometry import center_of_mass, inertia_matrix, integrate_axes, sample_plane
-from pybind_kernels.histograms import load_slice
-from helper_functions import update_hdf5, update_hdf5_mask
+from config.paths import hdf5_root, binary_root
+from lib.py.helpers import commandline_args, update_hdf5, update_hdf5_mask
+from lib.cpp.cpu_seq.geometry import center_of_mass, inertia_matrix, integrate_axes, sample_plane
+from lib.cpp.cpu.io import load_slice
 
 NA = np.newaxis
 

From 2183b6b53627df62991684416cbb627afe0e9020 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Feb 2023 09:18:43 +0100
Subject: [PATCH 037/136] #29 Added verbose to segment_implant_cc

---
 src/processing_steps/600_segment_implant_cc.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/processing_steps/600_segment_implant_cc.py b/src/processing_steps/600_segment_implant_cc.py
index df9d20d..1f09060 100644
--- a/src/processing_steps/600_segment_implant_cc.py
+++ b/src/processing_steps/600_segment_implant_cc.py
@@ -8,7 +8,10 @@
 
 NA = np.newaxis
 
-sample, scale, chunk_size = commandline_args({"sample":"<required>","scale":8, "chunk_size":256})
+sample, scale, chunk_size, verbose = commandline_args({"sample" : "<required>",
+                                                       "scale" : 8, 
+                                                       "chunk_size" : 256,
+                                                       "verbose" : 1})
 
 # Load metadata. TODO: Clean up, make automatic function.
 meta_filename = f"{hdf5_root}/hdf5-byte/msb/{sample}.h5"
@@ -24,7 +27,7 @@
 values      = np.linspace(global_vmin,global_vmax,2**16)
 implant_threshold_u16 = np.argmin(np.abs(values-implant_threshold))
 
-print(f"Reading metadata from {meta_filename}.\n"+
+if verbose >= 1: print(f"Reading metadata from {meta_filename}.\n"+
       f"volume_matching_shifts = {vm_shifts}\n"+
       f"full_Nz,Ny,Nx = {full_Nz,Ny,Nx}\n"+
       f"Nz            = {Nz}\n"+
@@ -44,9 +47,9 @@
     noisy_implant[z:z+chunk_length] = voxel_chunk[:chunk_length] >= implant_threshold_u16
     
                                                   
-print(f"Computing connected components")
+if verbose >= 1: print(f"Computing connected components")
 label, n_features = ndi.label(noisy_implant)
-print(f"Counting component volumes")
+if verbose >= 1: print(f"Counting component volumes")
 bincnts           = np.bincount(label[label>0],minlength=n_features+1)
 
 largest_cc_ix     = np.argmax(bincnts)
@@ -54,7 +57,7 @@
 
 output_dir = f"{hdf5_root}/masks/{scale}x/"
 pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
-print(f"Writing largest connected component to {output_dir}/{sample}.h5")
+if verbose >= 1: print(f"Writing largest connected component to {output_dir}/{sample}.h5")
 
 update_hdf5_mask(f"{output_dir}/{sample}.h5",
                  group_name="implant",

From a2a4c264a01826407c3e07f21208aa6558df242a Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Feb 2023 09:20:00 +0100
Subject: [PATCH 038/136] #25 Moved implant_FoR

---
 .../implant-FoR.py => src/processing_steps/700_implant_FoR.py     | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename pre-cleanup-src/segmentation/implant-FoR.py => src/processing_steps/700_implant_FoR.py (100%)

diff --git a/pre-cleanup-src/segmentation/implant-FoR.py b/src/processing_steps/700_implant_FoR.py
similarity index 100%
rename from pre-cleanup-src/segmentation/implant-FoR.py
rename to src/processing_steps/700_implant_FoR.py

From 556b3d68e17c9357e2863c42d6bdb8524368b330 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Feb 2023 09:24:27 +0100
Subject: [PATCH 039/136] #25 Fixed some of the dependencies of implant_FoR

---
 src/processing_steps/700_implant_FoR.py | 8 ++++----
 src/requirements.txt                    | 3 ++-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/processing_steps/700_implant_FoR.py b/src/processing_steps/700_implant_FoR.py
index 5df0d2b..21512be 100644
--- a/src/processing_steps/700_implant_FoR.py
+++ b/src/processing_steps/700_implant_FoR.py
@@ -1,14 +1,14 @@
 import h5py, sys, os.path, pathlib, numpy as np, numpy.linalg as la, tqdm
 sys.path.append(sys.path[0]+"/../")
 from config.constants import *
-from config.paths import hdf5_root, binary_root, commandline_args
-from pybind_kernels.geometry import center_of_mass, inertia_matrix, integrate_axes, sample_plane
-from pybind_kernels.histograms import load_slice, erode_3d_sphere_gpu as erode_3d, dilate_3d_sphere_gpu as dilate_3d
+from config.paths import hdf5_root, binary_root
+from lib.cpp.cpu_seq.geometry import center_of_mass, inertia_matrix, integrate_axes, sample_plane
+from lib.cpp.gpu.morphology import erode_3d_sphere as erode_3d, dilate_3d_sphere as dilate_3d
 import matplotlib.pyplot as plt
 from matplotlib.colors import colorConverter
 import scipy as sp, scipy.ndimage as ndi, scipy.interpolate as interpolate, scipy.signal as signal
 import vedo, vedo.pointcloud as pc
-from helper_functions import update_hdf5, update_hdf5_mask
+from lib.py.helpers import update_hdf5, update_hdf5_mask, commandline_args
 from numpy import array, newaxis as NA
 
 # Hvor skal disse hen?
diff --git a/src/requirements.txt b/src/requirements.txt
index 18bdb83..98617bb 100644
--- a/src/requirements.txt
+++ b/src/requirements.txt
@@ -1,4 +1,5 @@
 cupy-cuda11x==11.5.0
 fabric==3.0.0
 jax==0.4.3
-tqdm==4.64.1
\ No newline at end of file
+tqdm==4.64.1
+vedo==2023.4.3
\ No newline at end of file

From 2d9ad72840c2b68f2027bc7190795f88a50b5921 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Feb 2023 09:27:54 +0100
Subject: [PATCH 040/136] #29 Added verbose to implant_FoR

---
 src/processing_steps/700_implant_FoR.py | 70 +++++++++++++------------
 1 file changed, 37 insertions(+), 33 deletions(-)

diff --git a/src/processing_steps/700_implant_FoR.py b/src/processing_steps/700_implant_FoR.py
index 21512be..0ed5ae1 100644
--- a/src/processing_steps/700_implant_FoR.py
+++ b/src/processing_steps/700_implant_FoR.py
@@ -11,6 +11,8 @@
 from lib.py.helpers import update_hdf5, update_hdf5_mask, commandline_args
 from numpy import array, newaxis as NA
 
+verbose = 1
+
 # Hvor skal disse hen?
 def circle_center(p0,p1,p2):
     m1, m2               = (p0+p1)/2, (p0+p2)/2   # Midpoints 
@@ -56,12 +58,12 @@ def open_3d(image, r):
 
 def coordinate_image(shape):
     Nz,Ny,Nx   = shape
-    print(f"Broadcasting coordinates for {shape} image")
+    if verbose >= 1: print(f"Broadcasting coordinates for {shape} image")
     zs, ys, xs = np.broadcast_to(np.arange(Nz)[:,NA,NA],shape),\
                  np.broadcast_to(np.arange(Ny)[NA,:,NA],shape),\
                  np.broadcast_to(np.arange(Nx)[NA,NA,:],shape);
     zyxs = np.stack([zs,ys,xs],axis=-1)
-    print(f"Done")
+    if verbose >= 1: print(f"Done")
     return zyxs
 
 
@@ -105,7 +107,7 @@ def homogeneous_transform(xs, M):
     hxs[...,:3] = xs;
     hxs[..., 3]  = 1
 
-    print(hxs.shape, M.shape)
+    if verbose >= 1: print(hxs.shape, M.shape)
     return hxs @ M.T
 
 
@@ -273,24 +275,26 @@ def figure_FoR_voxels(name,voxels,debug=True):
 
         
 if __name__ == "__main__":
-    sample, scale, debug = commandline_args({"sample":"<required>","scale":8,"debug":1})
+    sample, scale, verbose = commandline_args({"sample" : "<required>",
+                                               "scale" : 8,
+                                               "verbose" : 1})
     
     if(scale<8):
-        print(f"Selected scale is {scale}x: This should not be run at high resolution, use scale>=8.")
+        if verbose >= 1: print(f"Selected scale is {scale}x: This should not be run at high resolution, use scale>=8.")
         #sys.exit(-1)
 
     ## STEP 0: LOAD MASKS, VOXELS, AND METADATA
     image_output_dir = f"{hdf5_root}/processed/implant-FoR/{sample}/"
-    print(f"Storing all debug-images to {image_output_dir}")    
+    if verbose >= 1: print(f"Storing all debug-images to {image_output_dir}")    
     pathlib.Path(image_output_dir).mkdir(parents=True, exist_ok=True)
     
-    print(f"Loading {scale}x implant mask from {hdf5_root}/masks/{scale}x/{sample}.h5")
+    if verbose >= 1: print(f"Loading {scale}x implant mask from {hdf5_root}/masks/{scale}x/{sample}.h5")
     implant_file = h5py.File(f"{hdf5_root}/masks/{scale}x/{sample}.h5",'r')
     implant      = implant_file["implant/mask"][:]
     voxel_size   = implant_file["implant"].attrs["voxel_size"]
     implant_file.close()
     
-    print(f"Loading {scale}x voxels from {binary_root}/voxels/{scale}x/{sample}.uint16")
+    if verbose >= 1: print(f"Loading {scale}x voxels from {binary_root}/voxels/{scale}x/{sample}.uint16")
     voxels  = np.fromfile(f"{binary_root}/voxels/{scale}x/{sample}.uint16",dtype=np.uint16).reshape(implant.shape)
 
     nz,ny,nx = implant.shape
@@ -298,7 +302,7 @@ def figure_FoR_voxels(name,voxels,debug=True):
     ### STEP 1: COMPUTE IMPLANT PRINCIPAL AXES FRAME OF REFERENCE
     ## STEP1A: DIAGONALIZE MOMENT OF INTERTIA MATRIX TO GET PRINCIPAL AXES
     cm    = np.array(center_of_mass(implant))                  # in downsampled-voxel index coordinates
-    print(f"Center of mass is: {cm}")
+    if verbose >= 1: print(f"Center of mass is: {cm}")
     IM    = np.array(inertia_matrix(implant,cm)).reshape(3,3)  
     ls,E  = la.eigh(IM)
 
@@ -320,7 +324,7 @@ def figure_FoR_voxels(name,voxels,debug=True):
     UVW = E.T
     u_vec,v_vec,w_vec = UVW
 
-    figure_FoR_UVW(debug)
+    figure_FoR_UVW(verbose >= 2)
 
     ### STEP 2: COMPUTE PHANTOM SCREW GEOMETRY
     #
@@ -397,7 +401,7 @@ def UVW2xyz(p):
     implant_length_voxels = implant_length/voxel_size
     implant_radius_voxels = implant_radius/voxel_size
     
-    figure_FoR_cylinder(debug)
+    figure_FoR_cylinder(verbose >= 2)
 
     ### 3: In the cylinder coordinates, find radii and angle ranges to fill in the "holes" in the implant and make it solid
     ###    (More robust than closing operations, as we don't want to effect the screw threads).
@@ -412,7 +416,7 @@ def UVW2xyz(p):
 
     #TODO: Local circle figure (instead of showing global fit on local slice, which isn't snug)
     bbox_uvwp = [Up_min,Up_max,Vp_min,Vp_max,Wp_min,Wp_max]
-    figure_FoR_circle("prime-circle",Cp*voxel_size,v_vec,w_vec,implant_radius,bbox_uvwp,debug)    
+    figure_FoR_circle("prime-circle",Cp*voxel_size,v_vec,w_vec,implant_radius,bbox_uvwp,verbose >= 2)    
 
     ## 3B: Profile of radii and angles
     implant_thetas = np.arctan2(implant_Vps,implant_Wps)
@@ -453,8 +457,8 @@ def UVW2xyz(p):
     solid_implant_UVWps   = ((((np.array(np.nonzero(solid_quarter)).T - cm) @ E) - w0v)*voxel_size - cp) @ UVWp
     Up_integrals, Up_bins = np.histogram(solid_implant_UVWps[:,0],200)
 
-    figure_FoR_profiles(debug)    
-    figure_FoR_voxels("solid_implant",solid_implant,debug)
+    figure_FoR_profiles(verbose >= 2)    
+    figure_FoR_voxels("solid_implant",solid_implant,verbose >= 2)
 
     back_mask  = (Ws<0)
     front_mask = largest_cc_of((Ws>50)*(~solid_implant))#*(thetas>=theta_from)*(thetas<=theta_to)
@@ -462,26 +466,26 @@ def UVW2xyz(p):
     # back_part = voxels*back_mask
    
     front_part = voxels*front_mask
-    figure_FoR_voxels("back_part", voxels*back_mask, debug) 
-    figure_FoR_voxels("front_part",voxels*front_mask, debug) 
+    figure_FoR_voxels("back_part", voxels*back_mask, verbose >= 2) 
+    figure_FoR_voxels("front_part",voxels*front_mask, verbose >= 2) 
 
 
     Cp_zyx = Cp[::-1]*voxel_size
 
     Muvwp = zyx_to_UVWp_transform()
-    print(f"MUvpw = {np.round(Muvwp,2)}")
-    print(f"UVW  = {np.round(UVW,2)}")
-    print(f"UVWp = {np.round(UVWp,2)}")
-    print(f"Cp = {np.round(Cp_zyx,2)}")
-    print(f"cp = {np.round(cp,2)}")
-    print(f"cm = {np.round(cm,2)}")
+    if verbose >= 1: print(f"MUvpw = {np.round(Muvwp,2)}")
+    if verbose >= 1: print(f"UVW  = {np.round(UVW,2)}")
+    if verbose >= 1: print(f"UVWp = {np.round(UVWp,2)}")
+    if verbose >= 1: print(f"Cp = {np.round(Cp_zyx,2)}")
+    if verbose >= 1: print(f"cp = {np.round(cp,2)}")
+    if verbose >= 1: print(f"cm = {np.round(cm,2)}")
 
-    figure_FoR_UVWp(debug)
+    figure_FoR_UVWp(verbose >= 2)
 
-    print(f"Physical Cp = {Cp[::-1]*voxel_size}")
+    if verbose >= 1: print(f"Physical Cp = {Cp[::-1]*voxel_size}")
 
     output_dir = f"{hdf5_root}/hdf5-byte/msb/"
-    print(f"Writing frame-of-reference metadata to {output_dir}/{sample}.h5")
+    if verbose >= 1: print(f"Writing frame-of-reference metadata to {output_dir}/{sample}.h5")
     update_hdf5(f"{output_dir}/{sample}.h5",
                 group_name="implant-FoR",
                 datasets={"UVW":UVW,
@@ -514,32 +518,32 @@ def UVW2xyz(p):
 
     output_dir = f"{hdf5_root}/masks/{scale}x/"
     pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
-    print(f"Saving implant_solid mask to {output_dir}/{sample}.h5")
+    if verbose >= 1: print(f"Saving implant_solid mask to {output_dir}/{sample}.h5")
     update_hdf5_mask(f"{output_dir}/{sample}.h5",
                      group_name="implant_solid",
                      datasets={"mask":solid_implant},
                      attributes={"sample":sample,"scale":scale,"voxel_size":voxel_size})
 
-    print(f"Saving implant_shell mask to {output_dir}/{sample}.h5")
+    if verbose >= 1: print(f"Saving implant_shell mask to {output_dir}/{sample}.h5")
     update_hdf5_mask(f"{output_dir}/{sample}.h5",
                      group_name="implant_shell",
                      datasets={"mask":implant_shell_mask},
                      attributes={"sample":sample,"scale":scale,"voxel_size":voxel_size})
 
-    print(f"Saving cut_cylinder_air mask to {output_dir}/{sample}.h5")
+    if verbose >= 1: print(f"Saving cut_cylinder_air mask to {output_dir}/{sample}.h5")
     update_hdf5_mask(f"{output_dir}/{sample}.h5",
                      group_name="cut_cylinder_air",
                      datasets={"mask":back_mask},
                      attributes={"sample":sample,"scale":scale,"voxel_size":voxel_size})
 
-    print(f"Saving cut_cylinder_bone mask to {output_dir}/{sample}.h5")
+    if verbose >= 1: print(f"Saving cut_cylinder_bone mask to {output_dir}/{sample}.h5")
     update_hdf5_mask(f"{output_dir}/{sample}.h5",
                      group_name="cut_cylinder_bone",
                      datasets={"mask":front_mask},
                      attributes={"sample":sample, "scale":scale, "voxel_size":voxel_size})
 
 
-    print(f"Computing bone region")
+    if verbose >= 1: print(f"Computing bone region")
     hist, bins = np.histogram(front_part, 256)
     hist[0] = 0
     peaks, info = signal.find_peaks(hist,height=0.5*hist.max())
@@ -547,7 +551,7 @@ def UVW2xyz(p):
     try:
         p1, p2 = peaks[np.argsort(info['peak_heights'])[:2]]
         midpoint = int(round((bins[p1]+bins[p2+1])/2)) # p1 is left-edge of p1-bin, p2+1 is right edge of p2-bin
-        print(f"p1, p2 = ({p1,bins[p1]}), ({p2,bins[p2]}); midpoint = {midpoint}")
+        if verbose >= 1: print(f"p1, p2 = ({p1,bins[p1]}), ({p2,bins[p2]}); midpoint = {midpoint}")
         
         bone_mask1 = front_part > midpoint                                                                                                                                                                                                                                       
         closing_diameter, opening_diameter = 400, 300           # micrometers                                                                                                                                                                   
@@ -564,10 +568,10 @@ def UVW2xyz(p):
     
         bone_region_mask = largest_cc_of(bone_region_mask)
     except:
-        print(f"Wasnt able to separate into resin and bone region. Assuming all is bone region.")
+        if verbose >= 1: print(f"Wasnt able to separate into resin and bone region. Assuming all is bone region.")
         bone_region_mask = front_mask
     
-        print(f"Saving bone_region mask to {output_dir}/{sample}.h5")
+        if verbose >= 1: print(f"Saving bone_region mask to {output_dir}/{sample}.h5")
         update_hdf5_mask(f"{output_dir}/{sample}.h5",
                          group_name="bone_region",
                          datasets={"mask":bone_region_mask},

From 9612a357210f2cbd50fc68cfed989df112bf5776 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Feb 2023 09:29:46 +0100
Subject: [PATCH 041/136] #25 Moved implant_data

---
 .../implant-data.py => src/processing_steps/800_implant_data.py   | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename pre-cleanup-src/segmentation/implant-data.py => src/processing_steps/800_implant_data.py (100%)

diff --git a/pre-cleanup-src/segmentation/implant-data.py b/src/processing_steps/800_implant_data.py
similarity index 100%
rename from pre-cleanup-src/segmentation/implant-data.py
rename to src/processing_steps/800_implant_data.py

From 20f2c2f292555ce66d7ac98443e09bca2603808f Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Feb 2023 09:31:20 +0100
Subject: [PATCH 042/136] #25 Fixed dependencies of implant_data

---
 src/processing_steps/800_implant_data.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/processing_steps/800_implant_data.py b/src/processing_steps/800_implant_data.py
index 5c0ac76..a15c8b4 100644
--- a/src/processing_steps/800_implant_data.py
+++ b/src/processing_steps/800_implant_data.py
@@ -1,13 +1,12 @@
 import h5py, sys, os.path, pathlib, numpy as np, numpy.linalg as la, tqdm
 sys.path.append(sys.path[0]+"/../")
 from config.constants import *
-from config.paths import hdf5_root, binary_root, commandline_args
-from pybind_kernels.geometry import center_of_mass, inertia_matrix, integrate_axes, fill_implant_mask, compute_front_mask
-from pybind_kernels.histograms import load_slice, erode_3d_sphere_gpu as erode_3d, dilate_3d_sphere_gpu as dilate_3d
+from config.paths import hdf5_root, binary_root
+from lib.cpp.cpu_seq.geometry import fill_implant_mask, compute_front_mask
 import matplotlib.pyplot as plt
 import scipy as sp, scipy.ndimage as ndi, scipy.interpolate as interpolate, scipy.signal as signal
 import vedo, vedo.pointcloud as pc
-from helper_functions import *
+from lib.py.helpers import commandline_args, update_hdf5, update_hdf5_mask
 from numpy import array, newaxis as NA
 
 

From d31572f357741038c214770c51c14b1ea7392033 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Feb 2023 09:32:55 +0100
Subject: [PATCH 043/136] #29 Added verbose to implant_data

---
 src/processing_steps/800_implant_data.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/processing_steps/800_implant_data.py b/src/processing_steps/800_implant_data.py
index a15c8b4..cff7989 100644
--- a/src/processing_steps/800_implant_data.py
+++ b/src/processing_steps/800_implant_data.py
@@ -10,9 +10,11 @@
 from numpy import array, newaxis as NA
 
 
-sample, scale = commandline_args({"sample":"<required>","scale":2})
+sample, scale, verbose = commandline_args({"sample" : "<required>",
+                                           "scale" : 2,
+                                           "verbose" : 1})
 
-print(f"Loading principal axis and cylinder frame-of-references")
+if verbose >= 1: print(f"Loading principal axis and cylinder frame-of-references")
 h5meta = h5py.File(f"{hdf5_root}/hdf5-byte/msb/{sample}.h5","r")
 try:    
     h5g = h5meta["implant-FoR"]
@@ -26,7 +28,7 @@
     print(f"Make sure you have run implant-FoR.py for {sample} at scale {scale}x")
     sys.exit(-1)
 
-print(f"Loading {scale}x implant mask from {hdf5_root}/masks/{scale}x/{sample}.h5")
+if verbose >= 1: print(f"Loading {scale}x implant mask from {hdf5_root}/masks/{scale}x/{sample}.h5")
 try:
     implant_file = h5py.File(f"{hdf5_root}/masks/{scale}x/{sample}.h5",'r')
     implant      = implant_file["implant/mask"][:]
@@ -49,7 +51,7 @@
 
 bbox_flat  = tuple(bbox.flatten())
 Muvwp_flat = tuple(Muvwp.flatten())
-print(f"Filling implant mask")
+if verbose >= 1: print(f"Filling implant mask")
 fill_implant_mask(implant.astype(np.uint8,copy=False),
                   voxel_size,bbox_flat, rsqr_fraction,
                   Muvwp_flat,

From 0ef3d3f9fd7d3a2285dac4d44d9eadc046c94b70 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Feb 2023 09:35:57 +0100
Subject: [PATCH 044/136] #25 Moved generate_gauss_c and fixed dependencies

---
 .../processing_steps/900_generate_gauss_c.py               | 7 ++++---
 src/requirements.txt                                       | 1 +
 2 files changed, 5 insertions(+), 3 deletions(-)
 rename pre-cleanup-src/generate_gauss_c.py => src/processing_steps/900_generate_gauss_c.py (96%)

diff --git a/pre-cleanup-src/generate_gauss_c.py b/src/processing_steps/900_generate_gauss_c.py
similarity index 96%
rename from pre-cleanup-src/generate_gauss_c.py
rename to src/processing_steps/900_generate_gauss_c.py
index da3bdbb..663eccf 100644
--- a/pre-cleanup-src/generate_gauss_c.py
+++ b/src/processing_steps/900_generate_gauss_c.py
@@ -7,8 +7,9 @@
 from math import pi, sqrt, exp
 from scipy import ndimage as ndi
 
-from config.paths import hdf5_root, binary_root, commandline_args
-import pybind_kernels.histograms as histograms
+from config.paths import hdf5_root, binary_root
+from lib.py.helpers import commandline_args
+from lib.cpp.cpu_seq import gauss_filter
 NA = np.newaxis
 
 impl_type = np.float32
@@ -52,7 +53,7 @@ def toint(arr, dtype=np.uint8):
         start = timeit.default_timer()
 
     print(f"Repeated Gauss blurs ({reps} iterations, sigma_voxels={sigma_voxels}, kernel length={radius} coefficients)")
-    histograms.gauss_filter_par_cpu(implant_mask, implant_mask.shape, kernel, reps, result)
+    gauss_filter(implant_mask, implant_mask.shape, kernel, reps, result)
     if verify:
         print (f'Parallel C edition took {timeit.default_timer() - start} seconds')
 
diff --git a/src/requirements.txt b/src/requirements.txt
index 98617bb..5125944 100644
--- a/src/requirements.txt
+++ b/src/requirements.txt
@@ -1,4 +1,5 @@
 cupy-cuda11x==11.5.0
+edt==2.3.0
 fabric==3.0.0
 jax==0.4.3
 tqdm==4.64.1

From 001fd25664f66c7bff28d2d00c4b9f8a0b66bea7 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Feb 2023 09:38:17 +0100
Subject: [PATCH 045/136] #29 Added verbose to generate_gauss_c

---
 src/processing_steps/900_generate_gauss_c.py | 39 +++++++++++---------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/src/processing_steps/900_generate_gauss_c.py b/src/processing_steps/900_generate_gauss_c.py
index 663eccf..ce827f6 100644
--- a/src/processing_steps/900_generate_gauss_c.py
+++ b/src/processing_steps/900_generate_gauss_c.py
@@ -21,25 +21,30 @@ def toint(arr, dtype=np.uint8):
 
 # sigma is given in physical units, i.e. in micrometers, in order to give scale-invariant results.
 if __name__ == '__main__':
-    sample, sigma, reps, scale, voxel_size_1x, verify, debug = commandline_args({"sample":"<required>","sigma":40.0,"repititions":10,"scale":2,
-                                                                                 "voxel_size_1x":1.85, "verify_against_ndimage":False, "debug_images":True})
-    print(f"Diffusion approximation by repeated Gaussian blurs.\n")
+    sample, sigma, reps, scale, voxel_size_1x, verify, verbose = commandline_args({"sample":"<required>",
+                                                                                   "sigma":40.0,
+                                                                                   "repititions":10,
+                                                                                   "scale":2,
+                                                                                   "voxel_size_1x":1.85, 
+                                                                                   "verify_against_ndimage":False, 
+                                                                                   "verbose":1})
+    if verbose >= 1: print(f"Diffusion approximation by repeated Gaussian blurs.\n")
     voxel_size   = voxel_size_1x*scale
     sigma_voxels = sigma/voxel_size
-    print(f"At scale {scale}x, voxel size is {voxel_size} micrometers.")
-    print(f"Using sigma={sigma} micrometers, sigma_voxels={sigma_voxels}.")
+    if verbose >= 1: print(f"At scale {scale}x, voxel size is {voxel_size} micrometers.")
+    if verbose >= 1: print(f"Using sigma={sigma} micrometers, sigma_voxels={sigma_voxels}.")
 
     output_dir = f"{binary_root}/fields/implant-gauss/{scale}x"
     pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
 
-    print(f"Loading implant_solid mask from {hdf5_root}/masks/{scale}x/{sample}.h5")
+    if verbose >= 1: print(f"Loading implant_solid mask from {hdf5_root}/masks/{scale}x/{sample}.h5")
     with h5py.File(f"{hdf5_root}/masks/{scale}x/{sample}.h5","r") as f:
         implant_mask = f['implant_solid/mask'][:]
 
     nz,ny,nx = implant_mask.shape
-    print(f"Implant mask has shape {implant_mask.shape}")
+    if verbose >= 1: print(f"Implant mask has shape {implant_mask.shape}")
 
-    if debug:
+    if verbose >= 2:
         print(f"Writing PNGs of implant mask slices to {output_dir}")
         Image.fromarray(toint(implant_mask[:,:,nx//2].astype(impl_type))).save(f"{output_dir}/{sample}-mask-yz.png")
         Image.fromarray(toint(implant_mask[:,ny//2,:].astype(impl_type))).save(f"{output_dir}/{sample}-mask-xz.png")
@@ -52,20 +57,20 @@ def toint(arr, dtype=np.uint8):
     if verify:
         start = timeit.default_timer()
 
-    print(f"Repeated Gauss blurs ({reps} iterations, sigma_voxels={sigma_voxels}, kernel length={radius} coefficients)")
+    if verbose >= 1: print(f"Repeated Gauss blurs ({reps} iterations, sigma_voxels={sigma_voxels}, kernel length={radius} coefficients)")
     gauss_filter(implant_mask, implant_mask.shape, kernel, reps, result)
     if verify:
-        print (f'Parallel C edition took {timeit.default_timer() - start} seconds')
+        if verbose >= 1: print (f'Parallel C edition took {timeit.default_timer() - start} seconds')
 
     xs = np.linspace(-1,1,nx)
     rs = np.sqrt(xs[NA,NA,:]**2 + xs[NA,:,NA]**2)
     cylinder_mask = (rs<=1)
 
-    print(f"Writing diffusion-field to {output_dir}/{sample}.npy")
+    if verbose >= 1: print(f"Writing diffusion-field to {output_dir}/{sample}.npy")
     np.save(f'{output_dir}/{sample}.npy', toint(result*cylinder_mask,np.uint16)*cylinder_mask)
 
     
-    if debug:
+    if verbose >= 2:
         print(f"Debug: Writing PNGs of result slices to {output_dir}")
         Image.fromarray(toint(result[nz//2,:,:])).save(f'{output_dir}/{sample}-gauss-xy.png')
         Image.fromarray(toint(result[:,ny//2,:])).save(f'{output_dir}/{sample}-gauss-xz.png')
@@ -82,7 +87,7 @@ def toint(arr, dtype=np.uint8):
             control[implant_mask] = 1
         print (f'ndimage edition took {timeit.default_timer() - start} seconds')
         np.save(f'{output_dir}/{sample}_ndimage.npy',control)
-        if debug:
+        if verbose >= 2:
             Image.fromarray(toint(control[nz//2,:,:])).save(f'{output_dir}/{sample}-control-xy.png')
             Image.fromarray(toint(control[:,ny//2,:])).save(f'{output_dir}/{sample}-control-xz.png')
             Image.fromarray(toint(control[:,:,nx//2])).save(f'{output_dir}/{sample}-control-yz.png')
@@ -103,21 +108,21 @@ def toint(arr, dtype=np.uint8):
                 plt.savefig(f'{output_dir}/{sample}-diff-{name}.png')
 
 
-    print(f"Computing Euclidean distance transform.")
+    if verbose >= 1: print(f"Computing Euclidean distance transform.")
     fedt = edt.edt(~implant_mask,parallel=16)
     del implant_mask
     
     edt_output_dir = f"{binary_root}/fields/implant-edt/{scale}x"
     pathlib.Path(edt_output_dir).mkdir(parents=True, exist_ok=True)
-    print(f"Writing EDT-field to {edt_output_dir}/{sample}.npy")
+    if verbose >= 1: print(f"Writing EDT-field to {edt_output_dir}/{sample}.npy")
     np.save(f'{edt_output_dir}/{sample}.npy', toint(fedt*cylinder_mask,np.uint16)*cylinder_mask)
                 
 
     mixed_output_dir = f"{binary_root}/fields/implant-gauss+edt/{scale}x"    
-    print(f"Writing combined field to {mixed_output_dir}/{sample}.npy")                
+    if verbose >= 1: print(f"Writing combined field to {mixed_output_dir}/{sample}.npy")                
     pathlib.Path(mixed_output_dir).mkdir(parents=True, exist_ok=True)
     result = (result-fedt/(fedt.max()))*cylinder_mask
     result -= result.min()
     result /= result.max()
-    print(f"Result (min,max) = ({result.min(),result.max()})")
+    if verbose >= 1: print(f"Result (min,max) = ({result.min(),result.max()})")
     np.save(f'{mixed_output_dir}/{sample}.npy', toint(result*cylinder_mask,np.uint16)*cylinder_mask)    

From 0ab97a2f79ff9616af6b8d77788e7efb6a91eb35 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Feb 2023 09:40:34 +0100
Subject: [PATCH 046/136] #25 Made room for init.d-like naming scheme

---
 .../{100_cache_esrf2013.py => 0100_cache_esrf2013.py}             | 0
 .../{200_generate_byte_hdf5.py => 0200_generate_byte_hdf5.py}     | 0
 .../{300_volume_matcher.py => 0300_volume_matcher.py}             | 0
 src/processing_steps/{400_h5tobin.py => 0400_h5tobin.py}          | 0
 .../{500_rescale_cupy_bin.py => 0500_rescale_cupy_bin.py}         | 0
 .../{600_segment_implant_cc.py => 0600_segment_implant_cc.py}     | 0
 src/processing_steps/{700_implant_FoR.py => 0700_implant_FoR.py}  | 0
 .../{800_implant_data.py => 0800_implant_data.py}                 | 0
 .../{900_generate_gauss_c.py => 0900_generate_gauss_c.py}         | 0
 9 files changed, 0 insertions(+), 0 deletions(-)
 rename src/processing_steps/{100_cache_esrf2013.py => 0100_cache_esrf2013.py} (100%)
 rename src/processing_steps/{200_generate_byte_hdf5.py => 0200_generate_byte_hdf5.py} (100%)
 rename src/processing_steps/{300_volume_matcher.py => 0300_volume_matcher.py} (100%)
 rename src/processing_steps/{400_h5tobin.py => 0400_h5tobin.py} (100%)
 rename src/processing_steps/{500_rescale_cupy_bin.py => 0500_rescale_cupy_bin.py} (100%)
 rename src/processing_steps/{600_segment_implant_cc.py => 0600_segment_implant_cc.py} (100%)
 rename src/processing_steps/{700_implant_FoR.py => 0700_implant_FoR.py} (100%)
 rename src/processing_steps/{800_implant_data.py => 0800_implant_data.py} (100%)
 rename src/processing_steps/{900_generate_gauss_c.py => 0900_generate_gauss_c.py} (100%)

diff --git a/src/processing_steps/100_cache_esrf2013.py b/src/processing_steps/0100_cache_esrf2013.py
similarity index 100%
rename from src/processing_steps/100_cache_esrf2013.py
rename to src/processing_steps/0100_cache_esrf2013.py
diff --git a/src/processing_steps/200_generate_byte_hdf5.py b/src/processing_steps/0200_generate_byte_hdf5.py
similarity index 100%
rename from src/processing_steps/200_generate_byte_hdf5.py
rename to src/processing_steps/0200_generate_byte_hdf5.py
diff --git a/src/processing_steps/300_volume_matcher.py b/src/processing_steps/0300_volume_matcher.py
similarity index 100%
rename from src/processing_steps/300_volume_matcher.py
rename to src/processing_steps/0300_volume_matcher.py
diff --git a/src/processing_steps/400_h5tobin.py b/src/processing_steps/0400_h5tobin.py
similarity index 100%
rename from src/processing_steps/400_h5tobin.py
rename to src/processing_steps/0400_h5tobin.py
diff --git a/src/processing_steps/500_rescale_cupy_bin.py b/src/processing_steps/0500_rescale_cupy_bin.py
similarity index 100%
rename from src/processing_steps/500_rescale_cupy_bin.py
rename to src/processing_steps/0500_rescale_cupy_bin.py
diff --git a/src/processing_steps/600_segment_implant_cc.py b/src/processing_steps/0600_segment_implant_cc.py
similarity index 100%
rename from src/processing_steps/600_segment_implant_cc.py
rename to src/processing_steps/0600_segment_implant_cc.py
diff --git a/src/processing_steps/700_implant_FoR.py b/src/processing_steps/0700_implant_FoR.py
similarity index 100%
rename from src/processing_steps/700_implant_FoR.py
rename to src/processing_steps/0700_implant_FoR.py
diff --git a/src/processing_steps/800_implant_data.py b/src/processing_steps/0800_implant_data.py
similarity index 100%
rename from src/processing_steps/800_implant_data.py
rename to src/processing_steps/0800_implant_data.py
diff --git a/src/processing_steps/900_generate_gauss_c.py b/src/processing_steps/0900_generate_gauss_c.py
similarity index 100%
rename from src/processing_steps/900_generate_gauss_c.py
rename to src/processing_steps/0900_generate_gauss_c.py

From 3fe7dfd6776bded1056b26187d3143fbca873394 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Feb 2023 09:53:41 +0100
Subject: [PATCH 047/136] #25 Moved compute_histograms

---
 .../processing_steps/1000_compute_histograms.py                   | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename pre-cleanup-src/histogram_processing/compute_histograms.py => src/processing_steps/1000_compute_histograms.py (100%)

diff --git a/pre-cleanup-src/histogram_processing/compute_histograms.py b/src/processing_steps/1000_compute_histograms.py
similarity index 100%
rename from pre-cleanup-src/histogram_processing/compute_histograms.py
rename to src/processing_steps/1000_compute_histograms.py

From 6f2db2a47921ffbcb626239d2a1b15be39e4ddfb Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Feb 2023 10:08:00 +0100
Subject: [PATCH 048/136] #25 Preliminary fix to compute_histograms
 dependencies

---
 .../1000_compute_histograms.py                | 53 ++++++++++++-------
 1 file changed, 33 insertions(+), 20 deletions(-)

diff --git a/src/processing_steps/1000_compute_histograms.py b/src/processing_steps/1000_compute_histograms.py
index 75f8490..529ad79 100755
--- a/src/processing_steps/1000_compute_histograms.py
+++ b/src/processing_steps/1000_compute_histograms.py
@@ -1,21 +1,26 @@
 #!/usr/bin/env python3
 import os, sys, pathlib, copy, scipy.ndimage as ndi
 sys.path.append(sys.path[0]+"/../")
-import pybind_kernels.histograms as histograms
+# TODO Move benchmarking out of this script.
+from lib.cpp.cpu_seq.histograms import axis_histogram as axis_histogram_seq_cpu, field_histogram as field_histogram_seq_cpu
+from lib.cpp.cpu.histograms import axis_histogram as axis_histogram_par_cpu, field_histogram as field_histogram_par_cpu, field_histogram_resample
+from lib.cpp.gpu.histograms import axis_histogram as axis_histogram_par_gpu, field_histogram as field_histogram_par_gpu
+from lib.cpp.cpu_seq.histograms import masked_minmax # TODO is it histogram specific?
 import numpy as np, h5py, timeit
 from datetime import datetime
 from PIL import Image
 from tqdm import tqdm
 from config.paths import *
 from config.constants import implant_threshold
-from helper_functions import block_info, load_block
+from lib.py.helpers import block_info, load_block, commandline_args
 
 NA = np.newaxis
+verbose = 1
 
 # TODO: Currently specialized to uint16_t
-masked_minmax = histograms.masked_minmax
+#masked_minmax = histograms.masked_minmax
 
-def axes_histogram(voxels, func=histograms.axis_histogram_seq_cpu, ranges=None, voxel_bins=256):
+def axes_histogram(voxels, func=axis_histogram_seq_cpu, ranges=None, voxel_bins=256):
     (Nz,Ny,Nx) = voxels.shape
     Nr = int(np.sqrt((Nx//2)**2 + (Ny//2)**2))+1
 
@@ -28,25 +33,25 @@ def axes_histogram(voxels, func=histograms.axis_histogram_seq_cpu, ranges=None,
         vmin, vmax = masked_minmax(voxels)
     else:
         vmin, vmax = ranges
-    print ("Entering call", datetime.now())
-    func(voxels, x_bins, y_bins, z_bins, r_bins, vmin, vmax, True)
-    print ("Exited call", datetime.now())
+    if verbose >= 1: print ("Entering call", datetime.now())
+    func(voxels, x_bins, y_bins, z_bins, r_bins, vmin, vmax, verbose >= 1)
+    if verbose >= 1: print ("Exited call", datetime.now())
     return x_bins, y_bins, z_bins, r_bins
 
 def field_histogram(voxels, field, field_bins, voxel_bins, ranges):
     bins = np.zeros((field_bins, voxel_bins), dtype=np.uint64)
     vmin, vmax = ranges
     # python3 histograms_tester.py 770c_pag  1849.98s user 170.42s system 512% cpu 6:33.95 total
-    histograms.field_histogram_par_cpu(voxels, field, bins, vmin, vmax)
+    field_histogram_par_cpu(voxels, field, bins, vmin, vmax)
     # python3 histograms_tester.py 770c_pag  1095.49s user 141.76s system 104% cpu 19:44.64 total
-    #histograms.field_histogram_seq_cpu(voxels, field, bins, vmin, vmax)
+    #field_histogram_seq_cpu(voxels, field, bins, vmin, vmax)
 
     return bins
 
 def verify_axes_histogram(voxels, ranges=(1,4095), voxel_bins=256):
     tolerance = 1e-5
-    schx, schy, schz, schr = axes_histogram(voxels, func=histograms.axis_histogram_seq_cpu, ranges=ranges, voxel_bins=voxel_bins)
-    pchx, pchy, pchz, pchr = axes_histogram(voxels, func=histograms.axis_histogram_par_cpu, ranges=ranges, voxel_bins=voxel_bins)
+    schx, schy, schz, schr = axes_histogram(voxels, func=axis_histogram_seq_cpu, ranges=ranges, voxel_bins=voxel_bins)
+    pchx, pchy, pchz, pchr = axes_histogram(voxels, func=axis_histogram_par_cpu, ranges=ranges, voxel_bins=voxel_bins)
 
     dx = np.abs(schx - pchx).sum()
     dy = np.abs(schy - pchy).sum()
@@ -62,7 +67,7 @@ def verify_axes_histogram(voxels, ranges=(1,4095), voxel_bins=256):
         print (f'diff z = {dz}')
         print (f'diff r = {dr}')
 
-    pghx, pghy, pghz, pghr = axes_histogram(voxels, func=histograms.axis_histogram_par_gpu, ranges=ranges, voxel_bins=voxel_bins)
+    pghx, pghy, pghz, pghr = axes_histogram(voxels, func=axis_histogram_par_gpu, ranges=ranges, voxel_bins=voxel_bins)
 
     dx = np.abs(schx - pghx).sum()
     dy = np.abs(schy - pghy).sum()
@@ -87,9 +92,9 @@ def benchmark_axes_histograms(voxels, ranges=(1,4095), voxel_bins=256, runs=10):
     print()
     print('----- Benchmarking -----')
     print()
-    seq_cpu = timeit.timeit(lambda: axes_histogram(voxels, func=histograms.axis_histogram_seq_cpu, ranges=ranges, voxel_bins=voxel_bins), number=runs)
-    par_cpu = timeit.timeit(lambda: axes_histogram(voxels, func=histograms.axis_histogram_par_cpu, ranges=ranges, voxel_bins=voxel_bins), number=runs)
-    par_gpu = timeit.timeit(lambda: axes_histogram(voxels, func=histograms.axis_histogram_par_gpu, ranges=ranges, voxel_bins=voxel_bins), number=runs)
+    seq_cpu = timeit.timeit(lambda: axes_histogram(voxels, func=axis_histogram_seq_cpu, ranges=ranges, voxel_bins=voxel_bins), number=runs)
+    par_cpu = timeit.timeit(lambda: axes_histogram(voxels, func=axis_histogram_par_cpu, ranges=ranges, voxel_bins=voxel_bins), number=runs)
+    par_gpu = timeit.timeit(lambda: axes_histogram(voxels, func=axis_histogram_par_gpu, ranges=ranges, voxel_bins=voxel_bins), number=runs)
     print (f'Average of {runs} runs:')
     print (f'Seq CPU: {seq_cpu / runs:9.04f}')
     print (f'Par CPU: {par_cpu / runs:9.04f}')
@@ -142,9 +147,9 @@ def run_out_of_core(sample, block_size=128, z_offset=0, n_blocks=0,
             
         voxels, fields = load_block(sample, zstart, block_size, mask, mask_scale, field_names)
         for i in tqdm(range(1),"Histogramming over x,y,z axes and radius", leave=True):
-            histograms.axis_histogram_par_gpu(voxels, (zstart, 0, 0), voxels.shape[0], x_bins, y_bins, z_bins, r_bins, center, (vmin, vmax), False)
+            axis_histogram_par_gpu(voxels, (zstart, 0, 0), voxels.shape[0], x_bins, y_bins, z_bins, r_bins, center, (vmin, vmax), False)
         for i in tqdm(range(Nfields),f"Histogramming w.r.t. fields {field_names}", leave=True):
-            histograms.field_histogram_resample_par_cpu(voxels, fields[i], (zstart, 0, 0), (Nz, Ny, Nx), (Nz//2,Ny//2,Nx//2), voxels.shape[0], f_bins[i], (vmin, vmax), (fmin, fmax))
+            field_histogram_resample(voxels, fields[i], (zstart, 0, 0), (Nz, Ny, Nx), (Nz//2,Ny//2,Nx//2), voxels.shape[0], f_bins[i], (vmin, vmax), (fmin, fmax))
 
     f_bins[:, 0,:] = 0 # TODO EDT mask hack            
     f_bins[:,-1,:] = 0 # TODO "bright" mask hack
@@ -165,10 +170,18 @@ def run_out_of_core(sample, block_size=128, z_offset=0, n_blocks=0,
     # Special parameter values:
     # - block_size == 0 means "do one full subvolume at the time, interpret z_offset as start-at-subvolume-number"
     # - n_blocks   == 0 means "all blocks"
+    # TODO move some of the constants / parameters out into the configuration
     sample, block_size, z_offset, n_blocks, suffix, \
-    mask, mask_scale, voxel_bins, field_bins = commandline_args({"sample":"<required>",
-                                                                 "block_size":256, "z_offset": 0, "n_blocks":0, "suffix":"",
-                                                                 "mask":"None", "mask_scale": 8, "voxel_bins":4096, "field_bins":2048})
+    mask, mask_scale, voxel_bins, field_bins, verbose = commandline_args({"sample" : "<required>",
+                                                                          "block_size" : 256, 
+                                                                          "z_offset" :  0, 
+                                                                          "n_blocks" : 0, 
+                                                                          "suffix" : "",
+                                                                          "mask" : "None", 
+                                                                          "mask_scale" :  8, 
+                                                                          "voxel_bins" : 4096, 
+                                                                          "field_bins" : 2048,
+                                                                          "verbose" : 1})
 
     implant_threshold_u16 = 32000 # TODO: use config.constants
     (vmin,vmax),(fmin,fmax) = ((1e4,3e4),(1,2**16-1)) # TODO: Compute from total voxel histogram resp. total field histogram

From c29fb87c81af61230f87755bdeb1dfa0ef895a8f Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Feb 2023 10:09:38 +0100
Subject: [PATCH 049/136] #25 Moved optimize_distributions_flat

---
 pre-cleanup-src/doitall.py                             | 10 +++++++---
 .../1100_optimize_distributions_flat.py                |  0
 2 files changed, 7 insertions(+), 3 deletions(-)
 rename pre-cleanup-src/histogram_processing/optimize_distributions_flat.py => src/processing_steps/1100_optimize_distributions_flat.py (100%)

diff --git a/pre-cleanup-src/doitall.py b/pre-cleanup-src/doitall.py
index 47648ee..341ac0e 100644
--- a/pre-cleanup-src/doitall.py
+++ b/pre-cleanup-src/doitall.py
@@ -60,17 +60,21 @@
 
     11)
 	for b $(seq 0 $nblocks); do python3 histogram_processing/optimize_distributions_flat.py $sample bone_region$b edt 4 0; done
+	;;
+
+    12)
 	for b $(seq 0 $nblocks); do python3 histogram_processing/compute_probabilities_flat.py $sample bone_region$b edt 10 0; done
 	;;    
 
-    12)
+    13)
 	python3 scripts/segment-from-distributions.py $sample 0 0 bone_region optimized_distributions
 	;;
 
-    13) for m in 0 1; do python3 preprocess/rescale-cupy-bin.py $sample segmented/P$m ; done
+    14) 
+	for m in 0 1; do python3 preprocess/rescale-cupy-bin.py $sample segmented/P$m ; done
 	;;
 
-    14)
+    15)
 	python3 segmentation/segment-blod-cc.py $sample
 	;;
 
diff --git a/pre-cleanup-src/histogram_processing/optimize_distributions_flat.py b/src/processing_steps/1100_optimize_distributions_flat.py
similarity index 100%
rename from pre-cleanup-src/histogram_processing/optimize_distributions_flat.py
rename to src/processing_steps/1100_optimize_distributions_flat.py

From 01933f7df3f12093d182420352b98ef63c49853b Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Feb 2023 10:18:52 +0100
Subject: [PATCH 050/136] #25 Fixed some of the dependencies of
 optimize_distributions_flat

---
 .../histogram_processing => src/lib/py}/distributions.py  | 0
 .../lib/py}/piecewise_cubic.py                            | 0
 src/processing_steps/1100_optimize_distributions_flat.py  | 8 ++++----
 3 files changed, 4 insertions(+), 4 deletions(-)
 rename {pre-cleanup-src/histogram_processing => src/lib/py}/distributions.py (100%)
 rename {pre-cleanup-src/histogram_processing => src/lib/py}/piecewise_cubic.py (100%)

diff --git a/pre-cleanup-src/histogram_processing/distributions.py b/src/lib/py/distributions.py
similarity index 100%
rename from pre-cleanup-src/histogram_processing/distributions.py
rename to src/lib/py/distributions.py
diff --git a/pre-cleanup-src/histogram_processing/piecewise_cubic.py b/src/lib/py/piecewise_cubic.py
similarity index 100%
rename from pre-cleanup-src/histogram_processing/piecewise_cubic.py
rename to src/lib/py/piecewise_cubic.py
diff --git a/src/processing_steps/1100_optimize_distributions_flat.py b/src/processing_steps/1100_optimize_distributions_flat.py
index a5225d3..3deb5a6 100644
--- a/src/processing_steps/1100_optimize_distributions_flat.py
+++ b/src/processing_steps/1100_optimize_distributions_flat.py
@@ -1,9 +1,9 @@
 import os, sys, tqdm, numpy as np, matplotlib.pyplot as plt, numpy.linalg as la, scipy.ndimage as ndi, scipy.optimize as opt, time
 sys.path.append(sys.path[0]+"/../")
-from piecewise_cubic import piecewisecubic_matrix, piecewisecubic, smooth_fun
-from config.paths import commandline_args, hdf5_root as hdf5_root
-from distributions import *
-from helper_functions import *
+from lib.py.piecewise_cubic import piecewisecubic_matrix, piecewisecubic, smooth_fun
+from config.paths import hdf5_root
+from lib.py.distributions import powers
+from lib.py.helpers import commandline_args, row_normalize, update_hdf5
 na = np.newaxis
 
 hist_path = f"{hdf5_root}/processed/histograms/"

From 9f7cd297bb297c730df96e4634ba4b791725f203 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Feb 2023 10:22:20 +0100
Subject: [PATCH 051/136] #29 added verbose to optimize_distributions_flat

---
 .../1100_optimize_distributions_flat.py       | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/processing_steps/1100_optimize_distributions_flat.py b/src/processing_steps/1100_optimize_distributions_flat.py
index 3deb5a6..c349ff1 100644
--- a/src/processing_steps/1100_optimize_distributions_flat.py
+++ b/src/processing_steps/1100_optimize_distributions_flat.py
@@ -7,11 +7,11 @@
 na = np.newaxis
 
 hist_path = f"{hdf5_root}/processed/histograms/"
-sample, region_mask, field, stride, debug = commandline_args({"sample":"<required>",
-                                                              "region_mask":"<required>",
-                                                              "field":"edt",
-                                                              "stride": 4,
-                                                              "debug":8
+sample, region_mask, field, stride, verbose = commandline_args({"sample":"<required>",
+                                                                "region_mask":"<required>",
+                                                                "field":"edt",
+                                                                "stride": 4,
+                                                                "verbose":8
 })
 
 f_hist   = np.load(f"{hist_path}/{sample}/bins-{region_mask}.npz")
@@ -32,7 +32,7 @@ def material_points(labs,material_id):
 #hist = hist/(sums + (sums==0))
 lab  = f_labels[field][::stride,::stride]
 
-if debug==1:
+if verbose >= 2:
     plt.imshow(lab)
     plt.show()
     
@@ -61,7 +61,7 @@ def material_points(labs,material_id):
 
 print(f"Optimizing distributions for {field} with {lab.max()} materials")
 
-if (debug&7):
+if (verbose >= 3):
     plt.ion()
     fig = plt.figure(figsize=(15,15))
     ax = fig.add_subplot(111)
@@ -105,7 +105,7 @@ def opt_all(abcd,*args):
     Ecloseness = np.sum(1/(np.abs(C[1:]-C[:-1])+0.001))
     
 #    print(np.round(E1,2), np.round(1e2*Ecloseness,2))
-    if(debug==2):
+    if(verbose >= 3):
         line1.set_ydata(model)
         ax.set_title(f"{x}: a = {np.round(A*A,1)}, b = {np.round(B*B,1)}, c = {np.round(C,1)}, d = {np.round(D*D,1)}")
         ax.relim()
@@ -132,7 +132,7 @@ def opt_all(abcd,*args):
     if(n>0):
         abcd0 = np.array([amx[ms,i], bmx[ms,i], cmx[ms,i], dmx[ms,i]]).flatten()
 
-        if (debug==1):
+        if (verbose == 2):
             model = powers(vs,abcd0)        
             line1.set_ydata(np.sum(model,axis=0))
             line2.set_ydata(hist[i])
@@ -141,7 +141,7 @@ def opt_all(abcd,*args):
             fig.canvas.draw()
             fig.canvas.flush_events()
 
-        if(debug==2):
+        if(verbose == 3):
             ax.set_title(f"x = {x}")
             line2.set_ydata(hist[i])
 
@@ -180,7 +180,7 @@ def opt_all(abcd,*args):
 #            print(f"ABCDm = {ABCDm}")
             
         
-        if(debug==4):
+        if(verbose == 5):
             colors = ['r','orange']
             lines  = [line3,line4]
             model = powers(vs,abcd)
@@ -213,7 +213,7 @@ def opt_all(abcd,*args):
     hist_modeled[gi] = np.sum(model,axis=0)
     hist_m[ms,gi] = model
 
-if (debug&8):
+if (verbose == 6):
     fig = plt.figure(figsize=(10,10))
     axarr = fig.subplots(2,2)
     fig.suptitle(f'{sample} {region_mask}') # or plt.suptitle('Main title')

From fd0a7a21959076b6fd87fb1b92c8dac75779d3fc Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Feb 2023 10:26:11 +0100
Subject: [PATCH 052/136] Moved compute_probabilities_flat

---
 .../processing_steps/1200_compute_probabilities_flat.py           | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename pre-cleanup-src/histogram_processing/compute_probabilities.py => src/processing_steps/1200_compute_probabilities_flat.py (100%)

diff --git a/pre-cleanup-src/histogram_processing/compute_probabilities.py b/src/processing_steps/1200_compute_probabilities_flat.py
similarity index 100%
rename from pre-cleanup-src/histogram_processing/compute_probabilities.py
rename to src/processing_steps/1200_compute_probabilities_flat.py

From 747a3af596c800170d02edca8f681510bebcd494 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Feb 2023 10:29:52 +0100
Subject: [PATCH 053/136] #25 Updated the dependencies of
 compute_probabilities_flat

---
 src/processing_steps/1200_compute_probabilities_flat.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/processing_steps/1200_compute_probabilities_flat.py b/src/processing_steps/1200_compute_probabilities_flat.py
index f30d028..7e3988b 100644
--- a/src/processing_steps/1200_compute_probabilities_flat.py
+++ b/src/processing_steps/1200_compute_probabilities_flat.py
@@ -1,10 +1,11 @@
 import os, sys, tqdm, numpy as np, matplotlib.pyplot as plt, numpy.linalg as la, scipy.ndimage as ndi, scipy.optimize as opt, time
+import h5py
+import pathlib
 sys.path.append(sys.path[0]+"/../")
 #from piecewise_linear import piecewiselinear_matrix, piecewiselinear, smooth_fun as smooth_fun_l
-from piecewise_cubic import piecewisecubic_matrix, piecewisecubic, smooth_fun as smooth_fun_c
+from lib.py.piecewise_cubic import piecewisecubic_matrix, piecewisecubic, smooth_fun as smooth_fun_c
 from config.paths import commandline_args, hdf5_root as hdf5_root
-from distributions import *
-from helper_functions import *
+from lib.py.helpers import update_hdf5, row_normalize
 na = np.newaxis
 
 

From 29a950c751fb51284cb2a63a9a1498e6249c12a9 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Feb 2023 10:33:35 +0100
Subject: [PATCH 054/136] #29 Added verbose to compute_probabilties_flat

---
 .../1200_compute_probabilities_flat.py        | 30 +++++++++----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/processing_steps/1200_compute_probabilities_flat.py b/src/processing_steps/1200_compute_probabilities_flat.py
index 7e3988b..931a502 100644
--- a/src/processing_steps/1200_compute_probabilities_flat.py
+++ b/src/processing_steps/1200_compute_probabilities_flat.py
@@ -4,16 +4,16 @@
 sys.path.append(sys.path[0]+"/../")
 #from piecewise_linear import piecewiselinear_matrix, piecewiselinear, smooth_fun as smooth_fun_l
 from lib.py.piecewise_cubic import piecewisecubic_matrix, piecewisecubic, smooth_fun as smooth_fun_c
-from config.paths import commandline_args, hdf5_root as hdf5_root
-from lib.py.helpers import update_hdf5, row_normalize
+from config.paths import hdf5_root
+from lib.py.helpers import commandline_args, row_normalize, update_hdf5
 na = np.newaxis
-
+verbose = 1
 
 # TODO: Til fælles fil.
 def save_probabilities(Ps,sample, region_mask,field_name, value_ranges, prob_method):
     output_path = f'{hdf5_root}/processed/probabilities/{sample}.h5'
-    print(f"output_path = {output_path}")
-    print(f"group_name1 = {prob_method}/{region_mask}\n" +
+    if verbose >= 1: print(f"output_path = {output_path}")
+    if verbose >= 1: print(f"group_name1 = {prob_method}/{region_mask}\n" +
           f"group_name2 = {prob_method}/{region_mask}/{field_name}")
     update_hdf5(
         output_path,
@@ -22,7 +22,7 @@ def save_probabilities(Ps,sample, region_mask,field_name, value_ranges, prob_met
         attributes = {}
     )
     for m,P in enumerate(Ps):
-        print(f"Storing {P.shape} probabilities P{m}")
+        if verbose >= 1: print(f"Storing {P.shape} probabilities P{m}")
         update_hdf5(
             output_path,
             group_name = f'{prob_method}/{region_mask}/{field_name}',
@@ -47,11 +47,11 @@ def evaluate_2d(G, xs, vs):
 
 
 hist_path = f"{hdf5_root}/processed/histograms/"
-sample, region_mask, field_name, n_segments_c, debug = commandline_args({"sample":"<required>",
-                                                                         "region_mask":"<required>",
-                                                                         "field_name":"edt",
-                                                                         "n_segments": 4,
-                                                                         "debug":8
+sample, region_mask, field_name, n_segments_c, verbose = commandline_args({"sample" : "<required>",
+                                                                           "region_mask" : "<required>",
+                                                                           "field_name" : "edt",
+                                                                           "n_segments" :  4,
+                                                                           "verbose" : 8
 })
 
 hist_path = f"{hdf5_root}/processed/histograms/"
@@ -150,7 +150,7 @@ def evaluate_2d(G, xs, vs):
 
     
 ##---- TODO: STICK THE DEBUG-PLOTTING FUNCTIONS SOMEWHERE CENTRAL
-if (debug&7):
+if (verbose & 7):
     plt.ion()
     fig = plt.figure(figsize=(15,15))
     ax = fig.add_subplot(111)
@@ -162,7 +162,7 @@ def evaluate_2d(G, xs, vs):
     plt.show()
 
     
-if(debug==4):
+if(verbose == 4):
     colors = ['b','r']
     lines  = [line3,line4]
 
@@ -182,7 +182,7 @@ def evaluate_2d(G, xs, vs):
         fig.canvas.flush_events()
 
 
-if (debug==8):
+if (verbose == 8):
     fig = plt.figure(figsize=(10,10))
     axarr = fig.subplots(3,2)
     fig.suptitle(f'{sample} {region_mask}') 
@@ -209,7 +209,7 @@ def evaluate_2d(G, xs, vs):
 
 
 
-if (debug==10):
+if (verbose == 10):
     fig = plt.figure(figsize=(15,15))
     axarr = fig.subplots(2,2)
     fig.suptitle(f'{sample} {region_mask}') 

From 95ef0e094ca31aec500358c0d3346536be207e7c Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Feb 2023 10:35:02 +0100
Subject: [PATCH 055/136] #25 Moved segment from distributions

---
 .../processing_steps/1300_segment_from_distributions.py           | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename pre-cleanup-src/scripts/segment_from_distributions.py => src/processing_steps/1300_segment_from_distributions.py (100%)

diff --git a/pre-cleanup-src/scripts/segment_from_distributions.py b/src/processing_steps/1300_segment_from_distributions.py
similarity index 100%
rename from pre-cleanup-src/scripts/segment_from_distributions.py
rename to src/processing_steps/1300_segment_from_distributions.py

From 4d7a57a1242dc2467c8462c24665244ca3fea5f1 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Feb 2023 10:38:57 +0100
Subject: [PATCH 056/136] #25 Fixed dependencies of segment_from_distributions

---
 .../1300_segment_from_distributions.py           | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/processing_steps/1300_segment_from_distributions.py b/src/processing_steps/1300_segment_from_distributions.py
index 837644f..5554e9f 100644
--- a/src/processing_steps/1300_segment_from_distributions.py
+++ b/src/processing_steps/1300_segment_from_distributions.py
@@ -1,12 +1,14 @@
 import os, sys, pathlib, h5py, numpy as np, scipy.ndimage as ndi
 sys.path.append(sys.path[0]+"/../")
-import pybind_kernels.histograms as histograms
-import pybind_kernels.label as label
-from config.paths import binary_root, hdf5_root_fast as hdf5_root, commandline_args
+#import pybind_kernels.histograms as histograms
+#import pybind_kernels.label as label
+from lib.cpp.gpu.label import material_prob_justonefieldthx
+from lib.cpp.cpu.io import write_slice
+from config.paths import binary_root, hdf5_root_fast as hdf5_root
 from tqdm import tqdm
 import matplotlib.pyplot as plt
 from PIL import Image
-from helper_functions import block_info, load_block
+from lib.py.helpers import block_info, commandline_args, load_block
 na = np.newaxis
 
 debug = True
@@ -20,7 +22,7 @@ def load_probabilities(path, group, axes_names, field_names, m):
         prob_file.close()
         return P_axes, P_fields
     except Exception as e:
-        print(f"Couldn't load {group}/{name}/P{m} from {path}: {e}")
+        print(f"Couldn't load {group}/{axes_names}|{field_names}/P{m} from {path}: {e}")
         sys.exit(-1)
 
 def load_value_ranges(path, group):
@@ -97,7 +99,7 @@ def nblocks(size, block_size):
             result = np.zeros((zend-zstart,Ny,Nx), dtype=np.uint16)
 
 
-            label.material_prob_justonefieldthx(voxels,fields[0],P_fields[0],result,
+            material_prob_justonefieldthx(voxels,fields[0],P_fields[0],result,
                                                 (vmin,vmax),(fmin,fmax),
                                                 (zstart,0,0), (zend,Ny,Nx));
 
@@ -115,5 +117,5 @@ def nblocks(size, block_size):
                 print (f'Segmentation has min {result.min()} and max {result.max()}')
 
             print(f"Writing results from block {b}")
-            histograms.write_slice(result, zstart*Ny*Nx, output_file)
+            write_slice(result, zstart*Ny*Nx, output_file)
 

From 96447e64531e1e5531de1984ea1e829575900f6e Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Feb 2023 10:42:39 +0100
Subject: [PATCH 057/136] #29 Added verbose to segment from distributions

---
 .../1300_segment_from_distributions.py        | 27 +++++++++----------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/src/processing_steps/1300_segment_from_distributions.py b/src/processing_steps/1300_segment_from_distributions.py
index 5554e9f..6d9e29f 100644
--- a/src/processing_steps/1300_segment_from_distributions.py
+++ b/src/processing_steps/1300_segment_from_distributions.py
@@ -10,9 +10,7 @@
 from PIL import Image
 from lib.py.helpers import block_info, commandline_args, load_block
 na = np.newaxis
-
-debug = True
-
+verbose = 1
 
 def load_probabilities(path, group, axes_names, field_names, m):
     try:
@@ -26,7 +24,7 @@ def load_probabilities(path, group, axes_names, field_names, m):
         sys.exit(-1)
 
 def load_value_ranges(path, group):
-    print(f"Reading value_ranges from {group} in {path}\n")
+    if verbose >= 1: print(f"Reading value_ranges from {group} in {path}\n")
     try:
         f = h5py.File(path, 'r')
         return f[group]['value_ranges'][:].astype(int)
@@ -39,14 +37,14 @@ def nblocks(size, block_size):
     return (size // block_size) + (1 if size % block_size > 0 else 0)
 
 if __name__ == '__main__': 
-    sample, block_start, n_blocks, region_mask, group, mask_scale, scheme, debug_output = commandline_args({'sample':'<required>',
-                                                                                   "block_start":0,
-                                                                                   "n_blocks":0,
-                                                                                   'region_mask': 'bone_region',
-                                                                                   'group': 'otsu_separation',
-                                                                                   'mask_scale': 8,
-                                                                                   'scheme':"edt", #MIDLERTIDIG
-                                                                                   'debug_output': None})
+    sample, block_start, n_blocks, region_mask, group, mask_scale, scheme, verbose = commandline_args({'sample' : '<required>',
+                                                                                                       "block_start" : 0,
+                                                                                                       "n_blocks" : 0,
+                                                                                                       'region_mask' :  'bone_region',
+                                                                                                       'group' :  'otsu_separation',
+                                                                                                       'mask_scale' :  8,
+                                                                                                       'scheme' : "edt", #MIDLERTIDIG
+                                                                                                       'verbose' : 1})
 
     # Iterate over all subvolumes
     bi = block_info(f'{hdf5_root}/hdf5-byte/msb/{sample}.h5', block_size=0, n_blocks=n_blocks, z_offset=block_start)
@@ -113,9 +111,8 @@ def nblocks(size, block_size):
             #     (zstart, 0, 0), (zend, sy, sx)
             # )
 
-            if debug:
-                print (f'Segmentation has min {result.min()} and max {result.max()}')
+            if verbose >= 2: print (f'Segmentation has min {result.min()} and max {result.max()}')
 
-            print(f"Writing results from block {b}")
+            if verbose >= 1: print(f"Writing results from block {b}")
             write_slice(result, zstart*Ny*Nx, output_file)
 

From b53638e1301ed1efb8caeca01340c1639a250079 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Feb 2023 10:45:21 +0100
Subject: [PATCH 058/136] #25 Added processing step 14 as a symlink

---
 src/processing_steps/1400_rescale_cupy_bin.py | 1 +
 1 file changed, 1 insertion(+)
 create mode 120000 src/processing_steps/1400_rescale_cupy_bin.py

diff --git a/src/processing_steps/1400_rescale_cupy_bin.py b/src/processing_steps/1400_rescale_cupy_bin.py
new file mode 120000
index 0000000..ec15bd3
--- /dev/null
+++ b/src/processing_steps/1400_rescale_cupy_bin.py
@@ -0,0 +1 @@
+processing_steps/0500_rescale_cupy_bin.py
\ No newline at end of file

From 0ef89f2dad8dcc89c6c32a671c67a190a3e01057 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Feb 2023 10:46:24 +0100
Subject: [PATCH 059/136] #25 Moved segment_blood_cc

---
 .../processing_steps/1500_segment_blood_cc.py                     | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename pre-cleanup-src/segmentation/segment-blood-cc.py => src/processing_steps/1500_segment_blood_cc.py (100%)

diff --git a/pre-cleanup-src/segmentation/segment-blood-cc.py b/src/processing_steps/1500_segment_blood_cc.py
similarity index 100%
rename from pre-cleanup-src/segmentation/segment-blood-cc.py
rename to src/processing_steps/1500_segment_blood_cc.py

From e63def628c709a316e6964485cdadc6fea4a01ad Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Feb 2023 10:49:03 +0100
Subject: [PATCH 060/136] #25 Fixed the dependencies of segment_blood_cc

---
 src/processing_steps/1500_segment_blood_cc.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/processing_steps/1500_segment_blood_cc.py b/src/processing_steps/1500_segment_blood_cc.py
index 5925347..ad31808 100644
--- a/src/processing_steps/1500_segment_blood_cc.py
+++ b/src/processing_steps/1500_segment_blood_cc.py
@@ -1,10 +1,10 @@
 import h5py, sys, os.path, pathlib, numpy as np, scipy.ndimage as ndi, tqdm, matplotlib.pyplot as plt
 sys.path.append(sys.path[0]+"/../")
 from config.constants import *
-from config.paths import hdf5_root, hdf5_root_fast, binary_root, commandline_args
-from pybind_kernels.histograms import load_slice
+from config.paths import hdf5_root, hdf5_root_fast, binary_root
+from lib.cpp.cpu import load_slice
 from scipy import ndimage as ndi
-from helper_functions import *
+from lib.py.helpers import block_info, commandline_args, update_hdf5
 
 sample, m, scheme, chunk_size = commandline_args({"sample":"<required>", "material":0, "scheme":"edt","chunk_size":256})
 

From 5857d92a2ab9ebd662a45606b5d9086550f5dc0a Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Feb 2023 10:50:13 +0100
Subject: [PATCH 061/136] #29 Added verbose to segment_blood_cc

---
 src/processing_steps/1500_segment_blood_cc.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/processing_steps/1500_segment_blood_cc.py b/src/processing_steps/1500_segment_blood_cc.py
index ad31808..a0e1ea2 100644
--- a/src/processing_steps/1500_segment_blood_cc.py
+++ b/src/processing_steps/1500_segment_blood_cc.py
@@ -6,7 +6,11 @@
 from scipy import ndimage as ndi
 from lib.py.helpers import block_info, commandline_args, update_hdf5
 
-sample, m, scheme, chunk_size = commandline_args({"sample":"<required>", "material":0, "scheme":"edt","chunk_size":256})
+sample, m, scheme, chunk_size, verbose = commandline_args({"sample" : "<required>",
+                                                           "material" : 0,
+                                                           "scheme" : "edt",
+                                                           "chunk_size" : 256,
+                                                           "verbose" : 1})
 
 scales = [32, 16, 8, 4, 2]
 

From 6ff9582819cb39e6542b89ad01d151ab260fc478 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Feb 2023 13:30:21 +0100
Subject: [PATCH 062/136] #25 Verified that 0300 works

---
 .gitignore                                  | 5 ++++-
 src/processing_steps/0300_volume_matcher.py | 5 +++--
 src/requirements.txt                        | 1 +
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index 5743ee6..224e2b6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,4 +18,7 @@ src/meow/runner_processing/*
 
 # Compiled files
 *.so
-*.so.dSYM
\ No newline at end of file
+*.so.dSYM
+
+# Ignore the $BONE_DATA symlinks, as they're only there for convinience in vscode
+data_*
\ No newline at end of file
diff --git a/src/processing_steps/0300_volume_matcher.py b/src/processing_steps/0300_volume_matcher.py
index 336660c..374aea8 100755
--- a/src/processing_steps/0300_volume_matcher.py
+++ b/src/processing_steps/0300_volume_matcher.py
@@ -139,5 +139,6 @@ def write_matched_hdf5(h5_filename_in, h5_filename_out, crossings, shifts, compr
         
     h5file.close()
 
-    if verbose >= 1: print(f"Copying over volume from {input_h5name} shifted by {shifts} to {output_h5name}")
-    if(generate_h5): write_matched_hdf5(input_h5name, output_h5name, crossings, shifts)
+    if(generate_h5): 
+        if verbose >= 1: print(f"Copying over volume from {input_h5name} shifted by {shifts} to {output_h5name}")
+        write_matched_hdf5(input_h5name, output_h5name, crossings, shifts)
diff --git a/src/requirements.txt b/src/requirements.txt
index 5125944..9146494 100644
--- a/src/requirements.txt
+++ b/src/requirements.txt
@@ -2,5 +2,6 @@ cupy-cuda11x==11.5.0
 edt==2.3.0
 fabric==3.0.0
 jax==0.4.3
+jaxlib==0.4.3
 tqdm==4.64.1
 vedo==2023.4.3
\ No newline at end of file

From 1407958db4dcf1a767960c05fde3486ed460edaf Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 13 Feb 2023 09:57:21 +0100
Subject: [PATCH 063/136] #25 step 400 works

---
 src/lib/cpp/cpu_seq/io.cc            |  1 +
 src/processing_steps/0400_h5tobin.py | 10 +++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/lib/cpp/cpu_seq/io.cc b/src/lib/cpp/cpu_seq/io.cc
index 01cf2f8..2d30477 100644
--- a/src/lib/cpp/cpu_seq/io.cc
+++ b/src/lib/cpp/cpu_seq/io.cc
@@ -34,6 +34,7 @@ void write_contiguous_slice(const T *data,
     }
     file.seekp(offset * sizeof(T), ios::beg);
     file.write((char*) data, size * sizeof(T));
+    file.flush(); // Should have flushed, but just in case
     file.close();
 }
 
diff --git a/src/processing_steps/0400_h5tobin.py b/src/processing_steps/0400_h5tobin.py
index 53b63ed..0c4def6 100755
--- a/src/processing_steps/0400_h5tobin.py
+++ b/src/processing_steps/0400_h5tobin.py
@@ -3,7 +3,7 @@
 sys.path.append(sys.path[0]+"/../")
 from config.paths import hdf5_root, binary_root
 from tqdm import tqdm
-from lib.cpp.cpu.io import write_slice
+from lib.cpp.cpu_seq.io import write_slice
 from lib.py.helpers import commandline_args, update_hdf5
 
 slice_all = slice(None)
@@ -65,12 +65,16 @@ def h5tobin(sample,region=(slice_all,slice_all,slice_all),shift_volume_match=1):
     for i in tqdm(range(Nvols), desc=f'Loading {sample} from HDF5 and writing binary'):
         subvolume_msb = dmsb[input_zstarts[i]:input_zends[i],y_range,x_range].astype(np.uint16)
         subvolume_lsb = dlsb[input_zstarts[i]:input_zends[i],y_range,x_range].astype(np.uint16)
-
-        write_slice((subvolume_msb << 8) | subvolume_lsb, output_zstarts[i]*Ny*Nx, outfile)
+        combined = (subvolume_msb << 8) | subvolume_lsb
 
         del subvolume_msb
         del subvolume_lsb
 
+        # TODO For some reason, when 'output_zstarts' is a numpy type, 'combined' gets interpreted as an uint8 array through pybind. It is therefore important that it is converted to a python integer. This should be investigated, as it doesn't make sense that arguments should affect each other in this manner! Especially since it's only the first argument that's templated. Note: it's not due to mixed types in the tuple, as giving it three numpy values also breaks it.
+        write_slice(combined, outfile, (int(output_zstarts[i]), 0, 0), combined.shape)
+
+        del combined
+
     msb_file.close()
     lsb_file.close()
 

From 58707e583664dc72721f805c99c85ce87efa8bf8 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 13 Feb 2023 09:58:08 +0100
Subject: [PATCH 064/136] #16 Added the option to only run a subset of tests

---
 src/Makefile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/Makefile b/src/Makefile
index 2a00b42..e26cb0c 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -41,5 +41,8 @@ $(foreach PLATFORM, $(PLATFORMS), \
 test: all
 	$(PYTHON) -m pytest -n auto test
 
+test_%: test/test_%.py all
+	$(PYTHON) -m pytest -n auto $<
+
 clean:
 	rm -rf $(CLEANUP) __pycache__ test/__pycache__ .pytest_cache lib/cpp/**/*.so
\ No newline at end of file

From eb4acf3a567b7f9a34dfcb341f1d4a8e9e89937f Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 13 Feb 2023 09:59:01 +0100
Subject: [PATCH 065/136] #16 Extended io test with the seek past file end test

---
 src/test/test_io.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/test/test_io.py b/src/test/test_io.py
index 12ed56d..4256fbe 100644
--- a/src/test/test_io.py
+++ b/src/test/test_io.py
@@ -59,6 +59,19 @@ def test_dtype(dtype):
     for i in range(partial_factor+1):
         io.load_slice(read_data, individual_tmp_file, (i*partial,0,0), read_data.shape)
         assert np.allclose(data[i*partial:(i+1)*partial], read_data)
+    
+    # Write past where the file ends
+    impl.write_slice(data, individual_tmp_file, (data.shape[0]*2,0,0), data.shape)
+    assert os.path.getsize(individual_tmp_file) == 3 * data.nbytes
+
+    # Check that the old data remains, the middle data is zeros, and that the new data is the same
+    read_data = np.empty_like(data)
+    for i in range(3):
+        impl.load_slice(read_data, individual_tmp_file, (i*data.shape[0],0,0), data.shape)
+        if i != 1:
+            assert np.allclose(data, read_data)
+        else:
+            assert np.allclose(np.zeros_like(data), read_data)
 
     os.remove(individual_tmp_file)
 

From 262a4a2f8a07ae030c297dabea1a61ab8fe8babf Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 13 Feb 2023 09:59:22 +0100
Subject: [PATCH 066/136] #16 extended io test to test different
 implementations

---
 src/test/test_io.py | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/src/test/test_io.py b/src/test/test_io.py
index 4256fbe..199257d 100644
--- a/src/test/test_io.py
+++ b/src/test/test_io.py
@@ -3,7 +3,9 @@
 '''
 import sys
 sys.path.append(sys.path[0]+"/../lib/cpp")
-import cpu_seq.io as io
+import cpu_seq.io as io_cpu_seq
+import cpu.io as io_cpu
+import gpu.io as io_gpu
 import numpy as np
 import tempfile
 import os
@@ -14,50 +16,52 @@
 tmp_folder = tempfile._get_default_tempdir()
 tmp_filename = next(tempfile._get_candidate_names())
 tmp_file = f'{tmp_folder}/{tmp_filename}'
-dim_size = 16
+dim_size = 128
 dim_shape = (dim_size, dim_size, dim_size)
 partial_factor = 4
+impls = [io_cpu_seq] #, io_cpu, io_gpu]
 
 def random(shape, dtype):
     rnds = np.random.random(shape) * 100
     return rnds > .5 if dtype == bool else rnds.astype(dtype)
 
+@pytest.mark.parametrize("impl", impls)
 @pytest.mark.parametrize("dtype", dtypes_to_test)
-def test_dtype(dtype):
+def test_dtype(impl, dtype):
     individual_tmp_file = f'{tmp_file}.{dtype.__name__}'
     if os.path.exists(individual_tmp_file):
         os.remove(individual_tmp_file)
     data = random(dim_shape, dtype)
-    data[0,0,1] = False
     partial = dim_size // partial_factor
 
     # Write out a new file
-    io.write_slice(data, individual_tmp_file, (0,0,0), dim_shape)
+    impl.write_slice(data, individual_tmp_file, (0,0,0), dim_shape)
     assert os.path.getsize(individual_tmp_file) == data.nbytes
 
     # Read back and verify in chunks
-    read_data = np.zeros((partial, dim_size, dim_size), dtype=dtype)
+    read_data = np.empty((partial, dim_size, dim_size), dtype=dtype)
     for i in range(partial_factor):
-        io.load_slice(read_data, individual_tmp_file, (i*partial,0,0), read_data.shape)
+        impl.load_slice(read_data, individual_tmp_file, (i*partial,0,0), read_data.shape)
         assert np.allclose(data[i*partial:(i+1)*partial], read_data)
 
     # Append another layer
     data = np.append(data, random((partial, dim_size, dim_size), dtype), axis=0)
-    io.write_slice(data[dim_size:], individual_tmp_file, (dim_size,0,0), data.shape)
+    impl.write_slice(data[dim_size:], individual_tmp_file, (dim_size,0,0), data.shape)
     assert os.path.getsize(individual_tmp_file) == data.nbytes
 
     # Read back and verify in chunks
     for i in range(partial_factor+1):
-        io.load_slice(read_data, individual_tmp_file, (i*partial,0,0), read_data.shape)
+        impl.load_slice(read_data, individual_tmp_file, (i*partial,0,0), read_data.shape)
         assert np.allclose(data[i*partial:(i+1)*partial], read_data)
 
     # Overwrite one of the "middle" chunks
     data[partial:2*partial] = random((partial, dim_size, dim_size), dtype)
-    io.write_slice(data[partial:partial*2], individual_tmp_file, (partial,0,0), data.shape)
+    impl.write_slice(data[partial:partial*2], individual_tmp_file, (partial,0,0), data.shape)
+    assert os.path.getsize(individual_tmp_file) == data.nbytes
 
     # Read back and verify in chunks
     for i in range(partial_factor+1):
-        io.load_slice(read_data, individual_tmp_file, (i*partial,0,0), read_data.shape)
+        impl.load_slice(read_data, individual_tmp_file, (i*partial,0,0), read_data.shape)
         assert np.allclose(data[i*partial:(i+1)*partial], read_data)
     
     # Write past where the file ends
@@ -76,6 +80,7 @@ def test_dtype(dtype):
     os.remove(individual_tmp_file)
 
 if __name__ == '__main__':
-    for dtype in dtypes_to_test:
-        print (f'Testing {dtype.__name__}')
-        test_dtype(dtype)
\ No newline at end of file
+    for impl in impls:
+        for dtype in dtypes_to_test:
+            print (f'Testing {impl.__name__} on {dtype.__name__}')
+            test_dtype(impl, dtype)
\ No newline at end of file

From c23bc53f6c77a762c537ca786c5f260f72054e7e Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 13 Feb 2023 10:04:47 +0100
Subject: [PATCH 067/136] Added file for printing system type ids

---
 src/exploration/print_cpp_type_ids.cpp | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 src/exploration/print_cpp_type_ids.cpp

diff --git a/src/exploration/print_cpp_type_ids.cpp b/src/exploration/print_cpp_type_ids.cpp
new file mode 100644
index 0000000..a0191c8
--- /dev/null
+++ b/src/exploration/print_cpp_type_ids.cpp
@@ -0,0 +1,24 @@
+#include<iostream>
+#include<stdint.h>
+
+int main() {
+    /*
+    This class is used to print out the code of the type.
+    This is handy when debugging the templated type at runtime.
+    */
+
+    std::cout << "int8 " << typeid(int8_t).name() << std::endl;
+    std::cout << "int16 " << typeid(int16_t).name() << std::endl;
+    std::cout << "int32 " << typeid(int32_t).name() << std::endl;
+    std::cout << "int64 " << typeid(int64_t).name() << std::endl;
+    std::cout << "int128 " << typeid(__int128_t).name() << std::endl;
+    
+    std::cout << "uint8 " << typeid(uint8_t).name() << std::endl;
+    std::cout << "uint16 " << typeid(uint16_t).name() << std::endl;
+    std::cout << "uint32 " << typeid(uint32_t).name() << std::endl;
+    std::cout << "uint64 " << typeid(uint64_t).name() << std::endl;
+    std::cout << "uint128 " << typeid(__uint128_t).name() << std::endl;
+    
+    std::cout << "float " << typeid(float).name() << std::endl;
+    std::cout << "double " << typeid(double).name() << std::endl;
+}
\ No newline at end of file

From 61df36171fa2b38e830fe3c37c3f2fa735eb4aac Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 13 Feb 2023 10:05:48 +0100
Subject: [PATCH 068/136] Added a.out to gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 224e2b6..a724c3c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,6 +19,7 @@ src/meow/runner_processing/*
 # Compiled files
 *.so
 *.so.dSYM
+a.out
 
 # Ignore the $BONE_DATA symlinks, as they're only there for convinience in vscode
 data_*
\ No newline at end of file

From fd47621a2a1b216fe30f61837e629b86bd69a12a Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 13 Feb 2023 10:06:20 +0100
Subject: [PATCH 069/136] Added launch file for vscode python debugging

---
 .vscode/launch.json | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 .vscode/launch.json

diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 0000000..7b0145f
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,17 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python: 0400_h5tobin",
+            "type": "python",
+            "request": "launch",
+            "program": "${workspaceFolder}/src/processing_steps/0400_h5tobin.py",
+            "console": "integratedTerminal",
+            "args": ["770c_pag"],
+            "justMyCode": false
+        }
+    ]
+}
\ No newline at end of file

From f739668946b8738b6eac671f7b9179f55fb34b0a Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 13 Feb 2023 10:09:28 +0100
Subject: [PATCH 070/136] #25 step 0500 runs

---
 .vscode/launch.json | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/.vscode/launch.json b/.vscode/launch.json
index 7b0145f..aae8ffc 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -12,6 +12,15 @@
             "console": "integratedTerminal",
             "args": ["770c_pag"],
             "justMyCode": false
-        }
+        },
+        {
+            "name": "Python: 0500_rescale_cupy_bin",
+            "type": "python",
+            "request": "launch",
+            "program": "${workspaceFolder}/src/processing_steps/0500_rescale_cupy_bin.py",
+            "console": "integratedTerminal",
+            "args": ["770c_pag"],
+            "justMyCode": false
+        },
     ]
 }
\ No newline at end of file

From 76eb769c1bfd1a136ee5fb26558c4bcd9d15ddb9 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 13 Feb 2023 10:15:30 +0100
Subject: [PATCH 071/136] #25 Moved geometry files

---
 .../pybind_kernels/cpu => src/lib/cpp/cpu_seq}/geometry.cc      | 0
 .../pybind_kernels => src}/pybind/geometry-pybind.cc            | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename {pre-cleanup-src/pybind_kernels/cpu => src/lib/cpp/cpu_seq}/geometry.cc (100%)
 rename {pre-cleanup-src/pybind_kernels => src}/pybind/geometry-pybind.cc (98%)

diff --git a/pre-cleanup-src/pybind_kernels/cpu/geometry.cc b/src/lib/cpp/cpu_seq/geometry.cc
similarity index 100%
rename from pre-cleanup-src/pybind_kernels/cpu/geometry.cc
rename to src/lib/cpp/cpu_seq/geometry.cc
diff --git a/pre-cleanup-src/pybind_kernels/pybind/geometry-pybind.cc b/src/pybind/geometry-pybind.cc
similarity index 98%
rename from pre-cleanup-src/pybind_kernels/pybind/geometry-pybind.cc
rename to src/pybind/geometry-pybind.cc
index cbf19f4..a7c1f0d 100644
--- a/pre-cleanup-src/pybind_kernels/pybind/geometry-pybind.cc
+++ b/src/pybind/geometry-pybind.cc
@@ -160,5 +160,5 @@ PYBIND11_MODULE(geometry, m) {
     m.def("cylinder_projection",  &python_api::cylinder_projection);
     m.def("sample_plane",         &python_api::sample_plane<uint16_t>);
     m.def("sample_plane",         &python_api::sample_plane<uint8_t>);
-    m.def("compute_front_mask", &python_api::compute_front_mask);
+    m.def("compute_front_mask",   &python_api::compute_front_mask);
 }

From 2e67d269d01a3e37e553b36a02178cb444846084 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 13 Feb 2023 10:41:35 +0100
Subject: [PATCH 072/136] #25 Made the geometry file more consistent with the
 other cpp files

---
 src/lib/cpp/cpu_seq/geometry.cc | 931 ++++++++++++++++----------------
 1 file changed, 453 insertions(+), 478 deletions(-)

diff --git a/src/lib/cpp/cpu_seq/geometry.cc b/src/lib/cpp/cpu_seq/geometry.cc
index 4154c5a..2a155aa 100644
--- a/src/lib/cpp/cpu_seq/geometry.cc
+++ b/src/lib/cpp/cpu_seq/geometry.cc
@@ -11,554 +11,529 @@ using namespace std;
 
 #define dot(a,b) (a[0]*b[0] + a[1]*b[1] + a[2]*b[2])
 
-void print_timestamp(string message)
-{
-  auto now = chrono::system_clock::to_time_t(chrono::system_clock::now());
-  tm local_tm = *localtime(&now);
-  fprintf(stderr,"%s at %02d:%02d:%02d\n", message.c_str(), local_tm.tm_hour, local_tm.tm_min, local_tm.tm_sec);    
+void print_timestamp(string message) {
+    auto now = chrono::system_clock::to_time_t(chrono::system_clock::now());
+    tm local_tm = *localtime(&now);
+    fprintf(stderr,"%s at %02d:%02d:%02d\n", message.c_str(), local_tm.tm_hour, local_tm.tm_min, local_tm.tm_sec);    
 }
 
-
 // TODO: Fix OpenACC copies & re-enable GPU
 array<real_t,3> center_of_mass(const input_ndarray<mask_type> voxels) {
-  // nvc++ doesn't support OpenACC 2.7 array reductions yet.  
-  real_t cmx = 0, cmy = 0, cmz = 0;
-  size_t  Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
-  int64_t image_length = Nx*Ny*Nz;
-
-  print_timestamp("center_of_mass start");
-  real_t total_mass = 0;  
-  for(int64_t block_start=0;block_start<image_length;block_start+=acc_block_size){
-
-    const mask_type *buffer = voxels.data + block_start;
-    ssize_t this_block_length = min(acc_block_size,image_length-block_start);
-
-    //#pragma acc parallel loop reduction(+:cmx,cmy,cmz,total_mass) copyin(buffer[:this_block_length])
-    reduction_loop((+:cmx,cmy,cmz,total_mass),())
-    for(int64_t k = 0; k<this_block_length;k++){
-      real_t          m = buffer[k];      
-
-      int64_t flat_idx = block_start + k;
-      int64_t x = flat_idx / (Ny*Nz);
-      int64_t y = (flat_idx / Nz) % Ny;
-      int64_t z = flat_idx % Nz;
-
-      total_mass += m;
-      cmx += m*x; cmy += m*y; cmz += m*z;
+    // nvc++ doesn't support OpenACC 2.7 array reductions yet.  
+    real_t  cmx = 0, cmy = 0, cmz = 0;
+    size_t  Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
+    int64_t image_length = Nx*Ny*Nz;
+
+    print_timestamp("center_of_mass start");
+    real_t total_mass = 0;  
+    for (int64_t block_start = 0; block_start < image_length; block_start += acc_block_size) {
+
+        const mask_type *buffer = voxels.data + block_start;
+        ssize_t this_block_length = min(acc_block_size, image_length-block_start);
+
+        //#pragma acc parallel loop reduction(+:cmx,cmy,cmz,total_mass) copyin(buffer[:this_block_length])
+        //reduction_loop((+:cmx,cmy,cmz,total_mass),())
+        for (int64_t k = 0; k < this_block_length; k++) {
+            real_t          m = buffer[k];      
+
+            int64_t flat_idx = block_start + k;
+            int64_t x = flat_idx / (Ny*Nz);
+            int64_t y = (flat_idx / Nz) % Ny;
+            int64_t z = flat_idx % Nz;
+
+            total_mass += m;
+            cmx += m*x; cmy += m*y; cmz += m*z;
+        }
     }
-  }
-  cmx /= total_mass; cmy /= total_mass; cmz /= total_mass;
+    cmx /= total_mass; cmy /= total_mass; cmz /= total_mass;
   
-  print_timestamp("center_of_mass end");  
+    print_timestamp("center_of_mass end");  
 
-  return array<real_t,3>{cmx,cmy,cmz};
+    return array<real_t,3>{cmx,cmy,cmz};
 }
 
-
-array<real_t,9> inertia_matrix_serial(const input_ndarray<mask_type> &voxels, const array<real_t,3> &cm)
-{
-  real_t
-    Ixx = 0, Ixy = 0, Ixz = 0,
-             Iyy = 0, Iyz = 0,
-                      Izz = 0;
+array<real_t,9> inertia_matrix_serial(const input_ndarray<mask_type> &voxels, const array<real_t,3> &cm) {
+    real_t
+        Ixx = 0, Ixy = 0, Ixz = 0,
+                 Iyy = 0, Iyz = 0,
+                          Izz = 0;
   
-  ssize_t Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
-
-  print_timestamp("inertia_matrix_serial start");
-  for(int64_t X=0,k=0;X<Nx;X++)
-    for(int64_t Y=0;Y<Ny;Y++)
-      for(int64_t Z=0;Z<Nz;Z++,k++){
-	real_t x = X-cm[0], y = Y-cm[1], z = Z-cm[2];
-	
-	real_t m = voxels.data[k];
-	Ixx += m*(y*y+z*z);
-	Iyy += m*(x*x+z*z);
-	Izz += m*(x*x+y*y);	
-	Ixy -= m * x*y;
-	Ixz -= m * x*z;
-	Iyz -= m * y*z;
-      }
+    ssize_t Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
+
+    print_timestamp("inertia_matrix_serial start");
+    for (int64_t X=0,k=0;X<Nx;X++) {
+        for (int64_t Y=0;Y<Ny;Y++) {
+            for (int64_t Z=0;Z<Nz;Z++,k++) {
+                real_t x = X-cm[0], y = Y-cm[1], z = Z-cm[2];
+                
+                real_t m = voxels.data[k];
+                Ixx += m*(y*y+z*z);
+                Iyy += m*(x*x+z*z);
+                Izz += m*(x*x+y*y);    
+                Ixy -= m * x*y;
+                Ixz -= m * x*z;
+                Iyz -= m * y*z;
+            }
+        }
+    }
   
-  print_timestamp("inertia_matrix_serial end");      
-  return array<real_t,9> {
-    Ixx, Ixy, Ixz,
-    Ixy, Iyy, Iyz,
-    Ixz, Iyz, Izz
-  };
+    print_timestamp("inertia_matrix_serial end");      
+    return array<real_t,9> {
+        Ixx, Ixy, Ixz,
+        Ixy, Iyy, Iyz,
+        Ixz, Iyz, Izz
+    };
 }
 
-
-array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &voxels, const array<real_t,3> &cm)
-{
-  // nvc++ doesn't support OpenACC 2.7 array reductions yet, so must name each element.
-  real_t
-    M00 = 0, M01 = 0, M02 = 0,
-             M11 = 0, M12 = 0,
-                      M22 = 0;
+array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &voxels, const array<real_t,3> &cm) {
+    // nvc++ doesn't support OpenACC 2.7 array reductions yet, so must name each element.
+    real_t
+        M00 = 0, M01 = 0, M02 = 0,
+                 M11 = 0, M12 = 0,
+                          M22 = 0;
   
-  size_t Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
-  ssize_t image_length = Nx*Ny*Nz;
-
-  print_timestamp("inertia_matrix start");    
-  for(ssize_t block_start=0;block_start<image_length;block_start+=acc_block_size){
-    const mask_type *buffer  = voxels.data + block_start;
-    ssize_t block_length = min(acc_block_size,image_length-block_start);
-
-    reduction_loop((+:M00,M01,M02,M11,M12,M22),())
-    for(int64_t k = 0; k<block_length;k++) {    //\if(buffer[k] != 0)
-	int64_t flat_idx = block_start + k;
-	real_t xs[3] = {(flat_idx  / (Ny*Nz))  - cm[0],  // x
-			((flat_idx / Nz) % Ny) - cm[1],  // y
-			(flat_idx  % Nz)       - cm[2]}; // z
-
-	real_t m = buffer[k];
-	real_t diag = dot(xs,xs);
-	M00 += m*(diag - xs[0] * xs[0]);
-	M11 += m*(diag - xs[1] * xs[1]);
-	M22 += m*(diag - xs[2] * xs[2]);	
-	M01 -= m * xs[0] * xs[1];
-	M02 -= m * xs[0] * xs[2];
-	M12 -= m * xs[1] * xs[2];
-      }
-  }
-  print_timestamp("inertia_matrix end");      
-  return array<real_t,9> {
-    M00,M01,M02,
-    M01,M11,M12,
-    M02,M12,M22};
+    size_t Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
+    ssize_t image_length = Nx*Ny*Nz;
+
+    print_timestamp("inertia_matrix start");    
+    for (ssize_t block_start = 0; block_start < image_length; block_start += acc_block_size) {
+        const mask_type *buffer = voxels.data + block_start;
+        ssize_t block_length    = min(acc_block_size, image_length-block_start);
+
+        reduction_loop((+:M00,M01,M02,M11,M12,M22),())
+        for (int64_t k = 0; k < block_length; k++) {    //\if (buffer[k] != 0)
+            int64_t flat_idx = block_start + k;
+            real_t xs[3] = {
+                (flat_idx  / (Ny*Nz))  - cm[0],   // x
+                ((flat_idx / Nz) % Ny) - cm[1],   // y
+                (flat_idx  % Nz)       - cm[2] }; // z
+
+            real_t m = buffer[k];
+            real_t diag = dot(xs,xs);
+            M00 += m*(diag - xs[0] * xs[0]);
+            M11 += m*(diag - xs[1] * xs[1]);
+            M22 += m*(diag - xs[2] * xs[2]);    
+            M01 -= m * xs[0] * xs[1];
+            M02 -= m * xs[0] * xs[2];
+            M12 -= m * xs[1] * xs[2];
+        }
+    }
+    print_timestamp("inertia_matrix end");      
+    return array<real_t,9> {
+        M00, M01, M02,
+        M01, M11, M12,
+        M02, M12, M22 };
 }
 
-
 void integrate_axes(const input_ndarray<mask_type> &voxels,
-		    const array<real_t,3> &x0,		    
-		    const array<real_t,3> &v_axis,
-		    const array<real_t,3> &w_axis,
-		    const real_t v_min, const real_t w_min,
-		    output_ndarray<real_t> output)
-{
-  ssize_t Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
-  ssize_t Nv = output.shape[0], Nw = output.shape[1]; 
-  int64_t image_length = Nx*Ny*Nz;
-  real_t *output_data = output.data;
-
-  // TODO: Check v_axis & w_axis projections to certify bounds and get rid of runtime check
+            const array<real_t,3> &x0,            
+            const array<real_t,3> &v_axis,
+            const array<real_t,3> &w_axis,
+            const real_t v_min, const real_t w_min,
+            output_ndarray<real_t> output) {
+    ssize_t Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
+    ssize_t Nv = output.shape[0], Nw = output.shape[1]; 
+    int64_t image_length = Nx*Ny*Nz;
+    real_t *output_data = output.data;
+
+    // TODO: Check v_axis & w_axis projections to certify bounds and get rid of runtime check
   
-  for(ssize_t block_start=0;block_start<image_length;block_start += acc_block_size){
-    const mask_type *buffer  = voxels.data + block_start;
-    int block_length = min(acc_block_size,image_length-block_start);
-
-    //#pragma acc parallel loop copy(output_data[:Nv*Nw]) copyin(buffer[:block_length], x0, v_axis, w_axis)
-    parallel_loop((output_data[:Nv*Nw]))
-    for(int64_t k = 0; k<block_length;k++) if(buffer[k] != 0) {
-	int64_t flat_idx = block_start + k;
-	real_t xs[3] = {(flat_idx  / (Ny*Nz))  - x0[0],  // x
-			((flat_idx / Nz) % Ny) - x0[1],  // y
-			(flat_idx  % Nz)       - x0[2]}; // z
-
-	mask_type voxel = buffer[k];
-	real_t v = dot(xs,v_axis), w = dot(xs,w_axis);
-	int64_t i_v = round(v-v_min), j_w = round(w-w_min);
-
-	if(i_v >= 0 && j_w >= 0 && i_v < Nv && j_w < Nw){
-	  atomic_statement()
-	  output_data[i_v*Nw + j_w] += voxel;
-	}
-      }
-  }
+    for (ssize_t block_start = 0; block_start < image_length; block_start += acc_block_size) {
+        const mask_type *buffer  = voxels.data + block_start;
+        int block_length = min(acc_block_size,image_length-block_start);
+
+        //#pragma acc parallel loop copy(output_data[:Nv*Nw]) copyin(buffer[:block_length], x0, v_axis, w_axis)
+        parallel_loop((output_data[:Nv*Nw]))
+        for (int64_t k = 0; k < block_length; k++) {
+            if (buffer[k] != 0) {
+                int64_t flat_idx = block_start + k;
+                real_t xs[3] = {
+                    (flat_idx  / (Ny*Nz))  - x0[0],   // x
+                    ((flat_idx / Nz) % Ny) - x0[1],   // y
+                    (flat_idx  % Nz)       - x0[2] }; // z
+
+                mask_type voxel = buffer[k];
+                real_t v = dot(xs,v_axis), w = dot(xs,w_axis);
+                int64_t i_v = round(v-v_min), j_w = round(w-w_min);
+
+                if (i_v >= 0 && j_w >= 0 && i_v < Nv && j_w < Nw) {
+                    atomic_statement()
+                    output_data[i_v*Nw + j_w] += voxel;
+                }
+            }
+        }
+    }
 }
 
+bool in_bbox(float U, float V, float W, const std::array<float,6> bbox) {
+    const auto& [U_min,U_max,V_min,V_max,W_min,W_max] = bbox;
 
-bool in_bbox(float U, float V, float W, const std::array<float,6> bbox)
-{
-  const auto& [U_min,U_max,V_min,V_max,W_min,W_max] = bbox;
-
-  bool inside = U>=U_min && U<=U_max && V>=V_min && V<=V_max && W>=W_min && W<=W_max;
+    bool inside = U>=U_min && U<=U_max && V>=V_min && V<=V_max && W>=W_min && W<=W_max;
 
-  // printf("in_bbox: (%.1f,%.1f,%.1f) \in ([%.1f,%.1f],[%.1f,%.1f],[%.1f,%.1f]) == %d\n",
-  // 	 U,V,W,U_min,U_max,V_min,V_max,U_min,U_max,inside);
-  return inside;
+    // printf("in_bbox: (%.1f,%.1f,%.1f) \in ([%.1f,%.1f],[%.1f,%.1f],[%.1f,%.1f]) == %d\n",
+    //      U,V,W,U_min,U_max,V_min,V_max,U_min,U_max,inside);
+    return inside;
 }
 
+template<typename field_type> float resample2x2x2(const field_type *voxels,
+                                                  const array<ssize_t,3> &shape,
+                                                  const array<float,3>   &X) {
+    auto  [Nx,Ny,Nz] = shape;    // Eller omvendt?
+    if (!in_bbox(X[0],X[1],X[2], {0.5,Nx-1.5, 0.5,Ny-1.5, 0.5,Nz-1.5})) {
+        uint64_t voxel_index = floor(X[0])*Ny*Nz+floor(X[1])*Ny+floor(X[2]);      
+        return voxels[voxel_index];
+    }
+    float   Xfrac[2][3]; // {Xminus[3], Xplus[3]}
+    int64_t Xint[2][3];     // {Iminus[3], Iplus[3]}
+    float   value = 0;
+
+    for (int i = 0; i < 3; i++) {
+        double Iminus, Iplus;
+        Xfrac[0][i] = 1-modf(X[i]-0.5, &Iminus); // 1-{X[i]-1/2}, floor(X[i]-1/2)
+        Xfrac[1][i] =   modf(X[i]+0.5, &Iplus);  // {X[i]+1/2}, floor(X[i]+1/2)
 
-template<typename field_type> float resample2x2x2(const field_type      *voxels,
-						  const array<ssize_t,3> &shape,
-						  const array<float,3>   &X)
-{
-  auto  [Nx,Ny,Nz] = shape;	// Eller omvendt?
-  if(!in_bbox(X[0],X[1],X[2], {0.5,Nx-1.5, 0.5,Ny-1.5, 0.5,Nz-1.5})){
-    uint64_t voxel_index = floor(X[0])*Ny*Nz+floor(X[1])*Ny+floor(X[2]);      
-    return voxels[voxel_index];
-  }
-  float   Xfrac[2][3];	// {Xminus[3], Xplus[3]}
-  int64_t  Xint[2][3];	// {Iminus[3], Iplus[3]}
-  float   value = 0;
-
-  for(int i=0;i<3;i++){
-    double Iminus, Iplus;
-    Xfrac[0][i] = 1-modf(X[i]-0.5, &Iminus); // 1-{X[i]-1/2}, floor(X[i]-1/2)
-    Xfrac[1][i] =   modf(X[i]+0.5, &Iplus);  // {X[i]+1/2}, floor(X[i]+1/2)
-
-    Xint[0][i] = Iminus;
-    Xint[1][i] = Iplus;
-  }
-
-
-  for(int ijk=0; ijk<=7; ijk++) {
-    float  weight = 1;
-    int64_t IJK[3] = {0,0,0};
-
-    for(int axis=0;axis<3;axis++){ // x-1/2 or x+1/2
-      int pm = (ijk>>axis) & 1;
-      IJK[axis] = Xint[pm][axis];
-      weight   *= Xfrac[pm][axis];
+        Xint[0][i] = Iminus;
+        Xint[1][i] = Iplus;
     }
 
-    auto [I,J,K] = IJK;
-    // if(I<0 || J<0 || K<0){
-    //   printf("(I,J,K) = (%ld,%ld,%ld)\n",I,J,K);
-    //   abort();
-    // }
-    // if(I>=int(Nx) || J>=int(Ny) || K>=int(Nz)){
-    //   printf("(I,J,K) = (%ld,%ld,%ld), (Nx,Ny,Nz) = (%ld,%ld,%ld)\n",I,J,K,Nx,Ny,Nz);
-    //   abort();
-    // }
-    uint64_t voxel_index = I*Ny*Nz+J*Ny+K;
-    assert(I>=0 && J>=0 && K>=0);
-    assert(I<Nx && J<Ny && K<Nz);    
-    field_type voxel = voxels[voxel_index];
-    value += voxel*weight;
-  }
-  return value;
+
+    for (int ijk = 0; ijk <= 7; ijk++) {
+        float  weight = 1;
+        int64_t IJK[3] = {0,0,0};
+
+        for (int axis = 0; axis < 3; axis++) { // x-1/2 or x+1/2
+            int pm = (ijk>>axis) & 1;
+            IJK[axis] = Xint[pm][axis];
+            weight   *= Xfrac[pm][axis];
+        }
+
+        auto [I,J,K] = IJK;
+        // if (I<0 || J<0 || K<0) {
+        //   printf("(I,J,K) = (%ld,%ld,%ld)\n",I,J,K);
+        //   abort();
+        // }
+        // if (I>=int(Nx) || J>=int(Ny) || K>=int(Nz)) {
+        //   printf("(I,J,K) = (%ld,%ld,%ld), (Nx,Ny,Nz) = (%ld,%ld,%ld)\n",I,J,K,Nx,Ny,Nz);
+        //   abort();
+        // }
+        uint64_t voxel_index = I*Ny*Nz+J*Ny+K;
+        assert(I>=0 && J>=0 && K>=0);
+        assert(I<Nx && J<Ny && K<Nz);    
+        field_type voxel = voxels[voxel_index];
+        value += voxel*weight;
+    }
+    return value;
 }
 
 template <typename voxel_type> void sample_plane(const input_ndarray<voxel_type> &voxels,
-						 const real_t voxel_size, // In micrometers
-						 const array<real_t,3> cm,
-						 const array<real_t,3> u_axis,
-						 const array<real_t,3> v_axis,		  
-						 const array<real_t,4>  bbox,    // [umin,umax,vmin,vmax] in micrometers
-						 output_ndarray<real_t> plane_samples)
-{
-  const auto& [umin,umax,vmin,vmax] = bbox; // In micrometers
-  ssize_t Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
-  ssize_t nu = plane_samples.shape[0], nv = plane_samples.shape[1];
-  real_t  du = (umax-umin)/nu, dv = (vmax-vmin)/nv;
-
-  #pragma omp parallel for collapse(2)
-  for(ssize_t ui=0;ui<nu;ui++)
-    for(ssize_t vj=0;vj<nv;vj++){
-      const real_t u = umin + ui*du, v = vmin + vj*dv;
-
-      // X,Y,Z in micrometers;  x,y,z in voxel index space      
-      const real_t		
-	X = cm[0] + u*u_axis[0] + v*v_axis[0],
-	Y = cm[1] + u*u_axis[1] + v*v_axis[1],
-	Z = cm[2] + u*u_axis[2] + v*v_axis[2];
-
-      const real_t x = X/voxel_size, y = Y/voxel_size, z = Z/voxel_size;
-
-      //      printf("u,v = %g,%g -> %.1f,%.1f,%.1f -> %d, %d, %d\n",u,v,X,Y,Z,int(round(x)),int(round(y)),int(round(z)));
-      
-      voxel_type value = 0;
-      if(in_bbox(x,y,z,{0.5,Nx-0.5, 0.5,Ny-0.5, 0.5,Nz-0.5}))
-	value = resample2x2x2<voxel_type>(voxels.data,{Nx,Ny,Nz},{x,y,z});
-      // else
-      // 	fprintf(stderr,"Sampling outside image: x,y,z = %.1f,%.1f,%.1f, Nx,Ny,Nz = %ld,%ld,%ld\n",x,y,z,Nx,Ny,Nz);
-
-      plane_samples.data[ui*nv + vj] = value;
+                         const real_t voxel_size, // In micrometers
+                         const array<real_t,3> cm,
+                         const array<real_t,3> u_axis,
+                         const array<real_t,3> v_axis,          
+                         const array<real_t,4>  bbox,    // [umin,umax,vmin,vmax] in micrometers
+                         output_ndarray<real_t> plane_samples) {
+    const auto& [umin,umax,vmin,vmax] = bbox; // In micrometers
+    ssize_t Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
+    ssize_t nu = plane_samples.shape[0], nv = plane_samples.shape[1];
+    real_t  du = (umax-umin)/nu, dv = (vmax-vmin)/nv;
+
+    #pragma omp parallel for collapse(2)
+    for (ssize_t ui=0;ui<nu;ui++) {
+        for (ssize_t vj=0;vj<nv;vj++) {
+            const real_t u = umin + ui*du, v = vmin + vj*dv;
+
+            // X,Y,Z in micrometers;  x,y,z in voxel index space      
+            const real_t        
+                X = cm[0] + u*u_axis[0] + v*v_axis[0],
+                Y = cm[1] + u*u_axis[1] + v*v_axis[1],
+                Z = cm[2] + u*u_axis[2] + v*v_axis[2];
+
+            const real_t x = X/voxel_size, y = Y/voxel_size, z = Z/voxel_size;
+
+            //      printf("u,v = %g,%g -> %.1f,%.1f,%.1f -> %d, %d, %d\n",u,v,X,Y,Z,int(round(x)),int(round(y)),int(round(z)));
+            
+            voxel_type value = 0;
+            if (in_bbox(x,y,z,{0.5,Nx-0.5, 0.5,Ny-0.5, 0.5,Nz-0.5}))
+                value = resample2x2x2<voxel_type>(voxels.data,{Nx,Ny,Nz},{x,y,z});
+            // else
+            //     fprintf(stderr,"Sampling outside image: x,y,z = %.1f,%.1f,%.1f, Nx,Ny,Nz = %ld,%ld,%ld\n",x,y,z,Nx,Ny,Nz);
+
+            plane_samples.data[ui*nv + vj] = value;
+        }
     }
 }
 
 // NB: xyz are in indices, not micrometers
 void zero_outside_bbox(const array<real_t,9> &principal_axes,
-		       const array<real_t,6> &parameter_ranges,
-		       const array<real_t,3> &cm,
-		       output_ndarray<mask_type> voxels)
-{
-  size_t  Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
-  int64_t image_length = Nx*Ny*Nz;
+               const array<real_t,6> &parameter_ranges,
+               const array<real_t,3> &cm,
+               output_ndarray<mask_type> voxels) {
+    size_t  Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
+    int64_t image_length = Nx*Ny*Nz;
 
-  printf("(Nx,Ny,Nz) = (%ld,%ld,%ld), image_length = %ld",Nx,Ny,Nz, image_length);
-  for(int64_t block_start=0;block_start<image_length;block_start+=acc_block_size){
+    printf("(Nx,Ny,Nz) = (%ld,%ld,%ld), image_length = %ld",Nx,Ny,Nz, image_length);
 
-    mask_type *buffer = voxels.data + block_start;
-    ssize_t this_block_length = min(acc_block_size,image_length-block_start);
+    for (int64_t block_start = 0; block_start < image_length; block_start += acc_block_size) {
+        mask_type *buffer = voxels.data + block_start;
+        ssize_t this_block_length = min(acc_block_size, image_length-block_start);
 
-    parallel_loop((buffer[:this_block_length]))
-    for(int64_t k = 0; k<this_block_length;k++){
-      int64_t flat_idx = block_start + k;
-      int64_t x = flat_idx  / (Ny*Nz);
-      int64_t y = (flat_idx / Nz) % Ny;
-      int64_t z = flat_idx  % Nz;
-      // Boilerplate until here. TODO: macroize or lambda out!
-      
-      real_t xs[3] = {x-cm[0], y-cm[1], z-cm[2]};
+        parallel_loop((buffer[:this_block_length]))
+        for (int64_t k = 0; k < this_block_length; k++) {
+            int64_t flat_idx = block_start + k;
+            int64_t x = flat_idx  / (Ny*Nz);
+            int64_t y = (flat_idx / Nz) % Ny;
+            int64_t z = flat_idx  % Nz;
+            // Boilerplate until here. TODO: macroize or lambda out!
+            
+            real_t xs[3] = {x-cm[0], y-cm[1], z-cm[2]};
 
-      real_t params[3] = {0,0,0};
+            real_t params[3] = {0,0,0};
 
-      for(int uvw=0;uvw<3;uvw++)
-	for(int xyz=0;xyz<3;xyz++)
-	  params[uvw] += xs[xyz]*principal_axes[uvw*3+xyz]; // u = dot(xs,u_axis), v = dot(xs,v_axis), w = dot(xs,w_axis)
+            for (int uvw = 0; uvw < 3; uvw++) 
+                for (int xyz = 0; xyz < 3; xyz++) 
+                    params[uvw] += xs[xyz] * principal_axes[uvw*3+xyz]; // u = dot(xs,u_axis), v = dot(xs,v_axis), w = dot(xs,w_axis)
 
-      bool p = false;
+            bool p = false;
 
-      for(int uvw=0;uvw<3;uvw++){
-	real_t param_min = parameter_ranges[uvw*2], param_max = parameter_ranges[uvw*2+1];
-	p |= (params[uvw] < param_min) | (params[uvw] > param_max);
-      }
+            for (int uvw = 0; uvw < 3; uvw++) {
+                real_t param_min = parameter_ranges[uvw*2], param_max = parameter_ranges[uvw*2+1];
+                p |= (params[uvw] < param_min) | (params[uvw] > param_max);
+            }
 
-      if(p) buffer[k] = 0;
+            if (p) buffer[k] = 0;
 
+        }
     }
-  }
 }
 
-inline vector4 hom_transform(const vector4 &x, const matrix4x4 &M)
-{
-  vector4 c{{0,0,0,0}};
+inline vector4 hom_transform(const vector4 &x, const matrix4x4 &M) {
+    vector4 c{{0,0,0,0}};
 
-  for(int i=0;i<4;i++){
-    real_t sum = 0;
-#pragma simd parallel for reduction(+:sum)    
-    for(int j=0;j<4;j++)
-      sum += M[i*4+j]*x[j];
-    c[i] = sum;
-  }
-  return c;
-
-  
+    for (int i = 0; i < 4; i++) {
+        real_t sum = 0;
+        #pragma simd parallel for reduction(+:sum)    
+        for (int j=0;j<4;j++)
+            sum += M[i*4+j]*x[j];
+        c[i] = sum;
+    }
+    return c;
 }
 
-
-
-#define loop_mask_start(mask_in,mask_out,COPY) {		               \
-  ssize_t Mx = mask_in.shape[0], My = mask_in.shape[1], Mz = mask_in.shape[2]; \
-  ssize_t mask_length = Mx*My*Mz;                                              \
-                                                                               \
-for(ssize_t block_start=0;block_start<mask_length;block_start+=acc_block_size){\
-  const mask_type *maskin_buffer  = mask_in.data + block_start;                \
-        mask_type *maskout_buffer = mask_out.data + block_start;               \
-  ssize_t  this_block_length = min(acc_block_size,mask_length-block_start);    \
-  									\
-  _Pragma(STR(acc parallel loop copy(maskin_buffer[:this_block_length], maskout_buffer[:this_block_length]) copy COPY)) \
-  for(int64_t k = 0; k<this_block_length;k++){                                 \
-    int64_t flat_idx = block_start + k;                                        \
-    int64_t X = (flat_idx  / (My*Mz)), Y = (flat_idx / Mz) % My, Z = flat_idx  % Mz; \
-    std::array<real_t,4> Xs = {X*voxel_size, Y*voxel_size, Z*voxel_size, 1}; \
-    bool mask_value = maskin_buffer[k];
+#define loop_mask_start(mask_in,mask_out,COPY) {                                                                                \
+    ssize_t Mx = mask_in.shape[0], My = mask_in.shape[1], Mz = mask_in.shape[2];                                                \
+    ssize_t mask_length = Mx*My*Mz;                                                                                             \
+                                                                                                                                \
+    for (ssize_t block_start = 0; block_start < mask_length; block_start += acc_block_size) {                                   \
+        const mask_type *maskin_buffer  = mask_in.data + block_start;                                                           \
+            mask_type *maskout_buffer = mask_out.data + block_start;                                                            \
+        ssize_t this_block_length = min(acc_block_size, mask_length-block_start);                                               \
+                                                                                                                                \
+        _Pragma(STR(acc parallel loop copy(maskin_buffer[:this_block_length], maskout_buffer[:this_block_length]) copy COPY))   \
+        for (int64_t k = 0; k < this_block_length; k++) {                                                                       \
+            int64_t flat_idx = block_start + k;                                                                                 \
+            int64_t X = (flat_idx  / (My*Mz)), Y = (flat_idx / Mz) % My, Z = flat_idx  % Mz;                                    \
+            std::array<real_t,4> Xs = { X*voxel_size, Y*voxel_size, Z*voxel_size, 1 };                                          \
+            bool mask_value = maskin_buffer[k];
 
 #define loop_mask_end(mask) }}} 
 
 
 void fill_implant_mask(const input_ndarray<mask_type> implant_mask,
-		       float voxel_size,
-		       const array<float,6> &bbox,
-		       float r_fraction,
-		       const matrix4x4 &Muvw,
-		       output_ndarray<mask_type> solid_implant_mask,
-		       output_ndarray<float> rsqr_maxs,
-		       output_ndarray<float> profile
-		       )
-{
-  real_t theta_min = M_PI, theta_max = -M_PI;  
-  ssize_t n_segments = rsqr_maxs.shape[0];
-  const auto [U_min,U_max,V_min,V_max,W_min,W_max] = bbox;
-  
-  printf("implant_mask.shape = %ld,%ld,%ld\n",implant_mask.shape[0],implant_mask.shape[1],implant_mask.shape[2]);
-  printf("solid_implant_mask.shape = %ld,%ld,%ld\n",solid_implant_mask.shape[0],solid_implant_mask.shape[1],solid_implant_mask.shape[2]);
-  
-  fprintf(stderr,"voxel_size = %g, U_min = %g, U_max = %g, r_frac = %g, n_segments = %ld\n",
-	 voxel_size, U_min, U_max, r_fraction, n_segments);
-
-  float     *rsqr_maxs_d     = rsqr_maxs.data;
-  float     *profile_d       = profile.data;
-  
-  // First pass computes some bounds -- possibly separate out to avoid repeating
-  loop_mask_start(implant_mask, solid_implant_mask,
-		  (maskin_buffer[:this_block_length], rsqr_maxs_d[:n_segments], Muvw[:16], bbox[:6]) );
-  if(mask_value){
-    auto [U,V,W,c] = hom_transform(Xs,Muvw);
+               float voxel_size,
+               const array<float,6> &bbox,
+               float r_fraction,
+               const matrix4x4 &Muvw,
+               output_ndarray<mask_type> solid_implant_mask,
+               output_ndarray<float> rsqr_maxs,
+               output_ndarray<float> profile) {
+    real_t theta_min = M_PI, theta_max = -M_PI;  
+    ssize_t n_segments = rsqr_maxs.shape[0];
+    const auto [U_min,U_max,V_min,V_max,W_min,W_max] = bbox;
     
-    real_t r_sqr = V*V+W*W;
-    real_t theta = atan2(V,W);
-
-    int U_i = floor((U-U_min)*(n_segments-1)/(U_max-U_min));
+    printf("implant_mask.shape = %ld,%ld,%ld\n",implant_mask.shape[0],implant_mask.shape[1],implant_mask.shape[2]);
+    printf("solid_implant_mask.shape = %ld,%ld,%ld\n",solid_implant_mask.shape[0],solid_implant_mask.shape[1],solid_implant_mask.shape[2]);
+    
+    fprintf(stderr,"voxel_size = %g, U_min = %g, U_max = %g, r_frac = %g, n_segments = %ld\n",
+        voxel_size, U_min, U_max, r_fraction, n_segments);
 
-    //    if(U_i >= 0 && U_i < n_segments){
-    if( in_bbox(U,V,W,bbox) ){
-      rsqr_maxs_d[U_i] = max(rsqr_maxs_d[U_i], float(r_sqr));
-      theta_min = min(theta_min, theta);
-      theta_max = max(theta_max, theta);
-      //      W_min     = min(W_min,     W);
-    } else {
-      // Otherwise we've calculated it wrong!
-      //  fprintf(stderr,"U-coordinate out of bounds: U_i = %ld, U = %g, U_min = %g, U_max = %g\n",U_i,U,U_min,U_max);
+    float     *rsqr_maxs_d     = rsqr_maxs.data;
+    float     *profile_d       = profile.data;
+    
+    // First pass computes some bounds -- possibly separate out to avoid repeating
+    loop_mask_start(implant_mask, solid_implant_mask,
+            (maskin_buffer[:this_block_length], rsqr_maxs_d[:n_segments], Muvw[:16], bbox[:6]) );
+    if (mask_value) {
+        auto [U,V,W,c] = hom_transform(Xs,Muvw);
+        
+        real_t r_sqr = V*V+W*W;
+        real_t theta = atan2(V,W);
+
+        int U_i = floor((U-U_min)*(n_segments-1)/(U_max-U_min));
+
+        //    if (U_i >= 0 && U_i < n_segments) {
+        if ( in_bbox(U,V,W,bbox) ) {
+            rsqr_maxs_d[U_i] = max(rsqr_maxs_d[U_i], float(r_sqr));
+            theta_min = min(theta_min, theta);
+            theta_max = max(theta_max, theta);
+            //      W_min     = min(W_min,     W);
+        } else {
+            // Otherwise we've calculated it wrong!
+            //  fprintf(stderr,"U-coordinate out of bounds: U_i = %ld, U = %g, U_min = %g, U_max = %g\n",U_i,U,U_min,U_max);
+        }
     }
-  }
-  loop_mask_end(implant_mask);
+    loop_mask_end(implant_mask);
 
-  double theta_center = (theta_max+theta_min)/2;
+    double theta_center = (theta_max+theta_min)/2;
 
-  fprintf(stderr,"theta_min, theta_center, theta_max = %g,%g,%g\n", theta_min, theta_center, theta_max);
+    fprintf(stderr,"theta_min, theta_center, theta_max = %g,%g,%g\n", theta_min, theta_center, theta_max);
 
-  // Second pass does the actual work
-  loop_mask_start(implant_mask, solid_implant_mask,
-		  (rsqr_maxs_d[:n_segments], profile_d[:n_segments]) );
-  auto [U,V,W,c] = hom_transform(Xs,Muvw);
-  float r_sqr = V*V+W*W;
-  float theta = atan2(V,W);
-  int U_i = floor((U-U_min)*(n_segments-1)/(U_max-U_min));
+    // Second pass does the actual work
+    loop_mask_start(implant_mask, solid_implant_mask,
+            (rsqr_maxs_d[:n_segments], profile_d[:n_segments]) );
+    auto [U,V,W,c] = hom_transform(Xs,Muvw);
+    float r_sqr = V*V+W*W;
+    float theta = atan2(V,W);
+    int U_i = floor((U-U_min)*(n_segments-1)/(U_max-U_min));
 
-  bool solid_mask_value = false;
-  if(U_i >= 0 && U_i < n_segments && W>=W_min){ // TODO: Full bounding box check?
-    solid_mask_value = mask_value | (r_sqr <= r_fraction*rsqr_maxs_d[U_i]);
+    bool solid_mask_value = false;
+    if (U_i >= 0 && U_i < n_segments && W >= W_min) { // TODO: Full bounding box check?
+        solid_mask_value = mask_value | (r_sqr <= r_fraction*rsqr_maxs_d[U_i]);
 
-    if(theta >= theta_min && theta <= theta_center && r_sqr <= rsqr_maxs_d[U_i]){
-      atomic_statement()
-	profile_d[U_i] += solid_mask_value;
+        if (theta >= theta_min && theta <= theta_center && r_sqr <= rsqr_maxs_d[U_i]) {
+            atomic_statement()
+            profile_d[U_i] += solid_mask_value;
+        }
     }
-  }
-  maskout_buffer[k] = solid_mask_value;
-  
-  loop_mask_end(implant_mask);
+    maskout_buffer[k] = solid_mask_value;
+    
+    loop_mask_end(implant_mask);
 }
-		       
+               
 void compute_front_mask(const input_ndarray<mask_type> solid_implant,
-		const float voxel_size,
-		const matrix4x4 &Muvw,		
-		std::array<float,6> bbox,
-		output_ndarray<mask_type> front_mask)
-{
-  const auto [U_min,U_max,V_min,V_max,W_min,W_max] = bbox;
-
-  loop_mask_start(solid_implant, front_mask,
-		  () );  
-
-  if(!mask_value){
-    auto [U,V,W,c] = hom_transform(Xs,Muvw);
-
-    maskout_buffer[k] = W>W_min;
-  } else
-    maskout_buffer[k] = 0;
-  
-  loop_mask_end(solid_implant)
+        const float voxel_size,
+        const matrix4x4 &Muvw,        
+        std::array<float,6> bbox,
+        output_ndarray<mask_type> front_mask) {
+    const auto [U_min,U_max,V_min,V_max,W_min,W_max] = bbox;
+
+    loop_mask_start(solid_implant, front_mask, () );  
+
+    if (!mask_value) {
+        auto [U,V,W,c] = hom_transform(Xs,Muvw);
+        maskout_buffer[k] = W>W_min;
+    } else
+        maskout_buffer[k] = 0;
+    
+    loop_mask_end(solid_implant)
 }
 
-
 void cylinder_projection(const input_ndarray<float>  edt,  // Euclidean Distance Transform in um, should be low-resolution (will be interpolated)
-			 const input_ndarray<uint8_t> C,  // Material classification images (probability per voxel, 0..1 -> 0..255)
-			 float voxel_size,		   // Voxel size for Cs
-			 float d_min, float d_max,	   // Distance shell to map to cylinder
-			 float theta_min, float theta_max, // Angle range (wrt cylinder center)
-			 std::array<float,6> bbox,
-			 const matrix4x4 &Muvw,		   // Transform from zyx (in um) to U'V'W' cylinder FoR (in um)
-			 output_ndarray<float>    image,  // Probability-weighted volume of (class,theta,U)-voxels
-			 output_ndarray<int64_t>  count   // Number of (class,theta,U)-voxels
-			 )
-{
-  ssize_t n_theta = image.shape[0], n_U = image.shape[1];
-
-  const auto& [U_min,U_max,V_min,V_max,W_min,W_max] = bbox;
-
-  ssize_t ex = edt.shape[0], ey = edt.shape[1], ez = edt.shape[2];
-  ssize_t Cx = C.shape[0],   Cy = C.shape[1],   Cz = C.shape[2];
-
-  real_t edx = ex/real_t(Cx), edy = ey/real_t(Cy), edz = ex/real_t(Cz);
-  
-  ssize_t edt_length       = ex*ey*ez;
-  ssize_t C_length         = Cx*Cy*Cz;  
+             const input_ndarray<uint8_t> C,  // Material classification images (probability per voxel, 0..1 -> 0..255)
+             float voxel_size,           // Voxel size for Cs
+             float d_min, float d_max,       // Distance shell to map to cylinder
+             float theta_min, float theta_max, // Angle range (wrt cylinder center)
+             std::array<float,6> bbox,
+             const matrix4x4 &Muvw,           // Transform from zyx (in um) to U'V'W' cylinder FoR (in um)
+             output_ndarray<float>    image,  // Probability-weighted volume of (class,theta,U)-voxels
+             output_ndarray<int64_t>  count   // Number of (class,theta,U)-voxels
+             ){
+    ssize_t n_theta = image.shape[0], n_U = image.shape[1];
+
+    const auto& [U_min,U_max,V_min,V_max,W_min,W_max] = bbox;
+
+    ssize_t ex = edt.shape[0], ey = edt.shape[1], ez = edt.shape[2];
+    ssize_t Cx = C.shape[0],   Cy = C.shape[1],   Cz = C.shape[2];
+
+    real_t edx = ex/real_t(Cx), edy = ey/real_t(Cy), edz = ex/real_t(Cz);
+    
+    ssize_t edt_length       = ex*ey*ez;
+    ssize_t C_length         = Cx*Cy*Cz;  
 
-  printf("Segmenting from %g to %g micrometers distance of implant.\n",d_min,d_max);
+    printf("Segmenting from %g to %g micrometers distance of implant.\n",d_min,d_max);
 
-  printf("Bounding box is [U_min,U_max,V_min,V_max,W_min,W_max] = [[%g,%g],[%g,%g],[%g,%g]]\n",
-	 U_min,U_max,V_min,V_max,W_min,W_max);
-  printf("EDT field is (%ld,%ld,%ld)\n",ex,ey,ez);
-  
-  real_t th_min = 1234, th_max = -1234;
-  ssize_t n_shell = 0;
-  ssize_t n_shell_bbox = 0;
+    printf("Bounding box is [U_min,U_max,V_min,V_max,W_min,W_max] = [[%g,%g],[%g,%g],[%g,%g]]\n",
+        U_min,U_max,V_min,V_max,W_min,W_max);
+    printf("EDT field is (%ld,%ld,%ld)\n",ex,ey,ez);
+    
+    real_t th_min = 1234, th_max = -1234;
+    ssize_t n_shell = 0;
+    ssize_t n_shell_bbox = 0;
 
-  ssize_t block_height = 64;
-  
-  //TODO: new acc/openmp macro in parallel.hh
-  {    
-      float   *image_d = image.data;
-      int64_t *count_d = count.data;
-
-      for(ssize_t block_start=0, edt_block_start=0;
-	  block_start<C_length;
-	  block_start+=block_height*Cy*Cz, edt_block_start+=block_height*ey*ez){
-	const uint8_t *C_buffer = C.data + block_start;
-	const float  *edt_block = edt.data + max(block_start-ey*ez,0L);
-
-	ssize_t  this_block_length = min(block_height*Cy*Cz,C_length-block_start);
-	ssize_t  this_edt_length   = min((block_height+2)*ey*ez,edt_length-block_start);
-
-	//#pragma acc parallel loop copy(C_buffer[:this_block_length], image_d[:n_theta*n_U], count_d[:n_theta*n_U], bbox[:6], Muvw[:16], edt_block[:this_edt_length]) reduction(+:n_shell,n_shell_bbox)
-	#pragma omp parallel for reduction(+:n_shell,n_shell_bbox)	
-	for(int64_t k = 0; k<this_block_length;k++){	
-	  const int64_t flat_idx = block_start + k;
-	  const int64_t X = (flat_idx  / (Cy*Cz)), Y = (flat_idx / Cz) % Cy, Z = flat_idx  % Cz; // Integer indices: Cs[c,X,Y,Z]
-	  // Index into local block
-	  const int64_t Xl = (k  / (Cy*Cz)), Yl = (k / Cz) % Cy, Zl = k  % Cz;
-	  // Index into local edt block. Note EDT has 1-slice padding top+bottom
-	  const float  x = (Xl+1)*edx, y = Yl*edy, z = Zl*edy;
-
-	  if(x>block_height){
-	    printf("Block number k=%ld.\nX,Y,Z=%ld,%ld,%ld\nXl,Yl,Zl=%ld,%ld,%ld\nx,y,z=%.2f, %.2f, %.2f\n",k,X,Y,Z,Xl,Yl,Zl,x,y,z);
-	    abort();
-	  }
-	  
-	  //****** MEAT OF THE IMPLEMENTATION IS HERE ******
-	  real_t distance = resample2x2x2<float>(edt_block,{this_edt_length/(ey*ez),ey,ez},
-						 {x,y,z});
-	  
-	  if(distance > d_min && distance <= d_max){ // TODO: and W>w_min
-	    array<real_t,4> Xs = {X*voxel_size, Y*voxel_size, Z*voxel_size, 1};
-	    auto [U,V,W,c] = hom_transform(Xs,Muvw);
-	    n_shell ++;
-
-	    //	    printf("distance = %.1f, U,V,W = %.2f,%.2f,%.2f\n",distance,U,V,W);
-	    if(in_bbox(U,V,W,bbox) ){
-
-	      real_t theta    = atan2(V,W);
-
-	      if(theta>=theta_min && theta<=theta_max){
-		n_shell_bbox++;
-		
-		
-		ssize_t theta_i = floor( (theta-theta_min) * (n_theta-1)/(theta_max-theta_min) );
-		ssize_t U_i     = floor( (U    -    U_min) * (n_U    -1)/(    U_max-    U_min) );
-		
-		real_t p = C_buffer[k]/255.;
-		
-		assert(theta >= theta_min);
-		assert(theta <= theta_max);
-		assert(U >= U_min);
-		assert(U <= U_max);	      
-		assert(theta_i >= 0);
-		assert(theta_i < n_theta);
-		assert(U_i >= 0);
-		assert(U_i < n_U);	      
-		
-		if(p>0){
-		  th_min = min(theta,th_min);
-		  th_max = max(theta,th_max);	      
-		  
-		  atomic_statement()
-		    image_d[theta_i*n_U + U_i] += p;
-		  
-		  atomic_statement()	  
-		    count_d[theta_i*n_U + U_i] += 1;
-		}
-	      }
-	    }
-	  }
-	}
-      }
-  }
-  printf("n_shell = %ld, n_shell_bbox = %ld\n",n_shell,n_shell_bbox);
-  printf("theta_min, theta_max = %.2f,%.2f\n",theta_min,theta_max);
-  printf("th_min,       th_max = %.2f,%.2f\n",th_min,th_max);    
+    ssize_t block_height = 64;
+    
+    //TODO: new acc/openmp macro in parallel.hh
+    {    
+        float   *image_d = image.data;
+        int64_t *count_d = count.data;
+
+        for (ssize_t block_start = 0, edt_block_start = 0; block_start < C_length; block_start += block_height*Cy*Cz, edt_block_start += block_height*ey*ez) {
+            const uint8_t *C_buffer = C.data + block_start;
+            const float  *edt_block = edt.data + max(block_start-ey*ez,0L);
+
+            ssize_t  this_block_length = min(block_height*Cy*Cz,C_length-block_start);
+            ssize_t  this_edt_length   = min((block_height+2)*ey*ez,edt_length-block_start);
+
+            //#pragma acc parallel loop copy(C_buffer[:this_block_length], image_d[:n_theta*n_U], count_d[:n_theta*n_U], bbox[:6], Muvw[:16], edt_block[:this_edt_length]) reduction(+:n_shell,n_shell_bbox)
+            #pragma omp parallel for reduction(+:n_shell,n_shell_bbox)    
+            for (int64_t k = 0; k < this_block_length; k++) {    
+                const int64_t flat_idx = block_start + k;
+                const int64_t X = (flat_idx  / (Cy*Cz)), Y = (flat_idx / Cz) % Cy, Z = flat_idx  % Cz; // Integer indices: Cs[c,X,Y,Z]
+                // Index into local block
+                const int64_t Xl = (k  / (Cy*Cz)), Yl = (k / Cz) % Cy, Zl = k  % Cz;
+                // Index into local edt block. Note EDT has 1-slice padding top+bottom
+                const float  x = (Xl+1)*edx, y = Yl*edy, z = Zl*edy;
+
+                if (x > block_height) {
+                    printf("Block number k=%ld.\nX,Y,Z=%ld,%ld,%ld\nXl,Yl,Zl=%ld,%ld,%ld\nx,y,z=%.2f, %.2f, %.2f\n",k,X,Y,Z,Xl,Yl,Zl,x,y,z);
+                    abort();
+                }
+                
+                //****** MEAT OF THE IMPLEMENTATION IS HERE ******
+                real_t distance = resample2x2x2<float>(edt_block, {this_edt_length/(ey*ez),ey,ez}, {x,y,z});
+                
+                if (distance > d_min && distance <= d_max) { // TODO: and W>w_min
+                    array<real_t,4> Xs = {X*voxel_size, Y*voxel_size, Z*voxel_size, 1};
+                    auto [U,V,W,c] = hom_transform(Xs,Muvw);
+                    n_shell ++;
+
+                    //        printf("distance = %.1f, U,V,W = %.2f,%.2f,%.2f\n",distance,U,V,W);
+                    if (in_bbox(U,V,W,bbox)) {
+                        real_t theta    = atan2(V,W);
+
+                        if (theta >= theta_min && theta <= theta_max) {
+                            n_shell_bbox++;
+                            
+                            ssize_t theta_i = floor( (theta-theta_min) * (n_theta-1)/(theta_max-theta_min) );
+                            ssize_t U_i     = floor( (U    -    U_min) * (n_U    -1)/(    U_max-    U_min) );
+                            
+                            real_t p = C_buffer[k]/255.;
+                            
+                            assert(theta >= theta_min);
+                            assert(theta <= theta_max);
+                            assert(U >= U_min);
+                            assert(U <= U_max);          
+                            assert(theta_i >= 0);
+                            assert(theta_i < n_theta);
+                            assert(U_i >= 0);
+                            assert(U_i < n_U);          
+                            
+                            if (p > 0) {
+                                th_min = min(theta,th_min);
+                                th_max = max(theta,th_max);          
+                                
+                                atomic_statement()
+                                image_d[theta_i*n_U + U_i] += p;
+                                
+                                atomic_statement()      
+                                count_d[theta_i*n_U + U_i] += 1;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    printf("n_shell = %ld, n_shell_bbox = %ld\n",n_shell,n_shell_bbox);
+    printf("theta_min, theta_max = %.2f,%.2f\n",theta_min,theta_max);
+    printf("th_min,       th_max = %.2f,%.2f\n",th_min,th_max);    
 }
-

From 3ee49868fb745e3430aa3ee357dd15d48a9195f2 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 13 Feb 2023 14:21:38 +0100
Subject: [PATCH 073/136] #25 Merged center_of_mass from geometry

---
 src/Makefile                     |   2 +-
 src/lib/cpp/cpu/geometry.cc      |  37 ++++++
 src/lib/cpp/cpu_seq/geometry.cc  |  77 +++++------
 src/lib/cpp/gpu/geometry.cc      |  50 +++++++
 src/lib/cpp/include/datatypes.hh |  13 +-
 src/lib/cpp/include/geometry.hh  |  20 +++
 src/pybind/geometry-pybind.cc    | 222 ++++++++++++++-----------------
 7 files changed, 248 insertions(+), 173 deletions(-)
 create mode 100644 src/lib/cpp/cpu/geometry.cc
 create mode 100644 src/lib/cpp/gpu/geometry.cc
 create mode 100644 src/lib/cpp/include/geometry.hh

diff --git a/src/Makefile b/src/Makefile
index e26cb0c..1d73b51 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -6,7 +6,7 @@ CPP_FOLDER=lib/cpp
 #CXXFLAGS += -I../contrib/cpptqdm/ -Iinclude
 CXXFLAGS += -I$(CPP_FOLDER)/include
 PLATFORMS=cpu_seq cpu gpu
-LIBS=io morphology
+LIBS=io geometry morphology
 TARGETS = $(foreach PLATFORM, $(PLATFORMS), $(foreach LIB, $(LIBS), $(CPP_FOLDER)/$(PLATFORM)/$(LIB)$(PYBIND_SUFFIX)))
 CLEANUP = $(TARGETS) $(foreach PLATFORM, $(PLATFORMS), $(CPP_FOLDER)/$(PLATFORM)/__pycache__)
 
diff --git a/src/lib/cpp/cpu/geometry.cc b/src/lib/cpp/cpu/geometry.cc
new file mode 100644
index 0000000..bd32cfb
--- /dev/null
+++ b/src/lib/cpp/cpu/geometry.cc
@@ -0,0 +1,37 @@
+#include <assert.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+using namespace std;
+
+#include "geometry.hh"
+
+array<real_t,3> center_of_mass(const input_ndarray<mask_type> voxels) {
+    uint64_t cmx = 0, cmy = 0, cmz = 0;
+    size_t Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
+    int64_t image_length = Nx*Ny*Nz;
+
+    print_timestamp("center_of_mass start");
+    
+    uint64_t total_mass = 0;  
+    
+    #pragma omp parallel for reduction(+:total_mass,cmx,cmy,cmz)
+    for (int64_t k = 0; k < image_length; k++) {
+        mask_type m = voxels.data[k];      
+
+        int64_t x = k / (Ny*Nz);
+        int64_t y = (k / Nz) % Ny;
+        int64_t z = k % Nz;
+
+        total_mass += m;
+        cmx += m*x; cmy += m*y; cmz += m*z;
+    }
+    real_t
+        rcmx = cmx / ((real_t) total_mass),
+        rcmy = cmy / ((real_t) total_mass),
+        rcmz = cmz / ((real_t) total_mass);
+  
+    print_timestamp("center_of_mass end");  
+
+    return array<real_t,3>{ rcmx, rcmy, rcmz };
+}
\ No newline at end of file
diff --git a/src/lib/cpp/cpu_seq/geometry.cc b/src/lib/cpp/cpu_seq/geometry.cc
index 2a155aa..a3778f4 100644
--- a/src/lib/cpp/cpu_seq/geometry.cc
+++ b/src/lib/cpp/cpu_seq/geometry.cc
@@ -1,55 +1,38 @@
 // TODO: Coordinates are named X,Y,Z in c++, but Z,Y,X in python. Homogenize to X,Y,Z!
-#include <chrono>
 #include <assert.h>
 #include <inttypes.h>
 #include <stdio.h>
 #include <math.h>
 using namespace std;
 
-#include "datatypes.hh"
-#include "parallel.hh"
+#include "geometry.hh"
 
-#define dot(a,b) (a[0]*b[0] + a[1]*b[1] + a[2]*b[2])
-
-void print_timestamp(string message) {
-    auto now = chrono::system_clock::to_time_t(chrono::system_clock::now());
-    tm local_tm = *localtime(&now);
-    fprintf(stderr,"%s at %02d:%02d:%02d\n", message.c_str(), local_tm.tm_hour, local_tm.tm_min, local_tm.tm_sec);    
-}
-
-// TODO: Fix OpenACC copies & re-enable GPU
 array<real_t,3> center_of_mass(const input_ndarray<mask_type> voxels) {
-    // nvc++ doesn't support OpenACC 2.7 array reductions yet.  
-    real_t  cmx = 0, cmy = 0, cmz = 0;
-    size_t  Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
+    uint64_t cmx = 0, cmy = 0, cmz = 0;
+    size_t Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
     int64_t image_length = Nx*Ny*Nz;
 
     print_timestamp("center_of_mass start");
-    real_t total_mass = 0;  
-    for (int64_t block_start = 0; block_start < image_length; block_start += acc_block_size) {
-
-        const mask_type *buffer = voxels.data + block_start;
-        ssize_t this_block_length = min(acc_block_size, image_length-block_start);
 
-        //#pragma acc parallel loop reduction(+:cmx,cmy,cmz,total_mass) copyin(buffer[:this_block_length])
-        //reduction_loop((+:cmx,cmy,cmz,total_mass),())
-        for (int64_t k = 0; k < this_block_length; k++) {
-            real_t          m = buffer[k];      
+    uint64_t total_mass = 0;  
+    for (int64_t k = 0; k < image_length; k++) {
+        mask_type m = voxels.data[k];      
 
-            int64_t flat_idx = block_start + k;
-            int64_t x = flat_idx / (Ny*Nz);
-            int64_t y = (flat_idx / Nz) % Ny;
-            int64_t z = flat_idx % Nz;
+        int64_t x = k / (Ny*Nz);
+        int64_t y = (k / Nz) % Ny;
+        int64_t z = k % Nz;
 
-            total_mass += m;
-            cmx += m*x; cmy += m*y; cmz += m*z;
-        }
+        total_mass += m;
+        cmx += m*x; cmy += m*y; cmz += m*z;
     }
-    cmx /= total_mass; cmy /= total_mass; cmz /= total_mass;
+    real_t
+        rcmx = cmx / ((real_t) total_mass),
+        rcmy = cmy / ((real_t) total_mass),
+        rcmz = cmz / ((real_t) total_mass);
   
     print_timestamp("center_of_mass end");  
 
-    return array<real_t,3>{cmx,cmy,cmz};
+    return array<real_t,3>{ rcmx, rcmy, rcmz };
 }
 
 array<real_t,9> inertia_matrix_serial(const input_ndarray<mask_type> &voxels, const array<real_t,3> &cm) {
@@ -100,7 +83,7 @@ array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &voxels, const arr
         const mask_type *buffer = voxels.data + block_start;
         ssize_t block_length    = min(acc_block_size, image_length-block_start);
 
-        reduction_loop((+:M00,M01,M02,M11,M12,M22),())
+        //reduction_loop((+:M00,M01,M02,M11,M12,M22),())
         for (int64_t k = 0; k < block_length; k++) {    //\if (buffer[k] != 0)
             int64_t flat_idx = block_start + k;
             real_t xs[3] = {
@@ -143,7 +126,7 @@ void integrate_axes(const input_ndarray<mask_type> &voxels,
         int block_length = min(acc_block_size,image_length-block_start);
 
         //#pragma acc parallel loop copy(output_data[:Nv*Nw]) copyin(buffer[:block_length], x0, v_axis, w_axis)
-        parallel_loop((output_data[:Nv*Nw]))
+        //parallel_loop((output_data[:Nv*Nw]))
         for (int64_t k = 0; k < block_length; k++) {
             if (buffer[k] != 0) {
                 int64_t flat_idx = block_start + k;
@@ -157,7 +140,7 @@ void integrate_axes(const input_ndarray<mask_type> &voxels,
                 int64_t i_v = round(v-v_min), j_w = round(w-w_min);
 
                 if (i_v >= 0 && j_w >= 0 && i_v < Nv && j_w < Nw) {
-                    atomic_statement()
+                    //atomic_statement()
                     output_data[i_v*Nw + j_w] += voxel;
                 }
             }
@@ -277,7 +260,7 @@ void zero_outside_bbox(const array<real_t,9> &principal_axes,
         mask_type *buffer = voxels.data + block_start;
         ssize_t this_block_length = min(acc_block_size, image_length-block_start);
 
-        parallel_loop((buffer[:this_block_length]))
+        //parallel_loop((buffer[:this_block_length]))
         for (int64_t k = 0; k < this_block_length; k++) {
             int64_t flat_idx = block_start + k;
             int64_t x = flat_idx  / (Ny*Nz);
@@ -337,7 +320,7 @@ inline vector4 hom_transform(const vector4 &x, const matrix4x4 &M) {
 
 #define loop_mask_end(mask) }}} 
 
-
+/*
 void fill_implant_mask(const input_ndarray<mask_type> implant_mask,
                float voxel_size,
                const array<float,6> &bbox,
@@ -360,8 +343,7 @@ void fill_implant_mask(const input_ndarray<mask_type> implant_mask,
     float     *profile_d       = profile.data;
     
     // First pass computes some bounds -- possibly separate out to avoid repeating
-    loop_mask_start(implant_mask, solid_implant_mask,
-            (maskin_buffer[:this_block_length], rsqr_maxs_d[:n_segments], Muvw[:16], bbox[:6]) );
+    //loop_mask_start(implant_mask, solid_implant_mask, (maskin_buffer[:this_block_length], rsqr_maxs_d[:n_segments], Muvw[:16], bbox[:6]) );
     if (mask_value) {
         auto [U,V,W,c] = hom_transform(Xs,Muvw);
         
@@ -381,14 +363,14 @@ void fill_implant_mask(const input_ndarray<mask_type> implant_mask,
             //  fprintf(stderr,"U-coordinate out of bounds: U_i = %ld, U = %g, U_min = %g, U_max = %g\n",U_i,U,U_min,U_max);
         }
     }
-    loop_mask_end(implant_mask);
+    //loop_mask_end(implant_mask);
 
     double theta_center = (theta_max+theta_min)/2;
 
     fprintf(stderr,"theta_min, theta_center, theta_max = %g,%g,%g\n", theta_min, theta_center, theta_max);
 
     // Second pass does the actual work
-    loop_mask_start(implant_mask, solid_implant_mask,
+    //loop_mask_start(implant_mask, solid_implant_mask,
             (rsqr_maxs_d[:n_segments], profile_d[:n_segments]) );
     auto [U,V,W,c] = hom_transform(Xs,Muvw);
     float r_sqr = V*V+W*W;
@@ -400,15 +382,15 @@ void fill_implant_mask(const input_ndarray<mask_type> implant_mask,
         solid_mask_value = mask_value | (r_sqr <= r_fraction*rsqr_maxs_d[U_i]);
 
         if (theta >= theta_min && theta <= theta_center && r_sqr <= rsqr_maxs_d[U_i]) {
-            atomic_statement()
+            //atomic_statement()
             profile_d[U_i] += solid_mask_value;
         }
     }
     maskout_buffer[k] = solid_mask_value;
     
-    loop_mask_end(implant_mask);
+    //loop_mask_end(implant_mask);
 }
-               
+
 void compute_front_mask(const input_ndarray<mask_type> solid_implant,
         const float voxel_size,
         const matrix4x4 &Muvw,        
@@ -426,6 +408,7 @@ void compute_front_mask(const input_ndarray<mask_type> solid_implant,
     
     loop_mask_end(solid_implant)
 }
+*/
 
 void cylinder_projection(const input_ndarray<float>  edt,  // Euclidean Distance Transform in um, should be low-resolution (will be interpolated)
              const input_ndarray<uint8_t> C,  // Material classification images (probability per voxel, 0..1 -> 0..255)
@@ -521,10 +504,10 @@ void cylinder_projection(const input_ndarray<float>  edt,  // Euclidean Distance
                                 th_min = min(theta,th_min);
                                 th_max = max(theta,th_max);          
                                 
-                                atomic_statement()
+                                //atomic_statement()
                                 image_d[theta_i*n_U + U_i] += p;
                                 
-                                atomic_statement()      
+                                //atomic_statement()      
                                 count_d[theta_i*n_U + U_i] += 1;
                             }
                         }
diff --git a/src/lib/cpp/gpu/geometry.cc b/src/lib/cpp/gpu/geometry.cc
new file mode 100644
index 0000000..0ff35a1
--- /dev/null
+++ b/src/lib/cpp/gpu/geometry.cc
@@ -0,0 +1,50 @@
+#include <assert.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+using namespace std;
+
+#include "geometry.hh"
+
+array<real_t,3> center_of_mass(const input_ndarray<mask_type> voxels) {
+    // nvc++ doesn't support OpenACC 2.7 array reductions yet.  
+    uint64_t cmx = 0, cmy = 0, cmz = 0;
+    size_t Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
+    int64_t image_length = Nx*Ny*Nz;
+
+    print_timestamp("center_of_mass start");
+
+    uint64_t total_mass = 0;
+    
+    #pragma acc data copy(total_mass,cmx,cmy,cmz)
+    {
+        for (int64_t block_start = 0; block_start < image_length; block_start += acc_block_size) {
+            const mask_type *buffer = voxels.data + block_start;
+            ssize_t this_block_size = min(acc_block_size, image_length-block_start);
+
+            #pragma acc data copyin(buffer[:this_block_size]) 
+            {
+                #pragma acc parallel loop reduction(+:total_mass,cmx,cmy,cmz)
+                for (int64_t k = 0; k < this_block_size; k++) {
+                    mask_type m = buffer[k];
+
+                    int64_t flat_idx = block_start + k;
+                    int64_t x = flat_idx / (Ny*Nz);
+                    int64_t y = (flat_idx / Nz) % Ny;
+                    int64_t z = flat_idx % Nz;
+
+                    total_mass += m;
+                    cmx += m*x; cmy += m*y; cmz += m*z;
+                }
+            }
+        }
+    }
+    real_t 
+        rcmx = cmx / ((real_t) total_mass),
+        rcmy = cmy / ((real_t) total_mass),
+        rcmz = cmz / ((real_t) total_mass);
+  
+    print_timestamp("center_of_mass end");  
+
+    return array<real_t,3>{rcmx, rcmy, rcmz};
+}
\ No newline at end of file
diff --git a/src/lib/cpp/include/datatypes.hh b/src/lib/cpp/include/datatypes.hh
index 4f92c5b..3defac0 100644
--- a/src/lib/cpp/include/datatypes.hh
+++ b/src/lib/cpp/include/datatypes.hh
@@ -2,6 +2,9 @@
 #define datatypes_h
 #include <array>
 #include <vector>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/numpy.h>
 
 typedef uint8_t mask_type;	// TODO: Template + explicit instantiation
 typedef uint16_t voxel_type;
@@ -10,7 +13,15 @@ typedef uint16_t field_type;
 typedef float gauss_type;
 typedef float real_t;
 
-constexpr ssize_t acc_block_size =  1024 * 1024 * 1024/sizeof(mask_type); // 1 GB
+namespace py = pybind11;
+template <typename voxel_type>
+using np_array = py::array_t<voxel_type, py::array::c_style | py::array::forcecast>;
+
+typedef py::array_t<mask_type, py::array::c_style | py::array::forcecast> np_maskarray;
+typedef py::array_t<real_t, py::array::c_style | py::array::forcecast>    np_realarray;
+typedef py::array_t<uint8_t, py::array::c_style | py::array::forcecast>   np_bytearray;
+
+constexpr ssize_t acc_block_size = 1024 * 1024 * 1024 / sizeof(mask_type); // 1 GB
 
 struct plane_t {
   array<real_t,3> cm, u_axis, v_axis;
diff --git a/src/lib/cpp/include/geometry.hh b/src/lib/cpp/include/geometry.hh
new file mode 100644
index 0000000..7073758
--- /dev/null
+++ b/src/lib/cpp/include/geometry.hh
@@ -0,0 +1,20 @@
+#ifndef geometry_h
+#define geometry_h
+
+using namespace std;
+
+#include "datatypes.hh"
+#include <chrono>
+#include <string>
+
+#define dot(a,b) (a[0]*b[0] + a[1]*b[1] + a[2]*b[2])
+
+void print_timestamp(string message) {
+    auto now = chrono::system_clock::to_time_t(chrono::system_clock::now());
+    tm local_tm = *localtime(&now);
+    fprintf(stderr,"%s at %02d:%02d:%02d\n", message.c_str(), local_tm.tm_hour, local_tm.tm_min, local_tm.tm_sec);
+}
+
+array<real_t,3> center_of_mass(const input_ndarray<mask_type> voxels);
+
+#endif
\ No newline at end of file
diff --git a/src/pybind/geometry-pybind.cc b/src/pybind/geometry-pybind.cc
index a7c1f0d..b738ef0 100644
--- a/src/pybind/geometry-pybind.cc
+++ b/src/pybind/geometry-pybind.cc
@@ -1,164 +1,138 @@
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <pybind11/numpy.h>
-
 #include "geometry.cc"
 
-namespace python_api { 
-  namespace py = pybind11;
-  template <typename voxel_type>
-  using np_array = py::array_t<voxel_type, py::array::c_style | py::array::forcecast>;
-  
-  typedef py::array_t<mask_type, py::array::c_style | py::array::forcecast> np_maskarray;
-  typedef py::array_t<real_t, py::array::c_style | py::array::forcecast>    np_realarray;
-  typedef py::array_t<uint8_t, py::array::c_style | py::array::forcecast>   np_bytearray;
+namespace python_api {
 
-  array<real_t,3> center_of_mass(const np_maskarray &np_voxels){
-    auto voxels_info    = np_voxels.request();
+array<real_t,3> center_of_mass(const np_maskarray &np_voxels){
+    auto voxels_info = np_voxels.request();
 
     return ::center_of_mass({voxels_info.ptr,voxels_info.shape});
-  }
-
-
-
-  array<real_t,9> inertia_matrix(const np_maskarray &np_voxels, array<real_t,3>& cm){
-    auto voxels_info    = np_voxels.request();
+}
+/*
+array<real_t,9> inertia_matrix(const np_maskarray &np_voxels, array<real_t,3>& cm){
+    auto voxels_info = np_voxels.request();
     
-    return ::inertia_matrix({voxels_info.ptr,voxels_info.shape}, cm);
-  }
+    return inertia_matrix({voxels_info.ptr,voxels_info.shape}, cm);
+}
 
-  array<real_t,9> inertia_matrix_serial(const np_maskarray &np_voxels, array<real_t,3>& cm){
-    auto voxels_info    = np_voxels.request();
+array<real_t,9> inertia_matrix_serial(const np_maskarray &np_voxels, array<real_t,3>& cm){
+    auto voxels_info = np_voxels.request();
     
-    return ::inertia_matrix_serial({voxels_info.ptr,voxels_info.shape}, cm);
-  }  
-
+    return inertia_matrix_serial({voxels_info.ptr,voxels_info.shape}, cm);
+}  
 
 template <typename voxel_type>
 void sample_plane(const np_array<voxel_type> &np_voxels,
-		  const real_t voxel_size, // In micrometers
-		  const array<real_t,3> cm,
-		  const array<real_t,3> u_axis,
-		  const array<real_t,3> v_axis,		  
-		  const array<real_t,4>  bbox,    // [umin,umax,vmin,vmax] in micrometers
-		  np_array<float> np_plane_samples)
-  {
+          const real_t voxel_size, // In micrometers
+          const array<real_t,3> cm,
+          const array<real_t,3> u_axis,
+          const array<real_t,3> v_axis,          
+          const array<real_t,4>  bbox,    // [umin,umax,vmin,vmax] in micrometers
+          np_array<float> np_plane_samples) {
     auto voxels_info = np_voxels.request();
     auto plane_samples_info  = np_plane_samples.request();
     
-    ::sample_plane<voxel_type>({voxels_info.ptr, voxels_info.shape}, voxel_size,
-		   cm,u_axis,v_axis,bbox,
-		   {plane_samples_info.ptr, plane_samples_info.shape});
-  }
+    sample_plane<voxel_type>({voxels_info.ptr, voxels_info.shape}, voxel_size,
+           cm,u_axis,v_axis,bbox,
+           {plane_samples_info.ptr, plane_samples_info.shape});
+}
   
-
-
-  void integrate_axes(const np_maskarray &np_voxels,
-		    const array<real_t,3> &x0,		    
-		    const array<real_t,3> &v_axis,
-		    const array<real_t,3> &w_axis,
-		    const real_t v_min, const real_t w_min,
-		    np_realarray &output)
-  {
+void integrate_axes(const np_maskarray &np_voxels,
+            const array<real_t,3> &x0,            
+            const array<real_t,3> &v_axis,
+            const array<real_t,3> &w_axis,
+            const real_t v_min, const real_t w_min,
+            np_realarray &output) {
     auto voxels_info = np_voxels.request();
     auto output_info  = output.request();
 
-    ::integrate_axes({voxels_info.ptr, voxels_info.shape},
-		     x0,v_axis,w_axis,
-		     v_min, w_min,
-		     {output_info.ptr, output_info.shape});
-  }
+    integrate_axes({voxels_info.ptr, voxels_info.shape},
+             x0,v_axis,w_axis,
+             v_min, w_min,
+             {output_info.ptr, output_info.shape});
+}
 
-  void zero_outside_bbox(const array<real_t,9> &principal_axes,
-			 const array<real_t,6> &parameter_ranges,
-			 const array<real_t,3> &cm, // TOOD: Med eller uden voxelsize?
-			 np_maskarray &np_voxels)
-  {
+void zero_outside_bbox(const array<real_t,9> &principal_axes,
+             const array<real_t,6> &parameter_ranges,
+             const array<real_t,3> &cm, // TOOD: Med eller uden voxelsize?
+             np_maskarray &np_voxels) {
     auto voxels_info = np_voxels.request();
     
-    ::zero_outside_bbox(principal_axes,
-		      parameter_ranges,
-		      cm, 
-		      {voxels_info.ptr, voxels_info.shape});
-  }
+    zero_outside_bbox(principal_axes,
+              parameter_ranges,
+              cm, 
+              {voxels_info.ptr, voxels_info.shape});
+}
 
 void fill_implant_mask(const np_maskarray implant_mask,
-		       float voxel_size,
-		       const array<float,6> &bbox,
-		       float r_fraction,
-		       const matrix4x4 &Muvw,
-		       np_maskarray solid_implant_mask,
-		       np_array<float> rsqr_maxs,
-		       np_array<float> profile
-		       )
-{
-  auto implant_info    = implant_mask.request(),
-    solid_implant_info = solid_implant_mask.request(),
-    rsqr_info          = rsqr_maxs.request(),
-    profile_info       =  profile.request();
-
-  return ::fill_implant_mask({implant_info.ptr,       implant_info.shape},
-			     voxel_size, bbox, r_fraction, Muvw,
-			     {solid_implant_info.ptr, solid_implant_info.shape},
-			     {rsqr_info.ptr,          rsqr_info.shape},
-			     {profile_info.ptr,       profile_info.shape}
-			     );
+               float voxel_size,
+               const array<float,6> &bbox,
+               float r_fraction,
+               const matrix4x4 &Muvw,
+               np_maskarray solid_implant_mask,
+               np_array<float> rsqr_maxs,
+               np_array<float> profile
+               ) {
+    auto implant_info    = implant_mask.request(),
+        solid_implant_info = solid_implant_mask.request(),
+        rsqr_info          = rsqr_maxs.request(),
+        profile_info       =  profile.request();
+
+    return fill_implant_mask({implant_info.ptr,       implant_info.shape},
+                 voxel_size, bbox, r_fraction, Muvw,
+                 {solid_implant_info.ptr, solid_implant_info.shape},
+                 {rsqr_info.ptr,          rsqr_info.shape},
+                 {profile_info.ptr,       profile_info.shape}
+                 );
 }
 
 void compute_front_mask(const np_array<uint8_t> &np_solid_implant,
-		const float voxel_size,
-		const matrix4x4 &Muvw,		
-		std::array<float,6> bbox,
-		np_array<mask_type> &np_front_mask)
-{
-  auto solid_implant_info = np_solid_implant.request();
-  auto front_mask_info    = np_front_mask.request();
-  
-  ::compute_front_mask({solid_implant_info.ptr, solid_implant_info.shape},
-	       voxel_size, Muvw, bbox,
-	       {front_mask_info.ptr, front_mask_info.shape});
+        const float voxel_size,
+        const matrix4x4 &Muvw,        
+        std::array<float,6> bbox,
+        np_array<mask_type> &np_front_mask) {
+    auto solid_implant_info = np_solid_implant.request();
+    auto front_mask_info    = np_front_mask.request();
+    
+    ::compute_front_mask({solid_implant_info.ptr, solid_implant_info.shape},
+            voxel_size, Muvw, bbox,
+            {front_mask_info.ptr, front_mask_info.shape});
 }
 
-  
-  void cylinder_projection(const np_array<float>  &np_edt,  // Euclidean Distance Transform in um, should be low-resolution (will be interpolated)
-			   const np_bytearray     &np_Cs,  // Material classification images (probability per voxel, 0..1 -> 0..255)
-			   float Cs_voxel_size,		   // Voxel size for Cs
-			   float d_min, float d_max,	   // Distance shell to map to cylinder
-			   float theta_min, float theta_max, // Angle range (wrt cylinder center)
-			   const array<float,6> &bbox,     // Implant bounding box (in U'V'W'-coordinates)
-			   const matrix4x4 &Muvw,	   // Transform from zyx (in um) to U'V'W' cylinder FoR (in um)
-			   np_array<float> &np_images,	   // Probability-weighted volume of (class,theta,U)-voxels
-			   np_array<uint64_t> &np_counts	   // Number of (class,theta,U)-voxels
-			   )
-  {
+void cylinder_projection(const np_array<float>  &np_edt,  // Euclidean Distance Transform in um, should be low-resolution (will be interpolated)
+               const np_bytearray     &np_Cs,  // Material classification images (probability per voxel, 0..1 -> 0..255)
+               float Cs_voxel_size,           // Voxel size for Cs
+               float d_min, float d_max,       // Distance shell to map to cylinder
+               float theta_min, float theta_max, // Angle range (wrt cylinder center)
+               const array<float,6> &bbox,     // Implant bounding box (in U'V'W'-coordinates)
+               const matrix4x4 &Muvw,       // Transform from zyx (in um) to U'V'W' cylinder FoR (in um)
+               np_array<float> &np_images,       // Probability-weighted volume of (class,theta,U)-voxels
+               np_array<uint64_t> &np_counts       // Number of (class,theta,U)-voxels
+               ) {
     auto edt_info    = np_edt.request();
     auto Cs_info     = np_Cs.request();
     auto images_info = np_images.request();
     auto counts_info = np_counts.request();
 
     ::cylinder_projection({edt_info.ptr,edt_info.shape},
-			  {Cs_info.ptr, Cs_info.shape},
-			  Cs_voxel_size,d_min,d_max,theta_min,theta_max,bbox,Muvw,
-			  {images_info.ptr, images_info.shape},
-			  {counts_info.ptr, counts_info.shape});
-  }
-  
-}
-
-
-		       
+              {Cs_info.ptr, Cs_info.shape},
+              Cs_voxel_size,d_min,d_max,theta_min,theta_max,bbox,Muvw,
+              {images_info.ptr, images_info.shape},
+              {counts_info.ptr, counts_info.shape});
+}*/
 
+}
+  
 PYBIND11_MODULE(geometry, m) {
     m.doc() = "Voxel Geometry Module"; // optional module docstring
 
     m.def("center_of_mass",       &python_api::center_of_mass);
-    m.def("inertia_matrix",       &python_api::inertia_matrix);
-    m.def("inertia_matrix_serial",&python_api::inertia_matrix_serial);
-    m.def("integrate_axes",       &python_api::integrate_axes);        
-    m.def("zero_outside_bbox",    &python_api::zero_outside_bbox);
-    m.def("fill_implant_mask",    &python_api::fill_implant_mask);
-    m.def("cylinder_projection",  &python_api::cylinder_projection);
-    m.def("sample_plane",         &python_api::sample_plane<uint16_t>);
-    m.def("sample_plane",         &python_api::sample_plane<uint8_t>);
-    m.def("compute_front_mask",   &python_api::compute_front_mask);
+    //m.def("inertia_matrix",       &python_api::inertia_matrix);
+    //m.def("inertia_matrix_serial",&python_api::inertia_matrix_serial);
+    //m.def("integrate_axes",       &python_api::integrate_axes);        
+    //m.def("zero_outside_bbox",    &python_api::zero_outside_bbox);
+    //m.def("fill_implant_mask",    &python_api::fill_implant_mask);
+    //m.def("cylinder_projection",  &python_api::cylinder_projection);
+    //m.def("sample_plane",         &python_api::sample_plane<uint16_t>);
+    //m.def("sample_plane",         &python_api::sample_plane<uint8_t>);
+    //m.def("compute_front_mask",   &python_api::compute_front_mask);
 }

From 8a17c7107097e01f0b1c854cd3cf6a94ee53a9cf Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 13 Feb 2023 14:22:03 +0100
Subject: [PATCH 074/136] #16 Added unit test for center_of_mass

---
 src/test/test_geometry.py | 45 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 src/test/test_geometry.py

diff --git a/src/test/test_geometry.py b/src/test/test_geometry.py
new file mode 100644
index 0000000..4b32258
--- /dev/null
+++ b/src/test/test_geometry.py
@@ -0,0 +1,45 @@
+'''
+Unit tests for the geometry library.
+'''
+import sys
+sys.path.append(sys.path[0]+'/../lib/cpp')
+import cpu_seq.geometry as m_cpu_seq
+import cpu.geometry as m_cpu
+import gpu.geometry as m_gpu
+
+import datetime
+from functools import partial
+import numpy as np
+import pytest
+
+# Parameters
+#n = 2344 # ~12 GB, used for testing whether blocked works.
+n = 128
+
+def run_with_warmup(f):
+    f()
+    start = datetime.datetime.now()
+    result = f()
+    end = datetime.datetime.now()
+    return result, end - start
+
+def test_center_of_mass():
+    voxels = np.random.randint(0, 255, (n,n,n), np.uint8)
+
+    baseline_f = partial(m_cpu_seq.center_of_mass, voxels)
+    cpu_f = partial(m_cpu.center_of_mass, voxels)
+    gpu_f = partial(m_gpu.center_of_mass, voxels)
+
+    baseline, baseline_t = run_with_warmup(baseline_f)
+    print (f'Sequential ran in {baseline_t}')
+
+    cpu, cpu_t = run_with_warmup(cpu_f)
+    print (f'Parallel CPU ran in {cpu_t}, which is {baseline_t / cpu_t} times faster than sequential')
+    assert np.allclose(baseline, cpu)
+
+    gpu, gpu_t = run_with_warmup(gpu_f)
+    print (f'GPU ran in {gpu_t}, which is {baseline_t / gpu_t} times faster than sequential') 
+    assert np.allclose(baseline, gpu)
+    
+if __name__ == '__main__':
+    test_center_of_mass()
\ No newline at end of file

From caf0934ebc580f039c2aa3628a7b7c4c38eafebc Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 13 Feb 2023 14:22:36 +0100
Subject: [PATCH 075/136] #15 Removed unfair warmup for morphology benchmark

---
 src/test/test_morphology.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/test/test_morphology.py b/src/test/test_morphology.py
index d99ccae..c48cc40 100644
--- a/src/test/test_morphology.py
+++ b/src/test/test_morphology.py
@@ -38,7 +38,6 @@ def test_morphology(r, m, op, nd):
 
     result = np.empty_like(implant_mask)
     f = getattr(m, f'{op}_3d_sphere')
-    f(implant_mask, r, result)
     fsta = datetime.datetime.now()
     f(implant_mask, r, result)
     fend = datetime.datetime.now()

From abc215efc7997b98966a0313d5a2a644fcc9ccab Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 13 Feb 2023 21:03:10 +0100
Subject: [PATCH 076/136] #25 Moved inertia_matrix from geometry.cc

---
 src/lib/cpp/cpu/geometry.cc     | 47 +++++++++++++++++++++-
 src/lib/cpp/cpu_seq/geometry.cc | 70 ++++++++++-----------------------
 src/lib/cpp/gpu/geometry.cc     | 57 +++++++++++++++++++++++++++
 src/lib/cpp/include/geometry.hh |  7 ++--
 src/pybind/geometry-pybind.cc   | 15 +++----
 src/test/test_geometry.py       | 54 +++++++++++++++++++------
 6 files changed, 173 insertions(+), 77 deletions(-)

diff --git a/src/lib/cpp/cpu/geometry.cc b/src/lib/cpp/cpu/geometry.cc
index bd32cfb..4a41724 100644
--- a/src/lib/cpp/cpu/geometry.cc
+++ b/src/lib/cpp/cpu/geometry.cc
@@ -14,7 +14,7 @@ array<real_t,3> center_of_mass(const input_ndarray<mask_type> voxels) {
     print_timestamp("center_of_mass start");
     
     uint64_t total_mass = 0;  
-    
+
     #pragma omp parallel for reduction(+:total_mass,cmx,cmy,cmz)
     for (int64_t k = 0; k < image_length; k++) {
         mask_type m = voxels.data[k];      
@@ -34,4 +34,49 @@ array<real_t,3> center_of_mass(const input_ndarray<mask_type> voxels) {
     print_timestamp("center_of_mass end");  
 
     return array<real_t,3>{ rcmx, rcmy, rcmz };
+}
+
+array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &voxels, const array<real_t,3> &cm) {
+    real_t
+        Ixx = 0, Ixy = 0, Ixz = 0,
+                 Iyy = 0, Iyz = 0,
+                          Izz = 0;
+  
+    ssize_t Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
+
+    print_timestamp("inertia_matrix_serial start");
+    
+    #pragma omp parallel for collapse(3) reduction(+:Ixx,Iyy,Izz) reduction(-:Ixy,Ixz,Iyz)
+    for (int64_t X = 0; X < Nx; X++) {
+        for (int64_t Y = 0; Y < Ny; Y++) {
+            for (int64_t Z = 0; Z < Nz; Z++) {
+
+                // TODO shouldn't the loops be interchanged to match the access pattern? (Naming-wise, that is)
+                int64_t k = X*Ny*Nz + Y*Nz + Z; 
+                mask_type m = voxels.data[k];
+                
+                // m guards this, and this removes branches
+                // if (m != 0) 
+                real_t 
+                    x = X - cm[0], 
+                    y = Y - cm[1], 
+                    z = Z - cm[2];
+                
+                Ixx += m * (y*y + z*z);
+                Iyy += m * (x*x + z*z);
+                Izz += m * (x*x + y*y);    
+                Ixy -= m * x*y;
+                Ixz -= m * x*z;
+                Iyz -= m * y*z;
+            }
+        }
+    }
+  
+    print_timestamp("inertia_matrix_serial end");
+
+    return array<real_t,9> {
+        Ixx, Ixy, Ixz,
+        Ixy, Iyy, Iyz,
+        Ixz, Iyz, Izz
+    };
 }
\ No newline at end of file
diff --git a/src/lib/cpp/cpu_seq/geometry.cc b/src/lib/cpp/cpu_seq/geometry.cc
index a3778f4..ddf962d 100644
--- a/src/lib/cpp/cpu_seq/geometry.cc
+++ b/src/lib/cpp/cpu_seq/geometry.cc
@@ -35,7 +35,7 @@ array<real_t,3> center_of_mass(const input_ndarray<mask_type> voxels) {
     return array<real_t,3>{ rcmx, rcmy, rcmz };
 }
 
-array<real_t,9> inertia_matrix_serial(const input_ndarray<mask_type> &voxels, const array<real_t,3> &cm) {
+array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &voxels, const array<real_t,3> &cm) {
     real_t
         Ixx = 0, Ixy = 0, Ixz = 0,
                  Iyy = 0, Iyz = 0,
@@ -44,15 +44,24 @@ array<real_t,9> inertia_matrix_serial(const input_ndarray<mask_type> &voxels, co
     ssize_t Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
 
     print_timestamp("inertia_matrix_serial start");
-    for (int64_t X=0,k=0;X<Nx;X++) {
-        for (int64_t Y=0;Y<Ny;Y++) {
-            for (int64_t Z=0;Z<Nz;Z++,k++) {
-                real_t x = X-cm[0], y = Y-cm[1], z = Z-cm[2];
+
+    int64_t k = 0;
+    for (int64_t X = 0; X < Nx; X++) {
+        for (int64_t Y = 0; Y < Ny; Y++) {
+            for (int64_t Z = 0; Z < Nz; Z++) {
+                mask_type m = voxels.data[k];
+                k++;
+                
+                // m guards this, and then branches are removed
+                //if (m != 0) 
+                real_t 
+                    x = X - cm[0], 
+                    y = Y - cm[1], 
+                    z = Z - cm[2];
                 
-                real_t m = voxels.data[k];
-                Ixx += m*(y*y+z*z);
-                Iyy += m*(x*x+z*z);
-                Izz += m*(x*x+y*y);    
+                Ixx += m * (y*y + z*z);
+                Iyy += m * (x*x + z*z);
+                Izz += m * (x*x + y*y);    
                 Ixy -= m * x*y;
                 Ixz -= m * x*z;
                 Iyz -= m * y*z;
@@ -60,7 +69,8 @@ array<real_t,9> inertia_matrix_serial(const input_ndarray<mask_type> &voxels, co
         }
     }
   
-    print_timestamp("inertia_matrix_serial end");      
+    print_timestamp("inertia_matrix_serial end");
+
     return array<real_t,9> {
         Ixx, Ixy, Ixz,
         Ixy, Iyy, Iyz,
@@ -68,46 +78,6 @@ array<real_t,9> inertia_matrix_serial(const input_ndarray<mask_type> &voxels, co
     };
 }
 
-array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &voxels, const array<real_t,3> &cm) {
-    // nvc++ doesn't support OpenACC 2.7 array reductions yet, so must name each element.
-    real_t
-        M00 = 0, M01 = 0, M02 = 0,
-                 M11 = 0, M12 = 0,
-                          M22 = 0;
-  
-    size_t Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
-    ssize_t image_length = Nx*Ny*Nz;
-
-    print_timestamp("inertia_matrix start");    
-    for (ssize_t block_start = 0; block_start < image_length; block_start += acc_block_size) {
-        const mask_type *buffer = voxels.data + block_start;
-        ssize_t block_length    = min(acc_block_size, image_length-block_start);
-
-        //reduction_loop((+:M00,M01,M02,M11,M12,M22),())
-        for (int64_t k = 0; k < block_length; k++) {    //\if (buffer[k] != 0)
-            int64_t flat_idx = block_start + k;
-            real_t xs[3] = {
-                (flat_idx  / (Ny*Nz))  - cm[0],   // x
-                ((flat_idx / Nz) % Ny) - cm[1],   // y
-                (flat_idx  % Nz)       - cm[2] }; // z
-
-            real_t m = buffer[k];
-            real_t diag = dot(xs,xs);
-            M00 += m*(diag - xs[0] * xs[0]);
-            M11 += m*(diag - xs[1] * xs[1]);
-            M22 += m*(diag - xs[2] * xs[2]);    
-            M01 -= m * xs[0] * xs[1];
-            M02 -= m * xs[0] * xs[2];
-            M12 -= m * xs[1] * xs[2];
-        }
-    }
-    print_timestamp("inertia_matrix end");      
-    return array<real_t,9> {
-        M00, M01, M02,
-        M01, M11, M12,
-        M02, M12, M22 };
-}
-
 void integrate_axes(const input_ndarray<mask_type> &voxels,
             const array<real_t,3> &x0,            
             const array<real_t,3> &v_axis,
diff --git a/src/lib/cpp/gpu/geometry.cc b/src/lib/cpp/gpu/geometry.cc
index 0ff35a1..87584dc 100644
--- a/src/lib/cpp/gpu/geometry.cc
+++ b/src/lib/cpp/gpu/geometry.cc
@@ -47,4 +47,61 @@ array<real_t,3> center_of_mass(const input_ndarray<mask_type> voxels) {
     print_timestamp("center_of_mass end");  
 
     return array<real_t,3>{rcmx, rcmy, rcmz};
+}
+
+array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &voxels, const array<real_t,3> &cm) {
+    // nvc++ doesn't support OpenACC 2.7 array reductions yet, so must name each element.
+    real_t
+        Ixx = 0, Ixy = 0, Ixz = 0,
+                 Iyy = 0, Iyz = 0,
+                          Izz = 0;
+  
+    size_t Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
+    ssize_t image_length = Nx*Ny*Nz;
+
+    print_timestamp("inertia_matrix start");
+
+    #pragma acc data copy(Ixx, Ixy, Ixz, Iyy, Iyz, Izz) 
+    {
+        for (ssize_t block_start = 0; block_start < image_length; block_start += acc_block_size) {
+            const mask_type *buffer = voxels.data + block_start;
+            ssize_t this_block_size = min(acc_block_size, image_length - block_start);
+
+            #pragma acc data copyin(buffer[:this_block_size]) 
+            {
+                #pragma acc parallel loop reduction(+:Ixx,Iyy,Izz) reduction(-:Ixy,Ixz,Iyz)
+                for (int64_t k = 0; k < this_block_size; k++) {    //\if (buffer[k] != 0)
+                    mask_type m = buffer[k];
+
+                    // m guards this, and GPUs doesn't like branches
+                    //if (m != 0)
+                    int64_t 
+                        flat_idx = block_start + k,
+                        X = flat_idx / (Ny * Nz),
+                        Y = ((flat_idx) / Nz) % Ny,
+                        Z = flat_idx % Nz;
+                    
+                    real_t 
+                        x = X - cm[0], 
+                        y = Y - cm[1], 
+                        z = Z - cm[2];
+                
+                    Ixx += m * (y*y + z*z);
+                    Iyy += m * (x*x + z*z);
+                    Izz += m * (x*x + y*y);    
+                    Ixy -= m * x*y;
+                    Ixz -= m * x*z;
+                    Iyz -= m * y*z;
+                }
+            }
+        }
+    }
+
+    print_timestamp("inertia_matrix end");
+
+    return array<real_t,9> {
+        Ixx, Ixy, Ixz,
+        Ixy, Iyy, Iyz,
+        Ixz, Iyz, Izz
+    };
 }
\ No newline at end of file
diff --git a/src/lib/cpp/include/geometry.hh b/src/lib/cpp/include/geometry.hh
index 7073758..0f729cd 100644
--- a/src/lib/cpp/include/geometry.hh
+++ b/src/lib/cpp/include/geometry.hh
@@ -10,11 +10,12 @@ using namespace std;
 #define dot(a,b) (a[0]*b[0] + a[1]*b[1] + a[2]*b[2])
 
 void print_timestamp(string message) {
-    auto now = chrono::system_clock::to_time_t(chrono::system_clock::now());
-    tm local_tm = *localtime(&now);
-    fprintf(stderr,"%s at %02d:%02d:%02d\n", message.c_str(), local_tm.tm_hour, local_tm.tm_min, local_tm.tm_sec);
+    //auto now = chrono::system_clock::to_time_t(chrono::system_clock::now());
+    //tm local_tm = *localtime(&now);
+    //fprintf(stderr,"%s at %02d:%02d:%02d\n", message.c_str(), local_tm.tm_hour, local_tm.tm_min, local_tm.tm_sec);
 }
 
 array<real_t,3> center_of_mass(const input_ndarray<mask_type> voxels);
+array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &voxels, const array<real_t,3> &cm);
 
 #endif
\ No newline at end of file
diff --git a/src/pybind/geometry-pybind.cc b/src/pybind/geometry-pybind.cc
index b738ef0..c22bf72 100644
--- a/src/pybind/geometry-pybind.cc
+++ b/src/pybind/geometry-pybind.cc
@@ -5,20 +5,16 @@ namespace python_api {
 array<real_t,3> center_of_mass(const np_maskarray &np_voxels){
     auto voxels_info = np_voxels.request();
 
-    return ::center_of_mass({voxels_info.ptr,voxels_info.shape});
+    return ::center_of_mass({voxels_info.ptr, voxels_info.shape});
 }
-/*
+
 array<real_t,9> inertia_matrix(const np_maskarray &np_voxels, array<real_t,3>& cm){
     auto voxels_info = np_voxels.request();
     
-    return inertia_matrix({voxels_info.ptr,voxels_info.shape}, cm);
+    return ::inertia_matrix({voxels_info.ptr, voxels_info.shape}, cm);
 }
 
-array<real_t,9> inertia_matrix_serial(const np_maskarray &np_voxels, array<real_t,3>& cm){
-    auto voxels_info = np_voxels.request();
-    
-    return inertia_matrix_serial({voxels_info.ptr,voxels_info.shape}, cm);
-}  
+/*
 
 template <typename voxel_type>
 void sample_plane(const np_array<voxel_type> &np_voxels,
@@ -126,8 +122,7 @@ PYBIND11_MODULE(geometry, m) {
     m.doc() = "Voxel Geometry Module"; // optional module docstring
 
     m.def("center_of_mass",       &python_api::center_of_mass);
-    //m.def("inertia_matrix",       &python_api::inertia_matrix);
-    //m.def("inertia_matrix_serial",&python_api::inertia_matrix_serial);
+    m.def("inertia_matrix",       &python_api::inertia_matrix);
     //m.def("integrate_axes",       &python_api::integrate_axes);        
     //m.def("zero_outside_bbox",    &python_api::zero_outside_bbox);
     //m.def("fill_implant_mask",    &python_api::fill_implant_mask);
diff --git a/src/test/test_geometry.py b/src/test/test_geometry.py
index 4b32258..4ddb051 100644
--- a/src/test/test_geometry.py
+++ b/src/test/test_geometry.py
@@ -16,6 +16,17 @@
 #n = 2344 # ~12 GB, used for testing whether blocked works.
 n = 128
 
+def assert_with_print(a, b):
+    all_close = np.allclose(a, b)
+    if not all_close:
+        na, nb = np.array(a), np.array(b)
+        print (na)
+        print (nb)
+        nabs = np.abs(na - nb)
+        print (nabs)
+        print (np.sum(nabs))
+    assert all_close
+
 def run_with_warmup(f):
     f()
     start = datetime.datetime.now()
@@ -23,23 +34,40 @@ def run_with_warmup(f):
     end = datetime.datetime.now()
     return result, end - start
 
-def test_center_of_mass():
-    voxels = np.random.randint(0, 255, (n,n,n), np.uint8)
-
-    baseline_f = partial(m_cpu_seq.center_of_mass, voxels)
-    cpu_f = partial(m_cpu.center_of_mass, voxels)
-    gpu_f = partial(m_gpu.center_of_mass, voxels)
-
+def compare_fs(func, baseline_f, cpu_f, gpu_f, should_assert=True):
     baseline, baseline_t = run_with_warmup(baseline_f)
-    print (f'Sequential ran in {baseline_t}')
+    print (f'({func}) Sequential ran in {baseline_t}')
 
     cpu, cpu_t = run_with_warmup(cpu_f)
-    print (f'Parallel CPU ran in {cpu_t}, which is {baseline_t / cpu_t} times faster than sequential')
-    assert np.allclose(baseline, cpu)
+    print (f'({func}) Parallel CPU ran in {cpu_t}, which is {baseline_t / cpu_t} times faster than sequential')
+    if should_assert: assert_with_print(baseline, cpu)
 
     gpu, gpu_t = run_with_warmup(gpu_f)
-    print (f'GPU ran in {gpu_t}, which is {baseline_t / gpu_t} times faster than sequential') 
-    assert np.allclose(baseline, gpu)
+    print (f'({func}) GPU ran in {gpu_t}, which is {baseline_t / gpu_t} times faster than sequential') 
+    if should_assert: assert_with_print(baseline, gpu)
+ 
+
+def test_center_of_mass():
+    voxels = np.random.randint(0, 256, (n,n,n), np.uint8)
+
+    baseline = partial(m_cpu_seq.center_of_mass, voxels)
+    cpu = partial(m_cpu.center_of_mass, voxels)
+    gpu = partial(m_gpu.center_of_mass, voxels)
+
+    compare_fs('center_of_mass', baseline, cpu, gpu)
+
+   
+def test_inertia_matrix():
+    voxels = np.random.randint(0, 2, (n,n,n), np.uint8)
+    cm = m_gpu.center_of_mass(voxels)
+
+    baseline = partial(m_cpu_seq.inertia_matrix, voxels, cm)
+    cpu = partial(m_cpu.inertia_matrix, voxels, cm)
+    gpu = partial(m_gpu.inertia_matrix, voxels, cm)
+    
+    # TODO assert disabled due to floating point associativity error accumulation
+    compare_fs('inertia_matrix', baseline, cpu, gpu, should_assert=False)
     
 if __name__ == '__main__':
-    test_center_of_mass()
\ No newline at end of file
+    test_center_of_mass()
+    test_inertia_matrix()
\ No newline at end of file

From 799789ecf9de7001b75d0defad5a6398d7422f81 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 13 Feb 2023 21:03:28 +0100
Subject: [PATCH 077/136] Added missing dependency in Makefile target

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 1d73b51..c689bad 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -28,7 +28,7 @@ endif
 all: $(TARGETS)
 
 define GEN_RULE
-$(CPP_FOLDER)/$(PLATFORM)/$(LIB)$(PYBIND_SUFFIX): pybind/$(LIB)-pybind.cc $(CPP_FOLDER)/$(PLATFORM)/$(LIB).cc
+$(CPP_FOLDER)/$(PLATFORM)/$(LIB)$(PYBIND_SUFFIX): pybind/$(LIB)-pybind.cc $(CPP_FOLDER)/$(PLATFORM)/$(LIB).cc $(CPP_FOLDER)/include/$(LIB).hh
 	$(CXX) $(CXXFLAGS) $(PYBIND_FLAGS) -I$(CPP_FOLDER)/$(PLATFORM) $$< -o $(CPP_FOLDER)/$(PLATFORM)/$(LIB)$(PYBIND_SUFFIX)
 endef
 

From 516d7cbfc5d04f0c96783f66970431fbef73d91f Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 13 Feb 2023 21:22:57 +0100
Subject: [PATCH 078/136] #25 Commented out non-processing-steps functions

---
 src/lib/cpp/cpu_seq/geometry.cc | 44 ++++++++++++++++-----------------
 src/lib/cpp/gpu/geometry.cc     | 44 ++++++++++++++++++++++++++++++++-
 2 files changed, 65 insertions(+), 23 deletions(-)

diff --git a/src/lib/cpp/cpu_seq/geometry.cc b/src/lib/cpp/cpu_seq/geometry.cc
index ddf962d..06cd754 100644
--- a/src/lib/cpp/cpu_seq/geometry.cc
+++ b/src/lib/cpp/cpu_seq/geometry.cc
@@ -78,6 +78,7 @@ array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &voxels, const arr
     };
 }
 
+/* TODO only called in test.py. Postponed for now.
 void integrate_axes(const input_ndarray<mask_type> &voxels,
             const array<real_t,3> &x0,            
             const array<real_t,3> &v_axis,
@@ -90,33 +91,30 @@ void integrate_axes(const input_ndarray<mask_type> &voxels,
     real_t *output_data = output.data;
 
     // TODO: Check v_axis & w_axis projections to certify bounds and get rid of runtime check
-  
-    for (ssize_t block_start = 0; block_start < image_length; block_start += acc_block_size) {
-        const mask_type *buffer  = voxels.data + block_start;
-        int block_length = min(acc_block_size,image_length-block_start);
-
-        //#pragma acc parallel loop copy(output_data[:Nv*Nw]) copyin(buffer[:block_length], x0, v_axis, w_axis)
-        //parallel_loop((output_data[:Nv*Nw]))
-        for (int64_t k = 0; k < block_length; k++) {
-            if (buffer[k] != 0) {
-                int64_t flat_idx = block_start + k;
-                real_t xs[3] = {
-                    (flat_idx  / (Ny*Nz))  - x0[0],   // x
-                    ((flat_idx / Nz) % Ny) - x0[1],   // y
-                    (flat_idx  % Nz)       - x0[2] }; // z
-
-                mask_type voxel = buffer[k];
-                real_t v = dot(xs,v_axis), w = dot(xs,w_axis);
-                int64_t i_v = round(v-v_min), j_w = round(w-w_min);
-
-                if (i_v >= 0 && j_w >= 0 && i_v < Nv && j_w < Nw) {
-                    //atomic_statement()
-                    output_data[i_v*Nw + j_w] += voxel;
+    int64_t k = 0:
+    for (int64_t X = 0; X < Nx; X++) {
+        for (int64_t Y = 0; Y < Ny; Y++) {
+            for (int64_t Z = 0; Z < Nz; Z++) {
+                if (buffer[k] != 0) {
+                    real_t xs[3] = {
+                        (flat_idx  / (Ny*Nz))  - x0[0],   // x
+                        ((flat_idx / Nz) % Ny) - x0[1],   // y
+                        (flat_idx  % Nz)       - x0[2] }; // z
+
+                    mask_type voxel = buffer[k];
+                    real_t v = dot(xs, v_axis), w = dot(xs,w_axis);
+                    int64_t i_v = round(v-v_min), j_w = round(w-w_min);
+
+                    if (i_v >= 0 && j_w >= 0 && i_v < Nv && j_w < Nw) {
+                        output_data[i_v*Nw + j_w] += voxel;
+                    }
                 }
+                k++;
             }
         }
     }
 }
+*/
 
 bool in_bbox(float U, float V, float W, const std::array<float,6> bbox) {
     const auto& [U_min,U_max,V_min,V_max,W_min,W_max] = bbox;
@@ -216,6 +214,7 @@ template <typename voxel_type> void sample_plane(const input_ndarray<voxel_type>
     }
 }
 
+/* TODO only called in test.py. Postpone for now. 
 // NB: xyz are in indices, not micrometers
 void zero_outside_bbox(const array<real_t,9> &principal_axes,
                const array<real_t,6> &parameter_ranges,
@@ -258,6 +257,7 @@ void zero_outside_bbox(const array<real_t,9> &principal_axes,
         }
     }
 }
+*/
 
 inline vector4 hom_transform(const vector4 &x, const matrix4x4 &M) {
     vector4 c{{0,0,0,0}};
diff --git a/src/lib/cpp/gpu/geometry.cc b/src/lib/cpp/gpu/geometry.cc
index 87584dc..5c0ce52 100644
--- a/src/lib/cpp/gpu/geometry.cc
+++ b/src/lib/cpp/gpu/geometry.cc
@@ -104,4 +104,46 @@ array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &voxels, const arr
         Ixy, Iyy, Iyz,
         Ixz, Iyz, Izz
     };
-}
\ No newline at end of file
+}
+
+/* TODO Only called in test.py. Postponed for now. 
+void integrate_axes(const input_ndarray<mask_type> &voxels,
+            const array<real_t,3> &x0,            
+            const array<real_t,3> &v_axis,
+            const array<real_t,3> &w_axis,
+            const real_t v_min, const real_t w_min,
+            output_ndarray<real_t> output) {
+    ssize_t Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
+    ssize_t Nv = output.shape[0], Nw = output.shape[1]; 
+    int64_t image_length = Nx*Ny*Nz;
+    real_t *output_data = output.data;
+
+    // TODO: Check v_axis & w_axis projections to certify bounds and get rid of runtime check
+  
+    for (ssize_t block_start = 0; block_start < image_length; block_start += acc_block_size) {
+        const mask_type *buffer  = voxels.data + block_start;
+        int block_length = min(acc_block_size,image_length-block_start);
+
+        //#pragma acc parallel loop copy(output_data[:Nv*Nw]) copyin(buffer[:block_length], x0, v_axis, w_axis)
+        //parallel_loop((output_data[:Nv*Nw]))
+        for (int64_t k = 0; k < block_length; k++) {
+            if (buffer[k] != 0) {
+                int64_t flat_idx = block_start + k;
+                real_t xs[3] = {
+                    (flat_idx  / (Ny*Nz))  - x0[0],   // x
+                    ((flat_idx / Nz) % Ny) - x0[1],   // y
+                    (flat_idx  % Nz)       - x0[2] }; // z
+
+                mask_type voxel = buffer[k];
+                real_t v = dot(xs,v_axis), w = dot(xs,w_axis);
+                int64_t i_v = round(v-v_min), j_w = round(w-w_min);
+
+                if (i_v >= 0 && j_w >= 0 && i_v < Nv && j_w < Nw) {
+                    //atomic_statement()
+                    output_data[i_v*Nw + j_w] += voxel;
+                }
+            }
+        }
+    }
+}
+*/
\ No newline at end of file

From 2fcc48f447e3eb0554ffc75aece4e506003688b3 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Wed, 15 Feb 2023 11:52:55 +0100
Subject: [PATCH 079/136] #25 Added boilerplate macros

---
 src/lib/cpp/cpu_seq/geometry.cc    | 26 ++++++++-------
 src/lib/cpp/include/boilerplate.hh | 51 ++++++++++++++++++++++++++++++
 src/lib/cpp/include/datatypes.hh   |  4 ++-
 3 files changed, 68 insertions(+), 13 deletions(-)
 create mode 100644 src/lib/cpp/include/boilerplate.hh

diff --git a/src/lib/cpp/cpu_seq/geometry.cc b/src/lib/cpp/cpu_seq/geometry.cc
index 06cd754..8c10286 100644
--- a/src/lib/cpp/cpu_seq/geometry.cc
+++ b/src/lib/cpp/cpu_seq/geometry.cc
@@ -6,25 +6,27 @@
 using namespace std;
 
 #include "geometry.hh"
+#include "boilerplate.hh"
 
-array<real_t,3> center_of_mass(const input_ndarray<mask_type> voxels) {
-    uint64_t cmx = 0, cmy = 0, cmz = 0;
-    size_t Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
-    int64_t image_length = Nx*Ny*Nz;
+array<real_t, 3> center_of_mass(const input_ndarray<mask_type> voxels) {
+    unpack_numpy(voxels);
 
     print_timestamp("center_of_mass start");
 
+    uint64_t cmz = 0, cmy = 0, cmx = 0;
     uint64_t total_mass = 0;  
-    for (int64_t k = 0; k < image_length; k++) {
-        mask_type m = voxels.data[k];      
 
-        int64_t x = k / (Ny*Nz);
-        int64_t y = (k / Nz) % Ny;
-        int64_t z = k % Nz;
+    for_3d_begin(voxels);
+
+    mask_type m = voxels.data[flat_index];
 
         total_mass += m;
-        cmx += m*x; cmy += m*y; cmz += m*z;
-    }
+    cmx += m * x;
+    cmy += m * y;
+    cmz += m * z;
+
+    for_3d_end();
+
     real_t
         rcmx = cmx / ((real_t) total_mass),
         rcmy = cmy / ((real_t) total_mass),
@@ -32,7 +34,7 @@ array<real_t,3> center_of_mass(const input_ndarray<mask_type> voxels) {
   
     print_timestamp("center_of_mass end");  
 
-    return array<real_t,3>{ rcmx, rcmy, rcmz };
+    return array<real_t, 3>{ rcmz, rcmy, rcmx };
 }
 
 array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &voxels, const array<real_t,3> &cm) {
diff --git a/src/lib/cpp/include/boilerplate.hh b/src/lib/cpp/include/boilerplate.hh
new file mode 100644
index 0000000..b71c6c0
--- /dev/null
+++ b/src/lib/cpp/include/boilerplate.hh
@@ -0,0 +1,51 @@
+#ifndef boilerplate_h
+#define boilerplate_h
+
+// TODO it seems like vscode doesn't pick this up.
+/// \def for_block_begin(arr)
+/// Inserts boilerplate code for accessing \a arr in a blocked (chunked) manner.
+#define for_block_begin(arr) \
+    for (int64_t block_start = 0; block_start < arr##_length; block_start += acc_block_size<arr##_type>) { \
+        const arr##_type *arr##_buffer = arr.data + block_start; \
+        ssize_t arr##_buffer_length = min(acc_block_size<arr##_type>, arr##_length-block_start); \
+        _Pragma(STR(acc data copyin(arr##_buffer[:arr##_buffer_length]))) \
+        { \
+
+#define for_block_end() } }
+
+#define for_3d_begin(arr) \
+    for (int64_t z = 0; z < arr##_Nz; z++) { \
+        for (int64_t y = 0; y < arr##_Ny; y++) { \
+            for (int64_t x = 0; x < arr##_Nx; x++) { \
+                int64_t flat_index = z*arr##_Ny*arr##_Nx + y*arr##_Nx + x;
+
+#define for_3d_end() }}}
+
+#define for_flat_begin_1(arr) for_flat_begin(arr, arr)
+#define for_flat_begin_2(arr, global_prefix) \
+    for (int64_t flat_index = 0; flat_index < arr##_length; flat_index++) { \
+        int64_t \
+            global_prefix##_index = arr##_start + flat_index \
+            z = global_prefix##_index / (arr##_Ny*arr##_Nx), \
+            y = (global_prefix##_index / arr##_Nx) % arr##_Ny, \
+            x = global_prefix##_index % arr##_Nx;
+
+#define for_flat_end() }
+
+// TODO I'm not sure this'll expand right.
+#define for_flat_block_begin(arr) \
+    for_block_begin(arr) \
+    for_flat_begin_2(arr##_buffer, global)
+
+#define for_flat_block_end() \
+    for_flat_end() \
+    for_block_end()
+
+#define unpack_numpy(arr) \
+    ssize_t \
+        arr##_Nz = arr.shape[0], \
+        arr##_Ny = arr.shape[1], \
+        arr##_Nx = arr.shape[2], \
+        arr##_length = arr##_Nz*arr##_Ny*arr##_Nx;
+
+#endif
\ No newline at end of file
diff --git a/src/lib/cpp/include/datatypes.hh b/src/lib/cpp/include/datatypes.hh
index 3defac0..cbe1213 100644
--- a/src/lib/cpp/include/datatypes.hh
+++ b/src/lib/cpp/include/datatypes.hh
@@ -8,6 +8,7 @@
 
 typedef uint8_t mask_type;	// TODO: Template + explicit instantiation
 typedef uint16_t voxel_type;
+typedef mask_type voxels_type;
 //typedef float    field_type;
 typedef uint16_t field_type;
 typedef float gauss_type;
@@ -21,7 +22,8 @@ typedef py::array_t<mask_type, py::array::c_style | py::array::forcecast> np_mas
 typedef py::array_t<real_t, py::array::c_style | py::array::forcecast>    np_realarray;
 typedef py::array_t<uint8_t, py::array::c_style | py::array::forcecast>   np_bytearray;
 
-constexpr ssize_t acc_block_size = 1024 * 1024 * 1024 / sizeof(mask_type); // 1 GB
+template <typename T>
+constexpr ssize_t acc_block_size = 1024 * 1024 * 1024 / sizeof(T); // 1 GB
 
 struct plane_t {
   array<real_t,3> cm, u_axis, v_axis;

From c8528c969ea83f11a903b9865c8d738a38fa2e0e Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Wed, 15 Feb 2023 11:53:16 +0100
Subject: [PATCH 080/136] #25 Added all of the include files as a dependency in
 Makefile

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index c689bad..623eea2 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -28,7 +28,7 @@ endif
 all: $(TARGETS)
 
 define GEN_RULE
-$(CPP_FOLDER)/$(PLATFORM)/$(LIB)$(PYBIND_SUFFIX): pybind/$(LIB)-pybind.cc $(CPP_FOLDER)/$(PLATFORM)/$(LIB).cc $(CPP_FOLDER)/include/$(LIB).hh
+$(CPP_FOLDER)/$(PLATFORM)/$(LIB)$(PYBIND_SUFFIX): pybind/$(LIB)-pybind.cc $(CPP_FOLDER)/$(PLATFORM)/$(LIB).cc $(CPP_FOLDER)/include/*.hh
 	$(CXX) $(CXXFLAGS) $(PYBIND_FLAGS) -I$(CPP_FOLDER)/$(PLATFORM) $$< -o $(CPP_FOLDER)/$(PLATFORM)/$(LIB)$(PYBIND_SUFFIX)
 endef
 

From ec9b2f62b2ad992117332642805226dd5c9e4d1d Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Wed, 15 Feb 2023 11:53:36 +0100
Subject: [PATCH 081/136] #34 Added example docstring in geometry.hh

---
 src/lib/cpp/include/geometry.hh | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/lib/cpp/include/geometry.hh b/src/lib/cpp/include/geometry.hh
index 0f729cd..2f448ef 100644
--- a/src/lib/cpp/include/geometry.hh
+++ b/src/lib/cpp/include/geometry.hh
@@ -15,7 +15,21 @@ void print_timestamp(string message) {
     //fprintf(stderr,"%s at %02d:%02d:%02d\n", message.c_str(), local_tm.tm_hour, local_tm.tm_min, local_tm.tm_sec);
 }
 
+/*
+Computes the center of mass of the given tomography.
+
+@param voxels The given tomography.
+@returns The 3D coordinates of the center of mass (in Z, Y, X).
+*/
 array<real_t,3> center_of_mass(const input_ndarray<mask_type> voxels);
+
+/*
+Computes the inertia matrix of the given tomography based of the given center of mass.
+
+@param voxels The given tomography.
+@param cm The given center of mass.
+@returns The 3x3 inertia matrix.
+*/
 array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &voxels, const array<real_t,3> &cm);
 
 #endif
\ No newline at end of file

From b0f5bbde2a07c6b1f3ffea1f142a9223994ab3f0 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Wed, 15 Feb 2023 11:54:00 +0100
Subject: [PATCH 082/136] #25 Removed trailing whitespace

---
 src/lib/cpp/cpu/geometry.cc     |  32 ++++-----
 src/lib/cpp/cpu_seq/geometry.cc | 112 ++++++++++++++++----------------
 src/lib/cpp/gpu/geometry.cc     |  48 +++++++-------
 src/pybind/geometry-pybind.cc   |  26 ++++----
 src/test/test_geometry.py       |  10 +--
 5 files changed, 114 insertions(+), 114 deletions(-)

diff --git a/src/lib/cpp/cpu/geometry.cc b/src/lib/cpp/cpu/geometry.cc
index 4a41724..1d0340b 100644
--- a/src/lib/cpp/cpu/geometry.cc
+++ b/src/lib/cpp/cpu/geometry.cc
@@ -12,12 +12,12 @@ array<real_t,3> center_of_mass(const input_ndarray<mask_type> voxels) {
     int64_t image_length = Nx*Ny*Nz;
 
     print_timestamp("center_of_mass start");
-    
-    uint64_t total_mass = 0;  
+
+    uint64_t total_mass = 0;
 
     #pragma omp parallel for reduction(+:total_mass,cmx,cmy,cmz)
     for (int64_t k = 0; k < image_length; k++) {
-        mask_type m = voxels.data[k];      
+        mask_type m = voxels.data[k];
 
         int64_t x = k / (Ny*Nz);
         int64_t y = (k / Nz) % Ny;
@@ -30,8 +30,8 @@ array<real_t,3> center_of_mass(const input_ndarray<mask_type> voxels) {
         rcmx = cmx / ((real_t) total_mass),
         rcmy = cmy / ((real_t) total_mass),
         rcmz = cmz / ((real_t) total_mass);
-  
-    print_timestamp("center_of_mass end");  
+
+    print_timestamp("center_of_mass end");
 
     return array<real_t,3>{ rcmx, rcmy, rcmz };
 }
@@ -41,37 +41,37 @@ array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &voxels, const arr
         Ixx = 0, Ixy = 0, Ixz = 0,
                  Iyy = 0, Iyz = 0,
                           Izz = 0;
-  
+
     ssize_t Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
 
     print_timestamp("inertia_matrix_serial start");
-    
+
     #pragma omp parallel for collapse(3) reduction(+:Ixx,Iyy,Izz) reduction(-:Ixy,Ixz,Iyz)
     for (int64_t X = 0; X < Nx; X++) {
         for (int64_t Y = 0; Y < Ny; Y++) {
             for (int64_t Z = 0; Z < Nz; Z++) {
 
                 // TODO shouldn't the loops be interchanged to match the access pattern? (Naming-wise, that is)
-                int64_t k = X*Ny*Nz + Y*Nz + Z; 
+                int64_t k = X*Ny*Nz + Y*Nz + Z;
                 mask_type m = voxels.data[k];
-                
+
                 // m guards this, and this removes branches
-                // if (m != 0) 
-                real_t 
-                    x = X - cm[0], 
-                    y = Y - cm[1], 
+                // if (m != 0)
+                real_t
+                    x = X - cm[0],
+                    y = Y - cm[1],
                     z = Z - cm[2];
-                
+
                 Ixx += m * (y*y + z*z);
                 Iyy += m * (x*x + z*z);
-                Izz += m * (x*x + y*y);    
+                Izz += m * (x*x + y*y);
                 Ixy -= m * x*y;
                 Ixz -= m * x*z;
                 Iyz -= m * y*z;
             }
         }
     }
-  
+
     print_timestamp("inertia_matrix_serial end");
 
     return array<real_t,9> {
diff --git a/src/lib/cpp/cpu_seq/geometry.cc b/src/lib/cpp/cpu_seq/geometry.cc
index 8c10286..a1e9ef2 100644
--- a/src/lib/cpp/cpu_seq/geometry.cc
+++ b/src/lib/cpp/cpu_seq/geometry.cc
@@ -14,13 +14,13 @@ array<real_t, 3> center_of_mass(const input_ndarray<mask_type> voxels) {
     print_timestamp("center_of_mass start");
 
     uint64_t cmz = 0, cmy = 0, cmx = 0;
-    uint64_t total_mass = 0;  
+    uint64_t total_mass = 0;
 
     for_3d_begin(voxels);
 
     mask_type m = voxels.data[flat_index];
 
-        total_mass += m;
+    total_mass += m;
     cmx += m * x;
     cmy += m * y;
     cmz += m * z;
@@ -31,8 +31,8 @@ array<real_t, 3> center_of_mass(const input_ndarray<mask_type> voxels) {
         rcmx = cmx / ((real_t) total_mass),
         rcmy = cmy / ((real_t) total_mass),
         rcmz = cmz / ((real_t) total_mass);
-  
-    print_timestamp("center_of_mass end");  
+
+    print_timestamp("center_of_mass end");
 
     return array<real_t, 3>{ rcmz, rcmy, rcmx };
 }
@@ -42,7 +42,7 @@ array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &voxels, const arr
         Ixx = 0, Ixy = 0, Ixz = 0,
                  Iyy = 0, Iyz = 0,
                           Izz = 0;
-  
+
     ssize_t Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
 
     print_timestamp("inertia_matrix_serial start");
@@ -53,24 +53,24 @@ array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &voxels, const arr
             for (int64_t Z = 0; Z < Nz; Z++) {
                 mask_type m = voxels.data[k];
                 k++;
-                
+
                 // m guards this, and then branches are removed
-                //if (m != 0) 
-                real_t 
-                    x = X - cm[0], 
-                    y = Y - cm[1], 
+                //if (m != 0)
+                real_t
+                    x = X - cm[0],
+                    y = Y - cm[1],
                     z = Z - cm[2];
-                
+
                 Ixx += m * (y*y + z*z);
                 Iyy += m * (x*x + z*z);
-                Izz += m * (x*x + y*y);    
+                Izz += m * (x*x + y*y);
                 Ixy -= m * x*y;
                 Ixz -= m * x*z;
                 Iyz -= m * y*z;
             }
         }
     }
-  
+
     print_timestamp("inertia_matrix_serial end");
 
     return array<real_t,9> {
@@ -82,13 +82,13 @@ array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &voxels, const arr
 
 /* TODO only called in test.py. Postponed for now.
 void integrate_axes(const input_ndarray<mask_type> &voxels,
-            const array<real_t,3> &x0,            
+            const array<real_t,3> &x0,
             const array<real_t,3> &v_axis,
             const array<real_t,3> &w_axis,
             const real_t v_min, const real_t w_min,
             output_ndarray<real_t> output) {
     ssize_t Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
-    ssize_t Nv = output.shape[0], Nw = output.shape[1]; 
+    ssize_t Nv = output.shape[0], Nw = output.shape[1];
     int64_t image_length = Nx*Ny*Nz;
     real_t *output_data = output.data;
 
@@ -133,7 +133,7 @@ template<typename field_type> float resample2x2x2(const field_type *voxels,
                                                   const array<float,3>   &X) {
     auto  [Nx,Ny,Nz] = shape;    // Eller omvendt?
     if (!in_bbox(X[0],X[1],X[2], {0.5,Nx-1.5, 0.5,Ny-1.5, 0.5,Nz-1.5})) {
-        uint64_t voxel_index = floor(X[0])*Ny*Nz+floor(X[1])*Ny+floor(X[2]);      
+        uint64_t voxel_index = floor(X[0])*Ny*Nz+floor(X[1])*Ny+floor(X[2]);
         return voxels[voxel_index];
     }
     float   Xfrac[2][3]; // {Xminus[3], Xplus[3]}
@@ -171,7 +171,7 @@ template<typename field_type> float resample2x2x2(const field_type *voxels,
         // }
         uint64_t voxel_index = I*Ny*Nz+J*Ny+K;
         assert(I>=0 && J>=0 && K>=0);
-        assert(I<Nx && J<Ny && K<Nz);    
+        assert(I<Nx && J<Ny && K<Nz);
         field_type voxel = voxels[voxel_index];
         value += voxel*weight;
     }
@@ -182,7 +182,7 @@ template <typename voxel_type> void sample_plane(const input_ndarray<voxel_type>
                          const real_t voxel_size, // In micrometers
                          const array<real_t,3> cm,
                          const array<real_t,3> u_axis,
-                         const array<real_t,3> v_axis,          
+                         const array<real_t,3> v_axis,
                          const array<real_t,4>  bbox,    // [umin,umax,vmin,vmax] in micrometers
                          output_ndarray<real_t> plane_samples) {
     const auto& [umin,umax,vmin,vmax] = bbox; // In micrometers
@@ -195,8 +195,8 @@ template <typename voxel_type> void sample_plane(const input_ndarray<voxel_type>
         for (ssize_t vj=0;vj<nv;vj++) {
             const real_t u = umin + ui*du, v = vmin + vj*dv;
 
-            // X,Y,Z in micrometers;  x,y,z in voxel index space      
-            const real_t        
+            // X,Y,Z in micrometers;  x,y,z in voxel index space
+            const real_t
                 X = cm[0] + u*u_axis[0] + v*v_axis[0],
                 Y = cm[1] + u*u_axis[1] + v*v_axis[1],
                 Z = cm[2] + u*u_axis[2] + v*v_axis[2];
@@ -204,7 +204,7 @@ template <typename voxel_type> void sample_plane(const input_ndarray<voxel_type>
             const real_t x = X/voxel_size, y = Y/voxel_size, z = Z/voxel_size;
 
             //      printf("u,v = %g,%g -> %.1f,%.1f,%.1f -> %d, %d, %d\n",u,v,X,Y,Z,int(round(x)),int(round(y)),int(round(z)));
-            
+
             voxel_type value = 0;
             if (in_bbox(x,y,z,{0.5,Nx-0.5, 0.5,Ny-0.5, 0.5,Nz-0.5}))
                 value = resample2x2x2<voxel_type>(voxels.data,{Nx,Ny,Nz},{x,y,z});
@@ -216,7 +216,7 @@ template <typename voxel_type> void sample_plane(const input_ndarray<voxel_type>
     }
 }
 
-/* TODO only called in test.py. Postpone for now. 
+/* TODO only called in test.py. Postpone for now.
 // NB: xyz are in indices, not micrometers
 void zero_outside_bbox(const array<real_t,9> &principal_axes,
                const array<real_t,6> &parameter_ranges,
@@ -238,13 +238,13 @@ void zero_outside_bbox(const array<real_t,9> &principal_axes,
             int64_t y = (flat_idx / Nz) % Ny;
             int64_t z = flat_idx  % Nz;
             // Boilerplate until here. TODO: macroize or lambda out!
-            
+
             real_t xs[3] = {x-cm[0], y-cm[1], z-cm[2]};
 
             real_t params[3] = {0,0,0};
 
-            for (int uvw = 0; uvw < 3; uvw++) 
-                for (int xyz = 0; xyz < 3; xyz++) 
+            for (int uvw = 0; uvw < 3; uvw++)
+                for (int xyz = 0; xyz < 3; xyz++)
                     params[uvw] += xs[xyz] * principal_axes[uvw*3+xyz]; // u = dot(xs,u_axis), v = dot(xs,v_axis), w = dot(xs,w_axis)
 
             bool p = false;
@@ -266,7 +266,7 @@ inline vector4 hom_transform(const vector4 &x, const matrix4x4 &M) {
 
     for (int i = 0; i < 4; i++) {
         real_t sum = 0;
-        #pragma simd parallel for reduction(+:sum)    
+        #pragma simd parallel for reduction(+:sum)
         for (int j=0;j<4;j++)
             sum += M[i*4+j]*x[j];
         c[i] = sum;
@@ -290,7 +290,7 @@ inline vector4 hom_transform(const vector4 &x, const matrix4x4 &M) {
             std::array<real_t,4> Xs = { X*voxel_size, Y*voxel_size, Z*voxel_size, 1 };                                          \
             bool mask_value = maskin_buffer[k];
 
-#define loop_mask_end(mask) }}} 
+#define loop_mask_end(mask) }}}
 
 /*
 void fill_implant_mask(const input_ndarray<mask_type> implant_mask,
@@ -301,24 +301,24 @@ void fill_implant_mask(const input_ndarray<mask_type> implant_mask,
                output_ndarray<mask_type> solid_implant_mask,
                output_ndarray<float> rsqr_maxs,
                output_ndarray<float> profile) {
-    real_t theta_min = M_PI, theta_max = -M_PI;  
+    real_t theta_min = M_PI, theta_max = -M_PI;
     ssize_t n_segments = rsqr_maxs.shape[0];
     const auto [U_min,U_max,V_min,V_max,W_min,W_max] = bbox;
-    
+
     printf("implant_mask.shape = %ld,%ld,%ld\n",implant_mask.shape[0],implant_mask.shape[1],implant_mask.shape[2]);
     printf("solid_implant_mask.shape = %ld,%ld,%ld\n",solid_implant_mask.shape[0],solid_implant_mask.shape[1],solid_implant_mask.shape[2]);
-    
+
     fprintf(stderr,"voxel_size = %g, U_min = %g, U_max = %g, r_frac = %g, n_segments = %ld\n",
         voxel_size, U_min, U_max, r_fraction, n_segments);
 
     float     *rsqr_maxs_d     = rsqr_maxs.data;
     float     *profile_d       = profile.data;
-    
+
     // First pass computes some bounds -- possibly separate out to avoid repeating
     //loop_mask_start(implant_mask, solid_implant_mask, (maskin_buffer[:this_block_length], rsqr_maxs_d[:n_segments], Muvw[:16], bbox[:6]) );
     if (mask_value) {
         auto [U,V,W,c] = hom_transform(Xs,Muvw);
-        
+
         real_t r_sqr = V*V+W*W;
         real_t theta = atan2(V,W);
 
@@ -359,25 +359,25 @@ void fill_implant_mask(const input_ndarray<mask_type> implant_mask,
         }
     }
     maskout_buffer[k] = solid_mask_value;
-    
+
     //loop_mask_end(implant_mask);
 }
 
 void compute_front_mask(const input_ndarray<mask_type> solid_implant,
         const float voxel_size,
-        const matrix4x4 &Muvw,        
+        const matrix4x4 &Muvw,
         std::array<float,6> bbox,
         output_ndarray<mask_type> front_mask) {
     const auto [U_min,U_max,V_min,V_max,W_min,W_max] = bbox;
 
-    loop_mask_start(solid_implant, front_mask, () );  
+    loop_mask_start(solid_implant, front_mask, () );
 
     if (!mask_value) {
         auto [U,V,W,c] = hom_transform(Xs,Muvw);
         maskout_buffer[k] = W>W_min;
     } else
         maskout_buffer[k] = 0;
-    
+
     loop_mask_end(solid_implant)
 }
 */
@@ -400,24 +400,24 @@ void cylinder_projection(const input_ndarray<float>  edt,  // Euclidean Distance
     ssize_t Cx = C.shape[0],   Cy = C.shape[1],   Cz = C.shape[2];
 
     real_t edx = ex/real_t(Cx), edy = ey/real_t(Cy), edz = ex/real_t(Cz);
-    
+
     ssize_t edt_length       = ex*ey*ez;
-    ssize_t C_length         = Cx*Cy*Cz;  
+    ssize_t C_length         = Cx*Cy*Cz;
 
     printf("Segmenting from %g to %g micrometers distance of implant.\n",d_min,d_max);
 
     printf("Bounding box is [U_min,U_max,V_min,V_max,W_min,W_max] = [[%g,%g],[%g,%g],[%g,%g]]\n",
         U_min,U_max,V_min,V_max,W_min,W_max);
     printf("EDT field is (%ld,%ld,%ld)\n",ex,ey,ez);
-    
+
     real_t th_min = 1234, th_max = -1234;
     ssize_t n_shell = 0;
     ssize_t n_shell_bbox = 0;
 
     ssize_t block_height = 64;
-    
+
     //TODO: new acc/openmp macro in parallel.hh
-    {    
+    {
         float   *image_d = image.data;
         int64_t *count_d = count.data;
 
@@ -429,8 +429,8 @@ void cylinder_projection(const input_ndarray<float>  edt,  // Euclidean Distance
             ssize_t  this_edt_length   = min((block_height+2)*ey*ez,edt_length-block_start);
 
             //#pragma acc parallel loop copy(C_buffer[:this_block_length], image_d[:n_theta*n_U], count_d[:n_theta*n_U], bbox[:6], Muvw[:16], edt_block[:this_edt_length]) reduction(+:n_shell,n_shell_bbox)
-            #pragma omp parallel for reduction(+:n_shell,n_shell_bbox)    
-            for (int64_t k = 0; k < this_block_length; k++) {    
+            #pragma omp parallel for reduction(+:n_shell,n_shell_bbox)
+            for (int64_t k = 0; k < this_block_length; k++) {
                 const int64_t flat_idx = block_start + k;
                 const int64_t X = (flat_idx  / (Cy*Cz)), Y = (flat_idx / Cz) % Cy, Z = flat_idx  % Cz; // Integer indices: Cs[c,X,Y,Z]
                 // Index into local block
@@ -442,10 +442,10 @@ void cylinder_projection(const input_ndarray<float>  edt,  // Euclidean Distance
                     printf("Block number k=%ld.\nX,Y,Z=%ld,%ld,%ld\nXl,Yl,Zl=%ld,%ld,%ld\nx,y,z=%.2f, %.2f, %.2f\n",k,X,Y,Z,Xl,Yl,Zl,x,y,z);
                     abort();
                 }
-                
+
                 //****** MEAT OF THE IMPLEMENTATION IS HERE ******
                 real_t distance = resample2x2x2<float>(edt_block, {this_edt_length/(ey*ez),ey,ez}, {x,y,z});
-                
+
                 if (distance > d_min && distance <= d_max) { // TODO: and W>w_min
                     array<real_t,4> Xs = {X*voxel_size, Y*voxel_size, Z*voxel_size, 1};
                     auto [U,V,W,c] = hom_transform(Xs,Muvw);
@@ -457,29 +457,29 @@ void cylinder_projection(const input_ndarray<float>  edt,  // Euclidean Distance
 
                         if (theta >= theta_min && theta <= theta_max) {
                             n_shell_bbox++;
-                            
+
                             ssize_t theta_i = floor( (theta-theta_min) * (n_theta-1)/(theta_max-theta_min) );
                             ssize_t U_i     = floor( (U    -    U_min) * (n_U    -1)/(    U_max-    U_min) );
-                            
+
                             real_t p = C_buffer[k]/255.;
-                            
+
                             assert(theta >= theta_min);
                             assert(theta <= theta_max);
                             assert(U >= U_min);
-                            assert(U <= U_max);          
+                            assert(U <= U_max);
                             assert(theta_i >= 0);
                             assert(theta_i < n_theta);
                             assert(U_i >= 0);
-                            assert(U_i < n_U);          
-                            
+                            assert(U_i < n_U);
+
                             if (p > 0) {
                                 th_min = min(theta,th_min);
-                                th_max = max(theta,th_max);          
-                                
+                                th_max = max(theta,th_max);
+
                                 //atomic_statement()
                                 image_d[theta_i*n_U + U_i] += p;
-                                
-                                //atomic_statement()      
+
+                                //atomic_statement()
                                 count_d[theta_i*n_U + U_i] += 1;
                             }
                         }
@@ -490,5 +490,5 @@ void cylinder_projection(const input_ndarray<float>  edt,  // Euclidean Distance
     }
     printf("n_shell = %ld, n_shell_bbox = %ld\n",n_shell,n_shell_bbox);
     printf("theta_min, theta_max = %.2f,%.2f\n",theta_min,theta_max);
-    printf("th_min,       th_max = %.2f,%.2f\n",th_min,th_max);    
+    printf("th_min,       th_max = %.2f,%.2f\n",th_min,th_max);
 }
diff --git a/src/lib/cpp/gpu/geometry.cc b/src/lib/cpp/gpu/geometry.cc
index 5c0ce52..891c579 100644
--- a/src/lib/cpp/gpu/geometry.cc
+++ b/src/lib/cpp/gpu/geometry.cc
@@ -7,7 +7,7 @@ using namespace std;
 #include "geometry.hh"
 
 array<real_t,3> center_of_mass(const input_ndarray<mask_type> voxels) {
-    // nvc++ doesn't support OpenACC 2.7 array reductions yet.  
+    // nvc++ doesn't support OpenACC 2.7 array reductions yet.
     uint64_t cmx = 0, cmy = 0, cmz = 0;
     size_t Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
     int64_t image_length = Nx*Ny*Nz;
@@ -15,14 +15,14 @@ array<real_t,3> center_of_mass(const input_ndarray<mask_type> voxels) {
     print_timestamp("center_of_mass start");
 
     uint64_t total_mass = 0;
-    
+
     #pragma acc data copy(total_mass,cmx,cmy,cmz)
     {
-        for (int64_t block_start = 0; block_start < image_length; block_start += acc_block_size) {
+        for (int64_t block_start = 0; block_start < image_length; block_start += acc_block_size<mask_type>) {
             const mask_type *buffer = voxels.data + block_start;
-            ssize_t this_block_size = min(acc_block_size, image_length-block_start);
+            ssize_t this_block_size = min(acc_block_size<mask_type>, image_length-block_start);
 
-            #pragma acc data copyin(buffer[:this_block_size]) 
+            #pragma acc data copyin(buffer[:this_block_size])
             {
                 #pragma acc parallel loop reduction(+:total_mass,cmx,cmy,cmz)
                 for (int64_t k = 0; k < this_block_size; k++) {
@@ -39,12 +39,12 @@ array<real_t,3> center_of_mass(const input_ndarray<mask_type> voxels) {
             }
         }
     }
-    real_t 
+    real_t
         rcmx = cmx / ((real_t) total_mass),
         rcmy = cmy / ((real_t) total_mass),
         rcmz = cmz / ((real_t) total_mass);
-  
-    print_timestamp("center_of_mass end");  
+
+    print_timestamp("center_of_mass end");
 
     return array<real_t,3>{rcmx, rcmy, rcmz};
 }
@@ -55,19 +55,19 @@ array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &voxels, const arr
         Ixx = 0, Ixy = 0, Ixz = 0,
                  Iyy = 0, Iyz = 0,
                           Izz = 0;
-  
+
     size_t Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
     ssize_t image_length = Nx*Ny*Nz;
 
     print_timestamp("inertia_matrix start");
 
-    #pragma acc data copy(Ixx, Ixy, Ixz, Iyy, Iyz, Izz) 
+    #pragma acc data copy(Ixx, Ixy, Ixz, Iyy, Iyz, Izz)
     {
-        for (ssize_t block_start = 0; block_start < image_length; block_start += acc_block_size) {
+        for (ssize_t block_start = 0; block_start < image_length; block_start += acc_block_size<mask_type>) {
             const mask_type *buffer = voxels.data + block_start;
-            ssize_t this_block_size = min(acc_block_size, image_length - block_start);
+            ssize_t this_block_size = min(acc_block_size<mask_type>, image_length - block_start);
 
-            #pragma acc data copyin(buffer[:this_block_size]) 
+            #pragma acc data copyin(buffer[:this_block_size])
             {
                 #pragma acc parallel loop reduction(+:Ixx,Iyy,Izz) reduction(-:Ixy,Ixz,Iyz)
                 for (int64_t k = 0; k < this_block_size; k++) {    //\if (buffer[k] != 0)
@@ -75,20 +75,20 @@ array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &voxels, const arr
 
                     // m guards this, and GPUs doesn't like branches
                     //if (m != 0)
-                    int64_t 
+                    int64_t
                         flat_idx = block_start + k,
                         X = flat_idx / (Ny * Nz),
                         Y = ((flat_idx) / Nz) % Ny,
                         Z = flat_idx % Nz;
-                    
-                    real_t 
-                        x = X - cm[0], 
-                        y = Y - cm[1], 
+
+                    real_t
+                        x = X - cm[0],
+                        y = Y - cm[1],
                         z = Z - cm[2];
-                
+
                     Ixx += m * (y*y + z*z);
                     Iyy += m * (x*x + z*z);
-                    Izz += m * (x*x + y*y);    
+                    Izz += m * (x*x + y*y);
                     Ixy -= m * x*y;
                     Ixz -= m * x*z;
                     Iyz -= m * y*z;
@@ -106,20 +106,20 @@ array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &voxels, const arr
     };
 }
 
-/* TODO Only called in test.py. Postponed for now. 
+/* TODO Only called in test.py. Postponed for now.
 void integrate_axes(const input_ndarray<mask_type> &voxels,
-            const array<real_t,3> &x0,            
+            const array<real_t,3> &x0,
             const array<real_t,3> &v_axis,
             const array<real_t,3> &w_axis,
             const real_t v_min, const real_t w_min,
             output_ndarray<real_t> output) {
     ssize_t Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
-    ssize_t Nv = output.shape[0], Nw = output.shape[1]; 
+    ssize_t Nv = output.shape[0], Nw = output.shape[1];
     int64_t image_length = Nx*Ny*Nz;
     real_t *output_data = output.data;
 
     // TODO: Check v_axis & w_axis projections to certify bounds and get rid of runtime check
-  
+
     for (ssize_t block_start = 0; block_start < image_length; block_start += acc_block_size) {
         const mask_type *buffer  = voxels.data + block_start;
         int block_length = min(acc_block_size,image_length-block_start);
diff --git a/src/pybind/geometry-pybind.cc b/src/pybind/geometry-pybind.cc
index c22bf72..d82f749 100644
--- a/src/pybind/geometry-pybind.cc
+++ b/src/pybind/geometry-pybind.cc
@@ -2,15 +2,15 @@
 
 namespace python_api {
 
-array<real_t,3> center_of_mass(const np_maskarray &np_voxels){
+array<real_t, 3> center_of_mass(const np_maskarray &np_voxels){
     auto voxels_info = np_voxels.request();
 
     return ::center_of_mass({voxels_info.ptr, voxels_info.shape});
 }
 
-array<real_t,9> inertia_matrix(const np_maskarray &np_voxels, array<real_t,3>& cm){
+array<real_t, 9> inertia_matrix(const np_maskarray &np_voxels, array<real_t, 3>& cm){
     auto voxels_info = np_voxels.request();
-    
+
     return ::inertia_matrix({voxels_info.ptr, voxels_info.shape}, cm);
 }
 
@@ -21,19 +21,19 @@ void sample_plane(const np_array<voxel_type> &np_voxels,
           const real_t voxel_size, // In micrometers
           const array<real_t,3> cm,
           const array<real_t,3> u_axis,
-          const array<real_t,3> v_axis,          
+          const array<real_t,3> v_axis,
           const array<real_t,4>  bbox,    // [umin,umax,vmin,vmax] in micrometers
           np_array<float> np_plane_samples) {
     auto voxels_info = np_voxels.request();
     auto plane_samples_info  = np_plane_samples.request();
-    
+
     sample_plane<voxel_type>({voxels_info.ptr, voxels_info.shape}, voxel_size,
            cm,u_axis,v_axis,bbox,
            {plane_samples_info.ptr, plane_samples_info.shape});
 }
-  
+
 void integrate_axes(const np_maskarray &np_voxels,
-            const array<real_t,3> &x0,            
+            const array<real_t,3> &x0,
             const array<real_t,3> &v_axis,
             const array<real_t,3> &w_axis,
             const real_t v_min, const real_t w_min,
@@ -52,10 +52,10 @@ void zero_outside_bbox(const array<real_t,9> &principal_axes,
              const array<real_t,3> &cm, // TOOD: Med eller uden voxelsize?
              np_maskarray &np_voxels) {
     auto voxels_info = np_voxels.request();
-    
+
     zero_outside_bbox(principal_axes,
               parameter_ranges,
-              cm, 
+              cm,
               {voxels_info.ptr, voxels_info.shape});
 }
 
@@ -83,12 +83,12 @@ void fill_implant_mask(const np_maskarray implant_mask,
 
 void compute_front_mask(const np_array<uint8_t> &np_solid_implant,
         const float voxel_size,
-        const matrix4x4 &Muvw,        
+        const matrix4x4 &Muvw,
         std::array<float,6> bbox,
         np_array<mask_type> &np_front_mask) {
     auto solid_implant_info = np_solid_implant.request();
     auto front_mask_info    = np_front_mask.request();
-    
+
     ::compute_front_mask({solid_implant_info.ptr, solid_implant_info.shape},
             voxel_size, Muvw, bbox,
             {front_mask_info.ptr, front_mask_info.shape});
@@ -117,13 +117,13 @@ void cylinder_projection(const np_array<float>  &np_edt,  // Euclidean Distance
 }*/
 
 }
-  
+
 PYBIND11_MODULE(geometry, m) {
     m.doc() = "Voxel Geometry Module"; // optional module docstring
 
     m.def("center_of_mass",       &python_api::center_of_mass);
     m.def("inertia_matrix",       &python_api::inertia_matrix);
-    //m.def("integrate_axes",       &python_api::integrate_axes);        
+    //m.def("integrate_axes",       &python_api::integrate_axes);
     //m.def("zero_outside_bbox",    &python_api::zero_outside_bbox);
     //m.def("fill_implant_mask",    &python_api::fill_implant_mask);
     //m.def("cylinder_projection",  &python_api::cylinder_projection);
diff --git a/src/test/test_geometry.py b/src/test/test_geometry.py
index 4ddb051..6cc9f37 100644
--- a/src/test/test_geometry.py
+++ b/src/test/test_geometry.py
@@ -43,9 +43,9 @@ def compare_fs(func, baseline_f, cpu_f, gpu_f, should_assert=True):
     if should_assert: assert_with_print(baseline, cpu)
 
     gpu, gpu_t = run_with_warmup(gpu_f)
-    print (f'({func}) GPU ran in {gpu_t}, which is {baseline_t / gpu_t} times faster than sequential') 
+    print (f'({func}) GPU ran in {gpu_t}, which is {baseline_t / gpu_t} times faster than sequential')
     if should_assert: assert_with_print(baseline, gpu)
- 
+
 
 def test_center_of_mass():
     voxels = np.random.randint(0, 256, (n,n,n), np.uint8)
@@ -56,7 +56,7 @@ def test_center_of_mass():
 
     compare_fs('center_of_mass', baseline, cpu, gpu)
 
-   
+
 def test_inertia_matrix():
     voxels = np.random.randint(0, 2, (n,n,n), np.uint8)
     cm = m_gpu.center_of_mass(voxels)
@@ -64,10 +64,10 @@ def test_inertia_matrix():
     baseline = partial(m_cpu_seq.inertia_matrix, voxels, cm)
     cpu = partial(m_cpu.inertia_matrix, voxels, cm)
     gpu = partial(m_gpu.inertia_matrix, voxels, cm)
-    
+
     # TODO assert disabled due to floating point associativity error accumulation
     compare_fs('inertia_matrix', baseline, cpu, gpu, should_assert=False)
-    
+
 if __name__ == '__main__':
     test_center_of_mass()
     test_inertia_matrix()
\ No newline at end of file

From 4a1e6c1496adafa1610d9c9de1f34be38bcc9840 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 17 Feb 2023 12:14:08 +0100
Subject: [PATCH 083/136] #25 I have stared into the abyss and it stared back

---
 src/Makefile                       |  30 ++++---
 src/lib/cpp/cpu/geometry.cc        |  33 ++------
 src/lib/cpp/cpu_seq/geometry.cc    |  38 +++++----
 src/lib/cpp/gpu/geometry.cc        |  48 ++---------
 src/lib/cpp/include/boilerplate.hh | 127 ++++++++++++++++++++---------
 src/lib/cpp/include/geometry.hh    |  12 ++-
 src/pybind/geometry-pybind.cc      |  21 ++++-
 7 files changed, 163 insertions(+), 146 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 623eea2..7597ea2 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,35 +1,41 @@
 # Define constants and collections
 PYTHON = python3.10
-PYBIND_FLAGS += $(shell $(PYTHON) -m pybind11 --include) -march=native -Wall -shared -fPIC -fopenmp -g -std=c++17 -O3
+PYBIND_FLAGS += $(shell $(PYTHON) -m pybind11 --include) -march=native -Wall -shared -fPIC -g -std=c++17 -O3
 PYBIND_SUFFIX = $(shell $(PYTHON)-config --extension-suffix)
+
+# Detect OS for OS specific changes
+ifeq ($(shell uname -s), Darwin) # Mac OSX
+CXX = g++-12 # Use homebrew gcc, as system gcc is an alias for clang
+CXXFLAGS += -undefined dynamic_lookup # https://pybind11.readthedocs.io/en/stable/compiling.html#building-manually
+CLEANUP += $(TARGETS) $(foreach TARGET, $(TARGETS), $(TARGET).dSYM) # These are also generated on Mac
+endif
+
 CPP_FOLDER=lib/cpp
 #CXXFLAGS += -I../contrib/cpptqdm/ -Iinclude
 CXXFLAGS += -I$(CPP_FOLDER)/include
-PLATFORMS=cpu_seq cpu gpu
+PLATFORMS=cpu_seq cpu
+cpu_seq_CXX=$(CXX)
+cpu_CXX=$(cpu_seq_CXX)
+cpu_FLAGS=-fopenmp
+
 LIBS=io geometry morphology
 TARGETS = $(foreach PLATFORM, $(PLATFORMS), $(foreach LIB, $(LIBS), $(CPP_FOLDER)/$(PLATFORM)/$(LIB)$(PYBIND_SUFFIX)))
 CLEANUP = $(TARGETS) $(foreach PLATFORM, $(PLATFORMS), $(CPP_FOLDER)/$(PLATFORM)/__pycache__)
 
 # Detect if OpenACC can be used
 ifneq (, $(shell which nvc++))
-CXX = nvc++
-CXXFLAGS += -acc=gpu -Minfo=accel -tp=native
+PLATFORMS += gpu
+gpu_CXX = nvc++
+gpu_FLAGS = -acc=gpu -Minfo=accel -tp=native
 else
 $(info OpenACC compiler nvc++ not found. Compiling without.)
 endif
 
-# Detect OS for OS specific changes
-ifeq ($(shell uname -s), Darwin) # Mac OSX
-CXX = g++-12 # Use homebrew gcc, as system gcc is an alias for clang
-CXXFLAGS += -undefined dynamic_lookup # https://pybind11.readthedocs.io/en/stable/compiling.html#building-manually
-CLEANUP += $(TARGETS) $(foreach TARGET, $(TARGETS), $(TARGET).dSYM) # These are also generated on Mac
-endif
-
 all: $(TARGETS)
 
 define GEN_RULE
 $(CPP_FOLDER)/$(PLATFORM)/$(LIB)$(PYBIND_SUFFIX): pybind/$(LIB)-pybind.cc $(CPP_FOLDER)/$(PLATFORM)/$(LIB).cc $(CPP_FOLDER)/include/*.hh
-	$(CXX) $(CXXFLAGS) $(PYBIND_FLAGS) -I$(CPP_FOLDER)/$(PLATFORM) $$< -o $(CPP_FOLDER)/$(PLATFORM)/$(LIB)$(PYBIND_SUFFIX)
+	$($(PLATFORM)_CXX) $($(PLATFORM)_FLAGS) $(CXXFLAGS) $(PYBIND_FLAGS) -I$(CPP_FOLDER)/$(PLATFORM) $$< -o $(CPP_FOLDER)/$(PLATFORM)/$(LIB)$(PYBIND_SUFFIX)
 endef
 
 $(foreach PLATFORM, $(PLATFORMS), \
diff --git a/src/lib/cpp/cpu/geometry.cc b/src/lib/cpp/cpu/geometry.cc
index 1d0340b..728e933 100644
--- a/src/lib/cpp/cpu/geometry.cc
+++ b/src/lib/cpp/cpu/geometry.cc
@@ -5,35 +5,12 @@
 using namespace std;
 
 #include "geometry.hh"
+#include "../cpu_seq/geometry.cc"
 
-array<real_t,3> center_of_mass(const input_ndarray<mask_type> voxels) {
-    uint64_t cmx = 0, cmy = 0, cmz = 0;
-    size_t Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
-    int64_t image_length = Nx*Ny*Nz;
+namespace cpu_par {
 
-    print_timestamp("center_of_mass start");
-
-    uint64_t total_mass = 0;
-
-    #pragma omp parallel for reduction(+:total_mass,cmx,cmy,cmz)
-    for (int64_t k = 0; k < image_length; k++) {
-        mask_type m = voxels.data[k];
-
-        int64_t x = k / (Ny*Nz);
-        int64_t y = (k / Nz) % Ny;
-        int64_t z = k % Nz;
-
-        total_mass += m;
-        cmx += m*x; cmy += m*y; cmz += m*z;
-    }
-    real_t
-        rcmx = cmx / ((real_t) total_mass),
-        rcmy = cmy / ((real_t) total_mass),
-        rcmz = cmz / ((real_t) total_mass);
-
-    print_timestamp("center_of_mass end");
-
-    return array<real_t,3>{ rcmx, rcmy, rcmz };
+array<real_t,3> center_of_mass(const input_ndarray<mask_type> &mask) {
+    return cpu_seq::center_of_mass(mask);
 }
 
 array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &voxels, const array<real_t,3> &cm) {
@@ -79,4 +56,6 @@ array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &voxels, const arr
         Ixy, Iyy, Iyz,
         Ixz, Iyz, Izz
     };
+}
+
 }
\ No newline at end of file
diff --git a/src/lib/cpp/cpu_seq/geometry.cc b/src/lib/cpp/cpu_seq/geometry.cc
index a1e9ef2..609970c 100644
--- a/src/lib/cpp/cpu_seq/geometry.cc
+++ b/src/lib/cpp/cpu_seq/geometry.cc
@@ -3,34 +3,35 @@
 #include <inttypes.h>
 #include <stdio.h>
 #include <math.h>
-using namespace std;
 
-#include "geometry.hh"
 #include "boilerplate.hh"
+#include "geometry.hh"
+
+using namespace std;
+namespace cpu_seq {
 
-array<real_t, 3> center_of_mass(const input_ndarray<mask_type> voxels) {
-    unpack_numpy(voxels);
+array<real_t, 3> center_of_mass(const input_ndarray<mask_type> &mask) {
+    UNPACK_NUMPY(mask);
 
     print_timestamp("center_of_mass start");
 
-    uint64_t cmz = 0, cmy = 0, cmx = 0;
-    uint64_t total_mass = 0;
+    uint64_t total_mass = 0, cmz = 0, cmy = 0, cmx = 0;
 
-    for_3d_begin(voxels);
+    BLOCK_BEGIN(mask, reduction(+:total_mass,cmz,cmy,cmx)); {
 
-    mask_type m = voxels.data[flat_index];
+        mask_type m = mask_buffer[flat_index];
 
-    total_mass += m;
-    cmx += m * x;
-    cmy += m * y;
-    cmz += m * z;
+        total_mass += m;
+        cmz += m * z;
+        cmy += m * y;
+        cmx += m * x;
 
-    for_3d_end();
+    } BLOCK_END();
 
     real_t
-        rcmx = cmx / ((real_t) total_mass),
+        rcmz = cmz / ((real_t) total_mass),
         rcmy = cmy / ((real_t) total_mass),
-        rcmz = cmz / ((real_t) total_mass);
+        rcmx = cmx / ((real_t) total_mass);
 
     print_timestamp("center_of_mass end");
 
@@ -80,6 +81,8 @@ array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &voxels, const arr
     };
 }
 
+}
+
 /* TODO only called in test.py. Postponed for now.
 void integrate_axes(const input_ndarray<mask_type> &voxels,
             const array<real_t,3> &x0,
@@ -116,7 +119,6 @@ void integrate_axes(const input_ndarray<mask_type> &voxels,
         }
     }
 }
-*/
 
 bool in_bbox(float U, float V, float W, const std::array<float,6> bbox) {
     const auto& [U_min,U_max,V_min,V_max,W_min,W_max] = bbox;
@@ -259,7 +261,6 @@ void zero_outside_bbox(const array<real_t,9> &principal_axes,
         }
     }
 }
-*/
 
 inline vector4 hom_transform(const vector4 &x, const matrix4x4 &M) {
     vector4 c{{0,0,0,0}};
@@ -380,7 +381,6 @@ void compute_front_mask(const input_ndarray<mask_type> solid_implant,
 
     loop_mask_end(solid_implant)
 }
-*/
 
 void cylinder_projection(const input_ndarray<float>  edt,  // Euclidean Distance Transform in um, should be low-resolution (will be interpolated)
              const input_ndarray<uint8_t> C,  // Material classification images (probability per voxel, 0..1 -> 0..255)
@@ -492,3 +492,5 @@ void cylinder_projection(const input_ndarray<float>  edt,  // Euclidean Distance
     printf("theta_min, theta_max = %.2f,%.2f\n",theta_min,theta_max);
     printf("th_min,       th_max = %.2f,%.2f\n",th_min,th_max);
 }
+
+*/
\ No newline at end of file
diff --git a/src/lib/cpp/gpu/geometry.cc b/src/lib/cpp/gpu/geometry.cc
index 891c579..6d75136 100644
--- a/src/lib/cpp/gpu/geometry.cc
+++ b/src/lib/cpp/gpu/geometry.cc
@@ -5,48 +5,12 @@
 using namespace std;
 
 #include "geometry.hh"
+#include "../cpu_seq/geometry.cc"
 
-array<real_t,3> center_of_mass(const input_ndarray<mask_type> voxels) {
-    // nvc++ doesn't support OpenACC 2.7 array reductions yet.
-    uint64_t cmx = 0, cmy = 0, cmz = 0;
-    size_t Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
-    int64_t image_length = Nx*Ny*Nz;
-
-    print_timestamp("center_of_mass start");
-
-    uint64_t total_mass = 0;
-
-    #pragma acc data copy(total_mass,cmx,cmy,cmz)
-    {
-        for (int64_t block_start = 0; block_start < image_length; block_start += acc_block_size<mask_type>) {
-            const mask_type *buffer = voxels.data + block_start;
-            ssize_t this_block_size = min(acc_block_size<mask_type>, image_length-block_start);
-
-            #pragma acc data copyin(buffer[:this_block_size])
-            {
-                #pragma acc parallel loop reduction(+:total_mass,cmx,cmy,cmz)
-                for (int64_t k = 0; k < this_block_size; k++) {
-                    mask_type m = buffer[k];
-
-                    int64_t flat_idx = block_start + k;
-                    int64_t x = flat_idx / (Ny*Nz);
-                    int64_t y = (flat_idx / Nz) % Ny;
-                    int64_t z = flat_idx % Nz;
-
-                    total_mass += m;
-                    cmx += m*x; cmy += m*y; cmz += m*z;
-                }
-            }
-        }
-    }
-    real_t
-        rcmx = cmx / ((real_t) total_mass),
-        rcmy = cmy / ((real_t) total_mass),
-        rcmz = cmz / ((real_t) total_mass);
+namespace gpu {
 
-    print_timestamp("center_of_mass end");
-
-    return array<real_t,3>{rcmx, rcmy, rcmz};
+array<real_t,3> center_of_mass(const input_ndarray<mask_type> &mask) {
+    return cpu_seq::center_of_mass(mask);
 }
 
 array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &voxels, const array<real_t,3> &cm) {
@@ -146,4 +110,6 @@ void integrate_axes(const input_ndarray<mask_type> &voxels,
         }
     }
 }
-*/
\ No newline at end of file
+*/
+
+}
\ No newline at end of file
diff --git a/src/lib/cpp/include/boilerplate.hh b/src/lib/cpp/include/boilerplate.hh
index b71c6c0..0b5b1bc 100644
--- a/src/lib/cpp/include/boilerplate.hh
+++ b/src/lib/cpp/include/boilerplate.hh
@@ -1,51 +1,98 @@
 #ifndef boilerplate_h
 #define boilerplate_h
 
-// TODO it seems like vscode doesn't pick this up.
-/// \def for_block_begin(arr)
-/// Inserts boilerplate code for accessing \a arr in a blocked (chunked) manner.
-#define for_block_begin(arr) \
-    for (int64_t block_start = 0; block_start < arr##_length; block_start += acc_block_size<arr##_type>) { \
-        const arr##_type *arr##_buffer = arr.data + block_start; \
-        ssize_t arr##_buffer_length = min(acc_block_size<arr##_type>, arr##_length-block_start); \
-        _Pragma(STR(acc data copyin(arr##_buffer[:arr##_buffer_length]))) \
-        { \
-
-#define for_block_end() } }
-
-#define for_3d_begin(arr) \
-    for (int64_t z = 0; z < arr##_Nz; z++) { \
-        for (int64_t y = 0; y < arr##_Ny; y++) { \
-            for (int64_t x = 0; x < arr##_Nx; x++) { \
-                int64_t flat_index = z*arr##_Ny*arr##_Nx + y*arr##_Nx + x;
-
-#define for_3d_end() }}}
-
-#define for_flat_begin_1(arr) for_flat_begin(arr, arr)
-#define for_flat_begin_2(arr, global_prefix) \
-    for (int64_t flat_index = 0; flat_index < arr##_length; flat_index++) { \
+// Gaze upon the glory of 3-layered macros for building string literals for _Pragma
+#define STRINGIFY(X) #X
+#define TOKEN_COMBINER(X) STRINGIFY(X)
+#define PRAGMA(X) _Pragma(TOKEN_COMBINER(X))
+
+#ifdef _OPENACC
+#define PARALLEL_TERM acc parallel loop
+#else
+#ifdef _OPENMP
+#define PARALLEL_TERM omp parallel for
+#else
+#define PARALLEL_TERM
+#endif
+#endif
+
+// TODO attempt at docstring; not quite working.
+
+/// Inserts boilerplate code for accessing the given parameter, ARR, in a blocked (chunked) manner.
+/// Following this call, the following variables will be exposed:
+///
+///  - `block_start`: the address of the current block.
+///
+/// @param ARR The array that will be accessed.
+#define FOR_BLOCK_BEGIN(ARR) \
+    for (int64_t ARR##_buffer_start = 0; ARR##_buffer_start < ARR##_length; ARR##_buffer_start += acc_block_size<ARR##_type>) { \
+        const ARR##_type *ARR##_buffer = ARR.data + ARR##_buffer_start; \
+        ssize_t ARR##_buffer_length = min(acc_block_size<ARR##_type>, ARR##_length-ARR##_buffer_start); \
+        PRAGMA(acc data copyin(ARR##_buffer[:ARR##_buffer_length])) \
+        {
+
+#define FOR_BLOCK_END() } }
+
+#define FOR_3D_BEGIN(ARR, EXTRA_PRAGMA_CLAUSE) \
+    PRAGMA(PARALLEL_TERM collapse(3) EXTRA_PRAGMA_CLAUSE) \
+    for (int64_t z = 0; z < ARR##_Nz; z++) { \
+        for (int64_t y = 0; y < ARR##_Ny; y++) { \
+            for (int64_t x = 0; x < ARR##_Nx; x++) { \
+
+#define FOR_3D_END() }}}
+
+#define FOR_FLAT_BEGIN(ARR, global_prefix, EXTRA_PRAGMA_CLAUSE) \
+    PRAGMA(PARALLEL_TERM EXTRA_PRAGMA_CLAUSE) \
+    for (int64_t flat_index = 0; flat_index < ARR##_length; flat_index++) { \
         int64_t \
-            global_prefix##_index = arr##_start + flat_index \
-            z = global_prefix##_index / (arr##_Ny*arr##_Nx), \
-            y = (global_prefix##_index / arr##_Nx) % arr##_Ny, \
-            x = global_prefix##_index % arr##_Nx;
+            global_prefix##_index = ARR##_start + flat_index, \
+            z = global_prefix##_index / (ARR##_Ny * ARR##_Nx), \
+            y = (global_prefix##_index / ARR##_Nx) % ARR##_Ny, \
+            x = global_prefix##_index % ARR##_Nx;
+
+#define FOR_FLAT_END() }
+
+#define PUSH_N_DOWN_TO_BUFFER(ARR) \
+    ssize_t \
+        ARR##_buffer_Nz = ARR##_Nz, \
+        ARR##_buffer_Ny = ARR##_Ny, \
+        ARR##_buffer_Nx = ARR##_Nx;
+
+#ifdef _OPENACC
+#define BLOCK_BEGIN(ARR, EXTRA_PRAGMA_CLAUSE) \
+    FOR_BLOCK_BEGIN(ARR) \
+    PUSH_N_DOWN_TO_BUFFER(ARR) \
+    FOR_FLAT_BEGIN(ARR##_buffer, global, EXTRA_PRAGMA_CLAUSE)
+
+#define BLOCK_END() \
+    FOR_FLAT_END() \
+    FOR_BLOCK_END()
+#else
+#ifdef _OPENMP // Should also capture OpenACC, which is why it's second.
+#define BLOCK_BEGIN(ARR, EXTRA_PRAGMA_CLAUSE) \
+    const ARR##_type *ARR##_buffer = ARR.data; \
+    FOR_3D_BEGIN(ARR, EXTRA_PRAGMA_CLAUSE) \
+    int64_t flat_index = z*ARR##_Ny*ARR##_Nx + y*ARR##_Nx + x;
 
-#define for_flat_end() }
+#define BLOCK_END() FOR_3D_END()
+#else
+#define BLOCK_BEGIN(ARR, EXTRA_PRAGMA_CLAUSE) \
+    int64_t flat_index = 0; \
+    const ARR##_type *ARR##_buffer = ARR.data; \
+    FOR_3D_BEGIN(ARR, EXTRA_PRAGMA_CLAUSE)
 
-// TODO I'm not sure this'll expand right.
-#define for_flat_block_begin(arr) \
-    for_block_begin(arr) \
-    for_flat_begin_2(arr##_buffer, global)
+#define BLOCK_END() \
+    flat_index++; \
+    FOR_3D_END()
 
-#define for_flat_block_end() \
-    for_flat_end() \
-    for_block_end()
+#endif
+#endif
 
-#define unpack_numpy(arr) \
+#define UNPACK_NUMPY(ARR) \
     ssize_t \
-        arr##_Nz = arr.shape[0], \
-        arr##_Ny = arr.shape[1], \
-        arr##_Nx = arr.shape[2], \
-        arr##_length = arr##_Nz*arr##_Ny*arr##_Nx;
+        ARR##_Nz = ARR.shape[0], \
+        ARR##_Ny = ARR.shape[1], \
+        ARR##_Nx = ARR.shape[2], \
+        ARR##_length = ARR##_Nz*ARR##_Ny*ARR##_Nx;
 
 #endif
\ No newline at end of file
diff --git a/src/lib/cpp/include/geometry.hh b/src/lib/cpp/include/geometry.hh
index 2f448ef..25bef7e 100644
--- a/src/lib/cpp/include/geometry.hh
+++ b/src/lib/cpp/include/geometry.hh
@@ -10,18 +10,20 @@ using namespace std;
 #define dot(a,b) (a[0]*b[0] + a[1]*b[1] + a[2]*b[2])
 
 void print_timestamp(string message) {
-    //auto now = chrono::system_clock::to_time_t(chrono::system_clock::now());
-    //tm local_tm = *localtime(&now);
-    //fprintf(stderr,"%s at %02d:%02d:%02d\n", message.c_str(), local_tm.tm_hour, local_tm.tm_min, local_tm.tm_sec);
+    auto now = chrono::system_clock::to_time_t(chrono::system_clock::now());
+    tm local_tm = *localtime(&now);
+    fprintf(stderr,"%s at %02d:%02d:%02d\n", message.c_str(), local_tm.tm_hour, local_tm.tm_min, local_tm.tm_sec);
 }
 
+namespace NS {
+
 /*
 Computes the center of mass of the given tomography.
 
 @param voxels The given tomography.
 @returns The 3D coordinates of the center of mass (in Z, Y, X).
 */
-array<real_t,3> center_of_mass(const input_ndarray<mask_type> voxels);
+array<real_t,3> center_of_mass(const input_ndarray<mask_type> &voxels);
 
 /*
 Computes the inertia matrix of the given tomography based of the given center of mass.
@@ -32,4 +34,6 @@ Computes the inertia matrix of the given tomography based of the given center of
 */
 array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &voxels, const array<real_t,3> &cm);
 
+}
+
 #endif
\ No newline at end of file
diff --git a/src/pybind/geometry-pybind.cc b/src/pybind/geometry-pybind.cc
index d82f749..d52c83c 100644
--- a/src/pybind/geometry-pybind.cc
+++ b/src/pybind/geometry-pybind.cc
@@ -1,17 +1,30 @@
+#ifdef _OPENACC
+#warning "Using GPU"
+#define NS gpu
+#else
+#ifdef _OPENMP
+#warning "Using OpenMP"
+#define NS cpu_par
+#else
+#warning "Using sequential"
+#define NS cpu_seq
+#endif
+#endif
+
 #include "geometry.cc"
 
 namespace python_api {
 
-array<real_t, 3> center_of_mass(const np_maskarray &np_voxels){
+array<real_t, 3> center_of_mass(const np_maskarray &np_voxels) {
     auto voxels_info = np_voxels.request();
 
-    return ::center_of_mass({voxels_info.ptr, voxels_info.shape});
+    return NS::center_of_mass({voxels_info.ptr, voxels_info.shape});
 }
 
-array<real_t, 9> inertia_matrix(const np_maskarray &np_voxels, array<real_t, 3>& cm){
+array<real_t, 9> inertia_matrix(const np_maskarray &np_voxels, array<real_t, 3> &cm) {
     auto voxels_info = np_voxels.request();
 
-    return ::inertia_matrix({voxels_info.ptr, voxels_info.shape}, cm);
+    return NS::inertia_matrix({voxels_info.ptr, voxels_info.shape}, cm);
 }
 
 /*

From 51943cb9a2621c8931a5656a81552ddd48e37b5d Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 17 Feb 2023 16:41:58 +0100
Subject: [PATCH 084/136] #25 implemented single source inertia matrix

---
 src/lib/cpp/cpu/geometry.cc     | 45 ++------------------------
 src/lib/cpp/cpu_seq/geometry.cc | 48 +++++++++++++--------------
 src/lib/cpp/gpu/geometry.cc     | 57 ++-------------------------------
 3 files changed, 27 insertions(+), 123 deletions(-)

diff --git a/src/lib/cpp/cpu/geometry.cc b/src/lib/cpp/cpu/geometry.cc
index 728e933..8da364d 100644
--- a/src/lib/cpp/cpu/geometry.cc
+++ b/src/lib/cpp/cpu/geometry.cc
@@ -13,49 +13,8 @@ array<real_t,3> center_of_mass(const input_ndarray<mask_type> &mask) {
     return cpu_seq::center_of_mass(mask);
 }
 
-array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &voxels, const array<real_t,3> &cm) {
-    real_t
-        Ixx = 0, Ixy = 0, Ixz = 0,
-                 Iyy = 0, Iyz = 0,
-                          Izz = 0;
-
-    ssize_t Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
-
-    print_timestamp("inertia_matrix_serial start");
-
-    #pragma omp parallel for collapse(3) reduction(+:Ixx,Iyy,Izz) reduction(-:Ixy,Ixz,Iyz)
-    for (int64_t X = 0; X < Nx; X++) {
-        for (int64_t Y = 0; Y < Ny; Y++) {
-            for (int64_t Z = 0; Z < Nz; Z++) {
-
-                // TODO shouldn't the loops be interchanged to match the access pattern? (Naming-wise, that is)
-                int64_t k = X*Ny*Nz + Y*Nz + Z;
-                mask_type m = voxels.data[k];
-
-                // m guards this, and this removes branches
-                // if (m != 0)
-                real_t
-                    x = X - cm[0],
-                    y = Y - cm[1],
-                    z = Z - cm[2];
-
-                Ixx += m * (y*y + z*z);
-                Iyy += m * (x*x + z*z);
-                Izz += m * (x*x + y*y);
-                Ixy -= m * x*y;
-                Ixz -= m * x*z;
-                Iyz -= m * y*z;
-            }
-        }
-    }
-
-    print_timestamp("inertia_matrix_serial end");
-
-    return array<real_t,9> {
-        Ixx, Ixy, Ixz,
-        Ixy, Iyy, Iyz,
-        Ixz, Iyz, Izz
-    };
+array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &mask, const array<real_t,3> &cm) {
+    return cpu_seq::inertia_matrix(mask, cm);
 }
 
 }
\ No newline at end of file
diff --git a/src/lib/cpp/cpu_seq/geometry.cc b/src/lib/cpp/cpu_seq/geometry.cc
index 609970c..2901829 100644
--- a/src/lib/cpp/cpu_seq/geometry.cc
+++ b/src/lib/cpp/cpu_seq/geometry.cc
@@ -18,6 +18,7 @@ array<real_t, 3> center_of_mass(const input_ndarray<mask_type> &mask) {
     uint64_t total_mass = 0, cmz = 0, cmy = 0, cmx = 0;
 
     BLOCK_BEGIN(mask, reduction(+:total_mass,cmz,cmy,cmx)); {
+    // TODO James approves; now RUN!
 
         mask_type m = mask_buffer[flat_index];
 
@@ -38,39 +39,35 @@ array<real_t, 3> center_of_mass(const input_ndarray<mask_type> &mask) {
     return array<real_t, 3>{ rcmz, rcmy, rcmx };
 }
 
-array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &voxels, const array<real_t,3> &cm) {
+array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &mask, const array<real_t,3> &cm) {
+    UNPACK_NUMPY(mask);
+
     real_t
         Ixx = 0, Ixy = 0, Ixz = 0,
                  Iyy = 0, Iyz = 0,
                           Izz = 0;
 
-    ssize_t Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
-
     print_timestamp("inertia_matrix_serial start");
 
-    int64_t k = 0;
-    for (int64_t X = 0; X < Nx; X++) {
-        for (int64_t Y = 0; Y < Ny; Y++) {
-            for (int64_t Z = 0; Z < Nz; Z++) {
-                mask_type m = voxels.data[k];
-                k++;
+    BLOCK_BEGIN(mask, reduction(+:Ixx, Iyy, Izz) reduction(-:Ixy,Ixz,Iyz)) {
 
-                // m guards this, and then branches are removed
-                //if (m != 0)
-                real_t
-                    x = X - cm[0],
-                    y = Y - cm[1],
-                    z = Z - cm[2];
-
-                Ixx += m * (y*y + z*z);
-                Iyy += m * (x*x + z*z);
-                Izz += m * (x*x + y*y);
-                Ixy -= m * x*y;
-                Ixz -= m * x*z;
-                Iyz -= m * y*z;
-            }
-        }
-    }
+        mask_type m = mask_buffer[flat_index];
+
+        // m guards this, and then branches are removed
+        //if (m != 0)
+        real_t
+            X = x - cm[0],
+            Y = y - cm[1],
+            Z = z - cm[2];
+
+        Ixx += m * (Y*Y + Z*Z);
+        Iyy += m * (X*X + Z*Z);
+        Izz += m * (X*X + Y*Y);
+        Ixy -= m * X*Y;
+        Ixz -= m * X*Z;
+        Iyz -= m * Y*Z;
+
+    } BLOCK_END();
 
     print_timestamp("inertia_matrix_serial end");
 
@@ -79,6 +76,7 @@ array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &voxels, const arr
         Ixy, Iyy, Iyz,
         Ixz, Iyz, Izz
     };
+
 }
 
 }
diff --git a/src/lib/cpp/gpu/geometry.cc b/src/lib/cpp/gpu/geometry.cc
index 6d75136..5d54e79 100644
--- a/src/lib/cpp/gpu/geometry.cc
+++ b/src/lib/cpp/gpu/geometry.cc
@@ -13,61 +13,8 @@ array<real_t,3> center_of_mass(const input_ndarray<mask_type> &mask) {
     return cpu_seq::center_of_mass(mask);
 }
 
-array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &voxels, const array<real_t,3> &cm) {
-    // nvc++ doesn't support OpenACC 2.7 array reductions yet, so must name each element.
-    real_t
-        Ixx = 0, Ixy = 0, Ixz = 0,
-                 Iyy = 0, Iyz = 0,
-                          Izz = 0;
-
-    size_t Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
-    ssize_t image_length = Nx*Ny*Nz;
-
-    print_timestamp("inertia_matrix start");
-
-    #pragma acc data copy(Ixx, Ixy, Ixz, Iyy, Iyz, Izz)
-    {
-        for (ssize_t block_start = 0; block_start < image_length; block_start += acc_block_size<mask_type>) {
-            const mask_type *buffer = voxels.data + block_start;
-            ssize_t this_block_size = min(acc_block_size<mask_type>, image_length - block_start);
-
-            #pragma acc data copyin(buffer[:this_block_size])
-            {
-                #pragma acc parallel loop reduction(+:Ixx,Iyy,Izz) reduction(-:Ixy,Ixz,Iyz)
-                for (int64_t k = 0; k < this_block_size; k++) {    //\if (buffer[k] != 0)
-                    mask_type m = buffer[k];
-
-                    // m guards this, and GPUs doesn't like branches
-                    //if (m != 0)
-                    int64_t
-                        flat_idx = block_start + k,
-                        X = flat_idx / (Ny * Nz),
-                        Y = ((flat_idx) / Nz) % Ny,
-                        Z = flat_idx % Nz;
-
-                    real_t
-                        x = X - cm[0],
-                        y = Y - cm[1],
-                        z = Z - cm[2];
-
-                    Ixx += m * (y*y + z*z);
-                    Iyy += m * (x*x + z*z);
-                    Izz += m * (x*x + y*y);
-                    Ixy -= m * x*y;
-                    Ixz -= m * x*z;
-                    Iyz -= m * y*z;
-                }
-            }
-        }
-    }
-
-    print_timestamp("inertia_matrix end");
-
-    return array<real_t,9> {
-        Ixx, Ixy, Ixz,
-        Ixy, Iyy, Iyz,
-        Ixz, Iyz, Izz
-    };
+array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &mask, const array<real_t,3> &cm) {
+    return cpu_seq::inertia_matrix(mask, cm);
 }
 
 /* TODO Only called in test.py. Postponed for now.

From 0f39d2719fa2f5dd027709d083a11c4f851c4d58 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Sun, 19 Feb 2023 14:09:32 +0100
Subject: [PATCH 085/136] #25 Moved geometry::in_bbox

---
 src/lib/cpp/cpu/geometry.cc     |  4 +++
 src/lib/cpp/cpu_seq/geometry.cc | 64 ++++++++++-----------------------
 src/lib/cpp/gpu/geometry.cc     |  4 +++
 src/lib/cpp/include/geometry.hh |  2 ++
 4 files changed, 28 insertions(+), 46 deletions(-)

diff --git a/src/lib/cpp/cpu/geometry.cc b/src/lib/cpp/cpu/geometry.cc
index 8da364d..6a6e615 100644
--- a/src/lib/cpp/cpu/geometry.cc
+++ b/src/lib/cpp/cpu/geometry.cc
@@ -13,6 +13,10 @@ array<real_t,3> center_of_mass(const input_ndarray<mask_type> &mask) {
     return cpu_seq::center_of_mass(mask);
 }
 
+bool in_bbox(float U, float V, float W, const std::array<float, 6> &bbox) {
+    return cpu_seq::in_bbox(U, V, W, bbox);
+}
+
 array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &mask, const array<real_t,3> &cm) {
     return cpu_seq::inertia_matrix(mask, cm);
 }
diff --git a/src/lib/cpp/cpu_seq/geometry.cc b/src/lib/cpp/cpu_seq/geometry.cc
index 2901829..08dfbea 100644
--- a/src/lib/cpp/cpu_seq/geometry.cc
+++ b/src/lib/cpp/cpu_seq/geometry.cc
@@ -39,6 +39,23 @@ array<real_t, 3> center_of_mass(const input_ndarray<mask_type> &mask) {
     return array<real_t, 3>{ rcmz, rcmy, rcmx };
 }
 
+bool in_bbox(float U, float V, float W, const std::array<float, 6> &bbox) {
+    const auto& [U_min, U_max, V_min, V_max, W_min, W_max] = bbox;
+
+    bool inside =
+        U >= U_min &&
+        U <= U_max &&
+        V >= V_min &&
+        V <= V_max &&
+        W >= W_min &&
+        W <= W_max;
+
+    // printf("in_bbox: (%.1f,%.1f,%.1f) \in ([%.1f,%.1f],[%.1f,%.1f],[%.1f,%.1f]) == %d\n",
+    //      U,V,W,U_min,U_max,V_min,V_max,U_min,U_max,inside);
+
+    return inside;
+}
+
 array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &mask, const array<real_t,3> &cm) {
     UNPACK_NUMPY(mask);
 
@@ -81,52 +98,7 @@ array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &mask, const array
 
 }
 
-/* TODO only called in test.py. Postponed for now.
-void integrate_axes(const input_ndarray<mask_type> &voxels,
-            const array<real_t,3> &x0,
-            const array<real_t,3> &v_axis,
-            const array<real_t,3> &w_axis,
-            const real_t v_min, const real_t w_min,
-            output_ndarray<real_t> output) {
-    ssize_t Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
-    ssize_t Nv = output.shape[0], Nw = output.shape[1];
-    int64_t image_length = Nx*Ny*Nz;
-    real_t *output_data = output.data;
-
-    // TODO: Check v_axis & w_axis projections to certify bounds and get rid of runtime check
-    int64_t k = 0:
-    for (int64_t X = 0; X < Nx; X++) {
-        for (int64_t Y = 0; Y < Ny; Y++) {
-            for (int64_t Z = 0; Z < Nz; Z++) {
-                if (buffer[k] != 0) {
-                    real_t xs[3] = {
-                        (flat_idx  / (Ny*Nz))  - x0[0],   // x
-                        ((flat_idx / Nz) % Ny) - x0[1],   // y
-                        (flat_idx  % Nz)       - x0[2] }; // z
-
-                    mask_type voxel = buffer[k];
-                    real_t v = dot(xs, v_axis), w = dot(xs,w_axis);
-                    int64_t i_v = round(v-v_min), j_w = round(w-w_min);
-
-                    if (i_v >= 0 && j_w >= 0 && i_v < Nv && j_w < Nw) {
-                        output_data[i_v*Nw + j_w] += voxel;
-                    }
-                }
-                k++;
-            }
-        }
-    }
-}
-
-bool in_bbox(float U, float V, float W, const std::array<float,6> bbox) {
-    const auto& [U_min,U_max,V_min,V_max,W_min,W_max] = bbox;
-
-    bool inside = U>=U_min && U<=U_max && V>=V_min && V<=V_max && W>=W_min && W<=W_max;
-
-    // printf("in_bbox: (%.1f,%.1f,%.1f) \in ([%.1f,%.1f],[%.1f,%.1f],[%.1f,%.1f]) == %d\n",
-    //      U,V,W,U_min,U_max,V_min,V_max,U_min,U_max,inside);
-    return inside;
-}
+/*
 
 template<typename field_type> float resample2x2x2(const field_type *voxels,
                                                   const array<ssize_t,3> &shape,
diff --git a/src/lib/cpp/gpu/geometry.cc b/src/lib/cpp/gpu/geometry.cc
index 5d54e79..2192c7f 100644
--- a/src/lib/cpp/gpu/geometry.cc
+++ b/src/lib/cpp/gpu/geometry.cc
@@ -13,6 +13,10 @@ array<real_t,3> center_of_mass(const input_ndarray<mask_type> &mask) {
     return cpu_seq::center_of_mass(mask);
 }
 
+bool in_bbox(float U, float V, float W, const std::array<float, 6> &bbox) {
+    return cpu_seq::in_bbox(U, V, W, bbox);
+}
+
 array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &mask, const array<real_t,3> &cm) {
     return cpu_seq::inertia_matrix(mask, cm);
 }
diff --git a/src/lib/cpp/include/geometry.hh b/src/lib/cpp/include/geometry.hh
index 25bef7e..07cb1dd 100644
--- a/src/lib/cpp/include/geometry.hh
+++ b/src/lib/cpp/include/geometry.hh
@@ -25,6 +25,8 @@ Computes the center of mass of the given tomography.
 */
 array<real_t,3> center_of_mass(const input_ndarray<mask_type> &voxels);
 
+bool in_bbox(float U, float V, float W, const std::array<float, 6> &bbox);
+
 /*
 Computes the inertia matrix of the given tomography based of the given center of mass.
 

From 4080522092c109fbe46ac3879f93a05148acf98a Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 3 Mar 2023 13:52:14 +0100
Subject: [PATCH 086/136] #25 Moved geometry::sample_plane and resample2x2x2

---
 src/lib/cpp/cpu/geometry.cc              | 19 ++++++
 src/lib/cpp/cpu_seq/geometry.cc          | 82 ++++++++++++++----------
 src/lib/cpp/gpu/geometry.cc              | 18 ++++++
 src/lib/cpp/include/geometry.hh          | 14 ++++
 src/processing_steps/0700_implant_FoR.py |  2 +-
 src/pybind/geometry-pybind.cc            | 20 +++---
 6 files changed, 111 insertions(+), 44 deletions(-)

diff --git a/src/lib/cpp/cpu/geometry.cc b/src/lib/cpp/cpu/geometry.cc
index 6a6e615..4548dd7 100644
--- a/src/lib/cpp/cpu/geometry.cc
+++ b/src/lib/cpp/cpu/geometry.cc
@@ -9,6 +9,7 @@ using namespace std;
 
 namespace cpu_par {
 
+// TODO look at function aliasing. Currently doesn't work, as it clashes with the header file prototype.
 array<real_t,3> center_of_mass(const input_ndarray<mask_type> &mask) {
     return cpu_seq::center_of_mass(mask);
 }
@@ -21,4 +22,22 @@ array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &mask, const array
     return cpu_seq::inertia_matrix(mask, cm);
 }
 
+template <typename T>
+float resample2x2x2(const T        *voxels,
+                    const array<ssize_t, 3> &shape,
+                    const array<float, 3>   &X) {
+    return cpu_seq::resample2x2x2(voxels, shape, X);
+}
+
+template <typename T>
+void sample_plane(const input_ndarray<T> &voxels,
+                  const real_t voxel_size, // In micrometers
+                  const array<real_t, 3> cm,
+                  const array<real_t, 3> u_axis,
+                  const array<real_t, 3> v_axis,
+                  const array<real_t, 4> bbox,    // [umin,umax,vmin,vmax] in micrometers
+                  output_ndarray<real_t> plane_samples) {
+    return cpu_seq::sample_plane(voxels, voxel_size, cm, u_axis, v_axis, bbox, plane_samples);
+}
+
 }
\ No newline at end of file
diff --git a/src/lib/cpp/cpu_seq/geometry.cc b/src/lib/cpp/cpu_seq/geometry.cc
index 08dfbea..c79ca2d 100644
--- a/src/lib/cpp/cpu_seq/geometry.cc
+++ b/src/lib/cpp/cpu_seq/geometry.cc
@@ -93,41 +93,38 @@ array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &mask, const array
         Ixy, Iyy, Iyz,
         Ixz, Iyz, Izz
     };
-
-}
-
 }
 
-/*
+template <typename T>
+float resample2x2x2(const T             *voxels,
+                    const array<ssize_t, 3> &shape,
+                    const array<float, 3>   &X) {
+    auto  [Nz,Ny,Nx] = shape;
 
-template<typename field_type> float resample2x2x2(const field_type *voxels,
-                                                  const array<ssize_t,3> &shape,
-                                                  const array<float,3>   &X) {
-    auto  [Nx,Ny,Nz] = shape;    // Eller omvendt?
-    if (!in_bbox(X[0],X[1],X[2], {0.5,Nx-1.5, 0.5,Ny-1.5, 0.5,Nz-1.5})) {
-        uint64_t voxel_index = floor(X[0])*Ny*Nz+floor(X[1])*Ny+floor(X[2]);
+    if (!in_bbox(X[0], X[1], X[2], {0.5f, Nx-0.5f, 0.5f, Ny-0.5f, 0.5f, Nz-0.5f})) {
+        uint64_t voxel_index = floor(X[0])*Ny*Nz + floor(X[1])*Ny + floor(X[2]);
         return voxels[voxel_index];
     }
+
     float   Xfrac[2][3]; // {Xminus[3], Xplus[3]}
-    int64_t Xint[2][3];     // {Iminus[3], Iplus[3]}
+    int64_t Xint[2][3];  // {Iminus[3], Iplus[3]}
     float   value = 0;
 
     for (int i = 0; i < 3; i++) {
-        double Iminus, Iplus;
-        Xfrac[0][i] = 1-modf(X[i]-0.5, &Iminus); // 1-{X[i]-1/2}, floor(X[i]-1/2)
-        Xfrac[1][i] =   modf(X[i]+0.5, &Iplus);  // {X[i]+1/2}, floor(X[i]+1/2)
+        float Iminus, Iplus;
+        Xfrac[0][i] = 1-modf(X[i]-0.5f, &Iminus); // 1-{X[i]-1/2}, floor(X[i]-1/2)
+        Xfrac[1][i] =   modf(X[i]+0.5f, &Iplus);  // {X[i]+1/2}, floor(X[i]+1/2)
 
         Xint[0][i] = Iminus;
         Xint[1][i] = Iplus;
     }
 
-
     for (int ijk = 0; ijk <= 7; ijk++) {
         float  weight = 1;
         int64_t IJK[3] = {0,0,0};
 
         for (int axis = 0; axis < 3; axis++) { // x-1/2 or x+1/2
-            int pm = (ijk>>axis) & 1;
+            int pm    = (ijk >> axis) & 1;
             IJK[axis] = Xint[pm][axis];
             weight   *= Xfrac[pm][axis];
         }
@@ -144,28 +141,36 @@ template<typename field_type> float resample2x2x2(const field_type *voxels,
         uint64_t voxel_index = I*Ny*Nz+J*Ny+K;
         assert(I>=0 && J>=0 && K>=0);
         assert(I<Nx && J<Ny && K<Nz);
-        field_type voxel = voxels[voxel_index];
+        float voxel = (float) voxels[voxel_index];
         value += voxel*weight;
     }
+
     return value;
 }
 
-template <typename voxel_type> void sample_plane(const input_ndarray<voxel_type> &voxels,
-                         const real_t voxel_size, // In micrometers
-                         const array<real_t,3> cm,
-                         const array<real_t,3> u_axis,
-                         const array<real_t,3> v_axis,
-                         const array<real_t,4>  bbox,    // [umin,umax,vmin,vmax] in micrometers
-                         output_ndarray<real_t> plane_samples) {
+template <typename T>
+void sample_plane(const input_ndarray<T> &voxels,
+                  const real_t voxel_size, // In micrometers
+                  const array<real_t, 3> cm,
+                  const array<real_t, 3> u_axis,
+                  const array<real_t, 3> v_axis,
+                  const array<real_t, 4> bbox,    // [umin,umax,vmin,vmax] in micrometers
+                  output_ndarray<real_t> plane_samples) {
     const auto& [umin,umax,vmin,vmax] = bbox; // In micrometers
-    ssize_t Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
-    ssize_t nu = plane_samples.shape[0], nv = plane_samples.shape[1];
-    real_t  du = (umax-umin)/nu, dv = (vmax-vmin)/nv;
+    UNPACK_NUMPY(voxels);
+    ssize_t
+        nu = plane_samples.shape[0],
+        nv = plane_samples.shape[1];
+    real_t
+        du = (umax - umin) / nu,
+        dv = (vmax - vmin) / nv;
 
-    #pragma omp parallel for collapse(2)
-    for (ssize_t ui=0;ui<nu;ui++) {
-        for (ssize_t vj=0;vj<nv;vj++) {
-            const real_t u = umin + ui*du, v = vmin + vj*dv;
+    //#pragma omp parallel for collapse(2)
+    for (ssize_t ui = 0; ui < nu; ui++) {
+        for (ssize_t vj = 0; vj < nv; vj++) {
+            const real_t
+                u = umin + ui*du,
+                v = vmin + vj*dv;
 
             // X,Y,Z in micrometers;  x,y,z in voxel index space
             const real_t
@@ -173,13 +178,17 @@ template <typename voxel_type> void sample_plane(const input_ndarray<voxel_type>
                 Y = cm[1] + u*u_axis[1] + v*v_axis[1],
                 Z = cm[2] + u*u_axis[2] + v*v_axis[2];
 
-            const real_t x = X/voxel_size, y = Y/voxel_size, z = Z/voxel_size;
+            const real_t
+                x = X / voxel_size,
+                y = Y / voxel_size,
+                z = Z / voxel_size;
 
             //      printf("u,v = %g,%g -> %.1f,%.1f,%.1f -> %d, %d, %d\n",u,v,X,Y,Z,int(round(x)),int(round(y)),int(round(z)));
 
-            voxel_type value = 0;
-            if (in_bbox(x,y,z,{0.5,Nx-0.5, 0.5,Ny-0.5, 0.5,Nz-0.5}))
-                value = resample2x2x2<voxel_type>(voxels.data,{Nx,Ny,Nz},{x,y,z});
+            T value = 0;
+            std::array<float, 6> bbox = {0.5f, voxels_Nx-0.5f, 0.5f, voxels_Ny-0.5f, 0.5f, voxels_Nz-0.5f};
+            if (in_bbox(x,y,z, bbox))
+                value = (T) floor(resample2x2x2<T>(voxels.data, {voxels_Nx, voxels_Ny, voxels_Nz}, {x, y, z}));
             // else
             //     fprintf(stderr,"Sampling outside image: x,y,z = %.1f,%.1f,%.1f, Nx,Ny,Nz = %ld,%ld,%ld\n",x,y,z,Nx,Ny,Nz);
 
@@ -188,6 +197,9 @@ template <typename voxel_type> void sample_plane(const input_ndarray<voxel_type>
     }
 }
 
+}
+
+/*
 /* TODO only called in test.py. Postpone for now.
 // NB: xyz are in indices, not micrometers
 void zero_outside_bbox(const array<real_t,9> &principal_axes,
diff --git a/src/lib/cpp/gpu/geometry.cc b/src/lib/cpp/gpu/geometry.cc
index 2192c7f..dc6448d 100644
--- a/src/lib/cpp/gpu/geometry.cc
+++ b/src/lib/cpp/gpu/geometry.cc
@@ -21,6 +21,24 @@ array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &mask, const array
     return cpu_seq::inertia_matrix(mask, cm);
 }
 
+template <typename T>
+float resample2x2x2(const T        *voxels,
+                    const array<ssize_t, 3> &shape,
+                    const array<float, 3>   &X) {
+    return cpu_seq::resample2x2x2(voxels, shape, X);
+}
+
+template <typename T>
+void sample_plane(const input_ndarray<T> &voxels,
+                  const real_t voxel_size, // In micrometers
+                  const array<real_t, 3> cm,
+                  const array<real_t, 3> u_axis,
+                  const array<real_t, 3> v_axis,
+                  const array<real_t, 4> bbox,    // [umin,umax,vmin,vmax] in micrometers
+                  output_ndarray<real_t> plane_samples) {
+    return cpu_seq::sample_plane(voxels, voxel_size, cm, u_axis, v_axis, bbox, plane_samples);
+}
+
 /* TODO Only called in test.py. Postponed for now.
 void integrate_axes(const input_ndarray<mask_type> &voxels,
             const array<real_t,3> &x0,
diff --git a/src/lib/cpp/include/geometry.hh b/src/lib/cpp/include/geometry.hh
index 07cb1dd..237844b 100644
--- a/src/lib/cpp/include/geometry.hh
+++ b/src/lib/cpp/include/geometry.hh
@@ -36,6 +36,20 @@ Computes the inertia matrix of the given tomography based of the given center of
 */
 array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &voxels, const array<real_t,3> &cm);
 
+template <typename T>
+float resample2x2x2(const T *voxels,
+                    const array<ssize_t,3> &shape,
+                    const array<float,3> &X);
+
+template <typename T>
+void sample_plane(const input_ndarray<T> &voxels,
+                  const real_t voxel_size, // In micrometers
+                  const array<real_t, 3> cm,
+                  const array<real_t, 3> u_axis,
+                  const array<real_t, 3> v_axis,
+                  const array<real_t, 4>  bbox,    // [umin,umax,vmin,vmax] in micrometers
+                  output_ndarray<real_t> plane_samples);
+
 }
 
 #endif
\ No newline at end of file
diff --git a/src/processing_steps/0700_implant_FoR.py b/src/processing_steps/0700_implant_FoR.py
index 0ed5ae1..dd65424 100644
--- a/src/processing_steps/0700_implant_FoR.py
+++ b/src/processing_steps/0700_implant_FoR.py
@@ -2,7 +2,7 @@
 sys.path.append(sys.path[0]+"/../")
 from config.constants import *
 from config.paths import hdf5_root, binary_root
-from lib.cpp.cpu_seq.geometry import center_of_mass, inertia_matrix, integrate_axes, sample_plane
+from lib.cpp.cpu_seq.geometry import center_of_mass, inertia_matrix, sample_plane
 from lib.cpp.gpu.morphology import erode_3d_sphere as erode_3d, dilate_3d_sphere as dilate_3d
 import matplotlib.pyplot as plt
 from matplotlib.colors import colorConverter
diff --git a/src/pybind/geometry-pybind.cc b/src/pybind/geometry-pybind.cc
index d52c83c..3e9be7c 100644
--- a/src/pybind/geometry-pybind.cc
+++ b/src/pybind/geometry-pybind.cc
@@ -27,24 +27,28 @@ array<real_t, 9> inertia_matrix(const np_maskarray &np_voxels, array<real_t, 3>
     return NS::inertia_matrix({voxels_info.ptr, voxels_info.shape}, cm);
 }
 
-/*
-
-template <typename voxel_type>
-void sample_plane(const np_array<voxel_type> &np_voxels,
+template <typename T>
+void sample_plane(const np_array<T> &np_voxels,
           const real_t voxel_size, // In micrometers
           const array<real_t,3> cm,
           const array<real_t,3> u_axis,
           const array<real_t,3> v_axis,
           const array<real_t,4>  bbox,    // [umin,umax,vmin,vmax] in micrometers
-          np_array<float> np_plane_samples) {
+          np_array<real_t> np_plane_samples) {
     auto voxels_info = np_voxels.request();
     auto plane_samples_info  = np_plane_samples.request();
 
-    sample_plane<voxel_type>({voxels_info.ptr, voxels_info.shape}, voxel_size,
+    NS::sample_plane<T>({voxels_info.ptr, voxels_info.shape}, voxel_size,
            cm,u_axis,v_axis,bbox,
            {plane_samples_info.ptr, plane_samples_info.shape});
 }
 
+real_t resample2x2x2(const np_array<uint8_t> &np_voxels) {
+    auto voxels_info = np_voxels.request();
+    return 0.0f;
+}
+
+/*
 void integrate_axes(const np_maskarray &np_voxels,
             const array<real_t,3> &x0,
             const array<real_t,3> &v_axis,
@@ -140,7 +144,7 @@ PYBIND11_MODULE(geometry, m) {
     //m.def("zero_outside_bbox",    &python_api::zero_outside_bbox);
     //m.def("fill_implant_mask",    &python_api::fill_implant_mask);
     //m.def("cylinder_projection",  &python_api::cylinder_projection);
-    //m.def("sample_plane",         &python_api::sample_plane<uint16_t>);
-    //m.def("sample_plane",         &python_api::sample_plane<uint8_t>);
+    m.def("sample_plane",         &python_api::sample_plane<uint16_t>);
+    m.def("sample_plane",         &python_api::sample_plane<uint8_t>);
     //m.def("compute_front_mask",   &python_api::compute_front_mask);
 }

From 1405c48e4302cabbab27a3670e140828eda85933 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 3 Mar 2023 13:53:09 +0100
Subject: [PATCH 087/136] #25 Added functionality to the geometry test helper
 functions

---
 src/test/test_geometry.py | 52 +++++++++++++++++++++++++--------------
 1 file changed, 34 insertions(+), 18 deletions(-)

diff --git a/src/test/test_geometry.py b/src/test/test_geometry.py
index 6cc9f37..6871ce2 100644
--- a/src/test/test_geometry.py
+++ b/src/test/test_geometry.py
@@ -16,36 +16,52 @@
 #n = 2344 # ~12 GB, used for testing whether blocked works.
 n = 128
 
-def assert_with_print(a, b):
-    all_close = np.allclose(a, b)
+def assert_with_print(a, b, tolerance=1e-7, names=None):
+    na, nb = np.array(a), np.array(b)
+    nabs = np.abs(na - nb)
+    all_close = np.alltrue(nabs < tolerance)
     if not all_close:
-        na, nb = np.array(a), np.array(b)
-        print (na)
-        print (nb)
-        nabs = np.abs(na - nb)
-        print (nabs)
-        print (np.sum(nabs))
+        print ('a', na)
+        print ('b', nb)
+        print ('absolute error (AE) (abs(a-b))', nabs)
+        print ('AE sum', np.sum(nabs))
+        diffs = np.argwhere(nabs > tolerance)
+        print (f'differing on {diffs.shape} elements')
+        for i in diffs[:5]: # Only print 5 first
+            print ('differing index (i), a[i], b[i] =', i, a[i[0], i[1]], b[i[0], i[1]])
+        if not names is None:
+            print (names)
     assert all_close
 
-def run_with_warmup(f):
-    f()
+def run_with_warmup(f, allocate_result=None):
+    '''
+    Runs the given function and returns the result and how long time it took to run.
+
+    @param allocate_result Defines whether the memory for the result should be allocated before running. If it should, it should be a tuple of the shape and the dtype of the array. None otherwise.
+    '''
+    alloc = lambda x: np.zeros(x[0], x[1])
+    f() if allocate_result is None else f(alloc(allocate_result))
+    result = alloc(allocate_result)
     start = datetime.datetime.now()
-    result = f()
+    if allocate_result is None:
+        result = f()
+    else:
+        f(result)
     end = datetime.datetime.now()
     return result, end - start
 
-def compare_fs(func, baseline_f, cpu_f, gpu_f, should_assert=True):
-    baseline, baseline_t = run_with_warmup(baseline_f)
+def compare_fs(func, baseline_f, cpu_f, gpu_f, should_assert=True, tolerance=1e-7,
+               allocate_result: tuple[tuple[int],np.dtype]=None):
+    baseline, baseline_t = run_with_warmup(baseline_f, allocate_result)
     print (f'({func}) Sequential ran in {baseline_t}')
 
-    cpu, cpu_t = run_with_warmup(cpu_f)
+    cpu, cpu_t = run_with_warmup(cpu_f, allocate_result)
     print (f'({func}) Parallel CPU ran in {cpu_t}, which is {baseline_t / cpu_t} times faster than sequential')
-    if should_assert: assert_with_print(baseline, cpu)
+    if should_assert: assert_with_print(baseline, cpu, tolerance, 'cpu_seq vs cpu')
 
-    gpu, gpu_t = run_with_warmup(gpu_f)
+    gpu, gpu_t = run_with_warmup(gpu_f, allocate_result)
     print (f'({func}) GPU ran in {gpu_t}, which is {baseline_t / gpu_t} times faster than sequential')
-    if should_assert: assert_with_print(baseline, gpu)
-
+    if should_assert: assert_with_print(baseline, gpu, tolerance, 'cpu_seq vs gpu')
 
 def test_center_of_mass():
     voxels = np.random.randint(0, 256, (n,n,n), np.uint8)

From 1cedce1459552845c50360f0189e59a73d59bc70 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 3 Mar 2023 14:06:58 +0100
Subject: [PATCH 088/136] #25 Added launch configuration for geometry test

---
 .vscode/launch.json | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.vscode/launch.json b/.vscode/launch.json
index aae8ffc..b48c6cc 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -4,6 +4,15 @@
     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
     "version": "0.2.0",
     "configurations": [
+        {
+            "name": "Python: Test geometry",
+            "type": "python",
+            "request": "launch",
+            "program": "${workspaceFolder}/src/test/test_geometry.py",
+            "console": "integratedTerminal",
+            "args": [],
+            "justMyCode": false
+        },
         {
             "name": "Python: 0400_h5tobin",
             "type": "python",

From 3fb1ae6ff92e61b91cdb97f8d1cae8b41ff73ff3 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 3 Mar 2023 14:07:29 +0100
Subject: [PATCH 089/136] #25 The geometry tests wasn't as flexible as first
 assumed.

---
 src/test/test_geometry.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/test/test_geometry.py b/src/test/test_geometry.py
index 6871ce2..a3e6530 100644
--- a/src/test/test_geometry.py
+++ b/src/test/test_geometry.py
@@ -41,11 +41,12 @@ def run_with_warmup(f, allocate_result=None):
     '''
     alloc = lambda x: np.zeros(x[0], x[1])
     f() if allocate_result is None else f(alloc(allocate_result))
-    result = alloc(allocate_result)
-    start = datetime.datetime.now()
     if allocate_result is None:
+        start = datetime.datetime.now()
         result = f()
     else:
+        result = alloc(allocate_result)
+        start = datetime.datetime.now()
         f(result)
     end = datetime.datetime.now()
     return result, end - start
@@ -66,20 +67,21 @@ def compare_fs(func, baseline_f, cpu_f, gpu_f, should_assert=True, tolerance=1e-
 def test_center_of_mass():
     voxels = np.random.randint(0, 256, (n,n,n), np.uint8)
 
-    baseline = partial(m_cpu_seq.center_of_mass, voxels)
-    cpu = partial(m_cpu.center_of_mass, voxels)
-    gpu = partial(m_gpu.center_of_mass, voxels)
-
-    compare_fs('center_of_mass', baseline, cpu, gpu)
+    baseline, cpu, gpu = [
+        partial(impl.center_of_mass, voxels)
+        for impl in [m_cpu_seq, m_cpu, m_gpu]
+    ]
 
+    compare_fs('center_of_mass', baseline, cpu, gpu, tolerance=1e-5)
 
 def test_inertia_matrix():
     voxels = np.random.randint(0, 2, (n,n,n), np.uint8)
     cm = m_gpu.center_of_mass(voxels)
 
-    baseline = partial(m_cpu_seq.inertia_matrix, voxels, cm)
-    cpu = partial(m_cpu.inertia_matrix, voxels, cm)
-    gpu = partial(m_gpu.inertia_matrix, voxels, cm)
+    baseline, cpu, gpu = [
+        partial(impl.inertia_matrix, voxels, cm)
+        for impl in [m_cpu_seq, m_cpu, m_gpu]
+    ]
 
     # TODO assert disabled due to floating point associativity error accumulation
     compare_fs('inertia_matrix', baseline, cpu, gpu, should_assert=False)

From 530cde9476e37bb136704ce4f2cf5eb1f783191b Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 3 Mar 2023 14:07:53 +0100
Subject: [PATCH 090/136] #25 Added test for geometry::sample_plane

---
 src/test/test_geometry.py | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/src/test/test_geometry.py b/src/test/test_geometry.py
index a3e6530..3d0a3ae 100644
--- a/src/test/test_geometry.py
+++ b/src/test/test_geometry.py
@@ -86,6 +86,29 @@ def test_inertia_matrix():
     # TODO assert disabled due to floating point associativity error accumulation
     compare_fs('inertia_matrix', baseline, cpu, gpu, should_assert=False)
 
+@pytest.mark.parametrize("dtype", [np.uint8, np.uint16])
+def test_sample_plane(dtype):
+    # TODO something that isn't just random data?
+    n = 128
+    voxels = np.random.randint(0, np.iinfo(dtype).max, (n,n,n), dtype)
+    voxel_size = 42
+    cm = m_cpu.center_of_mass(voxels)
+    im = np.array(m_cpu.inertia_matrix(voxels, cm)).reshape((3,3))
+    ls,E  = np.linalg.eigh(im)
+    E[:,0] *= -1
+    ix = np.argsort(np.abs(ls))
+    ls, E = ls[ix], E[:,ix]
+    UVW = E.T
+    _, v_vec, w_vec = UVW
+    cpu_seq, cpu, gpu = [
+        partial(impl.sample_plane, voxels, voxel_size, cm, v_vec, w_vec, [0, 1024, 0, 1024])
+        for impl in [m_cpu_seq, m_cpu, m_gpu]
+    ]
+
+    # TODO the function is unstable, even when they're all calling the sequential implementation, t least when comparing gcc against nvcc, but it differs at most with 1. Hence the higher tolerance for this test. Can be tested with something like for i in range(10000):
+    compare_fs('sample_plane', cpu_seq, cpu, gpu, True, 1.1, ((800,800), np.float32))
+
 if __name__ == '__main__':
     test_center_of_mass()
-    test_inertia_matrix()
\ No newline at end of file
+    test_inertia_matrix()
+    test_sample_plane(np.uint8)
\ No newline at end of file

From 701614a7a4d3ddb30d938539782e2ca520d19027 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 3 Mar 2023 15:05:05 +0100
Subject: [PATCH 091/136] #25 Added additional C++ warnings. Removed the ones
 from python / pybind

---
 src/Makefile                  | 10 ++++++----
 src/pybind/geometry-pybind.cc |  6 +++---
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 7597ea2..71ba57c 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,6 +1,7 @@
 # Define constants and collections
 PYTHON = python3.10
-PYBIND_FLAGS += $(shell $(PYTHON) -m pybind11 --include) -march=native -Wall -shared -fPIC -g -std=c++17 -O3
+PYBIND_FLAGS = $(shell $(PYTHON) -m pybind11 --include)
+CXXFLAGS += $(subst -I,-isystem ,$(PYBIND_FLAGS)) # We don't care about warnings from the python headers
 PYBIND_SUFFIX = $(shell $(PYTHON)-config --extension-suffix)
 
 # Detect OS for OS specific changes
@@ -12,7 +13,7 @@ endif
 
 CPP_FOLDER=lib/cpp
 #CXXFLAGS += -I../contrib/cpptqdm/ -Iinclude
-CXXFLAGS += -I$(CPP_FOLDER)/include
+CXXFLAGS += -I$(CPP_FOLDER)/include -march=native -Wall -Wextra -Wfloat-equal -Wundef -Wshadow -shared -fPIC -g -std=c++17 -O3
 PLATFORMS=cpu_seq cpu
 cpu_seq_CXX=$(CXX)
 cpu_CXX=$(cpu_seq_CXX)
@@ -26,7 +27,8 @@ CLEANUP = $(TARGETS) $(foreach PLATFORM, $(PLATFORMS), $(CPP_FOLDER)/$(PLATFORM)
 ifneq (, $(shell which nvc++))
 PLATFORMS += gpu
 gpu_CXX = nvc++
-gpu_FLAGS = -acc=gpu -Minfo=accel -tp=native
+gpu_FLAGS = -acc=gpu -tp=native -Xcudafe --display_error_number #-Minfo=accel
+gpu_FLAGS += --diag_suppress 1626 # Remove the annoying pybind warning that routine is both inline and noinline
 else
 $(info OpenACC compiler nvc++ not found. Compiling without.)
 endif
@@ -35,7 +37,7 @@ all: $(TARGETS)
 
 define GEN_RULE
 $(CPP_FOLDER)/$(PLATFORM)/$(LIB)$(PYBIND_SUFFIX): pybind/$(LIB)-pybind.cc $(CPP_FOLDER)/$(PLATFORM)/$(LIB).cc $(CPP_FOLDER)/include/*.hh
-	$($(PLATFORM)_CXX) $($(PLATFORM)_FLAGS) $(CXXFLAGS) $(PYBIND_FLAGS) -I$(CPP_FOLDER)/$(PLATFORM) $$< -o $(CPP_FOLDER)/$(PLATFORM)/$(LIB)$(PYBIND_SUFFIX)
+	$($(PLATFORM)_CXX) $($(PLATFORM)_FLAGS) $(CXXFLAGS) -I$(CPP_FOLDER)/$(PLATFORM) $$< -o $(CPP_FOLDER)/$(PLATFORM)/$(LIB)$(PYBIND_SUFFIX)
 endef
 
 $(foreach PLATFORM, $(PLATFORMS), \
diff --git a/src/pybind/geometry-pybind.cc b/src/pybind/geometry-pybind.cc
index 3e9be7c..ade209f 100644
--- a/src/pybind/geometry-pybind.cc
+++ b/src/pybind/geometry-pybind.cc
@@ -1,12 +1,12 @@
 #ifdef _OPENACC
-#warning "Using GPU"
+//#warning "Using GPU"
 #define NS gpu
 #else
 #ifdef _OPENMP
-#warning "Using OpenMP"
+//#warning "Using OpenMP"
 #define NS cpu_par
 #else
-#warning "Using sequential"
+//#warning "Using sequential"
 #define NS cpu_seq
 #endif
 #endif

From e6bb55c03c7d5686ccd59d2360cf5a7e820a5b31 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 3 Mar 2023 16:07:13 +0100
Subject: [PATCH 092/136] #25 Added more warning flags

---
 src/Makefile | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 71ba57c..2037c3c 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -13,11 +13,12 @@ endif
 
 CPP_FOLDER=lib/cpp
 #CXXFLAGS += -I../contrib/cpptqdm/ -Iinclude
-CXXFLAGS += -I$(CPP_FOLDER)/include -march=native -Wall -Wextra -Wfloat-equal -Wundef -Wshadow -shared -fPIC -g -std=c++17 -O3
+CXXFLAGS += -I$(CPP_FOLDER)/include -march=native -Wall -Wextra -Wfloat-equal -Wundef -Wshadow -Wuninitialized -Winit-self -shared -fPIC -g -std=c++17 -O3
 PLATFORMS=cpu_seq cpu
 cpu_seq_CXX=$(CXX)
+cpu_seq_FLAGS=-Wno-unknown-pragmas -Wno-comment -Wconversion -Weffc++
 cpu_CXX=$(cpu_seq_CXX)
-cpu_FLAGS=-fopenmp
+cpu_FLAGS=$(cpu_seq_FLAGS) -fopenmp
 
 LIBS=io geometry morphology
 TARGETS = $(foreach PLATFORM, $(PLATFORMS), $(foreach LIB, $(LIBS), $(CPP_FOLDER)/$(PLATFORM)/$(LIB)$(PYBIND_SUFFIX)))
@@ -29,6 +30,8 @@ PLATFORMS += gpu
 gpu_CXX = nvc++
 gpu_FLAGS = -acc=gpu -tp=native -Xcudafe --display_error_number #-Minfo=accel
 gpu_FLAGS += --diag_suppress 1626 # Remove the annoying pybind warning that routine is both inline and noinline
+gpu_FLAGS += --diag_suppress 9 # Remove the warning about nested comments
+gpu_FLAGS += -Wnvlink,-w # Disable nvlink warnings
 else
 $(info OpenACC compiler nvc++ not found. Compiling without.)
 endif

From 3c81ebb497fd62e318cbcbc354fecc63a1f9ef95 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 3 Mar 2023 16:07:41 +0100
Subject: [PATCH 093/136] #25 Handled shadow warning for ndarray struct

---
 src/lib/cpp/include/datatypes.hh | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/lib/cpp/include/datatypes.hh b/src/lib/cpp/include/datatypes.hh
index cbe1213..2171b86 100644
--- a/src/lib/cpp/include/datatypes.hh
+++ b/src/lib/cpp/include/datatypes.hh
@@ -19,8 +19,8 @@ template <typename voxel_type>
 using np_array = py::array_t<voxel_type, py::array::c_style | py::array::forcecast>;
 
 typedef py::array_t<mask_type, py::array::c_style | py::array::forcecast> np_maskarray;
-typedef py::array_t<real_t, py::array::c_style | py::array::forcecast>    np_realarray;
-typedef py::array_t<uint8_t, py::array::c_style | py::array::forcecast>   np_bytearray;
+typedef py::array_t<real_t,    py::array::c_style | py::array::forcecast> np_realarray;
+typedef py::array_t<uint8_t,   py::array::c_style | py::array::forcecast> np_bytearray;
 
 template <typename T>
 constexpr ssize_t acc_block_size = 1024 * 1024 * 1024 / sizeof(T); // 1 GB
@@ -33,16 +33,16 @@ template <typename T> struct input_ndarray {
   const T *data;
   const vector<ssize_t> shape;
 
-  input_ndarray(const T *data, const vector<ssize_t> &shape): data(data), shape(shape) {}
-  input_ndarray(const void *data, const vector<ssize_t> &shape): data(static_cast<const T*>(data)), shape(shape) {}
+  input_ndarray(const T *arg_data, const vector<ssize_t> &arg_shape): data(arg_data), shape(arg_shape) {}
+  input_ndarray(const void *arg_data, const vector<ssize_t> &arg_shape): data(static_cast<const T*>(arg_data)), shape(arg_shape) {}
 };
 
 template <typename T> struct output_ndarray {
   T *data;
   const vector<ssize_t> shape;
 
-  output_ndarray(T *data, const vector<ssize_t> &shape): data(data), shape(shape) {}
-  output_ndarray(void *data, const vector<ssize_t> &shape): data(static_cast<T*>(data)), shape(shape) {}
+  output_ndarray(T *arg_data, const vector<ssize_t> &arg_shape): data(arg_data), shape(arg_shape) {}
+  output_ndarray(void *arg_data, const vector<ssize_t> &arg_shape): data(static_cast<T*>(arg_data)), shape(arg_shape) {}
 };
 
 typedef std::array<real_t,16> matrix4x4;

From 386153935352658fcad02695c24d0c61090f6d1f Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 3 Mar 2023 16:08:01 +0100
Subject: [PATCH 094/136] #25 Handled unused variable warning for macro
 generated code

---
 src/lib/cpp/include/boilerplate.hh | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/lib/cpp/include/boilerplate.hh b/src/lib/cpp/include/boilerplate.hh
index 0b5b1bc..54852fb 100644
--- a/src/lib/cpp/include/boilerplate.hh
+++ b/src/lib/cpp/include/boilerplate.hh
@@ -54,9 +54,9 @@
 
 #define PUSH_N_DOWN_TO_BUFFER(ARR) \
     ssize_t \
-        ARR##_buffer_Nz = ARR##_Nz, \
-        ARR##_buffer_Ny = ARR##_Ny, \
-        ARR##_buffer_Nx = ARR##_Nx;
+        __attribute__((unused)) ARR##_buffer_Nz = ARR##_Nz, \
+        __attribute__((unused)) ARR##_buffer_Ny = ARR##_Ny, \
+        __attribute__((unused)) ARR##_buffer_Nx = ARR##_Nx;
 
 #ifdef _OPENACC
 #define BLOCK_BEGIN(ARR, EXTRA_PRAGMA_CLAUSE) \
@@ -90,9 +90,9 @@
 
 #define UNPACK_NUMPY(ARR) \
     ssize_t \
-        ARR##_Nz = ARR.shape[0], \
-        ARR##_Ny = ARR.shape[1], \
-        ARR##_Nx = ARR.shape[2], \
-        ARR##_length = ARR##_Nz*ARR##_Ny*ARR##_Nx;
+        __attribute__((unused)) ARR##_Nz = ARR.shape[0], \
+        __attribute__((unused)) ARR##_Ny = ARR.shape[1], \
+        __attribute__((unused)) ARR##_Nx = ARR.shape[2], \
+        __attribute__((unused)) ARR##_length = ARR##_Nz*ARR##_Ny*ARR##_Nx;
 
 #endif
\ No newline at end of file

From 0ccf64713755ac2aff5e9f09f62171f16d2b0816 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 3 Mar 2023 16:08:16 +0100
Subject: [PATCH 095/136] #25 Handled shadow warning

---
 src/lib/cpp/cpu_seq/geometry.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lib/cpp/cpu_seq/geometry.cc b/src/lib/cpp/cpu_seq/geometry.cc
index c79ca2d..78858cb 100644
--- a/src/lib/cpp/cpu_seq/geometry.cc
+++ b/src/lib/cpp/cpu_seq/geometry.cc
@@ -186,8 +186,8 @@ void sample_plane(const input_ndarray<T> &voxels,
             //      printf("u,v = %g,%g -> %.1f,%.1f,%.1f -> %d, %d, %d\n",u,v,X,Y,Z,int(round(x)),int(round(y)),int(round(z)));
 
             T value = 0;
-            std::array<float, 6> bbox = {0.5f, voxels_Nx-0.5f, 0.5f, voxels_Ny-0.5f, 0.5f, voxels_Nz-0.5f};
-            if (in_bbox(x,y,z, bbox))
+            std::array<float, 6> local_bbox = {0.5f, voxels_Nx-0.5f, 0.5f, voxels_Ny-0.5f, 0.5f, voxels_Nz-0.5f};
+            if (in_bbox(x,y,z, local_bbox))
                 value = (T) floor(resample2x2x2<T>(voxels.data, {voxels_Nx, voxels_Ny, voxels_Nz}, {x, y, z}));
             // else
             //     fprintf(stderr,"Sampling outside image: x,y,z = %.1f,%.1f,%.1f, Nx,Ny,Nz = %ld,%ld,%ld\n",x,y,z,Nx,Ny,Nz);

From c159aa9b3d4ef719c9d191987b864e496f58af5e Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 3 Mar 2023 16:37:27 +0100
Subject: [PATCH 096/136] #25 Added explicit type conversions

---
 src/lib/cpp/cpu_seq/geometry.cc | 30 +++++++++++++++---------------
 src/pybind/morphology-pybind.cc |  2 +-
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/lib/cpp/cpu_seq/geometry.cc b/src/lib/cpp/cpu_seq/geometry.cc
index 78858cb..5f9d560 100644
--- a/src/lib/cpp/cpu_seq/geometry.cc
+++ b/src/lib/cpp/cpu_seq/geometry.cc
@@ -30,9 +30,9 @@ array<real_t, 3> center_of_mass(const input_ndarray<mask_type> &mask) {
     } BLOCK_END();
 
     real_t
-        rcmz = cmz / ((real_t) total_mass),
-        rcmy = cmy / ((real_t) total_mass),
-        rcmx = cmx / ((real_t) total_mass);
+        rcmz = real_t(cmz) / real_t(total_mass),
+        rcmy = real_t(cmy) / real_t(total_mass),
+        rcmx = real_t(cmx) / real_t(total_mass);
 
     print_timestamp("center_of_mass end");
 
@@ -73,9 +73,9 @@ array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &mask, const array
         // m guards this, and then branches are removed
         //if (m != 0)
         real_t
-            X = x - cm[0],
-            Y = y - cm[1],
-            Z = z - cm[2];
+            X = real_t(x) - cm[0],
+            Y = real_t(y) - cm[1],
+            Z = real_t(z) - cm[2];
 
         Ixx += m * (Y*Y + Z*Z);
         Iyy += m * (X*X + Z*Z);
@@ -101,8 +101,8 @@ float resample2x2x2(const T             *voxels,
                     const array<float, 3>   &X) {
     auto  [Nz,Ny,Nx] = shape;
 
-    if (!in_bbox(X[0], X[1], X[2], {0.5f, Nx-0.5f, 0.5f, Ny-0.5f, 0.5f, Nz-0.5f})) {
-        uint64_t voxel_index = floor(X[0])*Ny*Nz + floor(X[1])*Ny + floor(X[2]);
+    if (!in_bbox(X[0], X[1], X[2], {0.5f, float(Nx)-0.5f, 0.5f, float(Ny)-0.5f, 0.5f, float(Nz)-0.5f})) {
+        uint64_t voxel_index = uint64_t(floor(X[0]))*Ny*Nz + uint64_t(floor(X[1]))*Ny + uint64_t(floor(X[2]));
         return voxels[voxel_index];
     }
 
@@ -115,8 +115,8 @@ float resample2x2x2(const T             *voxels,
         Xfrac[0][i] = 1-modf(X[i]-0.5f, &Iminus); // 1-{X[i]-1/2}, floor(X[i]-1/2)
         Xfrac[1][i] =   modf(X[i]+0.5f, &Iplus);  // {X[i]+1/2}, floor(X[i]+1/2)
 
-        Xint[0][i] = Iminus;
-        Xint[1][i] = Iplus;
+        Xint[0][i] = (int64_t) Iminus;
+        Xint[1][i] = (int64_t) Iplus;
     }
 
     for (int ijk = 0; ijk <= 7; ijk++) {
@@ -162,15 +162,15 @@ void sample_plane(const input_ndarray<T> &voxels,
         nu = plane_samples.shape[0],
         nv = plane_samples.shape[1];
     real_t
-        du = (umax - umin) / nu,
-        dv = (vmax - vmin) / nv;
+        du = (umax - umin) / real_t(nu),
+        dv = (vmax - vmin) / real_t(nv);
 
     //#pragma omp parallel for collapse(2)
     for (ssize_t ui = 0; ui < nu; ui++) {
         for (ssize_t vj = 0; vj < nv; vj++) {
             const real_t
-                u = umin + ui*du,
-                v = vmin + vj*dv;
+                u = umin + real_t(ui)*du,
+                v = vmin + real_t(vj)*dv;
 
             // X,Y,Z in micrometers;  x,y,z in voxel index space
             const real_t
@@ -186,7 +186,7 @@ void sample_plane(const input_ndarray<T> &voxels,
             //      printf("u,v = %g,%g -> %.1f,%.1f,%.1f -> %d, %d, %d\n",u,v,X,Y,Z,int(round(x)),int(round(y)),int(round(z)));
 
             T value = 0;
-            std::array<float, 6> local_bbox = {0.5f, voxels_Nx-0.5f, 0.5f, voxels_Ny-0.5f, 0.5f, voxels_Nz-0.5f};
+            std::array<float, 6> local_bbox = {0.5f, float(voxels_Nx)-0.5f, 0.5f, float(voxels_Ny)-0.5f, 0.5f, float(voxels_Nz)-0.5f};
             if (in_bbox(x,y,z, local_bbox))
                 value = (T) floor(resample2x2x2<T>(voxels.data, {voxels_Nx, voxels_Ny, voxels_Nz}, {x, y, z}));
             // else
diff --git a/src/pybind/morphology-pybind.cc b/src/pybind/morphology-pybind.cc
index f9c7891..b8547e7 100644
--- a/src/pybind/morphology-pybind.cc
+++ b/src/pybind/morphology-pybind.cc
@@ -16,7 +16,7 @@ void morphology_3d_sphere_wrapper(
         voxels_info = np_voxels.request(),
         result_info = np_result.request();
 
-    int32_t Nz = voxels_info.shape[0], Ny = voxels_info.shape[1], Nx = voxels_info.shape[2];
+    int64_t Nz = voxels_info.shape[0], Ny = voxels_info.shape[1], Nx = voxels_info.shape[2];
     int64_t N[3] = {Nz, Ny, Nx};
     int64_t strides[3] = {Ny*Nx, Nx, 1};
 

From 1daccb7fa1caa20a61528d8c08add0925ea7287f Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 3 Mar 2023 16:38:12 +0100
Subject: [PATCH 097/136] #25 Made the non-seq io functions call the seq one

---
 src/Makefile                     |  2 +-
 src/lib/cpp/cpu/io.cc            |  8 ++++++--
 src/lib/cpp/cpu_seq/io.cc        |  2 ++
 src/lib/cpp/gpu/io.cc            |  8 ++++++--
 src/lib/cpp/include/datatypes.hh | 14 +++++++++++++
 src/lib/cpp/include/io.hh        | 10 ++++++++--
 src/pybind/io-pybind.cc          | 34 ++++++++++++++++++--------------
 7 files changed, 56 insertions(+), 22 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 2037c3c..463d674 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -16,7 +16,7 @@ CPP_FOLDER=lib/cpp
 CXXFLAGS += -I$(CPP_FOLDER)/include -march=native -Wall -Wextra -Wfloat-equal -Wundef -Wshadow -Wuninitialized -Winit-self -shared -fPIC -g -std=c++17 -O3
 PLATFORMS=cpu_seq cpu
 cpu_seq_CXX=$(CXX)
-cpu_seq_FLAGS=-Wno-unknown-pragmas -Wno-comment -Wconversion -Weffc++
+cpu_seq_FLAGS=-Wno-unknown-pragmas -Wno-comment -Wconversion #-Weffc++
 cpu_CXX=$(cpu_seq_CXX)
 cpu_FLAGS=$(cpu_seq_FLAGS) -fopenmp
 
diff --git a/src/lib/cpp/cpu/io.cc b/src/lib/cpp/cpu/io.cc
index 41b56ec..62be52c 100644
--- a/src/lib/cpp/cpu/io.cc
+++ b/src/lib/cpp/cpu/io.cc
@@ -2,15 +2,17 @@
 #include <fstream>
 
 #include "io.hh"
+#include "../cpu_seq/io.cc"
 
 using namespace std;
+namespace cpu {
 
 template <typename T>
 void load_contiguous_slice(T *data,
         const string filename,
         const uint64_t offset,
         const uint64_t size) {
-    throw runtime_error(string("Library doesn't have a parallel cpu implementation of ") + __FUNCTION__);
+    cpu_seq::load_contiguous_slice(data, filename, offset, size);
 }
 
 template <typename T>
@@ -18,5 +20,7 @@ void write_contiguous_slice(const T *data,
         const string filename,
         const uint64_t offset,
         const uint64_t size) {
-    throw runtime_error(string("Library doesn't have a parallel cpu implementation of ") + __FUNCTION__);
+    cpu_seq::write_contiguous_slice(data, filename, offset, size);
+}
+
 }
diff --git a/src/lib/cpp/cpu_seq/io.cc b/src/lib/cpp/cpu_seq/io.cc
index 2d30477..bf771f2 100644
--- a/src/lib/cpp/cpu_seq/io.cc
+++ b/src/lib/cpp/cpu_seq/io.cc
@@ -4,6 +4,7 @@
 #include "io.hh"
 
 using namespace std;
+namespace cpu_seq {
 
 template <typename T>
 void load_contiguous_slice(T *data,
@@ -39,3 +40,4 @@ void write_contiguous_slice(const T *data,
 }
 
 // TODO non-contiguous
+}
diff --git a/src/lib/cpp/gpu/io.cc b/src/lib/cpp/gpu/io.cc
index 4eb196a..992209a 100644
--- a/src/lib/cpp/gpu/io.cc
+++ b/src/lib/cpp/gpu/io.cc
@@ -2,15 +2,17 @@
 #include <fstream>
 
 #include "io.hh"
+#include "../cpu_seq/io.cc"
 
 using namespace std;
+namespace gpu {
 
 template <typename T>
 void load_contiguous_slice(T *data,
         const string filename,
         const uint64_t offset,
         const uint64_t size) {
-    throw runtime_error(string("Library doesn't have a gpu implementation of ") + __FUNCTION__);
+    cpu_seq::load_contiguous_slice(data, filename, offset, size);
 }
 
 template <typename T>
@@ -18,5 +20,7 @@ void write_contiguous_slice(const T *data,
         const string filename,
         const uint64_t offset,
         const uint64_t size) {
-    throw runtime_error(string("Library doesn't have a gpu implementation of ") + __FUNCTION__);
+    cpu_seq::write_contiguous_slice(data, filename, offset, size);
 }
+
+}
\ No newline at end of file
diff --git a/src/lib/cpp/include/datatypes.hh b/src/lib/cpp/include/datatypes.hh
index 2171b86..b91fc78 100644
--- a/src/lib/cpp/include/datatypes.hh
+++ b/src/lib/cpp/include/datatypes.hh
@@ -1,5 +1,19 @@
 #ifndef datatypes_h
 #define datatypes_h
+
+#ifdef _OPENACC
+//#warning "Using GPU"
+#define NS gpu
+#else
+#ifdef _OPENMP
+//#warning "Using OpenMP"
+#define NS cpu_par
+#else
+//#warning "Using sequential"
+#define NS cpu_seq
+#endif
+#endif
+
 #include <array>
 #include <vector>
 #include <pybind11/pybind11.h>
diff --git a/src/lib/cpp/include/io.hh b/src/lib/cpp/include/io.hh
index a28da76..5720a20 100644
--- a/src/lib/cpp/include/io.hh
+++ b/src/lib/cpp/include/io.hh
@@ -1,9 +1,15 @@
 #ifndef io_h
 #define io_h
 
+#include "datatypes.hh"
+
+namespace NS {
+
 template <typename T>
-void load_contiguous_slice(T *data, const string filename, const uint64_t offset, const uint64_t size);
+void load_contiguous_slice(const T *data, const string filename, const uint64_t offset, const uint64_t size);
 template <typename T>
-void write_contiguous_slice(T *np_data, const string filename, const uint64_t offset, const uint64_t size);
+void write_contiguous_slice(const T *np_data, const string filename, const uint64_t offset, const uint64_t size);
+
+}
 
 #endif
\ No newline at end of file
diff --git a/src/pybind/io-pybind.cc b/src/pybind/io-pybind.cc
index 496b990..e15e1fe 100644
--- a/src/pybind/io-pybind.cc
+++ b/src/pybind/io-pybind.cc
@@ -7,6 +7,8 @@ namespace py = pybind11;
 #include "datatypes.hh"
 #include "io.cc"
 
+namespace python_api {
+
 template <typename T>
 void load_slice(py::array_t<T> &np_data, const string filename,
                 const tuple<uint64_t, uint64_t, uint64_t> offset,
@@ -16,7 +18,7 @@ void load_slice(py::array_t<T> &np_data, const string filename,
     auto [Nz, Ny, Nx] = shape;
     auto [oz, oy, ox] = offset;
     uint64_t flat_offset = oz*Ny*Nx + oy*Nx + ox;
-    load_contiguous_slice<T>(data, filename, flat_offset, data_info.size);
+    NS::load_contiguous_slice<T>(data, filename, flat_offset, data_info.size);
 }
 
 template <typename T>
@@ -29,22 +31,24 @@ void write_slice(const py::array_t<T> &np_data,
     auto [Nz, Ny, Nx] = shape;
     auto [oz, oy, ox] = offset;
     uint64_t flat_offset = oz*Ny*Nx + oy*Nx + ox;
-    write_contiguous_slice<T>(data, filename, flat_offset, data_info.size);
+    NS::write_contiguous_slice<T>(data, filename, flat_offset, data_info.size);
+}
+
 }
 
 PYBIND11_MODULE(io, m) {
     m.doc() = "I/O functions for handling flat binary format files."; // optional module docstring
-    m.def("load_slice", &load_slice<uint8_t>);
-    m.def("load_slice", &load_slice<uint16_t>);
-    m.def("load_slice", &load_slice<uint32_t>);
-    m.def("load_slice", &load_slice<uint64_t>);
-    m.def("load_slice", &load_slice<float>);
-    m.def("load_slice", &load_slice<double>);
-
-    m.def("write_slice", &write_slice<uint8_t>);
-    m.def("write_slice", &write_slice<uint16_t>);
-    m.def("write_slice", &write_slice<uint32_t>);
-    m.def("write_slice", &write_slice<uint64_t>);
-    m.def("write_slice", &write_slice<float>);
-    m.def("write_slice", &write_slice<double>);
+    m.def("load_slice", &python_api::load_slice<uint8_t>);
+    m.def("load_slice", &python_api::load_slice<uint16_t>);
+    m.def("load_slice", &python_api::load_slice<uint32_t>);
+    m.def("load_slice", &python_api::load_slice<uint64_t>);
+    m.def("load_slice", &python_api::load_slice<float>);
+    m.def("load_slice", &python_api::load_slice<double>);
+
+    m.def("write_slice", &python_api::write_slice<uint8_t>);
+    m.def("write_slice", &python_api::write_slice<uint16_t>);
+    m.def("write_slice", &python_api::write_slice<uint32_t>);
+    m.def("write_slice", &python_api::write_slice<uint64_t>);
+    m.def("write_slice", &python_api::write_slice<float>);
+    m.def("write_slice", &python_api::write_slice<double>);
 }
\ No newline at end of file

From 7631ce944fea99b7de4669fae37c770f9f27eae5 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 6 Mar 2023 08:35:32 +0100
Subject: [PATCH 098/136] #25 Incorrect namespace in io

---
 src/lib/cpp/cpu/io.cc   | 2 +-
 src/pybind/io-pybind.cc | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/lib/cpp/cpu/io.cc b/src/lib/cpp/cpu/io.cc
index 62be52c..098cde0 100644
--- a/src/lib/cpp/cpu/io.cc
+++ b/src/lib/cpp/cpu/io.cc
@@ -5,7 +5,7 @@
 #include "../cpu_seq/io.cc"
 
 using namespace std;
-namespace cpu {
+namespace cpu_par {
 
 template <typename T>
 void load_contiguous_slice(T *data,
diff --git a/src/pybind/io-pybind.cc b/src/pybind/io-pybind.cc
index e15e1fe..060d9d9 100644
--- a/src/pybind/io-pybind.cc
+++ b/src/pybind/io-pybind.cc
@@ -4,7 +4,6 @@
 using namespace std;
 namespace py = pybind11;
 
-#include "datatypes.hh"
 #include "io.cc"
 
 namespace python_api {

From ec85d405f95e159bff72d855cd05db770f069c75 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 6 Mar 2023 08:35:54 +0100
Subject: [PATCH 099/136] #25 Added test for the other io implementations

---
 src/test/test_io.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/test/test_io.py b/src/test/test_io.py
index 199257d..09ad43a 100644
--- a/src/test/test_io.py
+++ b/src/test/test_io.py
@@ -19,7 +19,7 @@
 dim_size = 128
 dim_shape = (dim_size, dim_size, dim_size)
 partial_factor = 4
-impls = [io_cpu_seq] #, io_cpu, io_gpu]
+impls = [io_cpu_seq, io_cpu, io_gpu]
 
 def random(shape, dtype):
     rnds = np.random.random(shape) * 100
@@ -63,7 +63,7 @@ def test_dtype(impl, dtype):
     for i in range(partial_factor+1):
         impl.load_slice(read_data, individual_tmp_file, (i*partial,0,0), read_data.shape)
         assert np.allclose(data[i*partial:(i+1)*partial], read_data)
-    
+
     # Write past where the file ends
     impl.write_slice(data, individual_tmp_file, (data.shape[0]*2,0,0), data.shape)
     assert os.path.getsize(individual_tmp_file) == 3 * data.nbytes

From 127428b574b182596a83ed59146143decd6dc48e Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 6 Mar 2023 08:44:43 +0100
Subject: [PATCH 100/136] #25 GPU debugging. Device code cannot have assert

---
 src/lib/cpp/cpu_seq/geometry.cc | 27 ++++-----------------------
 src/pybind/geometry-pybind.cc   | 13 -------------
 src/test/test_geometry.py       |  8 ++++----
 3 files changed, 8 insertions(+), 40 deletions(-)

diff --git a/src/lib/cpp/cpu_seq/geometry.cc b/src/lib/cpp/cpu_seq/geometry.cc
index 5f9d560..cf5f8cb 100644
--- a/src/lib/cpp/cpu_seq/geometry.cc
+++ b/src/lib/cpp/cpu_seq/geometry.cc
@@ -139,8 +139,8 @@ float resample2x2x2(const T             *voxels,
         //   abort();
         // }
         uint64_t voxel_index = I*Ny*Nz+J*Ny+K;
-        assert(I>=0 && J>=0 && K>=0);
-        assert(I<Nx && J<Ny && K<Nz);
+        //assert(I>=0 && J>=0 && K>=0);
+        //assert(I<Nx && J<Ny && K<Nz);
         float voxel = (float) voxels[voxel_index];
         value += voxel*weight;
     }
@@ -200,7 +200,7 @@ void sample_plane(const input_ndarray<T> &voxels,
 }
 
 /*
-/* TODO only called in test.py. Postpone for now.
+// TODO only called in test.py. Postpone for now.
 // NB: xyz are in indices, not micrometers
 void zero_outside_bbox(const array<real_t,9> &principal_axes,
                const array<real_t,6> &parameter_ranges,
@@ -257,25 +257,6 @@ inline vector4 hom_transform(const vector4 &x, const matrix4x4 &M) {
     return c;
 }
 
-#define loop_mask_start(mask_in,mask_out,COPY) {                                                                                \
-    ssize_t Mx = mask_in.shape[0], My = mask_in.shape[1], Mz = mask_in.shape[2];                                                \
-    ssize_t mask_length = Mx*My*Mz;                                                                                             \
-                                                                                                                                \
-    for (ssize_t block_start = 0; block_start < mask_length; block_start += acc_block_size) {                                   \
-        const mask_type *maskin_buffer  = mask_in.data + block_start;                                                           \
-            mask_type *maskout_buffer = mask_out.data + block_start;                                                            \
-        ssize_t this_block_length = min(acc_block_size, mask_length-block_start);                                               \
-                                                                                                                                \
-        _Pragma(STR(acc parallel loop copy(maskin_buffer[:this_block_length], maskout_buffer[:this_block_length]) copy COPY))   \
-        for (int64_t k = 0; k < this_block_length; k++) {                                                                       \
-            int64_t flat_idx = block_start + k;                                                                                 \
-            int64_t X = (flat_idx  / (My*Mz)), Y = (flat_idx / Mz) % My, Z = flat_idx  % Mz;                                    \
-            std::array<real_t,4> Xs = { X*voxel_size, Y*voxel_size, Z*voxel_size, 1 };                                          \
-            bool mask_value = maskin_buffer[k];
-
-#define loop_mask_end(mask) }}}
-
-/*
 void fill_implant_mask(const input_ndarray<mask_type> implant_mask,
                float voxel_size,
                const array<float,6> &bbox,
@@ -425,7 +406,7 @@ void cylinder_projection(const input_ndarray<float>  edt,  // Euclidean Distance
                     abort();
                 }
 
-                //****** MEAT OF THE IMPLEMENTATION IS HERE ******
+                // ****** MEAT OF THE IMPLEMENTATION IS HERE ******
                 real_t distance = resample2x2x2<float>(edt_block, {this_edt_length/(ey*ez),ey,ez}, {x,y,z});
 
                 if (distance > d_min && distance <= d_max) { // TODO: and W>w_min
diff --git a/src/pybind/geometry-pybind.cc b/src/pybind/geometry-pybind.cc
index ade209f..3051f69 100644
--- a/src/pybind/geometry-pybind.cc
+++ b/src/pybind/geometry-pybind.cc
@@ -1,16 +1,3 @@
-#ifdef _OPENACC
-//#warning "Using GPU"
-#define NS gpu
-#else
-#ifdef _OPENMP
-//#warning "Using OpenMP"
-#define NS cpu_par
-#else
-//#warning "Using sequential"
-#define NS cpu_seq
-#endif
-#endif
-
 #include "geometry.cc"
 
 namespace python_api {
diff --git a/src/test/test_geometry.py b/src/test/test_geometry.py
index 3d0a3ae..9426e7f 100644
--- a/src/test/test_geometry.py
+++ b/src/test/test_geometry.py
@@ -57,11 +57,11 @@ def compare_fs(func, baseline_f, cpu_f, gpu_f, should_assert=True, tolerance=1e-
     print (f'({func}) Sequential ran in {baseline_t}')
 
     cpu, cpu_t = run_with_warmup(cpu_f, allocate_result)
-    print (f'({func}) Parallel CPU ran in {cpu_t}, which is {baseline_t / cpu_t} times faster than sequential')
+    print (f'({func}) Parallel CPU ran in {cpu_t}, which is {baseline_t / cpu_t:.02f} times faster than sequential')
     if should_assert: assert_with_print(baseline, cpu, tolerance, 'cpu_seq vs cpu')
 
     gpu, gpu_t = run_with_warmup(gpu_f, allocate_result)
-    print (f'({func}) GPU ran in {gpu_t}, which is {baseline_t / gpu_t} times faster than sequential')
+    print (f'({func}) GPU ran in {gpu_t}, which is {baseline_t / gpu_t:.02f} times faster than sequential')
     if should_assert: assert_with_print(baseline, gpu, tolerance, 'cpu_seq vs gpu')
 
 def test_center_of_mass():
@@ -101,12 +101,12 @@ def test_sample_plane(dtype):
     UVW = E.T
     _, v_vec, w_vec = UVW
     cpu_seq, cpu, gpu = [
-        partial(impl.sample_plane, voxels, voxel_size, cm, v_vec, w_vec, [0, 1024, 0, 1024])
+        partial(impl.sample_plane, voxels, voxel_size, cm, v_vec, w_vec, [0, 128, 0, 128])
         for impl in [m_cpu_seq, m_cpu, m_gpu]
     ]
 
     # TODO the function is unstable, even when they're all calling the sequential implementation, t least when comparing gcc against nvcc, but it differs at most with 1. Hence the higher tolerance for this test. Can be tested with something like for i in range(10000):
-    compare_fs('sample_plane', cpu_seq, cpu, gpu, True, 1.1, ((800,800), np.float32))
+    compare_fs('sample_plane', cpu_seq, cpu, gpu, True, 1.1, ((64,64), np.float32))
 
 if __name__ == '__main__':
     test_center_of_mass()

From 5fc5448bce895b20473309573502fe37154de77e Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 6 Mar 2023 08:45:11 +0100
Subject: [PATCH 101/136] #25 Plane samples is working for all implementations

---
 src/Makefile                    |  3 ++-
 src/lib/cpp/cpu_seq/geometry.cc | 11 ++++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 463d674..f64e876 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -28,7 +28,8 @@ CLEANUP = $(TARGETS) $(foreach PLATFORM, $(PLATFORMS), $(CPP_FOLDER)/$(PLATFORM)
 ifneq (, $(shell which nvc++))
 PLATFORMS += gpu
 gpu_CXX = nvc++
-gpu_FLAGS = -acc=gpu -tp=native -Xcudafe --display_error_number #-Minfo=accel
+gpu_FLAGS = -acc=gpu -tp=native #-Minfo=accel
+gpu_FLAGS += -Xcudafe --display_error_number # Getting the warning codes for later suppression
 gpu_FLAGS += --diag_suppress 1626 # Remove the annoying pybind warning that routine is both inline and noinline
 gpu_FLAGS += --diag_suppress 9 # Remove the warning about nested comments
 gpu_FLAGS += -Wnvlink,-w # Disable nvlink warnings
diff --git a/src/lib/cpp/cpu_seq/geometry.cc b/src/lib/cpp/cpu_seq/geometry.cc
index cf5f8cb..69f0578 100644
--- a/src/lib/cpp/cpu_seq/geometry.cc
+++ b/src/lib/cpp/cpu_seq/geometry.cc
@@ -165,7 +165,11 @@ void sample_plane(const input_ndarray<T> &voxels,
         du = (umax - umin) / real_t(nu),
         dv = (vmax - vmin) / real_t(nv);
 
-    //#pragma omp parallel for collapse(2)
+    real_t *dat = plane_samples.data;
+
+    #pragma acc data copyin(voxels, voxels.data[:voxels_Nz*voxels_Ny*voxels_Nx], voxels_Nz, voxels_Ny, voxels_Nx) create(dat[:nu*nv]) copyout(dat[:nu*nv])
+    {
+    PRAGMA(PARALLEL_TERM collapse(2))
     for (ssize_t ui = 0; ui < nu; ui++) {
         for (ssize_t vj = 0; vj < nv; vj++) {
             const real_t
@@ -188,13 +192,14 @@ void sample_plane(const input_ndarray<T> &voxels,
             T value = 0;
             std::array<float, 6> local_bbox = {0.5f, float(voxels_Nx)-0.5f, 0.5f, float(voxels_Ny)-0.5f, 0.5f, float(voxels_Nz)-0.5f};
             if (in_bbox(x,y,z, local_bbox))
-                value = (T) floor(resample2x2x2<T>(voxels.data, {voxels_Nx, voxels_Ny, voxels_Nz}, {x, y, z}));
+                value = (T) round(resample2x2x2<T>(voxels.data, {voxels_Nx, voxels_Ny, voxels_Nz}, {x, y, z}));
             // else
             //     fprintf(stderr,"Sampling outside image: x,y,z = %.1f,%.1f,%.1f, Nx,Ny,Nz = %ld,%ld,%ld\n",x,y,z,Nx,Ny,Nz);
 
-            plane_samples.data[ui*nv + vj] = value;
+            dat[ui*nv + vj] = value;
         }
     }
+    }
 }
 
 }

From df57aa085748aec203c92dceceb7905dc3abbe70 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 6 Mar 2023 08:45:52 +0100
Subject: [PATCH 102/136] #25 Added cuda generated file to gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index a724c3c..6fc3a6a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,6 +20,7 @@ src/meow/runner_processing/*
 *.so
 *.so.dSYM
 a.out
+cudafe
 
 # Ignore the $BONE_DATA symlinks, as they're only there for convinience in vscode
 data_*
\ No newline at end of file

From 6b9d1ddefb0bfc1e7565e3b51f8218c4fd7c85d4 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 6 Mar 2023 09:42:54 +0100
Subject: [PATCH 103/136] #25 Implemented geometry::integrate_axes

---
 src/lib/cpp/cpu/geometry.cc        |  9 ++++++++
 src/lib/cpp/cpu_seq/geometry.cc    | 36 ++++++++++++++++++++++++++++++
 src/lib/cpp/gpu/geometry.cc        |  9 ++++++++
 src/lib/cpp/include/boilerplate.hh |  6 +++++
 src/pybind/geometry-pybind.cc      | 10 ++++-----
 5 files changed, 65 insertions(+), 5 deletions(-)

diff --git a/src/lib/cpp/cpu/geometry.cc b/src/lib/cpp/cpu/geometry.cc
index 4548dd7..7bf6ad9 100644
--- a/src/lib/cpp/cpu/geometry.cc
+++ b/src/lib/cpp/cpu/geometry.cc
@@ -22,6 +22,15 @@ array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &mask, const array
     return cpu_seq::inertia_matrix(mask, cm);
 }
 
+void integrate_axes(const input_ndarray<mask_type> &mask,
+		    const array<real_t,3> &x0,
+		    const array<real_t,3> &v_axis,
+		    const array<real_t,3> &w_axis,
+		    const real_t v_min, const real_t w_min,
+		    output_ndarray<real_t> output) {
+    return cpu_seq::integrate_axes(mask, x0, v_axis, w_axis, v_min, w_min, output);
+}
+
 template <typename T>
 float resample2x2x2(const T        *voxels,
                     const array<ssize_t, 3> &shape,
diff --git a/src/lib/cpp/cpu_seq/geometry.cc b/src/lib/cpp/cpu_seq/geometry.cc
index 69f0578..4709555 100644
--- a/src/lib/cpp/cpu_seq/geometry.cc
+++ b/src/lib/cpp/cpu_seq/geometry.cc
@@ -202,6 +202,42 @@ void sample_plane(const input_ndarray<T> &voxels,
     }
 }
 
+void integrate_axes(const input_ndarray<mask_type> &mask,
+		    const array<real_t,3> &x0,
+		    const array<real_t,3> &v_axis,
+		    const array<real_t,3> &w_axis,
+		    const real_t v_min, const real_t w_min,
+		    output_ndarray<real_t> output) {
+    UNPACK_NUMPY(mask);
+    ssize_t Nv = output.shape[0], Nw = output.shape[1];
+    real_t *output_data = output.data;
+
+    // TODO: Check v_axis & w_axis projections to certify bounds and get rid of runtime check
+    #pragma acc data create(output_data[:Nv*Nw]) copyout(output_data[:Nv*Nw])
+    {
+    BLOCK_BEGIN(mask, ) {
+
+        mask_type voxel = mask_buffer[flat_index];
+        if (voxel != 0) {
+            real_t xs[3] = {
+                real_t(x) - x0[0],
+                real_t(y) - x0[1],
+                real_t(z) - x0[2]
+            };
+
+            real_t v = dot(xs,v_axis), w = dot(xs,w_axis);
+            int64_t i_v = int64_t(round(v-v_min)), j_w = int64_t(round(w-w_min));
+
+            if(i_v >= 0 && j_w >= 0 && i_v < Nv && j_w < Nw){
+                ATOMIC()
+                output_data[i_v*Nw + j_w] += voxel;
+            }
+        }
+
+    BLOCK_END() }
+    }
+}
+
 }
 
 /*
diff --git a/src/lib/cpp/gpu/geometry.cc b/src/lib/cpp/gpu/geometry.cc
index dc6448d..317f572 100644
--- a/src/lib/cpp/gpu/geometry.cc
+++ b/src/lib/cpp/gpu/geometry.cc
@@ -21,6 +21,15 @@ array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &mask, const array
     return cpu_seq::inertia_matrix(mask, cm);
 }
 
+void integrate_axes(const input_ndarray<mask_type> &mask,
+		    const array<real_t,3> &x0,
+		    const array<real_t,3> &v_axis,
+		    const array<real_t,3> &w_axis,
+		    const real_t v_min, const real_t w_min,
+		    output_ndarray<real_t> output) {
+    return cpu_seq::integrate_axes(mask, x0, v_axis, w_axis, v_min, w_min, output);
+}
+
 template <typename T>
 float resample2x2x2(const T        *voxels,
                     const array<ssize_t, 3> &shape,
diff --git a/src/lib/cpp/include/boilerplate.hh b/src/lib/cpp/include/boilerplate.hh
index 54852fb..8eb5dab 100644
--- a/src/lib/cpp/include/boilerplate.hh
+++ b/src/lib/cpp/include/boilerplate.hh
@@ -16,6 +16,12 @@
 #endif
 #endif
 
+#ifdef _OPENACC
+#define ATOMIC() PRAGMA(acc atomic)
+#else
+#define ATOMIC() PRAGMA(omp atomic)
+#endif
+
 // TODO attempt at docstring; not quite working.
 
 /// Inserts boilerplate code for accessing the given parameter, ARR, in a blocked (chunked) manner.
diff --git a/src/pybind/geometry-pybind.cc b/src/pybind/geometry-pybind.cc
index 3051f69..469ff2d 100644
--- a/src/pybind/geometry-pybind.cc
+++ b/src/pybind/geometry-pybind.cc
@@ -30,12 +30,11 @@ void sample_plane(const np_array<T> &np_voxels,
            {plane_samples_info.ptr, plane_samples_info.shape});
 }
 
-real_t resample2x2x2(const np_array<uint8_t> &np_voxels) {
+/*real_t resample2x2x2(const np_array<uint8_t> &np_voxels) {
     auto voxels_info = np_voxels.request();
     return 0.0f;
-}
+}*/
 
-/*
 void integrate_axes(const np_maskarray &np_voxels,
             const array<real_t,3> &x0,
             const array<real_t,3> &v_axis,
@@ -45,12 +44,13 @@ void integrate_axes(const np_maskarray &np_voxels,
     auto voxels_info = np_voxels.request();
     auto output_info  = output.request();
 
-    integrate_axes({voxels_info.ptr, voxels_info.shape},
+    NS::integrate_axes({voxels_info.ptr, voxels_info.shape},
              x0,v_axis,w_axis,
              v_min, w_min,
              {output_info.ptr, output_info.shape});
 }
 
+/*
 void zero_outside_bbox(const array<real_t,9> &principal_axes,
              const array<real_t,6> &parameter_ranges,
              const array<real_t,3> &cm, // TOOD: Med eller uden voxelsize?
@@ -127,7 +127,7 @@ PYBIND11_MODULE(geometry, m) {
 
     m.def("center_of_mass",       &python_api::center_of_mass);
     m.def("inertia_matrix",       &python_api::inertia_matrix);
-    //m.def("integrate_axes",       &python_api::integrate_axes);
+    m.def("integrate_axes",       &python_api::integrate_axes);
     //m.def("zero_outside_bbox",    &python_api::zero_outside_bbox);
     //m.def("fill_implant_mask",    &python_api::fill_implant_mask);
     //m.def("cylinder_projection",  &python_api::cylinder_projection);

From 9d4143780c355ecea5093aa7be66bc4ca1be0d0b Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 6 Mar 2023 09:43:31 +0100
Subject: [PATCH 104/136] #25 Added test for integrate_axes

---
 src/test/test_geometry.py | 40 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/src/test/test_geometry.py b/src/test/test_geometry.py
index 9426e7f..36152c9 100644
--- a/src/test/test_geometry.py
+++ b/src/test/test_geometry.py
@@ -108,7 +108,45 @@ def test_sample_plane(dtype):
     # TODO the function is unstable, even when they're all calling the sequential implementation, t least when comparing gcc against nvcc, but it differs at most with 1. Hence the higher tolerance for this test. Can be tested with something like for i in range(10000):
     compare_fs('sample_plane', cpu_seq, cpu, gpu, True, 1.1, ((64,64), np.float32))
 
+def test_integrate_axes():
+    n = 128
+    dtype = np.uint8
+    voxels = np.random.randint(0, np.iinfo(dtype).max, (n,n,n), dtype)
+    cm = m_cpu.center_of_mass(voxels)
+    M  = np.array(m_cpu.inertia_matrix(voxels, cm)).reshape(3,3)
+
+    lam, E = np.linalg.eigh(M)
+    ix = np.argsort(np.abs(lam))
+    lam, E = np.array(lam)[ix], np.array(E)[:,ix]
+
+    v_axis, w_axis = E[:,1], E[:,2]
+
+    (vmin,vmax), _ = axis_parameter_bounds(voxels.shape, cm, v_axis)
+    (wmin,wmax), _ = axis_parameter_bounds(voxels.shape, cm, w_axis)
+
+    cpu_seq, cpu, gpu = [
+        partial(impl.integrate_axes, voxels, cm, v_axis, w_axis, vmin, wmin)
+        for impl in [m_cpu_seq, m_cpu, m_gpu]
+    ]
+
+    compare_fs('integrate_axes', cpu_seq, cpu, gpu, True, 1e-7, ((int(vmax-vmin+2),int(wmax-wmin+2)), float))
+
+def axis_parameter_bounds(shape, center, axis):
+    d     = len(axis)
+    signs = np.sign(axis)
+
+    # (0,0,..,0) corner and furthest corner of grid, relative to center
+#    print(center)
+    x0 = -np.array(center)
+    x1 = np.array(shape)[::-1]-center # Data has z,y,x-order, but we keep x,y,z in geometry calc
+
+    xmin = (signs==1)*x0 + (signs==-1)*x1 # minimizes dot(x,axis)
+    xmax = (signs==1)*x1 + (signs==-1)*x0 # maximizes dot(x,axis)
+
+    return (np.dot(xmin,axis), np.dot(xmax,axis)), (xmin,xmax)
+
 if __name__ == '__main__':
     test_center_of_mass()
     test_inertia_matrix()
-    test_sample_plane(np.uint8)
\ No newline at end of file
+    test_sample_plane(np.uint8)
+    test_integrate_axes()
\ No newline at end of file

From 23d23934787011aedfd140eeae5faebe5e849515 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 6 Mar 2023 09:43:45 +0100
Subject: [PATCH 105/136] #25 Removed unused imports

---
 src/processing_steps/0600_segment_implant_cc.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/processing_steps/0600_segment_implant_cc.py b/src/processing_steps/0600_segment_implant_cc.py
index 1f09060..b3a0d99 100644
--- a/src/processing_steps/0600_segment_implant_cc.py
+++ b/src/processing_steps/0600_segment_implant_cc.py
@@ -2,14 +2,13 @@
 sys.path.append(sys.path[0]+"/../")
 from config.constants import *
 from config.paths import hdf5_root, binary_root
-from lib.py.helpers import commandline_args, update_hdf5, update_hdf5_mask
-from lib.cpp.cpu_seq.geometry import center_of_mass, inertia_matrix, integrate_axes, sample_plane
+from lib.py.helpers import commandline_args, update_hdf5_mask
 from lib.cpp.cpu.io import load_slice
 
 NA = np.newaxis
 
 sample, scale, chunk_size, verbose = commandline_args({"sample" : "<required>",
-                                                       "scale" : 8, 
+                                                       "scale" : 8,
                                                        "chunk_size" : 256,
                                                        "verbose" : 1})
 
@@ -45,8 +44,8 @@
     load_slice(voxel_chunk, f"{binary_root}/voxels/{scale}x/{sample}.uint16",
                (z,0,0), (nz,ny,nx))
     noisy_implant[z:z+chunk_length] = voxel_chunk[:chunk_length] >= implant_threshold_u16
-    
-                                                  
+
+
 if verbose >= 1: print(f"Computing connected components")
 label, n_features = ndi.label(noisy_implant)
 if verbose >= 1: print(f"Counting component volumes")

From de16c73922cda9f033376d368addb0c71fa66883 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 6 Mar 2023 10:28:46 +0100
Subject: [PATCH 106/136] #25 Implemented geometry::zero_outside_bbox

---
 src/lib/cpp/cpu/geometry.cc        |  7 ++++
 src/lib/cpp/cpu_seq/geometry.cc    | 52 +++++++++++++-----------------
 src/lib/cpp/gpu/geometry.cc        | 45 +++-----------------------
 src/lib/cpp/include/boilerplate.hh |  8 ++---
 src/lib/cpp/include/geometry.hh    | 11 +++++++
 src/pybind/geometry-pybind.cc      | 11 ++-----
 6 files changed, 52 insertions(+), 82 deletions(-)

diff --git a/src/lib/cpp/cpu/geometry.cc b/src/lib/cpp/cpu/geometry.cc
index 7bf6ad9..c3f7cac 100644
--- a/src/lib/cpp/cpu/geometry.cc
+++ b/src/lib/cpp/cpu/geometry.cc
@@ -49,4 +49,11 @@ void sample_plane(const input_ndarray<T> &voxels,
     return cpu_seq::sample_plane(voxels, voxel_size, cm, u_axis, v_axis, bbox, plane_samples);
 }
 
+void zero_outside_bbox(const array<real_t,9> &principal_axes,
+                       const array<real_t,6> &parameter_ranges,
+                       const array<real_t,3> &cm,
+                       output_ndarray<mask_type> voxels) {
+    return cpu_seq::zero_outside_bbox(principal_axes, parameter_ranges, cm, voxels);
+}
+
 }
\ No newline at end of file
diff --git a/src/lib/cpp/cpu_seq/geometry.cc b/src/lib/cpp/cpu_seq/geometry.cc
index 4709555..f756136 100644
--- a/src/lib/cpp/cpu_seq/geometry.cc
+++ b/src/lib/cpp/cpu_seq/geometry.cc
@@ -238,53 +238,45 @@ void integrate_axes(const input_ndarray<mask_type> &mask,
     }
 }
 
-}
-
-/*
-// TODO only called in test.py. Postpone for now.
 // NB: xyz are in indices, not micrometers
 void zero_outside_bbox(const array<real_t,9> &principal_axes,
                const array<real_t,6> &parameter_ranges,
                const array<real_t,3> &cm,
                output_ndarray<mask_type> voxels) {
-    size_t  Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
-    int64_t image_length = Nx*Ny*Nz;
 
-    printf("(Nx,Ny,Nz) = (%ld,%ld,%ld), image_length = %ld",Nx,Ny,Nz, image_length);
+    UNPACK_NUMPY(voxels)
 
-    for (int64_t block_start = 0; block_start < image_length; block_start += acc_block_size) {
-        mask_type *buffer = voxels.data + block_start;
-        ssize_t this_block_length = min(acc_block_size, image_length-block_start);
+    BLOCK_BEGIN(voxels, ) {
 
-        //parallel_loop((buffer[:this_block_length]))
-        for (int64_t k = 0; k < this_block_length; k++) {
-            int64_t flat_idx = block_start + k;
-            int64_t x = flat_idx  / (Ny*Nz);
-            int64_t y = (flat_idx / Nz) % Ny;
-            int64_t z = flat_idx  % Nz;
-            // Boilerplate until here. TODO: macroize or lambda out!
+        real_t xs[3] = {
+            real_t(x) - cm[0],
+            real_t(y) - cm[1],
+            real_t(z) - cm[2]};
+        real_t params[3] = { 0, 0, 0 };
 
-            real_t xs[3] = {x-cm[0], y-cm[1], z-cm[2]};
+        for (int uvw = 0; uvw < 3; uvw++)
+            for (int xyz = 0; xyz < 3; xyz++)
+                params[uvw] += xs[xyz] * principal_axes[uvw*3 + xyz]; // u = dot(xs,u_axis), v = dot(xs,v_axis), w = dot(xs,w_axis)
 
-            real_t params[3] = {0,0,0};
+        bool p = false;
 
-            for (int uvw = 0; uvw < 3; uvw++)
-                for (int xyz = 0; xyz < 3; xyz++)
-                    params[uvw] += xs[xyz] * principal_axes[uvw*3+xyz]; // u = dot(xs,u_axis), v = dot(xs,v_axis), w = dot(xs,w_axis)
+        for (int uvw = 0; uvw < 3; uvw++) {
+            real_t
+                param_min = parameter_ranges[uvw*2],
+                param_max = parameter_ranges[uvw*2 + 1];
+            p |= (params[uvw] < param_min) | (params[uvw] > param_max);
+        }
 
-            bool p = false;
+        if (p)
+            voxels_buffer[flat_index] = 0;
 
-            for (int uvw = 0; uvw < 3; uvw++) {
-                real_t param_min = parameter_ranges[uvw*2], param_max = parameter_ranges[uvw*2+1];
-                p |= (params[uvw] < param_min) | (params[uvw] > param_max);
-            }
+    BLOCK_END() }
 
-            if (p) buffer[k] = 0;
+}
 
-        }
-    }
 }
 
+/*
 inline vector4 hom_transform(const vector4 &x, const matrix4x4 &M) {
     vector4 c{{0,0,0,0}};
 
diff --git a/src/lib/cpp/gpu/geometry.cc b/src/lib/cpp/gpu/geometry.cc
index 317f572..c70b867 100644
--- a/src/lib/cpp/gpu/geometry.cc
+++ b/src/lib/cpp/gpu/geometry.cc
@@ -48,46 +48,11 @@ void sample_plane(const input_ndarray<T> &voxels,
     return cpu_seq::sample_plane(voxels, voxel_size, cm, u_axis, v_axis, bbox, plane_samples);
 }
 
-/* TODO Only called in test.py. Postponed for now.
-void integrate_axes(const input_ndarray<mask_type> &voxels,
-            const array<real_t,3> &x0,
-            const array<real_t,3> &v_axis,
-            const array<real_t,3> &w_axis,
-            const real_t v_min, const real_t w_min,
-            output_ndarray<real_t> output) {
-    ssize_t Nx = voxels.shape[0], Ny = voxels.shape[1], Nz = voxels.shape[2];
-    ssize_t Nv = output.shape[0], Nw = output.shape[1];
-    int64_t image_length = Nx*Ny*Nz;
-    real_t *output_data = output.data;
-
-    // TODO: Check v_axis & w_axis projections to certify bounds and get rid of runtime check
-
-    for (ssize_t block_start = 0; block_start < image_length; block_start += acc_block_size) {
-        const mask_type *buffer  = voxels.data + block_start;
-        int block_length = min(acc_block_size,image_length-block_start);
-
-        //#pragma acc parallel loop copy(output_data[:Nv*Nw]) copyin(buffer[:block_length], x0, v_axis, w_axis)
-        //parallel_loop((output_data[:Nv*Nw]))
-        for (int64_t k = 0; k < block_length; k++) {
-            if (buffer[k] != 0) {
-                int64_t flat_idx = block_start + k;
-                real_t xs[3] = {
-                    (flat_idx  / (Ny*Nz))  - x0[0],   // x
-                    ((flat_idx / Nz) % Ny) - x0[1],   // y
-                    (flat_idx  % Nz)       - x0[2] }; // z
-
-                mask_type voxel = buffer[k];
-                real_t v = dot(xs,v_axis), w = dot(xs,w_axis);
-                int64_t i_v = round(v-v_min), j_w = round(w-w_min);
-
-                if (i_v >= 0 && j_w >= 0 && i_v < Nv && j_w < Nw) {
-                    //atomic_statement()
-                    output_data[i_v*Nw + j_w] += voxel;
-                }
-            }
-        }
-    }
+void zero_outside_bbox(const array<real_t,9> &principal_axes,
+                       const array<real_t,6> &parameter_ranges,
+                       const array<real_t,3> &cm,
+                       output_ndarray<mask_type> voxels) {
+    return cpu_seq::zero_outside_bbox(principal_axes, parameter_ranges, cm, voxels);
 }
-*/
 
 }
\ No newline at end of file
diff --git a/src/lib/cpp/include/boilerplate.hh b/src/lib/cpp/include/boilerplate.hh
index 8eb5dab..00b88d5 100644
--- a/src/lib/cpp/include/boilerplate.hh
+++ b/src/lib/cpp/include/boilerplate.hh
@@ -32,9 +32,9 @@
 /// @param ARR The array that will be accessed.
 #define FOR_BLOCK_BEGIN(ARR) \
     for (int64_t ARR##_buffer_start = 0; ARR##_buffer_start < ARR##_length; ARR##_buffer_start += acc_block_size<ARR##_type>) { \
-        const ARR##_type *ARR##_buffer = ARR.data + ARR##_buffer_start; \
+        ARR##_type *ARR##_buffer = (ARR##_type *) ARR.data + ARR##_buffer_start; \
         ssize_t ARR##_buffer_length = min(acc_block_size<ARR##_type>, ARR##_length-ARR##_buffer_start); \
-        PRAGMA(acc data copyin(ARR##_buffer[:ARR##_buffer_length])) \
+        PRAGMA(acc data copy(ARR##_buffer[:ARR##_buffer_length])) \
         {
 
 #define FOR_BLOCK_END() } }
@@ -76,7 +76,7 @@
 #else
 #ifdef _OPENMP // Should also capture OpenACC, which is why it's second.
 #define BLOCK_BEGIN(ARR, EXTRA_PRAGMA_CLAUSE) \
-    const ARR##_type *ARR##_buffer = ARR.data; \
+    ARR##_type *ARR##_buffer = (ARR##_type *) ARR.data; \
     FOR_3D_BEGIN(ARR, EXTRA_PRAGMA_CLAUSE) \
     int64_t flat_index = z*ARR##_Ny*ARR##_Nx + y*ARR##_Nx + x;
 
@@ -84,7 +84,7 @@
 #else
 #define BLOCK_BEGIN(ARR, EXTRA_PRAGMA_CLAUSE) \
     int64_t flat_index = 0; \
-    const ARR##_type *ARR##_buffer = ARR.data; \
+    ARR##_type *ARR##_buffer = (ARR##_type *) ARR.data; \
     FOR_3D_BEGIN(ARR, EXTRA_PRAGMA_CLAUSE)
 
 #define BLOCK_END() \
diff --git a/src/lib/cpp/include/geometry.hh b/src/lib/cpp/include/geometry.hh
index 237844b..7635d80 100644
--- a/src/lib/cpp/include/geometry.hh
+++ b/src/lib/cpp/include/geometry.hh
@@ -36,6 +36,13 @@ Computes the inertia matrix of the given tomography based of the given center of
 */
 array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &voxels, const array<real_t,3> &cm);
 
+void integrate_axes(const input_ndarray<mask_type> &mask,
+		    const array<real_t,3> &x0,
+		    const array<real_t,3> &v_axis,
+		    const array<real_t,3> &w_axis,
+		    const real_t v_min, const real_t w_min,
+		    output_ndarray<real_t> output);
+
 template <typename T>
 float resample2x2x2(const T *voxels,
                     const array<ssize_t,3> &shape,
@@ -50,6 +57,10 @@ void sample_plane(const input_ndarray<T> &voxels,
                   const array<real_t, 4>  bbox,    // [umin,umax,vmin,vmax] in micrometers
                   output_ndarray<real_t> plane_samples);
 
+void zero_outside_bbox(const array<real_t,9> &principal_axes,
+               const array<real_t,6> &parameter_ranges,
+               const array<real_t,3> &cm,
+               output_ndarray<mask_type> voxels);
 }
 
 #endif
\ No newline at end of file
diff --git a/src/pybind/geometry-pybind.cc b/src/pybind/geometry-pybind.cc
index 469ff2d..d22a2a9 100644
--- a/src/pybind/geometry-pybind.cc
+++ b/src/pybind/geometry-pybind.cc
@@ -30,11 +30,6 @@ void sample_plane(const np_array<T> &np_voxels,
            {plane_samples_info.ptr, plane_samples_info.shape});
 }
 
-/*real_t resample2x2x2(const np_array<uint8_t> &np_voxels) {
-    auto voxels_info = np_voxels.request();
-    return 0.0f;
-}*/
-
 void integrate_axes(const np_maskarray &np_voxels,
             const array<real_t,3> &x0,
             const array<real_t,3> &v_axis,
@@ -50,19 +45,19 @@ void integrate_axes(const np_maskarray &np_voxels,
              {output_info.ptr, output_info.shape});
 }
 
-/*
 void zero_outside_bbox(const array<real_t,9> &principal_axes,
              const array<real_t,6> &parameter_ranges,
              const array<real_t,3> &cm, // TOOD: Med eller uden voxelsize?
              np_maskarray &np_voxels) {
     auto voxels_info = np_voxels.request();
 
-    zero_outside_bbox(principal_axes,
+    NS::zero_outside_bbox(principal_axes,
               parameter_ranges,
               cm,
               {voxels_info.ptr, voxels_info.shape});
 }
 
+/*
 void fill_implant_mask(const np_maskarray implant_mask,
                float voxel_size,
                const array<float,6> &bbox,
@@ -128,7 +123,7 @@ PYBIND11_MODULE(geometry, m) {
     m.def("center_of_mass",       &python_api::center_of_mass);
     m.def("inertia_matrix",       &python_api::inertia_matrix);
     m.def("integrate_axes",       &python_api::integrate_axes);
-    //m.def("zero_outside_bbox",    &python_api::zero_outside_bbox);
+    m.def("zero_outside_bbox",    &python_api::zero_outside_bbox);
     //m.def("fill_implant_mask",    &python_api::fill_implant_mask);
     //m.def("cylinder_projection",  &python_api::cylinder_projection);
     m.def("sample_plane",         &python_api::sample_plane<uint16_t>);

From 98d8426a3bf8be985ce2207d403f408221677d9e Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 6 Mar 2023 10:29:08 +0100
Subject: [PATCH 107/136] #25 Added test for zero_outside_bbox

---
 src/test/test_geometry.py | 83 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 78 insertions(+), 5 deletions(-)

diff --git a/src/test/test_geometry.py b/src/test/test_geometry.py
index 36152c9..dae5f56 100644
--- a/src/test/test_geometry.py
+++ b/src/test/test_geometry.py
@@ -39,12 +39,16 @@ def run_with_warmup(f, allocate_result=None):
 
     @param allocate_result Defines whether the memory for the result should be allocated before running. If it should, it should be a tuple of the shape and the dtype of the array. None otherwise.
     '''
-    alloc = lambda x: np.zeros(x[0], x[1])
-    f() if allocate_result is None else f(alloc(allocate_result))
     if allocate_result is None:
+        f()
         start = datetime.datetime.now()
         result = f()
     else:
+        if type(allocate_result) is tuple:
+            alloc = lambda x: np.zeros(x[0], x[1])
+        else:
+            alloc = lambda x: np.copy(x)
+        f(alloc(allocate_result))
         result = alloc(allocate_result)
         start = datetime.datetime.now()
         f(result)
@@ -52,7 +56,7 @@ def run_with_warmup(f, allocate_result=None):
     return result, end - start
 
 def compare_fs(func, baseline_f, cpu_f, gpu_f, should_assert=True, tolerance=1e-7,
-               allocate_result: tuple[tuple[int],np.dtype]=None):
+               allocate_result: tuple[tuple[int],np.dtype] | np.ndarray=None):
     baseline, baseline_t = run_with_warmup(baseline_f, allocate_result)
     print (f'({func}) Sequential ran in {baseline_t}')
 
@@ -132,7 +136,6 @@ def test_integrate_axes():
     compare_fs('integrate_axes', cpu_seq, cpu, gpu, True, 1e-7, ((int(vmax-vmin+2),int(wmax-wmin+2)), float))
 
 def axis_parameter_bounds(shape, center, axis):
-    d     = len(axis)
     signs = np.sign(axis)
 
     # (0,0,..,0) corner and furthest corner of grid, relative to center
@@ -145,8 +148,78 @@ def axis_parameter_bounds(shape, center, axis):
 
     return (np.dot(xmin,axis), np.dot(xmax,axis)), (xmin,xmax)
 
+def integrate_axes(img, cm, v_axis, w_axis):
+    (vmin,vmax), (vxmin,vxmax) = axis_parameter_bounds(img.shape, cm, v_axis)
+    (wmin,wmax), (wxmin,wxmax) = axis_parameter_bounds(img.shape, cm, w_axis)
+
+    integral = np.zeros((int(vmax-vmin+2),int(wmax-wmin+2)), dtype=float)
+    m_cpu.integrate_axes(img,cm,v_axis, w_axis,vmin, wmin, integral)
+
+    return integral
+
+def bounding_volume(voxels,voxelsize=1.85):
+    cm = np.array(m_cpu.center_of_mass(voxels))
+    M  = np.array(m_cpu.inertia_matrix(voxels,cm)).reshape(3,3)
+
+    lam,E = np.linalg.eigh(M)
+    ix = np.argsort(np.abs(lam))
+    lam,E = np.array(lam)[ix], np.array(E)[:,ix]
+
+    u_axis, v_axis, w_axis = E[:,0], E[:,1], E[:,2]
+    (vmin,vmax), _ = axis_parameter_bounds(voxels.shape, cm, v_axis)
+
+    int_vw = integrate_axes(voxels, cm, v_axis, w_axis)
+    int_uw = integrate_axes(voxels, cm, u_axis, w_axis)
+    int_uv = integrate_axes(voxels, cm, u_axis, v_axis)
+    int_u  = np.sum(int_uv,axis=1)
+    int_v  = np.sum(int_uv,axis=0)
+    int_w  = np.sum(int_uw,axis=0)
+
+    lengths = np.array([np.sum(int_u>0), np.sum(int_v>0), np.sum(int_w>0)])
+    ix = np.argsort(lengths)[::-1]
+    print("lengths: ",lengths, ", ix: ",ix)
+
+    (umin,umax), _ = axis_parameter_bounds(voxels.shape, cm, u_axis)
+    (vmin,vmax), _ = axis_parameter_bounds(voxels.shape, cm, v_axis)
+    (wmin,wmax), _ = axis_parameter_bounds(voxels.shape, cm, w_axis)
+
+    u_prefix, u_postfix = np.sum(int_u[0:int(np.ceil(abs(umin)))]>0), np.sum(int_u[int(np.floor(abs(umin))):]>0)
+    v_prefix, v_postfix = np.sum(int_v[0:int(np.ceil(abs(vmin)))]>0), np.sum(int_v[int(np.floor(abs(vmin))):]>0)
+    w_prefix, w_postfix = np.sum(int_w[0:int(np.ceil(abs(wmin)))]>0), np.sum(int_w[int(np.floor(abs(wmin))):]>0)
+
+
+    return {
+        'principal_axes':np.array([u_axis,v_axis,w_axis]),
+        'principal_axes_ranges':np.array([[-u_prefix*voxelsize,u_postfix*voxelsize],
+                                          [-v_prefix*voxelsize,v_postfix*voxelsize],
+                                          [-w_prefix*voxelsize,w_postfix*voxelsize]]),
+        'centre_of_mass':cm*voxelsize
+    }
+
+def test_zero_outside_bbox():
+    n = 128
+    dtype = np.uint8
+    voxels = np.random.randint(0, np.iinfo(dtype).max, (n,n,n), dtype)
+    voxelsize = 1.85
+    coarse_scale = 6
+    fine_scale = 2
+    mmtofi = 1 / (voxelsize * fine_scale) # Conversion factor from micrometers to index
+
+    implant_bound = bounding_volume(voxels, voxelsize*coarse_scale)
+    uvw_axes   = implant_bound["principal_axes"]
+    uvw_ranges = implant_bound["principal_axes_ranges"] * mmtofi
+    cm         = implant_bound["centre_of_mass"] * mmtofi
+
+    cpu_seq, cpu, gpu = [
+        partial(impl.zero_outside_bbox, uvw_axes.flatten(), uvw_ranges.flatten(), cm)
+        for impl in [m_cpu_seq, m_cpu, m_gpu]
+    ]
+
+    compare_fs('zero_outside_bbox', cpu_seq, cpu, gpu, True, 1e-7, voxels)
+
 if __name__ == '__main__':
     test_center_of_mass()
     test_inertia_matrix()
     test_sample_plane(np.uint8)
-    test_integrate_axes()
\ No newline at end of file
+    test_integrate_axes()
+    test_zero_outside_bbox()
\ No newline at end of file

From 2eda0f4d82c80039424c48c4143c7af72ee34187 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 6 Mar 2023 15:18:15 +0100
Subject: [PATCH 108/136] #25 Implemented geometry::fill_implant_mask

---
 src/lib/cpp/cpu/geometry.cc        |  11 +-
 src/lib/cpp/cpu_seq/geometry.cc    | 179 ++++++++++++++---------------
 src/lib/cpp/gpu/geometry.cc        | 102 +++++++++++++++-
 src/lib/cpp/include/boilerplate.hh |   2 +
 src/lib/cpp/include/geometry.hh    |  41 ++++++-
 5 files changed, 234 insertions(+), 101 deletions(-)

diff --git a/src/lib/cpp/cpu/geometry.cc b/src/lib/cpp/cpu/geometry.cc
index c3f7cac..879ca9f 100644
--- a/src/lib/cpp/cpu/geometry.cc
+++ b/src/lib/cpp/cpu/geometry.cc
@@ -14,8 +14,15 @@ array<real_t,3> center_of_mass(const input_ndarray<mask_type> &mask) {
     return cpu_seq::center_of_mass(mask);
 }
 
-bool in_bbox(float U, float V, float W, const std::array<float, 6> &bbox) {
-    return cpu_seq::in_bbox(U, V, W, bbox);
+void fill_implant_mask(const input_ndarray<mask_type> mask,
+               float voxel_size,
+               const array<float,6> &bbox,
+               float r_fraction,
+               const matrix4x4 &Muvw,
+               output_ndarray<mask_type> solid_implant_mask,
+               output_ndarray<float> rsqr_maxs,
+               output_ndarray<float> profile) {
+    return cpu_seq::fill_implant_mask(mask, voxel_size, bbox, r_fraction, Muvw, solid_implant_mask, rsqr_maxs, profile);
 }
 
 array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &mask, const array<real_t,3> &cm) {
diff --git a/src/lib/cpp/cpu_seq/geometry.cc b/src/lib/cpp/cpu_seq/geometry.cc
index f756136..1fd5546 100644
--- a/src/lib/cpp/cpu_seq/geometry.cc
+++ b/src/lib/cpp/cpu_seq/geometry.cc
@@ -39,21 +39,93 @@ array<real_t, 3> center_of_mass(const input_ndarray<mask_type> &mask) {
     return array<real_t, 3>{ rcmz, rcmy, rcmx };
 }
 
-bool in_bbox(float U, float V, float W, const std::array<float, 6> &bbox) {
-    const auto& [U_min, U_max, V_min, V_max, W_min, W_max] = bbox;
+void fill_implant_mask(const input_ndarray<mask_type> mask,
+               float voxel_size,
+               const array<float,6> &bbox,
+               float r_fraction,
+               const matrix4x4 &Muvw,
+               output_ndarray<mask_type> solid_implant_mask,
+               output_ndarray<float> rsqr_maxs,
+               output_ndarray<float> profile) {
+    UNPACK_NUMPY(mask)
+
+    real_t theta_min = real_t(M_PI), theta_max = real_t(-M_PI);
+    ssize_t n_segments = rsqr_maxs.shape[0];
+    const auto [U_min,U_max,V_min,V_max,W_min,W_max] = bbox;
+    float     *rsqr_maxs_d     = rsqr_maxs.data;
+    float     *profile_d       = profile.data;
+
+
+    //BLOCK_BEGIN(mask, ) {
+    #pragma omp parallel for collapse(3)
+    for (int64_t z = 0; z < mask_Nz; z++) { for (int64_t y = 0; y < mask_Ny; y++) { for (int64_t x = 0; x < mask_Nx; x++) {
+        //mask_type *solid_mask_buffer = solid_implant_mask.data + mask_buffer_start;
 
-    bool inside =
-        U >= U_min &&
-        U <= U_max &&
-        V >= V_min &&
-        V <= V_max &&
-        W >= W_min &&
-        W <= W_max;
+        mask_type mask_value = mask.data[z*mask_Ny*mask_Nx + y*mask_Nx + x];
+        std::array<real_t, 4> Xs = {
+            real_t(x) * voxel_size,
+            real_t(y) * voxel_size,
+            real_t(z) * voxel_size,
+            1 };
 
-    // printf("in_bbox: (%.1f,%.1f,%.1f) \in ([%.1f,%.1f],[%.1f,%.1f],[%.1f,%.1f]) == %d\n",
-    //      U,V,W,U_min,U_max,V_min,V_max,U_min,U_max,inside);
+        if (mask_value) {
+            auto [U,V,W,c] = hom_transform(Xs, Muvw);
+
+            real_t r_sqr = V*V+W*W;
+            real_t theta = atan2(V,W);
+
+            int U_i = int(floor((U-U_min)*real_t(n_segments-1)/(U_max-U_min)));
+
+        //    if (U_i >= 0 && U_i < n_segments) {
+            if ( in_bbox(U,V,W,bbox) ) {
+                rsqr_maxs_d[U_i] = max(rsqr_maxs_d[U_i], float(r_sqr));
+                theta_min = min(theta_min, theta);
+                theta_max = max(theta_max, theta);
+            //      W_min     = min(W_min,     W);
+            } else {
+                // Otherwise we've calculated it wrong!
+                //  fprintf(stderr,"U-coordinate out of bounds: U_i = %ld, U = %g, U_min = %g, U_max = %g\n",U_i,U,U_min,U_max);
+            }
+        }
+
+    //FOR_3D_END() }
+    }}}
+
+    double theta_center = (theta_max+theta_min)/2;
+
+    //FOR_3D_BEGIN(mask, ) {
+    #pragma omp parallel for collapse(3)
+    for (int64_t z = 0; z < mask_Nz; z++) { for (int64_t y = 0; y < mask_Ny; y++) { for (int64_t x = 0; x < mask_Nx; x++) {
+        std::array<real_t, 4> Xs = {
+            real_t(x) * voxel_size,
+            real_t(y) * voxel_size,
+            real_t(z) * voxel_size,
+            1 };
+        int64_t flat_index = z*mask_Ny*mask_Nx + y*mask_Nx + x;
+        mask_type mask_value = mask.data[flat_index];
+
+        // Second pass does the actual work
+        auto [U,V,W,c] = hom_transform(Xs,Muvw);
+        float r_sqr = V*V+W*W;
+        float theta = atan2(V,W);
+        int U_i = int(floor((U-U_min)*real_t(n_segments-1)/(U_max-U_min)));
+
+        bool solid_mask_value = false;
+        if (U_i >= 0 && U_i < n_segments && W >= W_min) { // TODO: Full bounding box check?
+            solid_mask_value = mask_value | (r_sqr <= r_fraction*rsqr_maxs_d[U_i]);
+
+            if (theta >= theta_min && theta <= theta_center && r_sqr <= rsqr_maxs_d[U_i]) {
+                ATOMIC()
+                profile_d[U_i] += solid_mask_value;
+            }
+        }
+
+        solid_implant_mask.data[flat_index] = solid_mask_value;
+
+    //BLOCK_END() }
+    //FOR_3D_END() }
+    }}}
 
-    return inside;
 }
 
 array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &mask, const array<real_t,3> &cm) {
@@ -277,89 +349,6 @@ void zero_outside_bbox(const array<real_t,9> &principal_axes,
 }
 
 /*
-inline vector4 hom_transform(const vector4 &x, const matrix4x4 &M) {
-    vector4 c{{0,0,0,0}};
-
-    for (int i = 0; i < 4; i++) {
-        real_t sum = 0;
-        #pragma simd parallel for reduction(+:sum)
-        for (int j=0;j<4;j++)
-            sum += M[i*4+j]*x[j];
-        c[i] = sum;
-    }
-    return c;
-}
-
-void fill_implant_mask(const input_ndarray<mask_type> implant_mask,
-               float voxel_size,
-               const array<float,6> &bbox,
-               float r_fraction,
-               const matrix4x4 &Muvw,
-               output_ndarray<mask_type> solid_implant_mask,
-               output_ndarray<float> rsqr_maxs,
-               output_ndarray<float> profile) {
-    real_t theta_min = M_PI, theta_max = -M_PI;
-    ssize_t n_segments = rsqr_maxs.shape[0];
-    const auto [U_min,U_max,V_min,V_max,W_min,W_max] = bbox;
-
-    printf("implant_mask.shape = %ld,%ld,%ld\n",implant_mask.shape[0],implant_mask.shape[1],implant_mask.shape[2]);
-    printf("solid_implant_mask.shape = %ld,%ld,%ld\n",solid_implant_mask.shape[0],solid_implant_mask.shape[1],solid_implant_mask.shape[2]);
-
-    fprintf(stderr,"voxel_size = %g, U_min = %g, U_max = %g, r_frac = %g, n_segments = %ld\n",
-        voxel_size, U_min, U_max, r_fraction, n_segments);
-
-    float     *rsqr_maxs_d     = rsqr_maxs.data;
-    float     *profile_d       = profile.data;
-
-    // First pass computes some bounds -- possibly separate out to avoid repeating
-    //loop_mask_start(implant_mask, solid_implant_mask, (maskin_buffer[:this_block_length], rsqr_maxs_d[:n_segments], Muvw[:16], bbox[:6]) );
-    if (mask_value) {
-        auto [U,V,W,c] = hom_transform(Xs,Muvw);
-
-        real_t r_sqr = V*V+W*W;
-        real_t theta = atan2(V,W);
-
-        int U_i = floor((U-U_min)*(n_segments-1)/(U_max-U_min));
-
-        //    if (U_i >= 0 && U_i < n_segments) {
-        if ( in_bbox(U,V,W,bbox) ) {
-            rsqr_maxs_d[U_i] = max(rsqr_maxs_d[U_i], float(r_sqr));
-            theta_min = min(theta_min, theta);
-            theta_max = max(theta_max, theta);
-            //      W_min     = min(W_min,     W);
-        } else {
-            // Otherwise we've calculated it wrong!
-            //  fprintf(stderr,"U-coordinate out of bounds: U_i = %ld, U = %g, U_min = %g, U_max = %g\n",U_i,U,U_min,U_max);
-        }
-    }
-    //loop_mask_end(implant_mask);
-
-    double theta_center = (theta_max+theta_min)/2;
-
-    fprintf(stderr,"theta_min, theta_center, theta_max = %g,%g,%g\n", theta_min, theta_center, theta_max);
-
-    // Second pass does the actual work
-    //loop_mask_start(implant_mask, solid_implant_mask,
-            (rsqr_maxs_d[:n_segments], profile_d[:n_segments]) );
-    auto [U,V,W,c] = hom_transform(Xs,Muvw);
-    float r_sqr = V*V+W*W;
-    float theta = atan2(V,W);
-    int U_i = floor((U-U_min)*(n_segments-1)/(U_max-U_min));
-
-    bool solid_mask_value = false;
-    if (U_i >= 0 && U_i < n_segments && W >= W_min) { // TODO: Full bounding box check?
-        solid_mask_value = mask_value | (r_sqr <= r_fraction*rsqr_maxs_d[U_i]);
-
-        if (theta >= theta_min && theta <= theta_center && r_sqr <= rsqr_maxs_d[U_i]) {
-            //atomic_statement()
-            profile_d[U_i] += solid_mask_value;
-        }
-    }
-    maskout_buffer[k] = solid_mask_value;
-
-    //loop_mask_end(implant_mask);
-}
-
 void compute_front_mask(const input_ndarray<mask_type> solid_implant,
         const float voxel_size,
         const matrix4x4 &Muvw,
diff --git a/src/lib/cpp/gpu/geometry.cc b/src/lib/cpp/gpu/geometry.cc
index c70b867..5267619 100644
--- a/src/lib/cpp/gpu/geometry.cc
+++ b/src/lib/cpp/gpu/geometry.cc
@@ -13,8 +13,106 @@ array<real_t,3> center_of_mass(const input_ndarray<mask_type> &mask) {
     return cpu_seq::center_of_mass(mask);
 }
 
-bool in_bbox(float U, float V, float W, const std::array<float, 6> &bbox) {
-    return cpu_seq::in_bbox(U, V, W, bbox);
+void fill_implant_mask(const input_ndarray<mask_type> mask,
+               float voxel_size,
+               const array<float,6> &bbox,
+               float r_fraction,
+               const matrix4x4 &Muvw,
+               output_ndarray<mask_type> solid_implant_mask,
+               output_ndarray<float> rsqr_maxs,
+               output_ndarray<float> profile) {
+    UNPACK_NUMPY(mask)
+
+    real_t theta_min = real_t(M_PI), theta_max = real_t(-M_PI);
+    ssize_t n_segments = rsqr_maxs.shape[0];
+    const auto [U_min, U_max, V_min, V_max, W_min, W_max] = bbox;
+    float     *rsqr_maxs_d     = rsqr_maxs.data;
+    float     *profile_d       = profile.data;
+
+    #pragma acc data copyin(U_min) create(rsqr_maxs_d[:n_segments], profile_d[:n_segments]) copyout(rsqr_maxs_d[:n_segments], profile_d[:n_segments])
+    {
+        for (int64_t mask_buffer_start = 0; mask_buffer_start < mask_length; mask_buffer_start += acc_block_size<mask_type>) {
+            ssize_t mask_buffer_length = min(acc_block_size<mask_type>, mask_length-mask_buffer_start);
+            mask_type *mask_buffer = (mask_type *) mask.data + mask_buffer_start;
+            #pragma acc data copy(mask_buffer[:mask_buffer_length])
+            {
+                #pragma acc parallel loop
+                for (int64_t flat_index = 0; flat_index < mask_buffer_length; flat_index++) {
+                    int64_t
+                        global_index = mask_buffer_start + flat_index,
+                        z = global_index / (mask_Ny * mask_Nx),
+                        y = (global_index / mask_Nx) % mask_Ny,
+                        x = global_index % mask_Nx;
+                    mask_type mask_value = mask_buffer[flat_index];
+                    std::array<real_t, 4> Xs = {
+                        real_t(x) * voxel_size,
+                        real_t(y) * voxel_size,
+                        real_t(z) * voxel_size,
+                        1 };
+
+                    if (mask_value) {
+                        auto [U,V,W,c] = hom_transform(Xs, Muvw);
+
+                        real_t r_sqr = V*V+W*W;
+                        real_t theta = atan2(V,W);
+
+                        int U_i = int(floor((U-U_min)*real_t(n_segments-1)/(U_max-U_min)));
+
+                        if ( in_bbox(U,V,W,bbox) ) {
+                            rsqr_maxs_d[U_i] = max(rsqr_maxs_d[U_i], float(r_sqr));
+                            theta_min = min(theta_min, theta);
+                            theta_max = max(theta_max, theta);
+                        } else {
+                            // Otherwise we've calculated it wrong!
+                        }
+                    }
+                }
+            }
+        }
+
+        double theta_center = (theta_max+theta_min)/2;
+
+        for (int64_t mask_buffer_start = 0; mask_buffer_start < mask_length; mask_buffer_start += acc_block_size<mask_type>) {
+            mask_type *mask_buffer = (mask_type *) mask.data + mask_buffer_start;
+            ssize_t mask_buffer_length = min(acc_block_size<mask_type>, mask_length-mask_buffer_start);
+            mask_type *solid_mask_buffer = solid_implant_mask.data + mask_buffer_start;
+            #pragma acc data copy(mask_buffer[:mask_buffer_length]) create(solid_mask_buffer[:mask_buffer_length]) copyout(solid_mask_buffer[:mask_buffer_length])
+            {
+                #pragma acc parallel loop
+                for (int64_t flat_index = 0; flat_index < mask_buffer_length; flat_index++) {
+                    int64_t
+                        global_index = mask_buffer_start + flat_index,
+                        z = global_index / (mask_Ny * mask_Nx),
+                        y = (global_index / mask_Nx) % mask_Ny,
+                        x = global_index % mask_Nx;
+                    mask_type mask_value = mask_buffer[flat_index];
+                    std::array<real_t, 4> Xs = {
+                        real_t(x) * voxel_size,
+                        real_t(y) * voxel_size,
+                        real_t(z) * voxel_size,
+                        1 };
+
+                    // Second pass does the actual work
+                    auto [U,V,W,c] = hom_transform(Xs,Muvw);
+                    float r_sqr = V*V+W*W;
+                    float theta = atan2(V,W);
+                    int U_i = int(floor((U-U_min)*real_t(n_segments-1)/(U_max-U_min)));
+
+                    bool solid_mask_value = false;
+                    if (U_i >= 0 && U_i < n_segments && W >= W_min) { // TODO: Full bounding box check?
+                        solid_mask_value = mask_value | (r_sqr <= r_fraction*rsqr_maxs_d[U_i]);
+
+                        if (theta >= theta_min && theta <= theta_center && r_sqr <= rsqr_maxs_d[U_i]) {
+                            ATOMIC()
+                            profile_d[U_i] += solid_mask_value;
+                        }
+                    }
+
+                    solid_mask_buffer[flat_index] = solid_mask_value;
+                }
+            }
+        }
+    }
 }
 
 array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &mask, const array<real_t,3> &cm) {
diff --git a/src/lib/cpp/include/boilerplate.hh b/src/lib/cpp/include/boilerplate.hh
index 00b88d5..c0ba698 100644
--- a/src/lib/cpp/include/boilerplate.hh
+++ b/src/lib/cpp/include/boilerplate.hh
@@ -77,6 +77,7 @@
 #ifdef _OPENMP // Should also capture OpenACC, which is why it's second.
 #define BLOCK_BEGIN(ARR, EXTRA_PRAGMA_CLAUSE) \
     ARR##_type *ARR##_buffer = (ARR##_type *) ARR.data; \
+    __attribute__((unused)) int64_t ARR##_buffer_start = 0; \
     FOR_3D_BEGIN(ARR, EXTRA_PRAGMA_CLAUSE) \
     int64_t flat_index = z*ARR##_Ny*ARR##_Nx + y*ARR##_Nx + x;
 
@@ -85,6 +86,7 @@
 #define BLOCK_BEGIN(ARR, EXTRA_PRAGMA_CLAUSE) \
     int64_t flat_index = 0; \
     ARR##_type *ARR##_buffer = (ARR##_type *) ARR.data; \
+    __attribute__((unused)) int64_t ARR##_buffer_start = 0; \
     FOR_3D_BEGIN(ARR, EXTRA_PRAGMA_CLAUSE)
 
 #define BLOCK_END() \
diff --git a/src/lib/cpp/include/geometry.hh b/src/lib/cpp/include/geometry.hh
index 7635d80..c168ed4 100644
--- a/src/lib/cpp/include/geometry.hh
+++ b/src/lib/cpp/include/geometry.hh
@@ -9,12 +9,42 @@ using namespace std;
 
 #define dot(a,b) (a[0]*b[0] + a[1]*b[1] + a[2]*b[2])
 
-void print_timestamp(string message) {
+inline void print_timestamp(string message) {
     auto now = chrono::system_clock::to_time_t(chrono::system_clock::now());
     tm local_tm = *localtime(&now);
     fprintf(stderr,"%s at %02d:%02d:%02d\n", message.c_str(), local_tm.tm_hour, local_tm.tm_min, local_tm.tm_sec);
 }
 
+inline vector4 hom_transform(const vector4 &x, const matrix4x4 &M) {
+    vector4 c{{ 0, 0, 0, 0 }};
+
+    for (int i = 0; i < 4; i++) {
+        real_t sum = 0;
+        #pragma simd parallel for reduction(+:sum)
+        for (int j = 0; j < 4; j++)
+            sum += M[i*4 + j] * x[j];
+        c[i] = sum;
+    }
+    return c;
+}
+
+inline bool in_bbox(float U, float V, float W, const std::array<float, 6> &bbox) {
+    const auto& [U_min, U_max, V_min, V_max, W_min, W_max] = bbox;
+
+    bool inside =
+        U >= U_min &&
+        U <= U_max &&
+        V >= V_min &&
+        V <= V_max &&
+        W >= W_min &&
+        W <= W_max;
+
+    // printf("in_bbox: (%.1f,%.1f,%.1f) \in ([%.1f,%.1f],[%.1f,%.1f],[%.1f,%.1f]) == %d\n",
+    //      U,V,W,U_min,U_max,V_min,V_max,U_min,U_max,inside);
+
+    return inside;
+}
+
 namespace NS {
 
 /*
@@ -25,7 +55,14 @@ Computes the center of mass of the given tomography.
 */
 array<real_t,3> center_of_mass(const input_ndarray<mask_type> &voxels);
 
-bool in_bbox(float U, float V, float W, const std::array<float, 6> &bbox);
+void fill_implant_mask(const input_ndarray<mask_type> implant_mask,
+               float voxel_size,
+               const array<float,6> &bbox,
+               float r_fraction,
+               const matrix4x4 &Muvw,
+               output_ndarray<mask_type> solid_implant_mask,
+               output_ndarray<float> rsqr_maxs,
+               output_ndarray<float> profile);
 
 /*
 Computes the inertia matrix of the given tomography based of the given center of mass.

From caf7e60ee71462ececa380df9a6d032f82fab076 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 6 Mar 2023 15:18:43 +0100
Subject: [PATCH 109/136] #25 Added test for fill_implant_mask

---
 src/pybind/geometry-pybind.cc |  6 +++---
 src/test/test_geometry.py     | 37 ++++++++++++++++++++++++++++++++++-
 2 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/src/pybind/geometry-pybind.cc b/src/pybind/geometry-pybind.cc
index d22a2a9..ac3116b 100644
--- a/src/pybind/geometry-pybind.cc
+++ b/src/pybind/geometry-pybind.cc
@@ -57,7 +57,6 @@ void zero_outside_bbox(const array<real_t,9> &principal_axes,
               {voxels_info.ptr, voxels_info.shape});
 }
 
-/*
 void fill_implant_mask(const np_maskarray implant_mask,
                float voxel_size,
                const array<float,6> &bbox,
@@ -72,7 +71,7 @@ void fill_implant_mask(const np_maskarray implant_mask,
         rsqr_info          = rsqr_maxs.request(),
         profile_info       =  profile.request();
 
-    return fill_implant_mask({implant_info.ptr,       implant_info.shape},
+    return NS::fill_implant_mask({implant_info.ptr,       implant_info.shape},
                  voxel_size, bbox, r_fraction, Muvw,
                  {solid_implant_info.ptr, solid_implant_info.shape},
                  {rsqr_info.ptr,          rsqr_info.shape},
@@ -80,6 +79,7 @@ void fill_implant_mask(const np_maskarray implant_mask,
                  );
 }
 
+/*
 void compute_front_mask(const np_array<uint8_t> &np_solid_implant,
         const float voxel_size,
         const matrix4x4 &Muvw,
@@ -124,7 +124,7 @@ PYBIND11_MODULE(geometry, m) {
     m.def("inertia_matrix",       &python_api::inertia_matrix);
     m.def("integrate_axes",       &python_api::integrate_axes);
     m.def("zero_outside_bbox",    &python_api::zero_outside_bbox);
-    //m.def("fill_implant_mask",    &python_api::fill_implant_mask);
+    m.def("fill_implant_mask",    &python_api::fill_implant_mask);
     //m.def("cylinder_projection",  &python_api::cylinder_projection);
     m.def("sample_plane",         &python_api::sample_plane<uint16_t>);
     m.def("sample_plane",         &python_api::sample_plane<uint8_t>);
diff --git a/src/test/test_geometry.py b/src/test/test_geometry.py
index dae5f56..6525151 100644
--- a/src/test/test_geometry.py
+++ b/src/test/test_geometry.py
@@ -217,9 +217,44 @@ def test_zero_outside_bbox():
 
     compare_fs('zero_outside_bbox', cpu_seq, cpu, gpu, True, 1e-7, voxels)
 
+def test_fill_implant_mask():
+    n = 128
+    dtype = np.uint8
+    implant = np.random.randint(0, np.iinfo(dtype).max, (n,n,n), dtype)
+    # Values hardcoded from running 770c_pag on processing_steps/0800_implant_data.py
+    voxel_size = 3.75
+    bbox_flat = (-3041.39336716053, 2955.146870664342, -1743.0321403974565, 1744.4435665884819, 367.6267143127782, 1764.022543822563)
+    rsqr_fraction = 0.7
+    Muvwp_flat = (-0.9969205263686536, -0.07827989472162836, 0.004660706729396567, 3351.6367031993477, -0.004165804965960026, -0.006484313676985426, -0.9999702066630287, 3287.1018168847136, -0.07830654571765466, 0.996894476384658, -0.006138149566672908, -1739.8123507003322, 0.0, 0.0, 0.0, 1.0)
+    n_bins = 1024
+
+    solid_implant_mask = np.zeros(implant.shape, np.uint8)
+    rsqr_maxs = np.zeros((n_bins, ), np.uint8)
+    profile = np.zeros((n_bins, ), np.uint8)
+
+    impls = [m_cpu_seq, m_cpu, m_gpu]
+    result_solid_implant_mask = [solid_implant_mask.copy() for _ in impls]
+    result_rsqr_maxs = [rsqr_maxs.copy() for _ in impls]
+    result_profile = [profile.copy() for _ in impls]
+    cpu_seq, cpu, gpu = [
+        partial(impl.fill_implant_mask, implant, voxel_size, bbox_flat, rsqr_fraction, Muvwp_flat, solid_implant_mask, rsqr_maxs, profile)
+        for i, impl in enumerate(impls)
+    ]
+
+    compare_fs('test_fill_implant_mask', cpu_seq, cpu, gpu, False)
+
+    assert_with_print(result_solid_implant_mask[0], result_solid_implant_mask[1], 1e-7, "cpu_seq vs cpu")
+    assert_with_print(result_solid_implant_mask[0], result_solid_implant_mask[2], 1e-7, "cpu_seq vs gpu")
+    assert_with_print(result_rsqr_maxs[0], result_rsqr_maxs[1], 1e-7, "cpu_seq vs cpu")
+    assert_with_print(result_rsqr_maxs[0], result_rsqr_maxs[2], 1e-7, "cpu_seq vs gpu")
+    assert_with_print(result_profile[0], result_profile[1], 1e-7, "cpu_seq vs cpu")
+    assert_with_print(result_profile[0], result_profile[2], 1e-7, "cpu_seq vs gpu")
+
+
 if __name__ == '__main__':
     test_center_of_mass()
     test_inertia_matrix()
     test_sample_plane(np.uint8)
     test_integrate_axes()
-    test_zero_outside_bbox()
\ No newline at end of file
+    test_zero_outside_bbox()
+    test_fill_implant_mask()
\ No newline at end of file

From 3a35e5f45ee5a68aa94d95a1de03511a80223428 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Tue, 7 Mar 2023 13:33:50 +0100
Subject: [PATCH 110/136] #25 Fixed integrate axes not returning anything

---
 src/lib/cpp/cpu/geometry.cc      |  2 +-
 src/lib/cpp/cpu_seq/geometry.cc  | 14 +++++++++-----
 src/lib/cpp/gpu/geometry.cc      |  2 +-
 src/lib/cpp/include/datatypes.hh |  4 ++--
 src/lib/cpp/include/geometry.hh  |  2 +-
 src/pybind/geometry-pybind.cc    |  6 +++---
 src/test/test_geometry.py        | 14 ++++++++++++--
 7 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/src/lib/cpp/cpu/geometry.cc b/src/lib/cpp/cpu/geometry.cc
index 879ca9f..68af2ce 100644
--- a/src/lib/cpp/cpu/geometry.cc
+++ b/src/lib/cpp/cpu/geometry.cc
@@ -34,7 +34,7 @@ void integrate_axes(const input_ndarray<mask_type> &mask,
 		    const array<real_t,3> &v_axis,
 		    const array<real_t,3> &w_axis,
 		    const real_t v_min, const real_t w_min,
-		    output_ndarray<real_t> output) {
+		    output_ndarray<uint64_t> output) {
     return cpu_seq::integrate_axes(mask, x0, v_axis, w_axis, v_min, w_min, output);
 }
 
diff --git a/src/lib/cpp/cpu_seq/geometry.cc b/src/lib/cpp/cpu_seq/geometry.cc
index 1fd5546..6a79125 100644
--- a/src/lib/cpp/cpu_seq/geometry.cc
+++ b/src/lib/cpp/cpu_seq/geometry.cc
@@ -279,10 +279,10 @@ void integrate_axes(const input_ndarray<mask_type> &mask,
 		    const array<real_t,3> &v_axis,
 		    const array<real_t,3> &w_axis,
 		    const real_t v_min, const real_t w_min,
-		    output_ndarray<real_t> output) {
+		    output_ndarray<uint64_t> output) {
     UNPACK_NUMPY(mask);
     ssize_t Nv = output.shape[0], Nw = output.shape[1];
-    real_t *output_data = output.data;
+    uint64_t *output_data = output.data;
 
     // TODO: Check v_axis & w_axis projections to certify bounds and get rid of runtime check
     #pragma acc data create(output_data[:Nv*Nw]) copyout(output_data[:Nv*Nw])
@@ -297,10 +297,14 @@ void integrate_axes(const input_ndarray<mask_type> &mask,
                 real_t(z) - x0[2]
             };
 
-            real_t v = dot(xs,v_axis), w = dot(xs,w_axis);
-            int64_t i_v = int64_t(round(v-v_min)), j_w = int64_t(round(w-w_min));
+            real_t
+                v = dot(xs, v_axis),
+                w = dot(xs, w_axis);
+            int64_t
+                i_v = int64_t(round(v - v_min)),
+                j_w = int64_t(round(w - w_min));
 
-            if(i_v >= 0 && j_w >= 0 && i_v < Nv && j_w < Nw){
+            if (i_v >= 0 && j_w >= 0 && i_v < Nv && j_w < Nw) {
                 ATOMIC()
                 output_data[i_v*Nw + j_w] += voxel;
             }
diff --git a/src/lib/cpp/gpu/geometry.cc b/src/lib/cpp/gpu/geometry.cc
index 5267619..a22cb41 100644
--- a/src/lib/cpp/gpu/geometry.cc
+++ b/src/lib/cpp/gpu/geometry.cc
@@ -124,7 +124,7 @@ void integrate_axes(const input_ndarray<mask_type> &mask,
 		    const array<real_t,3> &v_axis,
 		    const array<real_t,3> &w_axis,
 		    const real_t v_min, const real_t w_min,
-		    output_ndarray<real_t> output) {
+		    output_ndarray<uint64_t> output) {
     return cpu_seq::integrate_axes(mask, x0, v_axis, w_axis, v_min, w_min, output);
 }
 
diff --git a/src/lib/cpp/include/datatypes.hh b/src/lib/cpp/include/datatypes.hh
index b91fc78..cf37cef 100644
--- a/src/lib/cpp/include/datatypes.hh
+++ b/src/lib/cpp/include/datatypes.hh
@@ -29,8 +29,8 @@ typedef float gauss_type;
 typedef float real_t;
 
 namespace py = pybind11;
-template <typename voxel_type>
-using np_array = py::array_t<voxel_type, py::array::c_style | py::array::forcecast>;
+template <typename T>
+using np_array = py::array_t<T, py::array::c_style | py::array::forcecast>;
 
 typedef py::array_t<mask_type, py::array::c_style | py::array::forcecast> np_maskarray;
 typedef py::array_t<real_t,    py::array::c_style | py::array::forcecast> np_realarray;
diff --git a/src/lib/cpp/include/geometry.hh b/src/lib/cpp/include/geometry.hh
index c168ed4..8bd5cd1 100644
--- a/src/lib/cpp/include/geometry.hh
+++ b/src/lib/cpp/include/geometry.hh
@@ -78,7 +78,7 @@ void integrate_axes(const input_ndarray<mask_type> &mask,
 		    const array<real_t,3> &v_axis,
 		    const array<real_t,3> &w_axis,
 		    const real_t v_min, const real_t w_min,
-		    output_ndarray<real_t> output);
+		    output_ndarray<uint64_t> output);
 
 template <typename T>
 float resample2x2x2(const T *voxels,
diff --git a/src/pybind/geometry-pybind.cc b/src/pybind/geometry-pybind.cc
index ac3116b..5c50f69 100644
--- a/src/pybind/geometry-pybind.cc
+++ b/src/pybind/geometry-pybind.cc
@@ -21,7 +21,7 @@ void sample_plane(const np_array<T> &np_voxels,
           const array<real_t,3> u_axis,
           const array<real_t,3> v_axis,
           const array<real_t,4>  bbox,    // [umin,umax,vmin,vmax] in micrometers
-          np_array<real_t> np_plane_samples) {
+          np_array<real_t> &np_plane_samples) {
     auto voxels_info = np_voxels.request();
     auto plane_samples_info  = np_plane_samples.request();
 
@@ -35,9 +35,9 @@ void integrate_axes(const np_maskarray &np_voxels,
             const array<real_t,3> &v_axis,
             const array<real_t,3> &w_axis,
             const real_t v_min, const real_t w_min,
-            np_realarray &output) {
+            np_array<uint64_t> &output) {
     auto voxels_info = np_voxels.request();
-    auto output_info  = output.request();
+    auto output_info = output.request();
 
     NS::integrate_axes({voxels_info.ptr, voxels_info.shape},
              x0,v_axis,w_axis,
diff --git a/src/test/test_geometry.py b/src/test/test_geometry.py
index 6525151..ae43815 100644
--- a/src/test/test_geometry.py
+++ b/src/test/test_geometry.py
@@ -125,6 +125,9 @@ def test_integrate_axes():
 
     v_axis, w_axis = E[:,1], E[:,2]
 
+    # TODO de her kan også bruges til test:
+    v_axis, w_axis = np.array([1,0,0], np.float32), np.array([0,1,0], np.float32)
+
     (vmin,vmax), _ = axis_parameter_bounds(voxels.shape, cm, v_axis)
     (wmin,wmax), _ = axis_parameter_bounds(voxels.shape, cm, w_axis)
 
@@ -132,8 +135,15 @@ def test_integrate_axes():
         partial(impl.integrate_axes, voxels, cm, v_axis, w_axis, vmin, wmin)
         for impl in [m_cpu_seq, m_cpu, m_gpu]
     ]
-
-    compare_fs('integrate_axes', cpu_seq, cpu, gpu, True, 1e-7, ((int(vmax-vmin+2),int(wmax-wmin+2)), float))
+    #$void integrate_axes(const np_maskarray &np_voxels,
+    #$            const array<real_t,3> &x0,
+    #$            const array<real_t,3> &v_axis,
+    #$            const array<real_t,3> &w_axis,
+    #$            const real_t v_min,
+    #             const real_t w_min,
+    #$            np_realarray output) {
+
+    compare_fs('integrate_axes', cpu_seq, cpu, gpu, True, 1e-7, ((int(vmax-vmin+2),int(wmax-wmin+2)), np.uint64))
 
 def axis_parameter_bounds(shape, center, axis):
     signs = np.sign(axis)

From 7103f1d9b807f4d47d0f577a0c39236737011a01 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Tue, 7 Mar 2023 13:34:08 +0100
Subject: [PATCH 111/136] #25 Fixed incorrect GPU results for integrate axes

---
 src/lib/cpp/cpu_seq/geometry.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lib/cpp/cpu_seq/geometry.cc b/src/lib/cpp/cpu_seq/geometry.cc
index 6a79125..ff735fe 100644
--- a/src/lib/cpp/cpu_seq/geometry.cc
+++ b/src/lib/cpp/cpu_seq/geometry.cc
@@ -285,7 +285,7 @@ void integrate_axes(const input_ndarray<mask_type> &mask,
     uint64_t *output_data = output.data;
 
     // TODO: Check v_axis & w_axis projections to certify bounds and get rid of runtime check
-    #pragma acc data create(output_data[:Nv*Nw]) copyout(output_data[:Nv*Nw])
+    #pragma acc data copy(output_data[:Nv*Nw]) copyin(x0, v_axis, w_axis, v_min, w_min)
     {
     BLOCK_BEGIN(mask, ) {
 

From d48d75d55cda58598fffbb2493cea16aef8545e3 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Tue, 7 Mar 2023 14:09:04 +0100
Subject: [PATCH 112/136] #25 Fixed zero_outside_bbox

---
 src/lib/cpp/cpu_seq/geometry.cc | 4 +++-
 src/test/test_geometry.py       | 7 +++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/lib/cpp/cpu_seq/geometry.cc b/src/lib/cpp/cpu_seq/geometry.cc
index ff735fe..23cfc54 100644
--- a/src/lib/cpp/cpu_seq/geometry.cc
+++ b/src/lib/cpp/cpu_seq/geometry.cc
@@ -322,6 +322,8 @@ void zero_outside_bbox(const array<real_t,9> &principal_axes,
 
     UNPACK_NUMPY(voxels)
 
+    #pragma acc data copyin(principal_axes, parameter_ranges, cm)
+    {
     BLOCK_BEGIN(voxels, ) {
 
         real_t xs[3] = {
@@ -347,7 +349,7 @@ void zero_outside_bbox(const array<real_t,9> &principal_axes,
             voxels_buffer[flat_index] = 0;
 
     BLOCK_END() }
-
+    }
 }
 
 }
diff --git a/src/test/test_geometry.py b/src/test/test_geometry.py
index ae43815..a67e6af 100644
--- a/src/test/test_geometry.py
+++ b/src/test/test_geometry.py
@@ -215,10 +215,9 @@ def test_zero_outside_bbox():
     fine_scale = 2
     mmtofi = 1 / (voxelsize * fine_scale) # Conversion factor from micrometers to index
 
-    implant_bound = bounding_volume(voxels, voxelsize*coarse_scale)
-    uvw_axes   = implant_bound["principal_axes"]
-    uvw_ranges = implant_bound["principal_axes_ranges"] * mmtofi
-    cm         = implant_bound["centre_of_mass"] * mmtofi
+    uvw_axes = np.array([[1,0,0],[0,1,0],[0,0,1]], np.float32)
+    uvw_ranges = np.array([-16,16]*3, np.float32)
+    cm = np.array(m_cpu.center_of_mass(voxels))
 
     cpu_seq, cpu, gpu = [
         partial(impl.zero_outside_bbox, uvw_axes.flatten(), uvw_ranges.flatten(), cm)

From 066778c4630ed381e45dc4cd9f4d6ae922e63707 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Tue, 7 Mar 2023 14:14:46 +0100
Subject: [PATCH 113/136] #25 Added additional checking of the results in
 test_geometry

---
 src/test/test_geometry.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/test/test_geometry.py b/src/test/test_geometry.py
index a67e6af..0caba8b 100644
--- a/src/test/test_geometry.py
+++ b/src/test/test_geometry.py
@@ -16,8 +16,12 @@
 #n = 2344 # ~12 GB, used for testing whether blocked works.
 n = 128
 
+def assert_interesting_result(result):
+    checksum = result.sum() if type(result) is np.ndarray else sum(result)
+    assert (checksum < 0 or checksum > 0) # Sanity check that there's an actual result to compare to.
+
 def assert_with_print(a, b, tolerance=1e-7, names=None):
-    na, nb = np.array(a), np.array(b)
+    na, nb = np.array(a, dtype=np.float64), np.array(b, dtype=np.float64)
     nabs = np.abs(na - nb)
     all_close = np.alltrue(nabs < tolerance)
     if not all_close:
@@ -25,6 +29,8 @@ def assert_with_print(a, b, tolerance=1e-7, names=None):
         print ('b', nb)
         print ('absolute error (AE) (abs(a-b))', nabs)
         print ('AE sum', np.sum(nabs))
+        suma, sumb = na.sum(), nb.sum()
+        print ('checksums', suma, sumb, np.abs(suma - sumb), suma / sumb)
         diffs = np.argwhere(nabs > tolerance)
         print (f'differing on {diffs.shape} elements')
         for i in diffs[:5]: # Only print 5 first
@@ -59,6 +65,7 @@ def compare_fs(func, baseline_f, cpu_f, gpu_f, should_assert=True, tolerance=1e-
                allocate_result: tuple[tuple[int],np.dtype] | np.ndarray=None):
     baseline, baseline_t = run_with_warmup(baseline_f, allocate_result)
     print (f'({func}) Sequential ran in {baseline_t}')
+    if should_assert: assert_interesting_result(baseline)
 
     cpu, cpu_t = run_with_warmup(cpu_f, allocate_result)
     print (f'({func}) Parallel CPU ran in {cpu_t}, which is {baseline_t / cpu_t:.02f} times faster than sequential')
@@ -90,6 +97,8 @@ def test_inertia_matrix():
     # TODO assert disabled due to floating point associativity error accumulation
     compare_fs('inertia_matrix', baseline, cpu, gpu, should_assert=False)
 
+    assert_interesting_result(baseline())
+
 @pytest.mark.parametrize("dtype", [np.uint8, np.uint16])
 def test_sample_plane(dtype):
     # TODO something that isn't just random data?

From ce816048f5b5defeee52a7b16eae844611bca676 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Wed, 8 Mar 2023 13:39:01 +0100
Subject: [PATCH 114/136] #25 Fixed fill_implant_mask not giving any results

---
 src/lib/cpp/cpu_seq/geometry.cc | 51 ++++++++++++++++-----------------
 src/lib/cpp/gpu/geometry.cc     | 32 +++++++++++++--------
 src/lib/cpp/include/geometry.hh |  1 -
 src/test/test_geometry.py       | 35 ++++++++++++++--------
 4 files changed, 67 insertions(+), 52 deletions(-)

diff --git a/src/lib/cpp/cpu_seq/geometry.cc b/src/lib/cpp/cpu_seq/geometry.cc
index 23cfc54..826be41 100644
--- a/src/lib/cpp/cpu_seq/geometry.cc
+++ b/src/lib/cpp/cpu_seq/geometry.cc
@@ -56,11 +56,10 @@ void fill_implant_mask(const input_ndarray<mask_type> mask,
     float     *profile_d       = profile.data;
 
 
-    //BLOCK_BEGIN(mask, ) {
-    #pragma omp parallel for collapse(3)
-    for (int64_t z = 0; z < mask_Nz; z++) { for (int64_t y = 0; y < mask_Ny; y++) { for (int64_t x = 0; x < mask_Nx; x++) {
-        //mask_type *solid_mask_buffer = solid_implant_mask.data + mask_buffer_start;
-
+    #pragma omp parallel for collapse(3) reduction(max:rsqr_maxs_d[:n_segments], theta_max) reduction(min:theta_min)
+    for (int64_t z = 0; z < mask_Nz; z++) {
+        for (int64_t y = 0; y < mask_Ny; y++) {
+            for (int64_t x = 0; x < mask_Nx; x++) {
         mask_type mask_value = mask.data[z*mask_Ny*mask_Nx + y*mask_Nx + x];
         std::array<real_t, 4> Xs = {
             real_t(x) * voxel_size,
@@ -71,13 +70,13 @@ void fill_implant_mask(const input_ndarray<mask_type> mask,
         if (mask_value) {
             auto [U,V,W,c] = hom_transform(Xs, Muvw);
 
-            real_t r_sqr = V*V+W*W;
-            real_t theta = atan2(V,W);
+                    real_t r_sqr = V*V + W*W;
+                    real_t theta = atan2(V, W);
 
-            int U_i = int(floor((U-U_min)*real_t(n_segments-1)/(U_max-U_min)));
+                    int U_i = int(floor((U - U_min) * real_t(n_segments-1) / (U_max - U_min)));
 
         //    if (U_i >= 0 && U_i < n_segments) {
-            if ( in_bbox(U,V,W,bbox) ) {
+                    if ( in_bbox(U, V, W, bbox) ) {
                 rsqr_maxs_d[U_i] = max(rsqr_maxs_d[U_i], float(r_sqr));
                 theta_min = min(theta_min, theta);
                 theta_max = max(theta_max, theta);
@@ -87,15 +86,16 @@ void fill_implant_mask(const input_ndarray<mask_type> mask,
                 //  fprintf(stderr,"U-coordinate out of bounds: U_i = %ld, U = %g, U_min = %g, U_max = %g\n",U_i,U,U_min,U_max);
             }
         }
+            }
+        }
+    }
 
-    //FOR_3D_END() }
-    }}}
-
-    double theta_center = (theta_max+theta_min)/2;
+    real_t theta_center = (theta_max + theta_min) / 2;
 
-    //FOR_3D_BEGIN(mask, ) {
-    #pragma omp parallel for collapse(3)
-    for (int64_t z = 0; z < mask_Nz; z++) { for (int64_t y = 0; y < mask_Ny; y++) { for (int64_t x = 0; x < mask_Nx; x++) {
+    #pragma omp parallel for collapse(3) reduction(+:profile_d[:n_segments])
+    for (int64_t z = 0; z < mask_Nz; z++) {
+        for (int64_t y = 0; y < mask_Ny; y++) {
+            for (int64_t x = 0; x < mask_Nx; x++) {
         std::array<real_t, 4> Xs = {
             real_t(x) * voxel_size,
             real_t(y) * voxel_size,
@@ -105,27 +105,24 @@ void fill_implant_mask(const input_ndarray<mask_type> mask,
         mask_type mask_value = mask.data[flat_index];
 
         // Second pass does the actual work
-        auto [U,V,W,c] = hom_transform(Xs,Muvw);
-        float r_sqr = V*V+W*W;
-        float theta = atan2(V,W);
-        int U_i = int(floor((U-U_min)*real_t(n_segments-1)/(U_max-U_min)));
+                auto [U,V,W,c] = hom_transform(Xs, Muvw);
+                float r_sqr = V*V + W*W;
+                float theta = atan2(V, W);
+                int U_i = int(floor((U - U_min) * real_t(n_segments-1) / (U_max - U_min)));
 
         bool solid_mask_value = false;
         if (U_i >= 0 && U_i < n_segments && W >= W_min) { // TODO: Full bounding box check?
-            solid_mask_value = mask_value | (r_sqr <= r_fraction*rsqr_maxs_d[U_i]);
+                    solid_mask_value = mask_value | (r_sqr <= r_fraction * rsqr_maxs_d[U_i]);
 
             if (theta >= theta_min && theta <= theta_center && r_sqr <= rsqr_maxs_d[U_i]) {
-                ATOMIC()
                 profile_d[U_i] += solid_mask_value;
             }
         }
 
         solid_implant_mask.data[flat_index] = solid_mask_value;
-
-    //BLOCK_END() }
-    //FOR_3D_END() }
-    }}}
-
+            }
+        }
+    }
 }
 
 array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &mask, const array<real_t,3> &cm) {
diff --git a/src/lib/cpp/gpu/geometry.cc b/src/lib/cpp/gpu/geometry.cc
index a22cb41..a78c3eb 100644
--- a/src/lib/cpp/gpu/geometry.cc
+++ b/src/lib/cpp/gpu/geometry.cc
@@ -29,14 +29,17 @@ void fill_implant_mask(const input_ndarray<mask_type> mask,
     float     *rsqr_maxs_d     = rsqr_maxs.data;
     float     *profile_d       = profile.data;
 
-    #pragma acc data copyin(U_min) create(rsqr_maxs_d[:n_segments], profile_d[:n_segments]) copyout(rsqr_maxs_d[:n_segments], profile_d[:n_segments])
+    #pragma acc data copyin(U_min, U_max, W_min, Muvw, mask_Nz, mask_Ny, mask_Nx, voxel_size, n_segments, bbox) copy(rsqr_maxs_d[:n_segments], profile_d[:n_segments])
+    {
+        #pragma acc data copy(theta_min, theta_max)
     {
         for (int64_t mask_buffer_start = 0; mask_buffer_start < mask_length; mask_buffer_start += acc_block_size<mask_type>) {
             ssize_t mask_buffer_length = min(acc_block_size<mask_type>, mask_length-mask_buffer_start);
             mask_type *mask_buffer = (mask_type *) mask.data + mask_buffer_start;
-            #pragma acc data copy(mask_buffer[:mask_buffer_length])
+                #pragma acc data copyin(mask_buffer_start, mask_buffer[:mask_buffer_length])
             {
-                #pragma acc parallel loop
+                    // TODO the reduction on rsqr_maxs_d kills performance, and allocates more memory than what's available on the GPU! The real solution would be using atomic, but OpenACC doesn't like it on that particular statement.
+                    #pragma acc parallel loop reduction(max:theta_max) reduction(min:theta_min) reduction(max:rsqr_maxs_d[:n_segments])
                 for (int64_t flat_index = 0; flat_index < mask_buffer_length; flat_index++) {
                     int64_t
                         global_index = mask_buffer_start + flat_index,
@@ -53,32 +56,36 @@ void fill_implant_mask(const input_ndarray<mask_type> mask,
                     if (mask_value) {
                         auto [U,V,W,c] = hom_transform(Xs, Muvw);
 
-                        real_t r_sqr = V*V+W*W;
+                            real_t r_sqr = V*V + W*W;
                         real_t theta = atan2(V,W);
 
-                        int U_i = int(floor((U-U_min)*real_t(n_segments-1)/(U_max-U_min)));
+                            int U_i = int(floor((U - U_min) * real_t(n_segments-1) / (U_max - U_min)));
 
                         if ( in_bbox(U,V,W,bbox) ) {
+                                //#pragma acc atomic update
                             rsqr_maxs_d[U_i] = max(rsqr_maxs_d[U_i], float(r_sqr));
                             theta_min = min(theta_min, theta);
                             theta_max = max(theta_max, theta);
                         } else {
                             // Otherwise we've calculated it wrong!
+                            }
                         }
                     }
                 }
             }
         }
 
-        double theta_center = (theta_max+theta_min)/2;
+        real_t theta_center = (theta_max + theta_min) / 2;
 
+        #pragma acc data copyin(theta_center)
+        {
         for (int64_t mask_buffer_start = 0; mask_buffer_start < mask_length; mask_buffer_start += acc_block_size<mask_type>) {
             mask_type *mask_buffer = (mask_type *) mask.data + mask_buffer_start;
             ssize_t mask_buffer_length = min(acc_block_size<mask_type>, mask_length-mask_buffer_start);
             mask_type *solid_mask_buffer = solid_implant_mask.data + mask_buffer_start;
             #pragma acc data copy(mask_buffer[:mask_buffer_length]) create(solid_mask_buffer[:mask_buffer_length]) copyout(solid_mask_buffer[:mask_buffer_length])
             {
-                #pragma acc parallel loop
+                    #pragma acc parallel loop // reduction(+:profile_d[:n_segments])
                 for (int64_t flat_index = 0; flat_index < mask_buffer_length; flat_index++) {
                     int64_t
                         global_index = mask_buffer_start + flat_index,
@@ -93,14 +100,14 @@ void fill_implant_mask(const input_ndarray<mask_type> mask,
                         1 };
 
                     // Second pass does the actual work
-                    auto [U,V,W,c] = hom_transform(Xs,Muvw);
-                    float r_sqr = V*V+W*W;
-                    float theta = atan2(V,W);
-                    int U_i = int(floor((U-U_min)*real_t(n_segments-1)/(U_max-U_min)));
+                        auto [U,V,W,c] = hom_transform(Xs, Muvw);
+                        float r_sqr = V*V + W*W;
+                        float theta = atan2(V, W);
+                        int U_i = int(floor((U - U_min) * real_t(n_segments-1) / (U_max - U_min)));
 
                     bool solid_mask_value = false;
                     if (U_i >= 0 && U_i < n_segments && W >= W_min) { // TODO: Full bounding box check?
-                        solid_mask_value = mask_value | (r_sqr <= r_fraction*rsqr_maxs_d[U_i]);
+                            solid_mask_value = mask_value | (r_sqr <= r_fraction * rsqr_maxs_d[U_i]);
 
                         if (theta >= theta_min && theta <= theta_center && r_sqr <= rsqr_maxs_d[U_i]) {
                             ATOMIC()
@@ -109,6 +116,7 @@ void fill_implant_mask(const input_ndarray<mask_type> mask,
                     }
 
                     solid_mask_buffer[flat_index] = solid_mask_value;
+                    }
                 }
             }
         }
diff --git a/src/lib/cpp/include/geometry.hh b/src/lib/cpp/include/geometry.hh
index 8bd5cd1..c7ec8dc 100644
--- a/src/lib/cpp/include/geometry.hh
+++ b/src/lib/cpp/include/geometry.hh
@@ -20,7 +20,6 @@ inline vector4 hom_transform(const vector4 &x, const matrix4x4 &M) {
 
     for (int i = 0; i < 4; i++) {
         real_t sum = 0;
-        #pragma simd parallel for reduction(+:sum)
         for (int j = 0; j < 4; j++)
             sum += M[i*4 + j] * x[j];
         c[i] = sum;
diff --git a/src/test/test_geometry.py b/src/test/test_geometry.py
index 0caba8b..ce7446b 100644
--- a/src/test/test_geometry.py
+++ b/src/test/test_geometry.py
@@ -6,9 +6,12 @@
 import cpu_seq.geometry as m_cpu_seq
 import cpu.geometry as m_cpu
 import gpu.geometry as m_gpu
+sys.path.append(sys.path[0]+'/../')
+from config.paths import hdf5_root
 
 import datetime
 from functools import partial
+import h5py
 import numpy as np
 import pytest
 
@@ -25,9 +28,9 @@ def assert_with_print(a, b, tolerance=1e-7, names=None):
     nabs = np.abs(na - nb)
     all_close = np.alltrue(nabs < tolerance)
     if not all_close:
-        print ('a', na)
-        print ('b', nb)
-        print ('absolute error (AE) (abs(a-b))', nabs)
+        #print ('a', na)
+        #print ('b', nb)
+        #print ('absolute error (AE) (abs(a-b))', nabs)
         print ('AE sum', np.sum(nabs))
         suma, sumb = na.sum(), nb.sum()
         print ('checksums', suma, sumb, np.abs(suma - sumb), suma / sumb)
@@ -238,29 +241,36 @@ def test_zero_outside_bbox():
 def test_fill_implant_mask():
     n = 128
     dtype = np.uint8
-    implant = np.random.randint(0, np.iinfo(dtype).max, (n,n,n), dtype)
-    # Values hardcoded from running 770c_pag on processing_steps/0800_implant_data.py
-    voxel_size = 3.75
-    bbox_flat = (-3041.39336716053, 2955.146870664342, -1743.0321403974565, 1744.4435665884819, 367.6267143127782, 1764.022543822563)
-    rsqr_fraction = 0.7
-    Muvwp_flat = (-0.9969205263686536, -0.07827989472162836, 0.004660706729396567, 3351.6367031993477, -0.004165804965960026, -0.006484313676985426, -0.9999702066630287, 3287.1018168847136, -0.07830654571765466, 0.996894476384658, -0.006138149566672908, -1739.8123507003322, 0.0, 0.0, 0.0, 1.0)
+    implant = np.random.randint(0, 2, (n,n,n), dtype)
+    voxel_size = 1
+    bbox_flat = np.array([-16,16] * 3, np.float32)
+    rsqr_fraction = 1#0.7
+    Muvwp_flat = np.array([
+       1, 0, 0, 0,
+       0, 1, 0, 0,
+       0, 0, 1, 0,
+       0, 0, 0, 1
+    ], np.float32)
     n_bins = 1024
 
     solid_implant_mask = np.zeros(implant.shape, np.uint8)
-    rsqr_maxs = np.zeros((n_bins, ), np.uint8)
-    profile = np.zeros((n_bins, ), np.uint8)
+    rsqr_maxs = np.zeros((n_bins, ), np.float32)
+    profile = np.zeros((n_bins, ), np.float32)
 
     impls = [m_cpu_seq, m_cpu, m_gpu]
     result_solid_implant_mask = [solid_implant_mask.copy() for _ in impls]
     result_rsqr_maxs = [rsqr_maxs.copy() for _ in impls]
     result_profile = [profile.copy() for _ in impls]
     cpu_seq, cpu, gpu = [
-        partial(impl.fill_implant_mask, implant, voxel_size, bbox_flat, rsqr_fraction, Muvwp_flat, solid_implant_mask, rsqr_maxs, profile)
+        partial(impl.fill_implant_mask, implant, voxel_size, bbox_flat, rsqr_fraction, Muvwp_flat, result_solid_implant_mask[i], result_rsqr_maxs[i], result_profile[i])
         for i, impl in enumerate(impls)
     ]
 
     compare_fs('test_fill_implant_mask', cpu_seq, cpu, gpu, False)
 
+    assert_interesting_result(result_solid_implant_mask[0])
+    assert_interesting_result(result_rsqr_maxs[0])
+    assert_interesting_result(result_profile[0])
     assert_with_print(result_solid_implant_mask[0], result_solid_implant_mask[1], 1e-7, "cpu_seq vs cpu")
     assert_with_print(result_solid_implant_mask[0], result_solid_implant_mask[2], 1e-7, "cpu_seq vs gpu")
     assert_with_print(result_rsqr_maxs[0], result_rsqr_maxs[1], 1e-7, "cpu_seq vs cpu")
@@ -270,6 +280,7 @@ def test_fill_implant_mask():
 
 
 if __name__ == '__main__':
+    np.random.seed(42)
     test_center_of_mass()
     test_inertia_matrix()
     test_sample_plane(np.uint8)

From 56ad65f0256fbc9e7110f8f766db0959d6af2a0e Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Wed, 8 Mar 2023 13:51:44 +0100
Subject: [PATCH 115/136] #25 Added implementation and test for
 geometry::compute_front_mask

---
 src/lib/cpp/cpu/geometry.cc        |   8 ++
 src/lib/cpp/cpu_seq/geometry.cc    | 157 +++++++++++++++--------------
 src/lib/cpp/gpu/geometry.cc        | 116 +++++++++++----------
 src/lib/cpp/include/boilerplate.hh |  40 ++++++++
 src/lib/cpp/include/datatypes.hh   |   2 +
 src/lib/cpp/include/geometry.hh    |   6 ++
 src/pybind/geometry-pybind.cc      |   6 +-
 src/test/test_geometry.py          |  32 +++++-
 8 files changed, 235 insertions(+), 132 deletions(-)

diff --git a/src/lib/cpp/cpu/geometry.cc b/src/lib/cpp/cpu/geometry.cc
index 68af2ce..b173519 100644
--- a/src/lib/cpp/cpu/geometry.cc
+++ b/src/lib/cpp/cpu/geometry.cc
@@ -14,6 +14,14 @@ array<real_t,3> center_of_mass(const input_ndarray<mask_type> &mask) {
     return cpu_seq::center_of_mass(mask);
 }
 
+void compute_front_mask(const input_ndarray<mask_type> solid_implant,
+        const float voxel_size,
+        const matrix4x4 &Muvw,
+        std::array<float,6> bbox,
+        output_ndarray<mask_type> front_mask) {
+    return cpu_seq::compute_front_mask(solid_implant, voxel_size, Muvw, bbox, front_mask);
+}
+
 void fill_implant_mask(const input_ndarray<mask_type> mask,
                float voxel_size,
                const array<float,6> &bbox,
diff --git a/src/lib/cpp/cpu_seq/geometry.cc b/src/lib/cpp/cpu_seq/geometry.cc
index 826be41..8ad180c 100644
--- a/src/lib/cpp/cpu_seq/geometry.cc
+++ b/src/lib/cpp/cpu_seq/geometry.cc
@@ -39,6 +39,33 @@ array<real_t, 3> center_of_mass(const input_ndarray<mask_type> &mask) {
     return array<real_t, 3>{ rcmz, rcmy, rcmx };
 }
 
+void compute_front_mask(const input_ndarray<mask_type> solid_implant,
+        const float voxel_size,
+        const matrix4x4 &Muvw,
+        std::array<float,6> bbox,
+        output_ndarray<mask_type> front_mask) {
+    const auto [U_min, U_max, V_min, V_max, W_min, W_max] = bbox;
+    UNPACK_NUMPY(solid_implant)
+
+    BLOCK_BEGIN_WITH_OUTPUT(solid_implant, front_mask, ) {
+
+        std::array<real_t, 4> Xs = {
+            real_t(x) * voxel_size,
+            real_t(y) * voxel_size,
+            real_t(z) * voxel_size,
+            1 };
+        mask_type mask_value = solid_implant_buffer[flat_index];
+
+        if (mask_value) {
+            front_mask_buffer[flat_index] = 0;
+        } else {
+            auto [U,V,W,c] = hom_transform(Xs, Muvw);
+            front_mask_buffer[flat_index] = W > W_min;
+        }
+
+    BLOCK_END_WITH_OUTPUT() }
+}
+
 void fill_implant_mask(const input_ndarray<mask_type> mask,
                float voxel_size,
                const array<float,6> &bbox,
@@ -60,32 +87,32 @@ void fill_implant_mask(const input_ndarray<mask_type> mask,
     for (int64_t z = 0; z < mask_Nz; z++) {
         for (int64_t y = 0; y < mask_Ny; y++) {
             for (int64_t x = 0; x < mask_Nx; x++) {
-        mask_type mask_value = mask.data[z*mask_Ny*mask_Nx + y*mask_Nx + x];
-        std::array<real_t, 4> Xs = {
-            real_t(x) * voxel_size,
-            real_t(y) * voxel_size,
-            real_t(z) * voxel_size,
-            1 };
+                mask_type mask_value = mask.data[z*mask_Ny*mask_Nx + y*mask_Nx + x];
+                std::array<real_t, 4> Xs = {
+                    real_t(x) * voxel_size,
+                    real_t(y) * voxel_size,
+                    real_t(z) * voxel_size,
+                    1 };
 
-        if (mask_value) {
-            auto [U,V,W,c] = hom_transform(Xs, Muvw);
+                if (mask_value) {
+                    auto [U,V,W,c] = hom_transform(Xs, Muvw);
 
                     real_t r_sqr = V*V + W*W;
                     real_t theta = atan2(V, W);
 
                     int U_i = int(floor((U - U_min) * real_t(n_segments-1) / (U_max - U_min)));
 
-        //    if (U_i >= 0 && U_i < n_segments) {
+                //    if (U_i >= 0 && U_i < n_segments) {
                     if ( in_bbox(U, V, W, bbox) ) {
-                rsqr_maxs_d[U_i] = max(rsqr_maxs_d[U_i], float(r_sqr));
-                theta_min = min(theta_min, theta);
-                theta_max = max(theta_max, theta);
-            //      W_min     = min(W_min,     W);
-            } else {
-                // Otherwise we've calculated it wrong!
-                //  fprintf(stderr,"U-coordinate out of bounds: U_i = %ld, U = %g, U_min = %g, U_max = %g\n",U_i,U,U_min,U_max);
-            }
-        }
+                        rsqr_maxs_d[U_i] = max(rsqr_maxs_d[U_i], float(r_sqr));
+                        theta_min = min(theta_min, theta);
+                        theta_max = max(theta_max, theta);
+                    //      W_min     = min(W_min,     W);
+                    } else {
+                        // Otherwise we've calculated it wrong!
+                        //  fprintf(stderr,"U-coordinate out of bounds: U_i = %ld, U = %g, U_min = %g, U_max = %g\n",U_i,U,U_min,U_max);
+                    }
+                }
             }
         }
     }
@@ -96,30 +123,30 @@ void fill_implant_mask(const input_ndarray<mask_type> mask,
     for (int64_t z = 0; z < mask_Nz; z++) {
         for (int64_t y = 0; y < mask_Ny; y++) {
             for (int64_t x = 0; x < mask_Nx; x++) {
-        std::array<real_t, 4> Xs = {
-            real_t(x) * voxel_size,
-            real_t(y) * voxel_size,
-            real_t(z) * voxel_size,
-            1 };
-        int64_t flat_index = z*mask_Ny*mask_Nx + y*mask_Nx + x;
-        mask_type mask_value = mask.data[flat_index];
-
-        // Second pass does the actual work
+                std::array<real_t, 4> Xs = {
+                    real_t(x) * voxel_size,
+                    real_t(y) * voxel_size,
+                    real_t(z) * voxel_size,
+                    1 };
+                int64_t flat_index = z*mask_Ny*mask_Nx + y*mask_Nx + x;
+                mask_type mask_value = mask.data[flat_index];
+
+                // Second pass does the actual work
                 auto [U,V,W,c] = hom_transform(Xs, Muvw);
                 float r_sqr = V*V + W*W;
                 float theta = atan2(V, W);
                 int U_i = int(floor((U - U_min) * real_t(n_segments-1) / (U_max - U_min)));
 
-        bool solid_mask_value = false;
-        if (U_i >= 0 && U_i < n_segments && W >= W_min) { // TODO: Full bounding box check?
+                bool solid_mask_value = false;
+                if (U_i >= 0 && U_i < n_segments && W >= W_min) { // TODO: Full bounding box check?
                     solid_mask_value = mask_value | (r_sqr <= r_fraction * rsqr_maxs_d[U_i]);
 
-            if (theta >= theta_min && theta <= theta_center && r_sqr <= rsqr_maxs_d[U_i]) {
-                profile_d[U_i] += solid_mask_value;
-            }
-        }
+                    if (theta >= theta_min && theta <= theta_center && r_sqr <= rsqr_maxs_d[U_i]) {
+                        profile_d[U_i] += solid_mask_value;
+                    }
+                }
 
-        solid_implant_mask.data[flat_index] = solid_mask_value;
+                solid_implant_mask.data[flat_index] = solid_mask_value;
             }
         }
     }
@@ -321,55 +348,37 @@ void zero_outside_bbox(const array<real_t,9> &principal_axes,
 
     #pragma acc data copyin(principal_axes, parameter_ranges, cm)
     {
-    BLOCK_BEGIN(voxels, ) {
-
-        real_t xs[3] = {
-            real_t(x) - cm[0],
-            real_t(y) - cm[1],
-            real_t(z) - cm[2]};
-        real_t params[3] = { 0, 0, 0 };
-
-        for (int uvw = 0; uvw < 3; uvw++)
-            for (int xyz = 0; xyz < 3; xyz++)
-                params[uvw] += xs[xyz] * principal_axes[uvw*3 + xyz]; // u = dot(xs,u_axis), v = dot(xs,v_axis), w = dot(xs,w_axis)
+        BLOCK_BEGIN(voxels, ) {
 
-        bool p = false;
-
-        for (int uvw = 0; uvw < 3; uvw++) {
-            real_t
-                param_min = parameter_ranges[uvw*2],
-                param_max = parameter_ranges[uvw*2 + 1];
-            p |= (params[uvw] < param_min) | (params[uvw] > param_max);
-        }
+            real_t xs[3] = {
+                real_t(x) - cm[0],
+                real_t(y) - cm[1],
+                real_t(z) - cm[2]};
+            real_t params[3] = { 0, 0, 0 };
+
+            for (int uvw = 0; uvw < 3; uvw++)
+                for (int xyz = 0; xyz < 3; xyz++)
+                    params[uvw] += xs[xyz] * principal_axes[uvw*3 + xyz]; // u = dot(xs,u_axis), v = dot(xs,v_axis), w = dot(xs,w_axis)
+
+            bool p = false;
+
+            for (int uvw = 0; uvw < 3; uvw++) {
+                real_t
+                    param_min = parameter_ranges[uvw*2],
+                    param_max = parameter_ranges[uvw*2 + 1];
+                p |= (params[uvw] < param_min) | (params[uvw] > param_max);
+            }
 
-        if (p)
-            voxels_buffer[flat_index] = 0;
+            if (p)
+                voxels_buffer[flat_index] = 0;
 
-    BLOCK_END() }
+        BLOCK_END() }
     }
 }
 
 }
 
 /*
-void compute_front_mask(const input_ndarray<mask_type> solid_implant,
-        const float voxel_size,
-        const matrix4x4 &Muvw,
-        std::array<float,6> bbox,
-        output_ndarray<mask_type> front_mask) {
-    const auto [U_min,U_max,V_min,V_max,W_min,W_max] = bbox;
-
-    loop_mask_start(solid_implant, front_mask, () );
-
-    if (!mask_value) {
-        auto [U,V,W,c] = hom_transform(Xs,Muvw);
-        maskout_buffer[k] = W>W_min;
-    } else
-        maskout_buffer[k] = 0;
-
-    loop_mask_end(solid_implant)
-}
-
 void cylinder_projection(const input_ndarray<float>  edt,  // Euclidean Distance Transform in um, should be low-resolution (will be interpolated)
              const input_ndarray<uint8_t> C,  // Material classification images (probability per voxel, 0..1 -> 0..255)
              float voxel_size,           // Voxel size for Cs
diff --git a/src/lib/cpp/gpu/geometry.cc b/src/lib/cpp/gpu/geometry.cc
index a78c3eb..57126dc 100644
--- a/src/lib/cpp/gpu/geometry.cc
+++ b/src/lib/cpp/gpu/geometry.cc
@@ -13,6 +13,14 @@ array<real_t,3> center_of_mass(const input_ndarray<mask_type> &mask) {
     return cpu_seq::center_of_mass(mask);
 }
 
+void compute_front_mask(const input_ndarray<mask_type> solid_implant,
+        const float voxel_size,
+        const matrix4x4 &Muvw,
+        std::array<float,6> bbox,
+        output_ndarray<mask_type> front_mask) {
+    return cpu_seq::compute_front_mask(solid_implant, voxel_size, Muvw, bbox, front_mask);
+}
+
 void fill_implant_mask(const input_ndarray<mask_type> mask,
                float voxel_size,
                const array<float,6> &bbox,
@@ -32,42 +40,42 @@ void fill_implant_mask(const input_ndarray<mask_type> mask,
     #pragma acc data copyin(U_min, U_max, W_min, Muvw, mask_Nz, mask_Ny, mask_Nx, voxel_size, n_segments, bbox) copy(rsqr_maxs_d[:n_segments], profile_d[:n_segments])
     {
         #pragma acc data copy(theta_min, theta_max)
-    {
-        for (int64_t mask_buffer_start = 0; mask_buffer_start < mask_length; mask_buffer_start += acc_block_size<mask_type>) {
-            ssize_t mask_buffer_length = min(acc_block_size<mask_type>, mask_length-mask_buffer_start);
-            mask_type *mask_buffer = (mask_type *) mask.data + mask_buffer_start;
+        {
+            for (int64_t mask_buffer_start = 0; mask_buffer_start < mask_length; mask_buffer_start += acc_block_size<mask_type>) {
+                ssize_t mask_buffer_length = min(acc_block_size<mask_type>, mask_length-mask_buffer_start);
+                mask_type *mask_buffer = (mask_type *) mask.data + mask_buffer_start;
                 #pragma acc data copyin(mask_buffer_start, mask_buffer[:mask_buffer_length])
-            {
+                {
                     // TODO the reduction on rsqr_maxs_d kills performance, and allocates more memory than what's available on the GPU! The real solution would be using atomic, but OpenACC doesn't like it on that particular statement.
                     #pragma acc parallel loop reduction(max:theta_max) reduction(min:theta_min) reduction(max:rsqr_maxs_d[:n_segments])
-                for (int64_t flat_index = 0; flat_index < mask_buffer_length; flat_index++) {
-                    int64_t
-                        global_index = mask_buffer_start + flat_index,
-                        z = global_index / (mask_Ny * mask_Nx),
-                        y = (global_index / mask_Nx) % mask_Ny,
-                        x = global_index % mask_Nx;
-                    mask_type mask_value = mask_buffer[flat_index];
-                    std::array<real_t, 4> Xs = {
-                        real_t(x) * voxel_size,
-                        real_t(y) * voxel_size,
-                        real_t(z) * voxel_size,
-                        1 };
-
-                    if (mask_value) {
-                        auto [U,V,W,c] = hom_transform(Xs, Muvw);
+                    for (int64_t flat_index = 0; flat_index < mask_buffer_length; flat_index++) {
+                        int64_t
+                            global_index = mask_buffer_start + flat_index,
+                            z = global_index / (mask_Ny * mask_Nx),
+                            y = (global_index / mask_Nx) % mask_Ny,
+                            x = global_index % mask_Nx;
+                        mask_type mask_value = mask_buffer[flat_index];
+                        std::array<real_t, 4> Xs = {
+                            real_t(x) * voxel_size,
+                            real_t(y) * voxel_size,
+                            real_t(z) * voxel_size,
+                            1 };
+
+                        if (mask_value) {
+                            auto [U,V,W,c] = hom_transform(Xs, Muvw);
 
                             real_t r_sqr = V*V + W*W;
-                        real_t theta = atan2(V,W);
+                            real_t theta = atan2(V,W);
 
                             int U_i = int(floor((U - U_min) * real_t(n_segments-1) / (U_max - U_min)));
 
-                        if ( in_bbox(U,V,W,bbox) ) {
+                            if ( in_bbox(U,V,W,bbox) ) {
                                 //#pragma acc atomic update
-                            rsqr_maxs_d[U_i] = max(rsqr_maxs_d[U_i], float(r_sqr));
-                            theta_min = min(theta_min, theta);
-                            theta_max = max(theta_max, theta);
-                        } else {
-                            // Otherwise we've calculated it wrong!
+                                rsqr_maxs_d[U_i] = max(rsqr_maxs_d[U_i], float(r_sqr));
+                                theta_min = min(theta_min, theta);
+                                theta_max = max(theta_max, theta);
+                            } else {
+                                // Otherwise we've calculated it wrong!
                             }
                         }
                     }
@@ -79,43 +87,43 @@ void fill_implant_mask(const input_ndarray<mask_type> mask,
 
         #pragma acc data copyin(theta_center)
         {
-        for (int64_t mask_buffer_start = 0; mask_buffer_start < mask_length; mask_buffer_start += acc_block_size<mask_type>) {
-            mask_type *mask_buffer = (mask_type *) mask.data + mask_buffer_start;
-            ssize_t mask_buffer_length = min(acc_block_size<mask_type>, mask_length-mask_buffer_start);
-            mask_type *solid_mask_buffer = solid_implant_mask.data + mask_buffer_start;
-            #pragma acc data copy(mask_buffer[:mask_buffer_length]) create(solid_mask_buffer[:mask_buffer_length]) copyout(solid_mask_buffer[:mask_buffer_length])
-            {
+            for (int64_t mask_buffer_start = 0; mask_buffer_start < mask_length; mask_buffer_start += acc_block_size<mask_type>) {
+                mask_type *mask_buffer = (mask_type *) mask.data + mask_buffer_start;
+                ssize_t mask_buffer_length = min(acc_block_size<mask_type>, mask_length-mask_buffer_start);
+                mask_type *solid_mask_buffer = solid_implant_mask.data + mask_buffer_start;
+                #pragma acc data copy(mask_buffer[:mask_buffer_length]) create(solid_mask_buffer[:mask_buffer_length]) copyout(solid_mask_buffer[:mask_buffer_length])
+                {
                     #pragma acc parallel loop // reduction(+:profile_d[:n_segments])
-                for (int64_t flat_index = 0; flat_index < mask_buffer_length; flat_index++) {
-                    int64_t
-                        global_index = mask_buffer_start + flat_index,
-                        z = global_index / (mask_Ny * mask_Nx),
-                        y = (global_index / mask_Nx) % mask_Ny,
-                        x = global_index % mask_Nx;
-                    mask_type mask_value = mask_buffer[flat_index];
-                    std::array<real_t, 4> Xs = {
-                        real_t(x) * voxel_size,
-                        real_t(y) * voxel_size,
-                        real_t(z) * voxel_size,
-                        1 };
-
-                    // Second pass does the actual work
+                    for (int64_t flat_index = 0; flat_index < mask_buffer_length; flat_index++) {
+                        int64_t
+                            global_index = mask_buffer_start + flat_index,
+                            z = global_index / (mask_Ny * mask_Nx),
+                            y = (global_index / mask_Nx) % mask_Ny,
+                            x = global_index % mask_Nx;
+                        mask_type mask_value = mask_buffer[flat_index];
+                        std::array<real_t, 4> Xs = {
+                            real_t(x) * voxel_size,
+                            real_t(y) * voxel_size,
+                            real_t(z) * voxel_size,
+                            1 };
+
+                        // Second pass does the actual work
                         auto [U,V,W,c] = hom_transform(Xs, Muvw);
                         float r_sqr = V*V + W*W;
                         float theta = atan2(V, W);
                         int U_i = int(floor((U - U_min) * real_t(n_segments-1) / (U_max - U_min)));
 
-                    bool solid_mask_value = false;
-                    if (U_i >= 0 && U_i < n_segments && W >= W_min) { // TODO: Full bounding box check?
+                        bool solid_mask_value = false;
+                        if (U_i >= 0 && U_i < n_segments && W >= W_min) { // TODO: Full bounding box check?
                             solid_mask_value = mask_value | (r_sqr <= r_fraction * rsqr_maxs_d[U_i]);
 
-                        if (theta >= theta_min && theta <= theta_center && r_sqr <= rsqr_maxs_d[U_i]) {
-                            ATOMIC()
-                            profile_d[U_i] += solid_mask_value;
+                            if (theta >= theta_min && theta <= theta_center && r_sqr <= rsqr_maxs_d[U_i]) {
+                                ATOMIC()
+                                profile_d[U_i] += solid_mask_value;
+                            }
                         }
-                    }
 
-                    solid_mask_buffer[flat_index] = solid_mask_value;
+                        solid_mask_buffer[flat_index] = solid_mask_value;
                     }
                 }
             }
diff --git a/src/lib/cpp/include/boilerplate.hh b/src/lib/cpp/include/boilerplate.hh
index c0ba698..c3c1dad 100644
--- a/src/lib/cpp/include/boilerplate.hh
+++ b/src/lib/cpp/include/boilerplate.hh
@@ -39,6 +39,16 @@
 
 #define FOR_BLOCK_END() } }
 
+#define FOR_BLOCK_BEGIN_WITH_OUTPUT(ARR_IN, ARR_OUT) \
+    for (int64_t ARR_IN##_buffer_start = 0; ARR_IN##_buffer_start < ARR_IN##_length; ARR_IN##_buffer_start += acc_block_size<ARR_IN##_type> / 2) { \
+        ARR_IN##_type *ARR_IN##_buffer = (ARR_IN##_type *) ARR_IN.data + ARR_IN##_buffer_start; \
+        ARR_OUT##_type *ARR_OUT##_buffer = (ARR_OUT##_type *) ARR_OUT.data + ARR_IN##_buffer_start; \
+        ssize_t ARR_IN##_buffer_length = min(acc_block_size<ARR_IN##_type>, ARR_IN##_length - ARR_IN##_buffer_start); \
+        PRAGMA(acc data copyin(ARR_IN##_buffer[:ARR_IN##_buffer_length]) copy(ARR_OUT##_buffer[:ARR_IN##_buffer_length])) \
+        {
+
+#define FOR_BLOCK_END_WITH_OUTPUT() } }
+
 #define FOR_3D_BEGIN(ARR, EXTRA_PRAGMA_CLAUSE) \
     PRAGMA(PARALLEL_TERM collapse(3) EXTRA_PRAGMA_CLAUSE) \
     for (int64_t z = 0; z < ARR##_Nz; z++) { \
@@ -73,6 +83,15 @@
 #define BLOCK_END() \
     FOR_FLAT_END() \
     FOR_BLOCK_END()
+
+#define BLOCK_BEGIN_WITH_OUTPUT(ARR_IN, ARR_OUT, EXTRA_PRAGMA_CLAUSE) \
+    FOR_BLOCK_BEGIN_WITH_OUTPUT(ARR_IN, ARR_OUT) \
+    PUSH_N_DOWN_TO_BUFFER(ARR_IN) \
+    FOR_FLAT_BEGIN(ARR_IN##_buffer, global, EXTRA_PRAGMA_CLAUSE)
+
+#define BLOCK_END_WITH_OUTPUT() \
+    FOR_FLAT_END() \
+    FOR_BLOCK_END_WITH_OUTPUT()
 #else
 #ifdef _OPENMP // Should also capture OpenACC, which is why it's second.
 #define BLOCK_BEGIN(ARR, EXTRA_PRAGMA_CLAUSE) \
@@ -82,6 +101,16 @@
     int64_t flat_index = z*ARR##_Ny*ARR##_Nx + y*ARR##_Nx + x;
 
 #define BLOCK_END() FOR_3D_END()
+
+#define BLOCK_BEGIN_WITH_OUTPUT(ARR_IN, ARR_OUT, EXTRA_PRAGMA_CLAUSE) \
+    ARR_IN##_type *ARR_IN##_buffer = (ARR_IN##_type *) ARR_IN.data; \
+    ARR_OUT##_type *ARR_OUT##_buffer = (ARR_OUT##_type *) ARR_OUT.data; \
+    __attribute__((unused)) int64_t ARR_IN##_buffer_start = 0; \
+    FOR_3D_BEGIN(ARR_IN, EXTRA_PRAGMA_CLAUSE) \
+    int64_t flat_index = z*ARR_IN##_Ny*ARR_IN##_Nx + y*ARR_IN##_Nx + x;
+
+#define BLOCK_END_WITH_OUTPUT() FOR_3D_END()
+
 #else
 #define BLOCK_BEGIN(ARR, EXTRA_PRAGMA_CLAUSE) \
     int64_t flat_index = 0; \
@@ -93,6 +122,17 @@
     flat_index++; \
     FOR_3D_END()
 
+#define BLOCK_BEGIN_WITH_OUTPUT(ARR_IN, ARR_OUT, EXTRA_PRAGMA_CLAUSE) \
+    int64_t flat_index = 0; \
+    ARR_IN##_type *ARR_IN##_buffer = (ARR_IN##_type *) ARR_IN.data; \
+    ARR_OUT##_type *ARR_OUT##_buffer = (ARR_OUT##_type *) ARR_OUT.data; \
+    __attribute__((unused)) int64_t ARR_IN##_buffer_start = 0; \
+    FOR_3D_BEGIN(ARR_IN, EXTRA_PRAGMA_CLAUSE)
+
+#define BLOCK_END_WITH_OUTPUT() \
+    flat_index++; \
+    FOR_3D_END()
+
 #endif
 #endif
 
diff --git a/src/lib/cpp/include/datatypes.hh b/src/lib/cpp/include/datatypes.hh
index cf37cef..72d898f 100644
--- a/src/lib/cpp/include/datatypes.hh
+++ b/src/lib/cpp/include/datatypes.hh
@@ -27,6 +27,8 @@ typedef mask_type voxels_type;
 typedef uint16_t field_type;
 typedef float gauss_type;
 typedef float real_t;
+typedef mask_type solid_implant_type;
+typedef mask_type front_mask_type;
 
 namespace py = pybind11;
 template <typename T>
diff --git a/src/lib/cpp/include/geometry.hh b/src/lib/cpp/include/geometry.hh
index c7ec8dc..39641ab 100644
--- a/src/lib/cpp/include/geometry.hh
+++ b/src/lib/cpp/include/geometry.hh
@@ -54,6 +54,12 @@ Computes the center of mass of the given tomography.
 */
 array<real_t,3> center_of_mass(const input_ndarray<mask_type> &voxels);
 
+void compute_front_mask(const input_ndarray<mask_type> solid_implant,
+        const float voxel_size,
+        const matrix4x4 &Muvw,
+        std::array<float,6> bbox,
+        output_ndarray<mask_type> front_mask);
+
 void fill_implant_mask(const input_ndarray<mask_type> implant_mask,
                float voxel_size,
                const array<float,6> &bbox,
diff --git a/src/pybind/geometry-pybind.cc b/src/pybind/geometry-pybind.cc
index 5c50f69..b947b1b 100644
--- a/src/pybind/geometry-pybind.cc
+++ b/src/pybind/geometry-pybind.cc
@@ -79,7 +79,6 @@ void fill_implant_mask(const np_maskarray implant_mask,
                  );
 }
 
-/*
 void compute_front_mask(const np_array<uint8_t> &np_solid_implant,
         const float voxel_size,
         const matrix4x4 &Muvw,
@@ -88,11 +87,12 @@ void compute_front_mask(const np_array<uint8_t> &np_solid_implant,
     auto solid_implant_info = np_solid_implant.request();
     auto front_mask_info    = np_front_mask.request();
 
-    ::compute_front_mask({solid_implant_info.ptr, solid_implant_info.shape},
+    return NS::compute_front_mask({solid_implant_info.ptr, solid_implant_info.shape},
             voxel_size, Muvw, bbox,
             {front_mask_info.ptr, front_mask_info.shape});
 }
 
+/*
 void cylinder_projection(const np_array<float>  &np_edt,  // Euclidean Distance Transform in um, should be low-resolution (will be interpolated)
                const np_bytearray     &np_Cs,  // Material classification images (probability per voxel, 0..1 -> 0..255)
                float Cs_voxel_size,           // Voxel size for Cs
@@ -128,5 +128,5 @@ PYBIND11_MODULE(geometry, m) {
     //m.def("cylinder_projection",  &python_api::cylinder_projection);
     m.def("sample_plane",         &python_api::sample_plane<uint16_t>);
     m.def("sample_plane",         &python_api::sample_plane<uint8_t>);
-    //m.def("compute_front_mask",   &python_api::compute_front_mask);
+    m.def("compute_front_mask",   &python_api::compute_front_mask);
 }
diff --git a/src/test/test_geometry.py b/src/test/test_geometry.py
index ce7446b..33f4a75 100644
--- a/src/test/test_geometry.py
+++ b/src/test/test_geometry.py
@@ -278,6 +278,35 @@ def test_fill_implant_mask():
     assert_with_print(result_profile[0], result_profile[1], 1e-7, "cpu_seq vs cpu")
     assert_with_print(result_profile[0], result_profile[2], 1e-7, "cpu_seq vs gpu")
 
+def test_compute_front_mask():
+    n = 128
+    dtype = np.uint8
+    implant = np.random.randint(0, 2, (n,n,n), dtype)
+    voxel_size = 1
+    bbox_flat = np.array([-16,16] * 3, np.float32)
+    rsqr_fraction = 1#0.7
+    Muvwp_flat = np.array([
+       1, 0, 0, 64,
+       0, 1, 0, 64,
+       0, 0, 1, 64,
+       0, 0, 0, 1
+    ], np.float32)
+    n_bins = 1024
+
+    solid_implant_mask = np.zeros(implant.shape, np.uint8)
+    rsqr_maxs = np.zeros((n_bins, ), np.float32)
+    profile = np.zeros((n_bins, ), np.float32)
+
+    m_cpu.fill_implant_mask(implant, voxel_size, bbox_flat, rsqr_fraction, Muvwp_flat, solid_implant_mask, rsqr_maxs, profile)
+
+    impls = [m_cpu_seq, m_cpu, m_gpu]
+
+    cpu_seq, cpu, gpu = [
+        partial(impl.compute_front_mask, solid_implant_mask, voxel_size, Muvwp_flat, bbox_flat)
+        for i, impl in enumerate(impls)
+    ]
+
+    compare_fs('test_compute_front_mask', cpu_seq, cpu, gpu, True, 1e-7, (solid_implant_mask.shape, solid_implant_mask.dtype))
 
 if __name__ == '__main__':
     np.random.seed(42)
@@ -286,4 +315,5 @@ def test_fill_implant_mask():
     test_sample_plane(np.uint8)
     test_integrate_axes()
     test_zero_outside_bbox()
-    test_fill_implant_mask()
\ No newline at end of file
+    test_fill_implant_mask()
+    test_compute_front_mask()
\ No newline at end of file

From 3b6d642377afbbe218691c01c6069cb1696865a0 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Thu, 9 Mar 2023 13:11:46 +0100
Subject: [PATCH 116/136] #25 Added additional debug launch configurations

---
 .vscode/launch.json | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/.vscode/launch.json b/.vscode/launch.json
index b48c6cc..cb878e1 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -4,6 +4,15 @@
     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
     "version": "0.2.0",
     "configurations": [
+        {
+            "name": "Python: pre-cleanup/cylinder_surface2",
+            "type": "python",
+            "request": "launch",
+            "program": "${workspaceFolder}/pre-cleanup-src/analysis/cylinder_surface2.py",
+            "console": "integratedTerminal",
+            "args": ["770c_pag"],
+            "justMyCode": false
+        },
         {
             "name": "Python: Test geometry",
             "type": "python",
@@ -31,5 +40,14 @@
             "args": ["770c_pag"],
             "justMyCode": false
         },
+        {
+            "name": "Python: 0800_implant_data",
+            "type": "python",
+            "request": "launch",
+            "program": "${workspaceFolder}/src/processing_steps/0800_implant_data.py",
+            "console": "integratedTerminal",
+            "args": ["770c_pag"],
+            "justMyCode": false
+        },
     ]
 }
\ No newline at end of file

From 9a17224e96294502aab530a42305ec1d346215ea Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Thu, 9 Mar 2023 13:12:22 +0100
Subject: [PATCH 117/136] #25 Fixed imports of the old cylinder_surface script

---
 pre-cleanup-src/analysis/cylinder_surface2.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/pre-cleanup-src/analysis/cylinder_surface2.py b/pre-cleanup-src/analysis/cylinder_surface2.py
index f9002fd..c31b484 100644
--- a/pre-cleanup-src/analysis/cylinder_surface2.py
+++ b/pre-cleanup-src/analysis/cylinder_surface2.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python3
 import os, sys, h5py, numpy as np, pathlib, tqdm, vedo, matplotlib.pyplot as plt, edt, vedo.pointcloud as pc, scipy.ndimage as ndi
-sys.path.append(sys.path[0]+"/../")
+sys.path.append(sys.path[0]+"/../../src")
 from config.paths import *
-from helper_functions import *
-from pybind_kernels.geometry import cylinder_projection
+from lib.py.helpers import commandline_args
+from lib.cpp.cpu_seq.geometry import cylinder_projection
 NA = np.newaxis
 
 
@@ -25,10 +25,10 @@ def homogeneous_transform(xs, M):
 
 def np_save(path,data):
     output_dir = os.path.dirname(path)
-    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)        
+    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
     np.save(path,data)
 
-    
+
 # Requires: implant-FoR
 #           soft-tissue/bone segmentation + blood analysis
 #           EDT-field
@@ -55,18 +55,18 @@ def np_save(path,data):
     print(f"Cant't read implant frame-of-reference: {e}")
     print(f"Make sure you have run segment-implant-cc.py and implant-FoR.py for {sample} at scale {mask_scale}x")
     sys.exit(-1)
-    
+
 try:
     blood_mask    = h5mask["blood/mask"][:]
     solid_implant = h5mask["implant_solid/mask"][:]
-    h5mask.close()    
+    h5mask.close()
 except Exception as e:
     print(f"Cant't read masks: {e}")
     print("Make sure you have run compute_histograms.py, generate_xx_probabilities.py, segment_from_distributions,\n"+
           "and segment-blood-cc.py")
     sys.exit(-1)
 
-    
+
 P0_binfile            = f"{binary_root}/segmented/P0/{segment_scale}x/{sample}.uint16"
 P1_binfile            = f"{binary_root}/segmented/P1/{segment_scale}x/{sample}.uint16"
 edt_binfile           = f"{binary_root}/fields/implant-edt/{mask_scale}x/{sample}.uint16"
@@ -92,4 +92,4 @@ def np_save(path,data):
                     d_min, d_max, theta_min, theta_max,
                     tuple(bbox.flatten()), tuple(Muvwp.flatten()),
                     images, counts)
-                    
+

From 3b1d79404083ca421d27c007315d080a90d3b760 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Thu, 9 Mar 2023 13:13:34 +0100
Subject: [PATCH 118/136] #25 Added geometry::cylinder_projection

---
 src/lib/cpp/cpu/geometry.cc     |  20 ++-
 src/lib/cpp/cpu_seq/geometry.cc | 283 +++++++++++++-------------------
 src/lib/cpp/gpu/geometry.cc     |  20 ++-
 src/lib/cpp/include/geometry.hh |  69 +++++++-
 src/pybind/geometry-pybind.cc   |   7 +-
 src/test/test_geometry.py       |  13 ++
 6 files changed, 222 insertions(+), 190 deletions(-)

diff --git a/src/lib/cpp/cpu/geometry.cc b/src/lib/cpp/cpu/geometry.cc
index b173519..491c171 100644
--- a/src/lib/cpp/cpu/geometry.cc
+++ b/src/lib/cpp/cpu/geometry.cc
@@ -22,6 +22,19 @@ void compute_front_mask(const input_ndarray<mask_type> solid_implant,
     return cpu_seq::compute_front_mask(solid_implant, voxel_size, Muvw, bbox, front_mask);
 }
 
+void cylinder_projection(const input_ndarray<float>  edt,  // Euclidean Distance Transform in um, should be low-resolution (will be interpolated)
+             const input_ndarray<uint8_t> C,  // Material classification images (probability per voxel, 0..1 -> 0..255)
+             float voxel_size,           // Voxel size for Cs
+             float d_min, float d_max,       // Distance shell to map to cylinder
+             float theta_min, float theta_max, // Angle range (wrt cylinder center)
+             std::array<float,6> bbox,
+             const matrix4x4 &Muvw,           // Transform from zyx (in um) to U'V'W' cylinder FoR (in um)
+             output_ndarray<float>    image,  // Probability-weighted volume of (class,theta,U)-voxels
+             output_ndarray<int64_t>  count   // Number of (class,theta,U)-voxels
+             ){
+    return cpu_seq::cylinder_projection(edt, C, voxel_size, d_min, d_max, theta_min, theta_max, bbox, Muvw, image, count);
+}
+
 void fill_implant_mask(const input_ndarray<mask_type> mask,
                float voxel_size,
                const array<float,6> &bbox,
@@ -46,13 +59,6 @@ void integrate_axes(const input_ndarray<mask_type> &mask,
     return cpu_seq::integrate_axes(mask, x0, v_axis, w_axis, v_min, w_min, output);
 }
 
-template <typename T>
-float resample2x2x2(const T        *voxels,
-                    const array<ssize_t, 3> &shape,
-                    const array<float, 3>   &X) {
-    return cpu_seq::resample2x2x2(voxels, shape, X);
-}
-
 template <typename T>
 void sample_plane(const input_ndarray<T> &voxels,
                   const real_t voxel_size, // In micrometers
diff --git a/src/lib/cpp/cpu_seq/geometry.cc b/src/lib/cpp/cpu_seq/geometry.cc
index 8ad180c..d7793b6 100644
--- a/src/lib/cpp/cpu_seq/geometry.cc
+++ b/src/lib/cpp/cpu_seq/geometry.cc
@@ -47,6 +47,7 @@ void compute_front_mask(const input_ndarray<mask_type> solid_implant,
     const auto [U_min, U_max, V_min, V_max, W_min, W_max] = bbox;
     UNPACK_NUMPY(solid_implant)
 
+    // TODO move the typedefs here, rather than having them globally in datatypes.hh
     BLOCK_BEGIN_WITH_OUTPUT(solid_implant, front_mask, ) {
 
         std::array<real_t, 4> Xs = {
@@ -66,6 +67,121 @@ void compute_front_mask(const input_ndarray<mask_type> solid_implant,
     BLOCK_END_WITH_OUTPUT() }
 }
 
+void cylinder_projection(const input_ndarray<float>  edt,  // Euclidean Distance Transform in um, should be low-resolution (will be interpolated)
+             const input_ndarray<uint8_t> C,  // Material classification images (probability per voxel, 0..1 -> 0..255)
+             float voxel_size,           // Voxel size for Cs
+             float d_min, float d_max,       // Distance shell to map to cylinder
+             float theta_min, float theta_max, // Angle range (wrt cylinder center)
+             std::array<float,6> bbox,
+             const matrix4x4 &Muvw,           // Transform from zyx (in um) to U'V'W' cylinder FoR (in um)
+             output_ndarray<float>    image,  // Probability-weighted volume of (class,theta,U)-voxels
+             output_ndarray<int64_t>  count   // Number of (class,theta,U)-voxels
+             ){
+    UNPACK_NUMPY(C);
+    UNPACK_NUMPY(edt);
+
+    ssize_t n_theta = image.shape[0], n_U = image.shape[1];
+
+    const auto& [U_min, U_max, V_min, V_max, W_min, W_max] = bbox;
+
+    real_t
+        edz = edt_Nz / real_t(C_Nz),
+        edy = edt_Ny / real_t(C_Ny),
+        edx = edt_Nx / real_t(C_Nx);
+
+    //printf("Segmenting from %g to %g micrometers distance of implant.\n",d_min,d_max);
+    //printf("Bounding box is [U_min,U_max,V_min,V_max,W_min,W_max] = [[%g,%g],[%g,%g],[%g,%g]]\n",
+    //    U_min,U_max,V_min,V_max,W_min,W_max);
+    //printf("EDT field is (%ld,%ld,%ld)\n",ex,ey,ez);
+
+    real_t th_min = 1234, th_max = -1234;
+    ssize_t n_shell = 0;
+    ssize_t n_shell_bbox = 0;
+
+    ssize_t block_height = 64;
+
+    //TODO: new acc/openmp macro in parallel.hh
+    // TODO postponed, to get a working edition first
+    //typedef uint8_t C_type;
+    //BLOCK_BEGIN(C, "reduction(+:n_shell,n_shell_bbox)") {
+    //BLOCK_END()
+
+    {
+        float   *image_d = image.data;
+        int64_t *count_d = count.data;
+
+        for (ssize_t block_start = 0, edt_block_start = 0; block_start < C_length; block_start += block_height*C_Ny*C_Nz, edt_block_start += block_height*edt_Ny*edt_Nz) {
+            const uint8_t *C_buffer = C.data + block_start;
+            const float  *edt_block = edt.data + max(block_start - edt_Ny*edt_Nz, 0L);
+
+            ssize_t  this_block_length = min(block_height*C_Ny*C_Nz,C_length-block_start);
+            ssize_t  this_edt_length   = min((block_height+2)*edt_Ny*edt_Nz,edt_length-block_start);
+
+            //#pragma acc parallel loop copy(C_buffer[:this_block_length], image_d[:n_theta*n_U], count_d[:n_theta*n_U], bbox[:6], Muvw[:16], edt_block[:this_edt_length]) reduction(+:n_shell,n_shell_bbox)
+            //#pragma omp parallel for reduction(+:n_shell,n_shell_bbox)
+            for (int64_t k = 0; k < this_block_length; k++) {
+                const int64_t flat_idx = block_start + k;
+                const int64_t X = (flat_idx  / (C_Ny*C_Nz)), Y = (flat_idx / C_Nz) % C_Ny, Z = flat_idx  % C_Nz; // Integer indices: Cs[c,X,Y,Z]
+                // Index into local block
+                const int64_t Xl = (k  / (C_Ny*C_Nz)), Yl = (k / C_Nz) % C_Ny, Zl = k  % C_Nz;
+                // Index into local edt block. Note EDT has 1-slice padding top+bottom
+                const float  x = (Xl+1)*edx, y = Yl*edy, z = Zl*edy;
+
+                if (x > block_height) {
+                    printf("Block number k=%ld.\nX,Y,Z=%ld,%ld,%ld\nXl,Yl,Zl=%ld,%ld,%ld\nx,y,z=%.2f, %.2f, %.2f\n",k,X,Y,Z,Xl,Yl,Zl,x,y,z);
+                    abort();
+                }
+
+                // ****** MEAT OF THE IMPLEMENTATION IS HERE ******
+                real_t distance = resample2x2x2<float>(edt_block, {this_edt_length/(edt_Ny*edt_Nz),edt_Ny,edt_Nz}, {x,y,z});
+
+                if (distance > d_min && distance <= d_max) { // TODO: and W>w_min
+                    array<real_t,4> Xs = {X*voxel_size, Y*voxel_size, Z*voxel_size, 1};
+                    auto [U,V,W,c] = hom_transform(Xs,Muvw);
+                    n_shell ++;
+
+                    //        printf("distance = %.1f, U,V,W = %.2f,%.2f,%.2f\n",distance,U,V,W);
+                    if (in_bbox(U,V,W,bbox)) {
+                        real_t theta    = atan2(V,W);
+
+                        if (theta >= theta_min && theta <= theta_max) {
+                            n_shell_bbox++;
+
+                            ssize_t theta_i = floor( (theta-theta_min) * (n_theta-1)/(theta_max-theta_min) );
+                            ssize_t U_i     = floor( (U    -    U_min) * (n_U    -1)/(    U_max-    U_min) );
+
+                            real_t p = C_buffer[k]/255.;
+
+                            assert(theta >= theta_min);
+                            assert(theta <= theta_max);
+                            assert(U >= U_min);
+                            assert(U <= U_max);
+                            assert(theta_i >= 0);
+                            assert(theta_i < n_theta);
+                            assert(U_i >= 0);
+                            assert(U_i < n_U);
+
+                            if (p > 0) {
+                                th_min = min(theta,th_min);
+                                th_max = max(theta,th_max);
+
+                                //atomic_statement()
+                                image_d[theta_i*n_U + U_i] += p;
+
+                                //atomic_statement()
+                                count_d[theta_i*n_U + U_i] += 1;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    printf("n_shell = %ld, n_shell_bbox = %ld\n",n_shell,n_shell_bbox);
+    printf("theta_min, theta_max = %.2f,%.2f\n",theta_min,theta_max);
+    printf("th_min,       th_max = %.2f,%.2f\n",th_min,th_max);
+}
+
 void fill_implant_mask(const input_ndarray<mask_type> mask,
                float voxel_size,
                const array<float,6> &bbox,
@@ -191,59 +307,6 @@ array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &mask, const array
     };
 }
 
-template <typename T>
-float resample2x2x2(const T             *voxels,
-                    const array<ssize_t, 3> &shape,
-                    const array<float, 3>   &X) {
-    auto  [Nz,Ny,Nx] = shape;
-
-    if (!in_bbox(X[0], X[1], X[2], {0.5f, float(Nx)-0.5f, 0.5f, float(Ny)-0.5f, 0.5f, float(Nz)-0.5f})) {
-        uint64_t voxel_index = uint64_t(floor(X[0]))*Ny*Nz + uint64_t(floor(X[1]))*Ny + uint64_t(floor(X[2]));
-        return voxels[voxel_index];
-    }
-
-    float   Xfrac[2][3]; // {Xminus[3], Xplus[3]}
-    int64_t Xint[2][3];  // {Iminus[3], Iplus[3]}
-    float   value = 0;
-
-    for (int i = 0; i < 3; i++) {
-        float Iminus, Iplus;
-        Xfrac[0][i] = 1-modf(X[i]-0.5f, &Iminus); // 1-{X[i]-1/2}, floor(X[i]-1/2)
-        Xfrac[1][i] =   modf(X[i]+0.5f, &Iplus);  // {X[i]+1/2}, floor(X[i]+1/2)
-
-        Xint[0][i] = (int64_t) Iminus;
-        Xint[1][i] = (int64_t) Iplus;
-    }
-
-    for (int ijk = 0; ijk <= 7; ijk++) {
-        float  weight = 1;
-        int64_t IJK[3] = {0,0,0};
-
-        for (int axis = 0; axis < 3; axis++) { // x-1/2 or x+1/2
-            int pm    = (ijk >> axis) & 1;
-            IJK[axis] = Xint[pm][axis];
-            weight   *= Xfrac[pm][axis];
-        }
-
-        auto [I,J,K] = IJK;
-        // if (I<0 || J<0 || K<0) {
-        //   printf("(I,J,K) = (%ld,%ld,%ld)\n",I,J,K);
-        //   abort();
-        // }
-        // if (I>=int(Nx) || J>=int(Ny) || K>=int(Nz)) {
-        //   printf("(I,J,K) = (%ld,%ld,%ld), (Nx,Ny,Nz) = (%ld,%ld,%ld)\n",I,J,K,Nx,Ny,Nz);
-        //   abort();
-        // }
-        uint64_t voxel_index = I*Ny*Nz+J*Ny+K;
-        //assert(I>=0 && J>=0 && K>=0);
-        //assert(I<Nx && J<Ny && K<Nz);
-        float voxel = (float) voxels[voxel_index];
-        value += voxel*weight;
-    }
-
-    return value;
-}
-
 template <typename T>
 void sample_plane(const input_ndarray<T> &voxels,
                   const real_t voxel_size, // In micrometers
@@ -377,117 +440,3 @@ void zero_outside_bbox(const array<real_t,9> &principal_axes,
 }
 
 }
-
-/*
-void cylinder_projection(const input_ndarray<float>  edt,  // Euclidean Distance Transform in um, should be low-resolution (will be interpolated)
-             const input_ndarray<uint8_t> C,  // Material classification images (probability per voxel, 0..1 -> 0..255)
-             float voxel_size,           // Voxel size for Cs
-             float d_min, float d_max,       // Distance shell to map to cylinder
-             float theta_min, float theta_max, // Angle range (wrt cylinder center)
-             std::array<float,6> bbox,
-             const matrix4x4 &Muvw,           // Transform from zyx (in um) to U'V'W' cylinder FoR (in um)
-             output_ndarray<float>    image,  // Probability-weighted volume of (class,theta,U)-voxels
-             output_ndarray<int64_t>  count   // Number of (class,theta,U)-voxels
-             ){
-    ssize_t n_theta = image.shape[0], n_U = image.shape[1];
-
-    const auto& [U_min,U_max,V_min,V_max,W_min,W_max] = bbox;
-
-    ssize_t ex = edt.shape[0], ey = edt.shape[1], ez = edt.shape[2];
-    ssize_t Cx = C.shape[0],   Cy = C.shape[1],   Cz = C.shape[2];
-
-    real_t edx = ex/real_t(Cx), edy = ey/real_t(Cy), edz = ex/real_t(Cz);
-
-    ssize_t edt_length       = ex*ey*ez;
-    ssize_t C_length         = Cx*Cy*Cz;
-
-    printf("Segmenting from %g to %g micrometers distance of implant.\n",d_min,d_max);
-
-    printf("Bounding box is [U_min,U_max,V_min,V_max,W_min,W_max] = [[%g,%g],[%g,%g],[%g,%g]]\n",
-        U_min,U_max,V_min,V_max,W_min,W_max);
-    printf("EDT field is (%ld,%ld,%ld)\n",ex,ey,ez);
-
-    real_t th_min = 1234, th_max = -1234;
-    ssize_t n_shell = 0;
-    ssize_t n_shell_bbox = 0;
-
-    ssize_t block_height = 64;
-
-    //TODO: new acc/openmp macro in parallel.hh
-    {
-        float   *image_d = image.data;
-        int64_t *count_d = count.data;
-
-        for (ssize_t block_start = 0, edt_block_start = 0; block_start < C_length; block_start += block_height*Cy*Cz, edt_block_start += block_height*ey*ez) {
-            const uint8_t *C_buffer = C.data + block_start;
-            const float  *edt_block = edt.data + max(block_start-ey*ez,0L);
-
-            ssize_t  this_block_length = min(block_height*Cy*Cz,C_length-block_start);
-            ssize_t  this_edt_length   = min((block_height+2)*ey*ez,edt_length-block_start);
-
-            //#pragma acc parallel loop copy(C_buffer[:this_block_length], image_d[:n_theta*n_U], count_d[:n_theta*n_U], bbox[:6], Muvw[:16], edt_block[:this_edt_length]) reduction(+:n_shell,n_shell_bbox)
-            #pragma omp parallel for reduction(+:n_shell,n_shell_bbox)
-            for (int64_t k = 0; k < this_block_length; k++) {
-                const int64_t flat_idx = block_start + k;
-                const int64_t X = (flat_idx  / (Cy*Cz)), Y = (flat_idx / Cz) % Cy, Z = flat_idx  % Cz; // Integer indices: Cs[c,X,Y,Z]
-                // Index into local block
-                const int64_t Xl = (k  / (Cy*Cz)), Yl = (k / Cz) % Cy, Zl = k  % Cz;
-                // Index into local edt block. Note EDT has 1-slice padding top+bottom
-                const float  x = (Xl+1)*edx, y = Yl*edy, z = Zl*edy;
-
-                if (x > block_height) {
-                    printf("Block number k=%ld.\nX,Y,Z=%ld,%ld,%ld\nXl,Yl,Zl=%ld,%ld,%ld\nx,y,z=%.2f, %.2f, %.2f\n",k,X,Y,Z,Xl,Yl,Zl,x,y,z);
-                    abort();
-                }
-
-                // ****** MEAT OF THE IMPLEMENTATION IS HERE ******
-                real_t distance = resample2x2x2<float>(edt_block, {this_edt_length/(ey*ez),ey,ez}, {x,y,z});
-
-                if (distance > d_min && distance <= d_max) { // TODO: and W>w_min
-                    array<real_t,4> Xs = {X*voxel_size, Y*voxel_size, Z*voxel_size, 1};
-                    auto [U,V,W,c] = hom_transform(Xs,Muvw);
-                    n_shell ++;
-
-                    //        printf("distance = %.1f, U,V,W = %.2f,%.2f,%.2f\n",distance,U,V,W);
-                    if (in_bbox(U,V,W,bbox)) {
-                        real_t theta    = atan2(V,W);
-
-                        if (theta >= theta_min && theta <= theta_max) {
-                            n_shell_bbox++;
-
-                            ssize_t theta_i = floor( (theta-theta_min) * (n_theta-1)/(theta_max-theta_min) );
-                            ssize_t U_i     = floor( (U    -    U_min) * (n_U    -1)/(    U_max-    U_min) );
-
-                            real_t p = C_buffer[k]/255.;
-
-                            assert(theta >= theta_min);
-                            assert(theta <= theta_max);
-                            assert(U >= U_min);
-                            assert(U <= U_max);
-                            assert(theta_i >= 0);
-                            assert(theta_i < n_theta);
-                            assert(U_i >= 0);
-                            assert(U_i < n_U);
-
-                            if (p > 0) {
-                                th_min = min(theta,th_min);
-                                th_max = max(theta,th_max);
-
-                                //atomic_statement()
-                                image_d[theta_i*n_U + U_i] += p;
-
-                                //atomic_statement()
-                                count_d[theta_i*n_U + U_i] += 1;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-    printf("n_shell = %ld, n_shell_bbox = %ld\n",n_shell,n_shell_bbox);
-    printf("theta_min, theta_max = %.2f,%.2f\n",theta_min,theta_max);
-    printf("th_min,       th_max = %.2f,%.2f\n",th_min,th_max);
-}
-
-*/
\ No newline at end of file
diff --git a/src/lib/cpp/gpu/geometry.cc b/src/lib/cpp/gpu/geometry.cc
index 57126dc..b76db65 100644
--- a/src/lib/cpp/gpu/geometry.cc
+++ b/src/lib/cpp/gpu/geometry.cc
@@ -21,6 +21,19 @@ void compute_front_mask(const input_ndarray<mask_type> solid_implant,
     return cpu_seq::compute_front_mask(solid_implant, voxel_size, Muvw, bbox, front_mask);
 }
 
+void cylinder_projection(const input_ndarray<float>  edt,  // Euclidean Distance Transform in um, should be low-resolution (will be interpolated)
+             const input_ndarray<uint8_t> C,  // Material classification images (probability per voxel, 0..1 -> 0..255)
+             float voxel_size,           // Voxel size for Cs
+             float d_min, float d_max,       // Distance shell to map to cylinder
+             float theta_min, float theta_max, // Angle range (wrt cylinder center)
+             std::array<float,6> bbox,
+             const matrix4x4 &Muvw,           // Transform from zyx (in um) to U'V'W' cylinder FoR (in um)
+             output_ndarray<float>    image,  // Probability-weighted volume of (class,theta,U)-voxels
+             output_ndarray<int64_t>  count   // Number of (class,theta,U)-voxels
+             ){
+    return cpu_seq::cylinder_projection(edt, C, voxel_size, d_min, d_max, theta_min, theta_max, bbox, Muvw, image, count);
+}
+
 void fill_implant_mask(const input_ndarray<mask_type> mask,
                float voxel_size,
                const array<float,6> &bbox,
@@ -144,13 +157,6 @@ void integrate_axes(const input_ndarray<mask_type> &mask,
     return cpu_seq::integrate_axes(mask, x0, v_axis, w_axis, v_min, w_min, output);
 }
 
-template <typename T>
-float resample2x2x2(const T        *voxels,
-                    const array<ssize_t, 3> &shape,
-                    const array<float, 3>   &X) {
-    return cpu_seq::resample2x2x2(voxels, shape, X);
-}
-
 template <typename T>
 void sample_plane(const input_ndarray<T> &voxels,
                   const real_t voxel_size, // In micrometers
diff --git a/src/lib/cpp/include/geometry.hh b/src/lib/cpp/include/geometry.hh
index 39641ab..629aa53 100644
--- a/src/lib/cpp/include/geometry.hh
+++ b/src/lib/cpp/include/geometry.hh
@@ -44,6 +44,59 @@ inline bool in_bbox(float U, float V, float W, const std::array<float, 6> &bbox)
     return inside;
 }
 
+template <typename T>
+float resample2x2x2(const T             *voxels,
+                    const array<ssize_t, 3> &shape,
+                    const array<float, 3>   &X) {
+    auto  [Nz,Ny,Nx] = shape;
+
+    if (!in_bbox(X[0], X[1], X[2], {0.5f, float(Nx)-0.5f, 0.5f, float(Ny)-0.5f, 0.5f, float(Nz)-0.5f})) {
+        uint64_t voxel_index = uint64_t(floor(X[0]))*Ny*Nz + uint64_t(floor(X[1]))*Ny + uint64_t(floor(X[2]));
+        return voxels[voxel_index];
+    }
+
+    float   Xfrac[2][3]; // {Xminus[3], Xplus[3]}
+    int64_t Xint[2][3];  // {Iminus[3], Iplus[3]}
+    float   value = 0;
+
+    for (int i = 0; i < 3; i++) {
+        float Iminus, Iplus;
+        Xfrac[0][i] = 1-modf(X[i]-0.5f, &Iminus); // 1-{X[i]-1/2}, floor(X[i]-1/2)
+        Xfrac[1][i] =   modf(X[i]+0.5f, &Iplus);  // {X[i]+1/2}, floor(X[i]+1/2)
+
+        Xint[0][i] = (int64_t) Iminus;
+        Xint[1][i] = (int64_t) Iplus;
+    }
+
+    for (int ijk = 0; ijk <= 7; ijk++) {
+        float  weight = 1;
+        int64_t IJK[3] = {0,0,0};
+
+        for (int axis = 0; axis < 3; axis++) { // x-1/2 or x+1/2
+            int pm    = (ijk >> axis) & 1;
+            IJK[axis] = Xint[pm][axis];
+            weight   *= Xfrac[pm][axis];
+        }
+
+        auto [I,J,K] = IJK;
+        // if (I<0 || J<0 || K<0) {
+        //   printf("(I,J,K) = (%ld,%ld,%ld)\n",I,J,K);
+        //   abort();
+        // }
+        // if (I>=int(Nx) || J>=int(Ny) || K>=int(Nz)) {
+        //   printf("(I,J,K) = (%ld,%ld,%ld), (Nx,Ny,Nz) = (%ld,%ld,%ld)\n",I,J,K,Nx,Ny,Nz);
+        //   abort();
+        // }
+        uint64_t voxel_index = I*Ny*Nz+J*Ny+K;
+        //assert(I>=0 && J>=0 && K>=0);
+        //assert(I<Nx && J<Ny && K<Nz);
+        float voxel = (float) voxels[voxel_index];
+        value += voxel*weight;
+    }
+
+    return value;
+}
+
 namespace NS {
 
 /*
@@ -60,6 +113,17 @@ void compute_front_mask(const input_ndarray<mask_type> solid_implant,
         std::array<float,6> bbox,
         output_ndarray<mask_type> front_mask);
 
+void cylinder_projection(const input_ndarray<float>  edt,  // Euclidean Distance Transform in um, should be low-resolution (will be interpolated)
+             const input_ndarray<uint8_t> C,  // Material classification images (probability per voxel, 0..1 -> 0..255)
+             float voxel_size,           // Voxel size for Cs
+             float d_min, float d_max,       // Distance shell to map to cylinder
+             float theta_min, float theta_max, // Angle range (wrt cylinder center)
+             std::array<float,6> bbox,
+             const matrix4x4 &Muvw,           // Transform from zyx (in um) to U'V'W' cylinder FoR (in um)
+             output_ndarray<float>    image,  // Probability-weighted volume of (class,theta,U)-voxels
+             output_ndarray<int64_t>  count   // Number of (class,theta,U)-voxels
+             );
+
 void fill_implant_mask(const input_ndarray<mask_type> implant_mask,
                float voxel_size,
                const array<float,6> &bbox,
@@ -85,11 +149,6 @@ void integrate_axes(const input_ndarray<mask_type> &mask,
 		    const real_t v_min, const real_t w_min,
 		    output_ndarray<uint64_t> output);
 
-template <typename T>
-float resample2x2x2(const T *voxels,
-                    const array<ssize_t,3> &shape,
-                    const array<float,3> &X);
-
 template <typename T>
 void sample_plane(const input_ndarray<T> &voxels,
                   const real_t voxel_size, // In micrometers
diff --git a/src/pybind/geometry-pybind.cc b/src/pybind/geometry-pybind.cc
index b947b1b..ebd09b7 100644
--- a/src/pybind/geometry-pybind.cc
+++ b/src/pybind/geometry-pybind.cc
@@ -92,7 +92,6 @@ void compute_front_mask(const np_array<uint8_t> &np_solid_implant,
             {front_mask_info.ptr, front_mask_info.shape});
 }
 
-/*
 void cylinder_projection(const np_array<float>  &np_edt,  // Euclidean Distance Transform in um, should be low-resolution (will be interpolated)
                const np_bytearray     &np_Cs,  // Material classification images (probability per voxel, 0..1 -> 0..255)
                float Cs_voxel_size,           // Voxel size for Cs
@@ -108,12 +107,12 @@ void cylinder_projection(const np_array<float>  &np_edt,  // Euclidean Distance
     auto images_info = np_images.request();
     auto counts_info = np_counts.request();
 
-    ::cylinder_projection({edt_info.ptr,edt_info.shape},
+    NS::cylinder_projection({edt_info.ptr,edt_info.shape},
               {Cs_info.ptr, Cs_info.shape},
               Cs_voxel_size,d_min,d_max,theta_min,theta_max,bbox,Muvw,
               {images_info.ptr, images_info.shape},
               {counts_info.ptr, counts_info.shape});
-}*/
+}
 
 }
 
@@ -125,7 +124,7 @@ PYBIND11_MODULE(geometry, m) {
     m.def("integrate_axes",       &python_api::integrate_axes);
     m.def("zero_outside_bbox",    &python_api::zero_outside_bbox);
     m.def("fill_implant_mask",    &python_api::fill_implant_mask);
-    //m.def("cylinder_projection",  &python_api::cylinder_projection);
+    m.def("cylinder_projection",  &python_api::cylinder_projection);
     m.def("sample_plane",         &python_api::sample_plane<uint16_t>);
     m.def("sample_plane",         &python_api::sample_plane<uint8_t>);
     m.def("compute_front_mask",   &python_api::compute_front_mask);
diff --git a/src/test/test_geometry.py b/src/test/test_geometry.py
index 33f4a75..6070c93 100644
--- a/src/test/test_geometry.py
+++ b/src/test/test_geometry.py
@@ -10,6 +10,7 @@
 from config.paths import hdf5_root
 
 import datetime
+import edt
 from functools import partial
 import h5py
 import numpy as np
@@ -308,6 +309,18 @@ def test_compute_front_mask():
 
     compare_fs('test_compute_front_mask', cpu_seq, cpu, gpu, True, 1e-7, (solid_implant_mask.shape, solid_implant_mask.dtype))
 
+# TODO postponed because it's not used until after segment_from_distributions, i.e. in the last analysis phase.
+#def test_cylinder_projection():
+#    n = 128
+#    implant_mask = np.zeros((n,n,n), np.uint8)
+#    implant_mask[:,n//2-4:n//2+4,n//2-4:n//2+4] = 1
+#    edt_field = edt.edt(~implant_mask, parallel=16)
+#
+#    m_cpu_seq.cylinder_projection(edt_field, Cs, Cs_voxel_size,
+#                    d_min, d_max, theta_min, theta_max,
+#                    tuple(bbox.flatten()), tuple(Muvwp.flatten()),
+#                    images, counts)
+
 if __name__ == '__main__':
     np.random.seed(42)
     test_center_of_mass()

From 50864ecdcae372ca38c134c36ecb0c814194470e Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Mar 2023 16:10:34 +0100
Subject: [PATCH 119/136] #25 Added launch configuration for step 0800

---
 .vscode/launch.json | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.vscode/launch.json b/.vscode/launch.json
index cb878e1..2012182 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -40,6 +40,15 @@
             "args": ["770c_pag"],
             "justMyCode": false
         },
+        {
+            "name": "Python: 0700_implant_FoR",
+            "type": "python",
+            "request": "launch",
+            "program": "${workspaceFolder}/src/processing_steps/0700_implant_FoR.py",
+            "console": "integratedTerminal",
+            "args": ["770c_pag"],
+            "justMyCode": false
+        },
         {
             "name": "Python: 0800_implant_data",
             "type": "python",

From 8fa4c8575e4d37878ad6912f52d83b5af2488a0e Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Mar 2023 16:11:13 +0100
Subject: [PATCH 120/136] #25 Added rc file for running vedo over ssh

---
 src/vedo_setup.sh | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100755 src/vedo_setup.sh

diff --git a/src/vedo_setup.sh b/src/vedo_setup.sh
new file mode 100755
index 0000000..4dcb108
--- /dev/null
+++ b/src/vedo_setup.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+#
+set -x
+export DISPLAY=:99.0
+Xvfb :99 -screen 0 1024x1024x24 > /dev/null 2&>1 &
+sleep 3
+set +x
+exec "$@"

From 36485901330415b7122f5a4c2b88f0080c44fe60 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Mar 2023 16:16:56 +0100
Subject: [PATCH 121/136] #25 Somethings is not right with the zyx vs xyz
 ordering of values

---
 src/lib/cpp/cpu_seq/geometry.cc |  2 +-
 src/lib/cpp/include/geometry.hh |  2 +-
 src/test/test_geometry.py       | 89 +++++++++++++++++++++++++--------
 3 files changed, 69 insertions(+), 24 deletions(-)

diff --git a/src/lib/cpp/cpu_seq/geometry.cc b/src/lib/cpp/cpu_seq/geometry.cc
index d7793b6..5c0acfa 100644
--- a/src/lib/cpp/cpu_seq/geometry.cc
+++ b/src/lib/cpp/cpu_seq/geometry.cc
@@ -36,7 +36,7 @@ array<real_t, 3> center_of_mass(const input_ndarray<mask_type> &mask) {
 
     print_timestamp("center_of_mass end");
 
-    return array<real_t, 3>{ rcmz, rcmy, rcmx };
+    return array<real_t, 3>{ rcmx, rcmy, rcmz };
 }
 
 void compute_front_mask(const input_ndarray<mask_type> solid_implant,
diff --git a/src/lib/cpp/include/geometry.hh b/src/lib/cpp/include/geometry.hh
index 629aa53..937c5d2 100644
--- a/src/lib/cpp/include/geometry.hh
+++ b/src/lib/cpp/include/geometry.hh
@@ -48,7 +48,7 @@ template <typename T>
 float resample2x2x2(const T             *voxels,
                     const array<ssize_t, 3> &shape,
                     const array<float, 3>   &X) {
-    auto  [Nz,Ny,Nx] = shape;
+    auto  [Nx,Ny,Nz] = shape;
 
     if (!in_bbox(X[0], X[1], X[2], {0.5f, float(Nx)-0.5f, 0.5f, float(Ny)-0.5f, 0.5f, float(Nz)-0.5f})) {
         uint64_t voxel_index = uint64_t(floor(X[0]))*Ny*Nz + uint64_t(floor(X[1]))*Ny + uint64_t(floor(X[2]));
diff --git a/src/test/test_geometry.py b/src/test/test_geometry.py
index 6070c93..8ee1f87 100644
--- a/src/test/test_geometry.py
+++ b/src/test/test_geometry.py
@@ -9,10 +9,12 @@
 sys.path.append(sys.path[0]+'/../')
 from config.paths import hdf5_root
 
+import argparse
 import datetime
 import edt
 from functools import partial
 import h5py
+import matplotlib.pyplot as plt
 import numpy as np
 import pytest
 
@@ -43,6 +45,27 @@ def assert_with_print(a, b, tolerance=1e-7, names=None):
             print (names)
     assert all_close
 
+def compare_fs(func, baseline_f, cpu_f, gpu_f, should_assert=True, tolerance=1e-7,
+               allocate_result: tuple[tuple[int],np.dtype] | np.ndarray=None):
+    baseline, baseline_t = run_with_warmup(baseline_f, allocate_result)
+    print (f'({func}) Sequential ran in {baseline_t}')
+    if should_assert: assert_interesting_result(baseline)
+
+    cpu, cpu_t = run_with_warmup(cpu_f, allocate_result)
+    print (f'({func}) Parallel CPU ran in {cpu_t}, which is {baseline_t / cpu_t:.02f} times faster than sequential')
+    if should_assert: assert_with_print(baseline, cpu, tolerance, 'cpu_seq vs cpu')
+
+    gpu, gpu_t = run_with_warmup(gpu_f, allocate_result)
+    print (f'({func}) GPU ran in {gpu_t}, which is {baseline_t / gpu_t:.02f} times faster than sequential')
+    if should_assert: assert_with_print(baseline, gpu, tolerance, 'cpu_seq vs gpu')
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Unit tests for the geometry C++ functions.")
+
+    parser.add_argument('tests', nargs='*', help='Which test(s) to run. It can be either the name, or the index of the test.')
+
+    return parser.parse_args()
+
 def run_with_warmup(f, allocate_result=None):
     '''
     Runs the given function and returns the result and how long time it took to run.
@@ -65,20 +88,6 @@ def run_with_warmup(f, allocate_result=None):
     end = datetime.datetime.now()
     return result, end - start
 
-def compare_fs(func, baseline_f, cpu_f, gpu_f, should_assert=True, tolerance=1e-7,
-               allocate_result: tuple[tuple[int],np.dtype] | np.ndarray=None):
-    baseline, baseline_t = run_with_warmup(baseline_f, allocate_result)
-    print (f'({func}) Sequential ran in {baseline_t}')
-    if should_assert: assert_interesting_result(baseline)
-
-    cpu, cpu_t = run_with_warmup(cpu_f, allocate_result)
-    print (f'({func}) Parallel CPU ran in {cpu_t}, which is {baseline_t / cpu_t:.02f} times faster than sequential')
-    if should_assert: assert_with_print(baseline, cpu, tolerance, 'cpu_seq vs cpu')
-
-    gpu, gpu_t = run_with_warmup(gpu_f, allocate_result)
-    print (f'({func}) GPU ran in {gpu_t}, which is {baseline_t / gpu_t:.02f} times faster than sequential')
-    if should_assert: assert_with_print(baseline, gpu, tolerance, 'cpu_seq vs gpu')
-
 def test_center_of_mass():
     voxels = np.random.randint(0, 256, (n,n,n), np.uint8)
 
@@ -104,7 +113,7 @@ def test_inertia_matrix():
     assert_interesting_result(baseline())
 
 @pytest.mark.parametrize("dtype", [np.uint8, np.uint16])
-def test_sample_plane(dtype):
+def test_sample_plane(dtype, debug=False):
     # TODO something that isn't just random data?
     n = 128
     voxels = np.random.randint(0, np.iinfo(dtype).max, (n,n,n), dtype)
@@ -125,6 +134,21 @@ def test_sample_plane(dtype):
     # TODO the function is unstable, even when they're all calling the sequential implementation, t least when comparing gcc against nvcc, but it differs at most with 1. Hence the higher tolerance for this test. Can be tested with something like for i in range(10000):
     compare_fs('sample_plane', cpu_seq, cpu, gpu, True, 1.1, ((64,64), np.float32))
 
+    if debug:
+        voxels = np.zeros((n,n,n), dtype)
+        voxels[:, n//2-5:n//2+5, n//2-5:n//2-2] = 1
+        voxels[:, n//2-5:n//2+5, n//2+2:n//2+5] = 1
+        voxel_size = 1
+        cm = m_cpu.center_of_mass(voxels)
+        # TODO plan vektorne er z y x, ikke x y z!! Trace hvorfor!
+        v_vec = np.array([0,0,1], np.float32)
+        w_vec = np.array([0,1,0], np.float32)
+        bbox = [-n//2, n//2, -n//2, n//2]
+        result = np.zeros((n, n), np.float32)
+        m_cpu_seq.sample_plane(voxels, voxel_size, cm, v_vec, w_vec, bbox, result)
+        plt.imshow(result)
+        plt.savefig('pis.png')
+
 def test_integrate_axes():
     n = 128
     dtype = np.uint8
@@ -323,10 +347,31 @@ def test_compute_front_mask():
 
 if __name__ == '__main__':
     np.random.seed(42)
-    test_center_of_mass()
-    test_inertia_matrix()
-    test_sample_plane(np.uint8)
-    test_integrate_axes()
-    test_zero_outside_bbox()
-    test_fill_implant_mask()
-    test_compute_front_mask()
\ No newline at end of file
+    args = parse_args()
+
+    if len(args.tests) == 0:
+        test_center_of_mass()
+        test_inertia_matrix()
+        test_sample_plane(np.uint8)
+        test_integrate_axes()
+        test_zero_outside_bbox()
+        test_fill_implant_mask()
+        test_compute_front_mask()
+    else:
+        for test in args.tests:
+            if test == '1' or test == 'center_of_mass':
+                test_center_of_mass()
+            elif test == '2' or test == 'inertia_matrix':
+                test_inertia_matrix()
+            elif test == '3' or test == 'sample_plane':
+                test_sample_plane(np.uint8, debug=True)
+            elif test == '4' or test == 'integrate_axes':
+                test_integrate_axes()
+            elif test == '5' or test == 'zero_outside_bbox':
+                test_zero_outside_bbox()
+            elif test == '6' or test == 'fill_implant_mask':
+                test_fill_implant_mask()
+            elif test == '7' or test == 'compute_front_mask':
+                test_compute_front_mask()
+            else:
+                print (f'WARNING: skipping unknown test: "{test}"')

From 5052bc895144bbb448691273d3fd4d539176bd59 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Mar 2023 16:17:28 +0100
Subject: [PATCH 122/136] #25 Removed Bohrium dependency from esrf_read. Might
 not be the right thing, as it is very slow right now.

---
 src/lib/py/esrf_read.py | 42 ++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/lib/py/esrf_read.py b/src/lib/py/esrf_read.py
index 1d62c4a..1e3475b 100644
--- a/src/lib/py/esrf_read.py
+++ b/src/lib/py/esrf_read.py
@@ -2,8 +2,8 @@
 # Read metadata and data from raw tomograms from ESRF.
 # (C) James Avery for the MAXIBONE project, 2018
 import numpy as np;
-import bohrium as bh;
-#import numpy as bh;
+#import bohrium as bh;
+import numpy as bh;
 #import jax.numpy as jp
 import numpy.ma as ma;
 import sys,re,os,tqdm;
@@ -13,7 +13,7 @@
 
 def esrf_edf_metadata(filename):
     meta = {};
-    header_length = 1024;            
+    header_length = 1024;
     with open(filename,"r",encoding="latin-1") as f:
         header = f.read(header_length);
 
@@ -22,19 +22,19 @@ def esrf_edf_metadata(filename):
             kv = re.split("[=;]",l);
             if(len(kv)>=2):
                 meta[kv[0].strip()] = kv[1].strip();
-            
+
         assert meta["ByteOrder"] == "LowByteFirst";
 
         if(meta["DataType"] == "UnsignedShort"):
             meta["NumpyType"] = np.uint16;
         if(meta["DataType"] == "Float"):
-            meta["NumpyType"] = np.float32;            
-        
+            meta["NumpyType"] = np.float32;
+
         return meta;
-    
+
 def esrf_edf_to_npy(filename):
     meta = esrf_edf_metadata(filename);
-    header_length = 1024;        
+    header_length = 1024;
 
     with open(filename,"rb") as f:
         f.seek(header_length,os.SEEK_SET);
@@ -71,8 +71,8 @@ def esrf_full_tomogram(info):
 
 def esrf_edf_to_bh(filename):
     meta = esrf_edf_metadata(filename);
-    (nx,ny) = (int(meta["Dim_2"]), int(meta["Dim_1"]));    
-    header_length = 1024;        
+    (nx,ny) = (int(meta["Dim_2"]), int(meta["Dim_1"]));
+    header_length = 1024;
 
     with open(filename,"rb") as f:
         f.seek(header_length,os.SEEK_SET);
@@ -107,8 +107,8 @@ def esrf_edfrange_to_bh(info,region):
 
 def esrf_edf_to_jp(filename):
     meta = esrf_edf_metadata(filename);
-    (nx,ny) = (int(meta["Dim_2"]), int(meta["Dim_1"]));    
-    header_length = 1024;        
+    (nx,ny) = (int(meta["Dim_2"]), int(meta["Dim_1"]));
+    header_length = 1024;
 
     with open(filename,"rb") as f:
         f.seek(header_length,os.SEEK_SET);
@@ -145,7 +145,7 @@ def esrf_read_xml(filename):
     fields = ["subvolume_name","sizex","sizey","sizez","originx","originy","originz","voxelsize","valmin","valmax","byte_order","s1","s2","S1","S2"];
     fieldstrings = ["\<{}\>(.*)\<\/{}\>".format(f,f) for f in fields];
     res = [re.compile(s,re.IGNORECASE) for s in fieldstrings];
-    xmlmeta = {};    
+    xmlmeta = {};
     with open(filename,"r") as file:
         for l in file.readlines():
             for i in range(len(fields)):
@@ -156,7 +156,7 @@ def esrf_read_xml(filename):
         xmlmeta["subvolume_name"]=xmlmeta["subvolume_name"].replace("%04d","{:04d}");
         xmlmeta["filename"]=filename;
         xmlmeta["dirname"]=os.path.dirname(filename);
-    
+
     # Change printf template to python3 format template
     return xmlmeta;
 
@@ -165,27 +165,27 @@ def readfile(filename):
     with open(filename,'r') as f:
         return f.readlines()
 
-        
+
 # def frame_histogram(frame,i,bin_edges):
-# #    print("Calculating histogram for frame",i)        
+# #    print("Calculating histogram for frame",i)
 #     count =  np.histogram(frame.compressed(),bins=bin_edges)[0];
 # #    print("Completed histogram for frame",i)
 #     return count
 
 # #To get a total histogram, simply do np.sum(count,axis=0)
 # def progressive_histogram(xml,nbins=2048,bin_edges=np.array([]),num_cores=4):
-    
+
 #     if(len(bin_edges)==0):
 #         bin_edges = np.linspace(float(xml["valmin"]), float(xml["valmax"]), nbins + 1);
 #         nbins = len(bin_edges)-1;
 
 
 #     nz     = int(xml["sizez"]);
-#     print("sizez = ",nz)    
+#     print("sizez = ",nz)
 #     meta,frame  = esrf_edf_n_to_npy(xml,0);
 #     frames = np.ma.empty((4*num_cores, frame.shape[0], frame.shape[1]));
 #     counts = np.empty((nz,nbins),dtype=int);
-    
+
 #     for i in range(0,nz,4*num_cores):
 #         chunk_length = min(4*num_cores,nz-i);
 #         for j in range(chunk_length):
@@ -193,6 +193,6 @@ def readfile(filename):
 #             _, frames[j] = esrf_edf_n_to_npy(xml,i+j);
 #         counts[i:i+chunk_length] = np.array(Parallel(n_jobs=num_cores)(delayed(frame_histogram)(frames[j],i+j,bin_edges)
 #                                                                       for j in range(chunk_length)));
-        
+
 #     return counts, bin_edges;
-        
+

From df03abe34453f424c8d89becb5e493ae9e71f658 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Mar 2023 16:18:44 +0100
Subject: [PATCH 123/136] #25 Ensured that 0200 runs.

---
 proc-steps-checklist.txt                      |  2 ++
 .../0200_generate_byte_hdf5.py                | 26 +++++++++----------
 2 files changed, 15 insertions(+), 13 deletions(-)
 create mode 100644 proc-steps-checklist.txt

diff --git a/proc-steps-checklist.txt b/proc-steps-checklist.txt
new file mode 100644
index 0000000..34a94f6
--- /dev/null
+++ b/proc-steps-checklist.txt
@@ -0,0 +1,2 @@
+0100 - Haven't checked. Requires ERDA to be set up properly.
+0200 - Runs!
diff --git a/src/processing_steps/0200_generate_byte_hdf5.py b/src/processing_steps/0200_generate_byte_hdf5.py
index 559bc85..5731316 100755
--- a/src/processing_steps/0200_generate_byte_hdf5.py
+++ b/src/processing_steps/0200_generate_byte_hdf5.py
@@ -6,10 +6,10 @@
 # /voxels:              uint8(Nz,Ny,Nx). Nz = sum(scan_dimensions[:,0]), ny = minimum(subvolume_dimensions[:,1]), nx = minimum(subvolume_dimensions[:,2])
 import h5py, sys, os.path, pathlib, tqdm
 sys.path.append(sys.path[0]+"/../")
-import bohrium as bh # TODO: Get rid of Bohrium dependence without losing too much performance
+#import bohrium as bh # TODO: Get rid of Bohrium dependence without losing too much performance
 from lib.py.esrf_read import *
 import numpy   as np, matplotlib.pyplot as plt
-from config.paths import *
+from config.paths import hdf5_root_fast as hdf5_root, esrf_implants_root
 from lib.py.helpers import commandline_args
 from PIL import Image
 
@@ -45,7 +45,7 @@ def normalize(A,value_range,nbits=16,dtype=np.uint16):
 
 for i in range(len(subvolume_metadata)):
     if verbose >= 1: print(f"{i} {sample}/{subvolume_metadata[i]['experiment']}: {subvolume_range[i]}")
-if verbose >= 1: print((global_vmin, global_vmax), (Nz,Ny,Nx))    
+if verbose >= 1: print((global_vmin, global_vmax), (Nz,Ny,Nx))
 if verbose >= 1: print(subvolume_dimensions)
 if verbose >= 1: print(subvolume_range)
 
@@ -103,7 +103,7 @@ def normalize(A,value_range,nbits=16,dtype=np.uint16):
 def cylinder_mask(Ny,Nx):
     ys = np.linspace(-1,1,Ny)
     xs = np.linspace(-1,1,Nx)
-    return (xs[NA,:]**2 + ys[:,NA]**2) < 1 
+    return (xs[NA,:]**2 + ys[:,NA]**2) < 1
 
 mask = np.array(cylinder_mask(Ny,Nx))
 
@@ -113,10 +113,10 @@ def cylinder_mask(Ny,Nx):
     (sy,sx)        = ((ny-Ny)//2+((ny-Ny)%2), (nx-Nx)//2+((nx-Nx)%2))
     (ey,ex)        = (ny-(ny-Ny)//2, nx-(nx-Nx)//2)
     if verbose >= 1: print((sy,ey),(sx,ex))
-    
+
     # if verbose >= 1: print(f"Loading {subvolume_info['experiment']}")
     # tomo = normalize(esrf_full_tomogram_bh(subvolume_info), (global_vmin,global_vmax));
-    # if verbose >= 1: print(f"Writing {subvolume_info['experiment']}")    
+    # if verbose >= 1: print(f"Writing {subvolume_info['experiment']}")
     # h5tomo[z_offset:z_offset+nz] = tomo[:,sy:ey,sx:ex];
     # del tomo
     chunk = np.zeros((chunk_length,Ny,Nx),dtype=np.uint16);
@@ -129,7 +129,7 @@ def cylinder_mask(Ny,Nx):
         if verbose >= 1: print(f"Chunk shape: {slab_data.shape}")
         if verbose >= 1: print("Max value before masking:", slab_data.max())
         slab_data *= mask[NA,:,:]
-        if verbose >= 1: print("Max value after masking:", slab_data.max())        
+        if verbose >= 1: print("Max value after masking:", slab_data.max())
         chunk[:chunk_end-z] = normalize(slab_data,(global_vmin,global_vmax))
         if verbose >= 1: print("Max value after normalizing:", chunk.max())
 
@@ -138,22 +138,22 @@ def cylinder_mask(Ny,Nx):
         #     slice_data = jp.array(slice_data[sy:ey,sx:ex].copy())
         #     chunk[j] = normalize(slice_data[sy:ey,sx:ex],(global_vmin,global_vmax)) * mask
 
-            
+
         if verbose >= 1: print(f"Writing {sample} MSB slice {z+z_offset}:{chunk_end+z_offset} ({i}-{z})");
         chunk_msb = ((chunk[:chunk_end-z]>>8)&0xff).astype(np.uint8)
         if verbose >= 1: print("chunk_msb.max: ", chunk_msb.max())
-        chunk_msb = chunk_msb.copy2numpy()
+        #chunk_msb = chunk_msb.copy2numpy()
         if verbose >= 1: print("chunk_msb.copy2numpy().max: ", chunk_msb.max())
         h5tomo_msb[z_offset+z:z_offset+chunk_end] = chunk_msb[:]
-        
+
         if verbose >= 1: print(f"Writing {sample} LSB slice {z+z_offset}:{chunk_end+z_offset} ({i}-{z})");
         chunk_lsb = (chunk[:chunk_end-z]&0xff).astype(np.uint8)
         if verbose >= 1: print("chunk_lsb.max: ", chunk_lsb.max())
-        chunk_lsb = chunk_lsb.copy2numpy()
+        #chunk_lsb = chunk_lsb.copy2numpy()
         if verbose >= 1: print("chunk_lsb.copy2numpy().max: ", chunk_lsb.max())
         h5tomo_lsb[z_offset+z:z_offset+chunk_end] = chunk_lsb[:]
-        np.flush()
-        
+        #np.flush()
+
     z_offset += nz;
 
 h5file_msb.close()

From 75f11a2db3e1bcdf27dd063f7842a2d0f5796abb Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Mar 2023 16:19:13 +0100
Subject: [PATCH 124/136] #25 Verified that 0300 - 0600 (inclusive) runs

---
 proc-steps-checklist.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/proc-steps-checklist.txt b/proc-steps-checklist.txt
index 34a94f6..30b6929 100644
--- a/proc-steps-checklist.txt
+++ b/proc-steps-checklist.txt
@@ -1,2 +1,6 @@
 0100 - Haven't checked. Requires ERDA to be set up properly.
 0200 - Runs!
+0300 - Verified!
+0400 - Runs!
+0500 - Runs!
+0600 - Runs!
\ No newline at end of file

From f28163d1063eaa957b32bca5cd150da49d130e39 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Fri, 10 Mar 2023 16:20:11 +0100
Subject: [PATCH 125/136] #25 Started working on step 0700, which doesn't look
 quite right.

---
 src/processing_steps/0700_implant_FoR.py | 174 ++++++++++-------------
 1 file changed, 77 insertions(+), 97 deletions(-)

diff --git a/src/processing_steps/0700_implant_FoR.py b/src/processing_steps/0700_implant_FoR.py
index dd65424..b311755 100644
--- a/src/processing_steps/0700_implant_FoR.py
+++ b/src/processing_steps/0700_implant_FoR.py
@@ -2,7 +2,7 @@
 sys.path.append(sys.path[0]+"/../")
 from config.constants import *
 from config.paths import hdf5_root, binary_root
-from lib.cpp.cpu_seq.geometry import center_of_mass, inertia_matrix, sample_plane
+from lib.cpp.cpu.geometry import center_of_mass, inertia_matrix, sample_plane
 from lib.cpp.gpu.morphology import erode_3d_sphere as erode_3d, dilate_3d_sphere as dilate_3d
 import matplotlib.pyplot as plt
 from matplotlib.colors import colorConverter
@@ -15,16 +15,16 @@
 
 # Hvor skal disse hen?
 def circle_center(p0,p1,p2):
-    m1, m2               = (p0+p1)/2, (p0+p2)/2   # Midpoints 
+    m1, m2               = (p0+p1)/2, (p0+p2)/2   # Midpoints
     (dx1,dy1), (dx2,dy2) = (p1-p0), (p2-p0)       # Slopes of connecting lines
     n1, n2               = np.array([dy1,-dx1]).T, np.array([dy2,-dx2]).T # Normals perpendicular to connecting lines
-    
+
     A       = np.array([n1,-n2]).T   # Solve m1 + t1*n1 == m2 + t2*n2   <=> t1*n1 - t2*n2 = m2-m1
-    
+
     (t1,t2) = la.solve(A, m2-m1)
 
     c1, c2 = m1+t1*n1, m2+t2*n2  # Center of circle!
-    
+
     assert(np.allclose(c1,c2))
 
     return c1
@@ -55,7 +55,6 @@ def open_3d(image, r):
 
     return I1[r:-r,r:-r,r:-r].astype(image.dtype)
 
-
 def coordinate_image(shape):
     Nz,Ny,Nx   = shape
     if verbose >= 1: print(f"Broadcasting coordinates for {shape} image")
@@ -66,8 +65,6 @@ def coordinate_image(shape):
     if verbose >= 1: print(f"Done")
     return zyxs
 
-
-
 def proj(u,v):                  # Project u onto v
     return (np.dot(u,v)/np.dot(v,v))*v
 
@@ -77,7 +74,6 @@ def gramschmidt(u,v,w):
 
     return np.array([u/la.norm(u), vp/la.norm(v), wp/la.norm(w)])
 
-
 def highest_peaks(data,n,height=0.7):
     peaks, info = signal.find_peaks(data,height=height*data.max())
     return peaks[np.argsort(info['peak_heights'])][:n]
@@ -85,7 +81,7 @@ def highest_peaks(data,n,height=0.7):
 def largest_cc_of(mask):
     label, n_features = ndi.label(mask)
     bincnts           = np.bincount(label[label>0],minlength=n_features+1)
-    
+
     largest_cc_ix   = np.argmax(bincnts)
     return (label==largest_cc_ix)
 
@@ -110,24 +106,23 @@ def homogeneous_transform(xs, M):
     if verbose >= 1: print(hxs.shape, M.shape)
     return hxs @ M.T
 
-
 def zyx_to_UVWp_transform():
     Tcm   = hom_translate(-cm*voxel_size)
     Muvw  = hom_linear(UVW)
     TW0   = hom_translate((0,0,-w0*voxel_size))
     Tcp   = hom_translate(-cp)
-    Muvwp = hom_linear(UVWp)    
+    Muvwp = hom_linear(UVWp)
 
     return Muvwp @ Tcp @ TW0 @ Muvw @ Tcm
 
 vaxis = {'z':np.array((0,0,1.)), 'y':np.array((0,-1.,0)), 'z2':np.array((0,0,1.))}
 daxis = {'z':np.array([-1,1,0]), 'y':np.array([0,0,1]), 'z2':np.array([-1.5,0,0])}
-    
+
 def figure_FoR_UVW(debug=True):
-    vol = vedo.Volume(implant,alpha=[0,0,0.05,0.2])
+    vol = vedo.Volume(implant, alpha=[0,0,0.05,0.2])
     u_arrow = vedo.Arrow(cm[::-1],cm[::-1]+1/np.sqrt(ls[0]/ls[2])*100*u_vec[::-1],c='r',s=0.7)
     v_arrow = vedo.Arrow(cm[::-1],cm[::-1]+1/np.sqrt(ls[1]/ls[2])*100*v_vec[::-1],c='g',s=0.7)
-    w_arrow = vedo.Arrow(cm[::-1],cm[::-1]+100*w_vec[::-1],c='b',s=0.7)    
+    w_arrow = vedo.Arrow(cm[::-1],cm[::-1]+100*w_vec[::-1],c='b',s=0.7)
 
     for axis in vaxis.keys():
         pl = vedo.Plotter(offscreen=True, interactive=False,sharecam=False)
@@ -147,35 +142,34 @@ def figure_FoR_UVW(debug=True):
             'viewup':-vaxis[axis]
         })
 
-
-# TODO: Fix lengths (voxel_size times...)        
+# TODO: Fix lengths (voxel_size times...)
 def figure_FoR_UVWp(debug=True):
-    implant_uvwps = homogeneous_transform(implant_zyxs*voxel_size,Muvwp)
-    pts = pc.Points(implant_uvwps)
-    
+    implant_uvwps = homogeneous_transform(implant_zyxs * voxel_size, Muvwp)
+    pts = pc.Points(implant_uvwps[:,:3])
+
     u_arrow = vedo.Arrow([0,0,0],1/np.sqrt(ls[0]/ls[2])*100*np.array([0,0,1]),c='r',s=0.7)
     v_arrow = vedo.Arrow([0,0,0],1/np.sqrt(ls[1]/ls[2])*100*v_vec[::-1],c='g',s=0.7)
-    w_arrow = vedo.Arrow([0,0,0],100*w_vec[::-1],c='b',s=0.7)    
+    w_arrow = vedo.Arrow([0,0,0],100*w_vec[::-1],c='b',s=0.7)
 
-    pl = vedo.Plotter(offscreen=True, interactive=False, sharecam=False)        
+    pl = vedo.Plotter(offscreen=True, interactive=False, sharecam=False)
     for axis in vaxis.keys():
         pl.show([pts,u_arrow,v_arrow,w_arrow],camera={
             'pos': np.array((nz/2,ny/2,nx/2)) + 2.5*ny*daxis[axis],
             'focalPoint': (nz/2,ny/2,nx/2),
             'viewup':-vaxis[axis]
         })
-        
+
         pl.screenshot(f"{image_output_dir}/implant-FoR_UVWp-{axis}.png")
 
     if debug:
-        vedo.show([pts,u_arrow,v_arrow,w_arrow],interactive=True)        
-        
+        vedo.show([pts,u_arrow,v_arrow,w_arrow],interactive=True)
+
 def figure_FoR_circle(name,center,v_vec,w_vec,radius,implant_bbox,debug=True):
     from matplotlib.patches import Circle
     from matplotlib.lines import Line2D
-    
+
     [U_min,U_max,V_min,V_max,W_min,W_max] = implant_bbox
-    
+
     sample = np.zeros((800,800),dtype=np.float32)
     sample_bbox = (-2905.,2905,-1000,4810.)
     sample_plane(voxels,voxel_size,
@@ -194,7 +188,7 @@ def figure_FoR_circle(name,center,v_vec,w_vec,radius,implant_bbox,debug=True):
     p2 = np.array((V_max,W_min))
 
     m1, m2 = (p0+p1)/2, (p0+p2)/2
-    
+
     ax.add_patch(Circle((0,0), radius*1.01, ec='black',fc=circle_color))
     ax.add_patch(Circle(p1, radius/40, fc='purple'))
     ax.add_patch(Circle(p2, radius/40, fc='purple'))
@@ -206,8 +200,8 @@ def figure_FoR_circle(name,center,v_vec,w_vec,radius,implant_bbox,debug=True):
     ax.add_line(Line2D([p0[0],p2[0]],[p0[1],p2[1]],c='red'))
 
     ax.add_line(Line2D([m1[0]*1.05,0],[m1[1]*1.05,0],c='green'))
-    ax.add_line(Line2D([m2[0]*1.05,0],[m2[1]*1.05,0],c='green'))        
-    
+    ax.add_line(Line2D([m2[0]*1.05,0],[m2[1]*1.05,0],c='green'))
+
     fig.savefig(f"{image_output_dir}/implant-FoR_{name}.png",dpi=300)
 
     if debug:
@@ -218,92 +212,88 @@ def figure_FoR_profiles(debug):
     ax1 = fig1.add_subplot(111)
     ax1.plot((Up_bins[1:]+Up_bins[:-1])/2, Up_integrals);
     fig1.savefig(f"{image_output_dir}/implant-FoR_Up-profile.png")
-    
+
     fig2 = plt.figure()
     ax2 = fig2.add_subplot(111)
     ax2.plot((theta_bins[1:]+theta_bins[:-1])/2, theta_integrals)
-    fig2.savefig(f"{image_output_dir}/implant-FoR_theta-profile.png")        
-    
+    fig2.savefig(f"{image_output_dir}/implant-FoR_theta-profile.png")
+
     if debug:
         plt.show()
 
-    
-        
-        
-def figure_FoR_cylinder(debug=True): 
+def figure_FoR_cylinder(debug=True):
 #    center_line = vedo.Arrow(C1,C2)
     center_line = vedo.Cylinder((C1+C2)/2,r=implant_radius_voxels/20,height=implant_length_voxels, axis=(C2-C1),alpha=1,c='r')
     cylinder = vedo.Cylinder((C1+C2)/2,r=implant_radius_voxels,height=implant_length_voxels, axis=(C2-C1),alpha=0.3)
-    
+
     Up_arrow = vedo.Arrow(Cp, UVW2xyz(cp+implant_length*u_prime), c='r')
     Vp_arrow = vedo.Arrow(Cp, UVW2xyz(cp+implant_radius*2*v_prime), c='g')
     Wp_arrow = vedo.Arrow(Cp, UVW2xyz(cp+implant_radius*2*w_prime), c='b')
 
     vol = vedo.Volume(implant,alpha=[0,0,0.05,0.1])
 
-        
-    pl = vedo.Plotter(offscreen=True, interactive=False,sharecam=False)        
+    pl = vedo.Plotter(offscreen=True, interactive=False,sharecam=False)
     for axis in vaxis.keys():
         pl.show([vol,center_line,Vp_arrow,Wp_arrow,cylinder],camera={
             'pos': np.array((nz/2,ny/2,nx/2)) + 2.5*ny*daxis[axis],
             'focalPoint': (nz/2,ny/2,nx/2),
             'viewup':-vaxis[axis]
         })
-    
-        pl.screenshot(f"{image_output_dir}/implant-FoR_cylinder-{axis}.png")    
-    
+
+        pl.screenshot(f"{image_output_dir}/implant-FoR_cylinder-{axis}.png")
+
     if debug:
         vedo.show([vol,cylinder,Up_arrow,Vp_arrow,Wp_arrow],interactive=True)
 
 def figure_FoR_voxels(name,voxels,debug=True):
     vol = vedo.Volume(voxels,alpha=[0,0,0.05,0.1])
 
-    pl  = vedo.Plotter(offscreen=True, interactive=False,sharecam=False)    
+    pl  = vedo.Plotter(offscreen=True, interactive=False,sharecam=False)
     for axis in vaxis.keys():
         pl.show([vol],camera={
             'pos': np.array((nz/2,ny/2,nx/2)) + 2.5*ny*daxis[axis],
             'focalPoint': (nz/2,ny/2,nx/2),
             'viewup':-vaxis[axis]
-        })        
+        })
         pl.screenshot(f"{image_output_dir}/implant-FoR_voxels_{name}-{axis}.png")
 
     if debug:
         vedo.show([vol],interactive=True)
 
-    
-
-
-        
 if __name__ == "__main__":
     sample, scale, verbose = commandline_args({"sample" : "<required>",
                                                "scale" : 8,
                                                "verbose" : 1})
-    
+
     if(scale<8):
         if verbose >= 1: print(f"Selected scale is {scale}x: This should not be run at high resolution, use scale>=8.")
         #sys.exit(-1)
 
     ## STEP 0: LOAD MASKS, VOXELS, AND METADATA
     image_output_dir = f"{hdf5_root}/processed/implant-FoR/{sample}/"
-    if verbose >= 1: print(f"Storing all debug-images to {image_output_dir}")    
+    if verbose >= 1: print(f"Storing all debug-images to {image_output_dir}")
     pathlib.Path(image_output_dir).mkdir(parents=True, exist_ok=True)
-    
+
     if verbose >= 1: print(f"Loading {scale}x implant mask from {hdf5_root}/masks/{scale}x/{sample}.h5")
     implant_file = h5py.File(f"{hdf5_root}/masks/{scale}x/{sample}.h5",'r')
-    implant      = implant_file["implant/mask"][:]
+    implant      = implant_file["implant/mask"][:].astype(np.uint8)
     voxel_size   = implant_file["implant"].attrs["voxel_size"]
     implant_file.close()
-    
+
     if verbose >= 1: print(f"Loading {scale}x voxels from {binary_root}/voxels/{scale}x/{sample}.uint16")
     voxels  = np.fromfile(f"{binary_root}/voxels/{scale}x/{sample}.uint16",dtype=np.uint16).reshape(implant.shape)
 
+    plt.imshow(voxels[voxels.shape[0]//2,:,:]); plt.savefig(f'{image_output_dir}/voxels-sanity-xy.png')
+    plt.imshow(voxels[:,voxels.shape[0]//2,:]); plt.savefig(f'{image_output_dir}/voxels-sanity-xz.png')
+    plt.imshow(voxels[:,:,voxels.shape[0]//2]); plt.savefig(f'{image_output_dir}/voxels-sanity-yz.png')
+
     nz,ny,nx = implant.shape
 
     ### STEP 1: COMPUTE IMPLANT PRINCIPAL AXES FRAME OF REFERENCE
     ## STEP1A: DIAGONALIZE MOMENT OF INTERTIA MATRIX TO GET PRINCIPAL AXES
     cm    = np.array(center_of_mass(implant))                  # in downsampled-voxel index coordinates
     if verbose >= 1: print(f"Center of mass is: {cm}")
-    IM    = np.array(inertia_matrix(implant,cm)).reshape(3,3)  
+    IM    = np.array(inertia_matrix(implant,cm)).reshape(3,3)
     ls,E  = la.eigh(IM)
 
     ## STEP 1B: PRINCIPAL AXES ARE ONLY DEFINED UP TO A SIGN.
@@ -318,7 +308,7 @@ def figure_FoR_voxels(name,voxels,debug=True):
         E[:,0] *= -1
     if sample == "770_pag":
         E[:,2] *= -1
-    
+
     ix = np.argsort(np.abs(ls));
     ls, E = ls[ix], E[:,ix]
     UVW = E.T
@@ -335,7 +325,7 @@ def figure_FoR_voxels(name,voxels,debug=True):
     w0  = implant_uvws[:,2].min();  # In {scale}x voxel units
     w0v = np.array([0,0,w0])        # w-shift to get to center of implant back-plane
 
-    
+
     ## 2B: Transform to backplane-centered coordinates in physical units
     implant_UVWs = (implant_uvws - w0v)*voxel_size     # Physical U,V,W-coordinates, relative to implant back-plane center, in micrometers
     implant_Us,implant_Vs,implant_Ws = implant_UVWs.T  # Implant point coordinates
@@ -349,7 +339,7 @@ def figure_FoR_voxels(name,voxels,debug=True):
     for i in tqdm.tqdm(range(len(U_bins)-1),"Cylinder centres as fn of U"):
         # Everything is in micrometers
         U0,U1 = U_bins[i], U_bins[i+1]
-        
+
         slab = implant_UVWs[(implant_Us>=U0) & (implant_Us<=U1)]
         slab_Us, slab_Vs, slab_Ws = slab.T
 
@@ -360,17 +350,16 @@ def figure_FoR_voxels(name,voxels,debug=True):
         p1 = np.array([V0,0])
         p2 = np.array([V1,0])
 
-        # Will be way faster to 
+        # Will be way faster to
         c = circle_center(p0,p1,p2)     # circle center in VW-coordinates
         Cs[i] = np.array([(U0+U1)/2, c[0], c[1]])
-        Rs[i] = la.norm(p0-c)        
-        
+        Rs[i] = la.norm(p0-c)
 
-    ## 2D: Best circle centers along U forms a helix, due to the winding screw threads. To get the best cylinder, 
+    ## 2D: Best circle centers along U forms a helix, due to the winding screw threads. To get the best cylinder,
     ##     we solve for direction vector u_prime so C(U) = C0 + U*u_prime + e(U) with minimal least square residual error e(U)
     ##     where C0 is the mean of the segment circle centers.
-    # 
-    # U*u_prime = C(U) - C0  
+    #
+    # U*u_prime = C(U) - C0
     #
     # Cs: (N,3)
     # U: N -> (N,3)
@@ -378,10 +367,10 @@ def figure_FoR_voxels(name,voxels,debug=True):
     C0 = np.mean(Cs,axis=0)
     u_prime, _,_,_ = la.lstsq(Ub, Cs-C0)
     u_prime = u_prime[0]
-    
+
     UVWp = gramschmidt(u_prime,np.array([0,1,0]),np.array([0,0,1]))
     u_prime, v_prime, w_prime = UVWp # U',V',W' in U,V,W coordinates
-    
+
     c1 = C0 + implant_Us.min()*u_prime
     c2 = C0 + implant_Us.max()*u_prime
     cp = (c1+c2)/2
@@ -393,30 +382,28 @@ def UVW2xyz(p):
 
     C1, C2, Cp = UVW2xyz(c1), UVW2xyz(c2), UVW2xyz(cp)
 
-
-    
     implant_length = (implant_Us.max()-implant_Us.min())
     implant_radius = Rs.max()
 
     implant_length_voxels = implant_length/voxel_size
     implant_radius_voxels = implant_radius/voxel_size
-    
+
     figure_FoR_cylinder(verbose >= 2)
 
     ### 3: In the cylinder coordinates, find radii and angle ranges to fill in the "holes" in the implant and make it solid
     ###    (More robust than closing operations, as we don't want to effect the screw threads).
 
     ## 3A: Transform to implant cylinder coordinates
-    implant_UVWps = (implant_UVWs - cp) @ UVWp # We now transform to fully screw aligned coordinates with screw center origin    
+    implant_UVWps = (implant_UVWs - cp) @ UVWp # We now transform to fully screw aligned coordinates with screw center origin
     implant_Ups, implant_Vps, implant_Wps = implant_UVWps.T
 
-    Up_min, Up_max = implant_Ups.min(), implant_Ups.max()    
+    Up_min, Up_max = implant_Ups.min(), implant_Ups.max()
     Vp_min, Vp_max = implant_Vps.min(), implant_Vps.max()
     Wp_min, Wp_max = implant_Wps.min(), implant_Wps.max()
 
     #TODO: Local circle figure (instead of showing global fit on local slice, which isn't snug)
     bbox_uvwp = [Up_min,Up_max,Vp_min,Vp_max,Wp_min,Wp_max]
-    figure_FoR_circle("prime-circle",Cp*voxel_size,v_vec,w_vec,implant_radius,bbox_uvwp,verbose >= 2)    
+    figure_FoR_circle("prime-circle",Cp*voxel_size,v_vec,w_vec,implant_radius,bbox_uvwp,verbose >= 2)
 
     ## 3B: Profile of radii and angles
     implant_thetas = np.arctan2(implant_Vps,implant_Wps)
@@ -441,8 +428,8 @@ def UVW2xyz(p):
     zyxs = coordinate_image(implant.shape)
     uvws = (zyxs - cm) @ E                  # raw voxel-scale relative to center of mass
     UVWs = (uvws - w0v) * voxel_size        # Micrometer scale relative to backplane-center
-    Us,Vs,Ws = UVWs[...,0], UVWs[...,1], UVWs[...,2]        # UVW physical image coordinates 
-        
+    Us,Vs,Ws = UVWs[...,0], UVWs[...,1], UVWs[...,2]        # UVW physical image coordinates
+
     UVWps = (UVWs - cp) @ UVWp                # relative to center-of-implant-before-sawing-in-half
     Ups,Vps,Wps = UVWps[...,0], UVWps[...,1], UVWps[...,2]      # U',V',W' physical image coordinates
     thetas, rs = np.arctan2(Vps,Wps), np.sqrt(Vps**2+Wps**2)    # This is the good reference frame for cylindrical coords
@@ -457,18 +444,17 @@ def UVW2xyz(p):
     solid_implant_UVWps   = ((((np.array(np.nonzero(solid_quarter)).T - cm) @ E) - w0v)*voxel_size - cp) @ UVWp
     Up_integrals, Up_bins = np.histogram(solid_implant_UVWps[:,0],200)
 
-    figure_FoR_profiles(verbose >= 2)    
+    figure_FoR_profiles(verbose >= 2)
     figure_FoR_voxels("solid_implant",solid_implant,verbose >= 2)
 
     back_mask  = (Ws<0)
     front_mask = largest_cc_of((Ws>50)*(~solid_implant))#*(thetas>=theta_from)*(thetas<=theta_to)
 
     # back_part = voxels*back_mask
-   
-    front_part = voxels*front_mask
-    figure_FoR_voxels("back_part", voxels*back_mask, verbose >= 2) 
-    figure_FoR_voxels("front_part",voxels*front_mask, verbose >= 2) 
 
+    front_part = voxels*front_mask
+    figure_FoR_voxels("back_part", voxels*back_mask, verbose >= 2)
+    figure_FoR_voxels("front_part",voxels*front_mask, verbose >= 2)
 
     Cp_zyx = Cp[::-1]*voxel_size
 
@@ -489,10 +475,10 @@ def UVW2xyz(p):
     update_hdf5(f"{output_dir}/{sample}.h5",
                 group_name="implant-FoR",
                 datasets={"UVW":UVW,
-                          "UVWp": UVWp,                      
-                          "center_of_mass":cm*voxel_size, 
+                          "UVWp": UVWp,
+                          "center_of_mass":cm*voxel_size,
                           "center_of_cylinder_UVW": cp,
-                          "UVWp_transform": Muvwp,                      
+                          "UVWp_transform": Muvwp,
                           "center_of_cylinder_zyx": Cp_zyx, # Cp is in scaled voxel xyz
                           "bounding_box_UVWp": np.array([[implant_Ups.min(),implant_Ups.max()],
                                                          [implant_Vps.min(),implant_Vps.max()],
@@ -502,7 +488,7 @@ def UVW2xyz(p):
                           "theta_range": np.array([theta_from, theta_to])
                 },
                 attributes={"backplane_W_shift":w0*voxel_size,
-                            "implant_radius": implant_radius                        
+                            "implant_radius": implant_radius
                 },
                 dimensions={
                     "center_of_mass":"zyx micrometers",
@@ -515,7 +501,6 @@ def UVW2xyz(p):
                 chunk_shape=None
         )
 
-
     output_dir = f"{hdf5_root}/masks/{scale}x/"
     pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
     if verbose >= 1: print(f"Saving implant_solid mask to {output_dir}/{sample}.h5")
@@ -542,40 +527,35 @@ def UVW2xyz(p):
                      datasets={"mask":front_mask},
                      attributes={"sample":sample, "scale":scale, "voxel_size":voxel_size})
 
-
     if verbose >= 1: print(f"Computing bone region")
     hist, bins = np.histogram(front_part, 256)
     hist[0] = 0
     peaks, info = signal.find_peaks(hist,height=0.5*hist.max())
-        
+
     try:
         p1, p2 = peaks[np.argsort(info['peak_heights'])[:2]]
         midpoint = int(round((bins[p1]+bins[p2+1])/2)) # p1 is left-edge of p1-bin, p2+1 is right edge of p2-bin
         if verbose >= 1: print(f"p1, p2 = ({p1,bins[p1]}), ({p2,bins[p2]}); midpoint = {midpoint}")
-        
-        bone_mask1 = front_part > midpoint                                                                                                                                                                                                                                       
-        closing_diameter, opening_diameter = 400, 300           # micrometers                                                                                                                                                                   
+
+        bone_mask1 = front_part > midpoint
+        closing_diameter, opening_diameter = 400, 300           # micrometers
         closing_voxels = 2*int(round(closing_diameter/(2*voxel_size))) + 1 # Scale & ensure odd length
         opening_voxels = 2*int(round(opening_diameter/(2*voxel_size))) + 1 # Scale & ensure odd length
-        
+
         for i in tqdm.tqdm(range(1),f"Closing with sphere of diameter {closing_diameter} micrometers, {closing_voxels} voxels.\n"):
             bone_region_mask = close_3d(bone_mask1, closing_voxels//2)
-            
+
         for i in tqdm.tqdm(range(1),f"Opening with sphere of diameter {opening_diameter} micrometers, {opening_voxels} voxels.\n"):
             bone_region_mask &= ~solid_implant #~open_3d(implant_shell_mask, opening_voxels)
             bone_region_mask = open_3d(bone_region_mask,opening_voxels//2)
-            
-    
+
         bone_region_mask = largest_cc_of(bone_region_mask)
     except:
         if verbose >= 1: print(f"Wasnt able to separate into resin and bone region. Assuming all is bone region.")
         bone_region_mask = front_mask
-    
+
         if verbose >= 1: print(f"Saving bone_region mask to {output_dir}/{sample}.h5")
         update_hdf5_mask(f"{output_dir}/{sample}.h5",
                          group_name="bone_region",
                          datasets={"mask":bone_region_mask},
                          attributes={"sample":sample, "scale":scale, "voxel_size":voxel_size})
-
-
-

From 3c39c269d356b7412e2064ab9a7019f4ce886de2 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 13 Mar 2023 12:05:46 +0100
Subject: [PATCH 126/136] #25 Handled pybind conversion for io

---
 src/pybind/io-pybind.cc | 37 ++++++++++++++++++++++++-------------
 src/test/test_io.py     |  3 +++
 2 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/src/pybind/io-pybind.cc b/src/pybind/io-pybind.cc
index 060d9d9..0e5d680 100644
--- a/src/pybind/io-pybind.cc
+++ b/src/pybind/io-pybind.cc
@@ -17,7 +17,18 @@ void load_slice(py::array_t<T> &np_data, const string filename,
     auto [Nz, Ny, Nx] = shape;
     auto [oz, oy, ox] = offset;
     uint64_t flat_offset = oz*Ny*Nx + oy*Nx + ox;
+
+    cout <<
+        Nz << " " << Ny << " " << Nx << " " <<
+        oz << " " << oy << " " << ox << " " <<
+        flat_offset << endl;
+
     NS::load_contiguous_slice<T>(data, filename, flat_offset, data_info.size);
+
+    T checksum = (T) 0;
+    for (int64_t i = 0; i < data_info.size; i++)
+        checksum += data[i];
+    cout << checksum << " " << sizeof(T) << endl;
 }
 
 template <typename T>
@@ -37,17 +48,17 @@ void write_slice(const py::array_t<T> &np_data,
 
 PYBIND11_MODULE(io, m) {
     m.doc() = "I/O functions for handling flat binary format files."; // optional module docstring
-    m.def("load_slice", &python_api::load_slice<uint8_t>);
-    m.def("load_slice", &python_api::load_slice<uint16_t>);
-    m.def("load_slice", &python_api::load_slice<uint32_t>);
-    m.def("load_slice", &python_api::load_slice<uint64_t>);
-    m.def("load_slice", &python_api::load_slice<float>);
-    m.def("load_slice", &python_api::load_slice<double>);
-
-    m.def("write_slice", &python_api::write_slice<uint8_t>);
-    m.def("write_slice", &python_api::write_slice<uint16_t>);
-    m.def("write_slice", &python_api::write_slice<uint32_t>);
-    m.def("write_slice", &python_api::write_slice<uint64_t>);
-    m.def("write_slice", &python_api::write_slice<float>);
-    m.def("write_slice", &python_api::write_slice<double>);
+    m.def("load_slice", &python_api::load_slice<uint8_t>, py::arg("np_data").noconvert(), py::arg("filename"), py::arg("offset"), py::arg("shape"));
+    m.def("load_slice", &python_api::load_slice<uint16_t>, py::arg("np_data").noconvert(), py::arg("filename"), py::arg("offset"), py::arg("shape"));
+    m.def("load_slice", &python_api::load_slice<uint32_t>, py::arg("np_data").noconvert(), py::arg("filename"), py::arg("offset"), py::arg("shape"));
+    m.def("load_slice", &python_api::load_slice<uint64_t>, py::arg("np_data").noconvert(), py::arg("filename"), py::arg("offset"), py::arg("shape"));
+    m.def("load_slice", &python_api::load_slice<float>, py::arg("np_data").noconvert(), py::arg("filename"), py::arg("offset"), py::arg("shape"));
+    m.def("load_slice", &python_api::load_slice<double>, py::arg("np_data").noconvert(), py::arg("filename"), py::arg("offset"), py::arg("shape"));
+
+    m.def("write_slice", &python_api::write_slice<uint8_t>, py::arg("np_data").noconvert(), py::arg("filename"), py::arg("offset"), py::arg("shape"));
+    m.def("write_slice", &python_api::write_slice<uint16_t>, py::arg("np_data").noconvert(), py::arg("filename"), py::arg("offset"), py::arg("shape"));
+    m.def("write_slice", &python_api::write_slice<uint32_t>, py::arg("np_data").noconvert(), py::arg("filename"), py::arg("offset"), py::arg("shape"));
+    m.def("write_slice", &python_api::write_slice<uint64_t>, py::arg("np_data").noconvert(), py::arg("filename"), py::arg("offset"), py::arg("shape"));
+    m.def("write_slice", &python_api::write_slice<float>, py::arg("np_data").noconvert(), py::arg("filename"), py::arg("offset"), py::arg("shape"));
+    m.def("write_slice", &python_api::write_slice<double>, py::arg("np_data").noconvert(), py::arg("filename"), py::arg("offset"), py::arg("shape"));
 }
\ No newline at end of file
diff --git a/src/test/test_io.py b/src/test/test_io.py
index 09ad43a..ee16cd8 100644
--- a/src/test/test_io.py
+++ b/src/test/test_io.py
@@ -32,6 +32,9 @@ def test_dtype(impl, dtype):
     if os.path.exists(individual_tmp_file):
         os.remove(individual_tmp_file)
     data = random(dim_shape, dtype)
+
+    assert (len(np.unique(data)) > 1) # Assert interesting data
+
     partial = dim_size // partial_factor
 
     # Write out a new file

From 9f06542bde4f6628781ef4e0a7fc0bf8d899459c Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 13 Mar 2023 12:18:54 +0100
Subject: [PATCH 127/136] #25 Fixed pybind implicit conversion for geometry

---
 src/pybind/geometry-pybind.cc | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/pybind/geometry-pybind.cc b/src/pybind/geometry-pybind.cc
index ebd09b7..7499275 100644
--- a/src/pybind/geometry-pybind.cc
+++ b/src/pybind/geometry-pybind.cc
@@ -119,13 +119,13 @@ void cylinder_projection(const np_array<float>  &np_edt,  // Euclidean Distance
 PYBIND11_MODULE(geometry, m) {
     m.doc() = "Voxel Geometry Module"; // optional module docstring
 
-    m.def("center_of_mass",       &python_api::center_of_mass);
-    m.def("inertia_matrix",       &python_api::inertia_matrix);
-    m.def("integrate_axes",       &python_api::integrate_axes);
-    m.def("zero_outside_bbox",    &python_api::zero_outside_bbox);
-    m.def("fill_implant_mask",    &python_api::fill_implant_mask);
-    m.def("cylinder_projection",  &python_api::cylinder_projection);
-    m.def("sample_plane",         &python_api::sample_plane<uint16_t>);
-    m.def("sample_plane",         &python_api::sample_plane<uint8_t>);
-    m.def("compute_front_mask",   &python_api::compute_front_mask);
+    m.def("center_of_mass",       &python_api::center_of_mass, py::arg("np_voxels"));
+    m.def("inertia_matrix",       &python_api::inertia_matrix, py::arg("np_voxels"), py::arg("cm"));
+    m.def("integrate_axes",       &python_api::integrate_axes, py::arg("np_voxels"), py::arg("x0"), py::arg("v_axis"), py::arg("w_axis"), py::arg("v_min"), py::arg("w_min"), py::arg("output").noconvert());
+    m.def("zero_outside_bbox",    &python_api::zero_outside_bbox, py::arg("principal_axes"), py::arg("parameter_ranges"), py::arg("cm"), py::arg("np_voxels").noconvert());
+    m.def("fill_implant_mask",    &python_api::fill_implant_mask, py::arg("implant_mask"), py::arg("voxel_size"), py::arg("bbox"), py::arg("r_fraction"), py::arg("Muvw"), py::arg("solid_implant_mask").noconvert(), py::arg("rsqr_maxs").noconvert(), py::arg("profile").noconvert());
+    m.def("cylinder_projection",  &python_api::cylinder_projection, py::arg("np_edt"), py::arg("np_Cs"), py::arg("Cs_voxel_size"), py::arg("d_min"), py::arg("d_max"), py::arg("theta_min"), py::arg("theta_max"), py::arg("bbox"), py::arg("Muvw"), py::arg("np_images").noconvert(), py::arg("np_counts").noconvert());
+    m.def("sample_plane",         &python_api::sample_plane<uint16_t>, py::arg("np_voxels"), py::arg("voxel_size"), py::arg("cm"), py::arg("u_axis"), py::arg("v_axis"), py::arg("bbox"), py::arg("np_plano_samples").noconvert());
+    m.def("sample_plane",         &python_api::sample_plane<uint8_t>, py::arg("np_voxels"), py::arg("voxel_size"), py::arg("cm"), py::arg("u_axis"), py::arg("v_axis"), py::arg("bbox"), py::arg("np_plano_samples").noconvert());
+    m.def("compute_front_mask",   &python_api::compute_front_mask, py::arg("np_solid_implant"), py::arg("voxel_size"), py::arg("Muvw"), py::arg("bbox"), py::arg("np_front_mask").noconvert());
 }

From b0a46b4f4e42378fa3f566b32078c6edc9612733 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 13 Mar 2023 12:20:07 +0100
Subject: [PATCH 128/136] #25 Fixed implicit pybind conversion for morphology

---
 src/pybind/morphology-pybind.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pybind/morphology-pybind.cc b/src/pybind/morphology-pybind.cc
index b8547e7..b91daa7 100644
--- a/src/pybind/morphology-pybind.cc
+++ b/src/pybind/morphology-pybind.cc
@@ -28,6 +28,6 @@ void morphology_3d_sphere_wrapper(
 
 PYBIND11_MODULE(morphology, m) {
     m.doc() = "Morphology operations."; // optional module docstring
-    m.def("dilate_3d_sphere", &morphology_3d_sphere_wrapper<std::bit_or<mask_type>, false>);
-    m.def("erode_3d_sphere", &morphology_3d_sphere_wrapper<std::bit_and<mask_type>, true>);
+    m.def("dilate_3d_sphere", &morphology_3d_sphere_wrapper<std::bit_or<mask_type>, false>, py::arg("np_voxels"), py::arg("radius"), py::arg("np_result").noconvert());
+    m.def("erode_3d_sphere", &morphology_3d_sphere_wrapper<std::bit_and<mask_type>, true>, py::arg("np_voxels"), py::arg("radius"), py::arg("np_result").noconvert());
 }
\ No newline at end of file

From 610e1d5547165ce9ed4e55bc7fbe34b8269de090 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 13 Mar 2023 12:36:39 +0100
Subject: [PATCH 129/136] #25 Verified steps are correct up until 600

---
 proc-steps-checklist.txt                      |  8 +++---
 src/processing_steps/0500_rescale_cupy_bin.py | 26 +++++++++----------
 src/pybind/io-pybind.cc                       | 11 +-------
 3 files changed, 18 insertions(+), 27 deletions(-)

diff --git a/proc-steps-checklist.txt b/proc-steps-checklist.txt
index 30b6929..b8ba576 100644
--- a/proc-steps-checklist.txt
+++ b/proc-steps-checklist.txt
@@ -1,6 +1,6 @@
 0100 - Haven't checked. Requires ERDA to be set up properly.
-0200 - Runs!
+0200 - Verified!
 0300 - Verified!
-0400 - Runs!
-0500 - Runs!
-0600 - Runs!
\ No newline at end of file
+0400 - Verified!
+0500 - Verified!
+0600 - Verified!
\ No newline at end of file
diff --git a/src/processing_steps/0500_rescale_cupy_bin.py b/src/processing_steps/0500_rescale_cupy_bin.py
index f7d5f47..271d17c 100644
--- a/src/processing_steps/0500_rescale_cupy_bin.py
+++ b/src/processing_steps/0500_rescale_cupy_bin.py
@@ -13,9 +13,9 @@
 pinned_mempool.free_all_blocks()
 
 if __name__ == "__main__":
-    sample, image, chunk_size, dtype, verbose = commandline_args({"sample" : "<required>", 
+    sample, image, chunk_size, dtype, verbose = commandline_args({"sample" : "<required>",
                                                                   "image" :  "voxels",
-                                                                  "chunk_size" : 32*2, 
+                                                                  "chunk_size" : 32*2,
                                                                   "dtype" : "uint16",
                                                                   "verbose" : 1})
 
@@ -29,13 +29,13 @@
     if verbose >= 1: print(f"Input metadata from {input_meta}")
     if verbose >= 1: print(f"Input flat binary {dtype} data from {input_bin}")
     if verbose >= 1: print(f"Output flat binary {dtype} data to {output_root}/[1,2,4,8,16,32]x/{sample}.{dtype}")
-    
+
     meta_h5    = h5py.File(input_meta, 'r')
     full_Nz, Ny, Nx = meta_h5['voxels'].shape
     shifts     = meta_h5['volume_matching_shifts'][:] # TODO: Do this in a neater way
     Nz         = full_Nz - np.sum(shifts)
-    meta_h5.close()    
-    
+    meta_h5.close()
+
     if verbose >= 1: print(f"Downscaling from 1x {(Nz,Ny,Nx)} to 2x {(Nz//2,Ny//2,Nx//2)}")
     if(chunk_size % 32):
         if verbose >= 1: print(f"Chunk size {chunk_size} is invalid: must be divisible by 32.")
@@ -49,12 +49,12 @@
     voxels4x  = np.empty((Nz//4,Ny//4,Nx//4),dtype=T)
     voxels8x  = np.empty((Nz//8,Ny//8,Nx//8),dtype=T)
     voxels16x = np.empty((Nz//16,Ny//16,Nx//16),dtype=T)
-    voxels32x = np.empty((Nz//32,Ny//32,Nx//32),dtype=T)            
-    voxels    = [voxels2x,voxels4x,voxels8x,voxels16x,voxels32x];    
-    
+    voxels32x = np.empty((Nz//32,Ny//32,Nx//32),dtype=T)
+    voxels    = [voxels2x,voxels4x,voxels8x,voxels16x,voxels32x];
+
     for z in tqdm.tqdm(range(0,Nz,chunk_size),f"{sample}: Reading and scaling {chunk_size}-layer chunks"):
         zend  = min(z+chunk_size, Nz)
-        chunk_items = (zend-z) * Ny * Nx 
+        chunk_items = (zend-z) * Ny * Nx
         # # CHECK: Is a simple fread faster than numpy fromfile?
         # voxels1x_np = np.empty((zend-z,Ny,Nx),dtype=T);
         # load_slice(voxels1x_np,input_bin,(z,0,0),voxels1x_np.shape)
@@ -65,7 +65,7 @@
         except:
             if verbose >= 1: print(f"Read failed. chunk_items = {chunk_items} = {(zend-z)*Ny*Nx}, z = {z}, zend-z = {zend-z}")
             sys.exit(-1)
-            
+
 #        if verbose >= 1: print(f"Used GPU memory: {mempool.used_bytes()//1000000}MB out of {mempool.total_bytes()/1000000}MB. {pinned_mempool.n_free_blocks()} free pinned blocks.")
         voxels2x_chunk = downsample2x(voxels1x_chunk)
         del voxels1x_chunk
@@ -91,11 +91,11 @@
         del voxels8x_chunk
         del voxels16x_chunk
         del voxels32x_chunk
-        
+
     if verbose >= 1: print(f"Allocating {(Nz//2,Ny//2,Nx//2)}={Nz//2*Ny//2*Nx//2} {dtype} for voxels2x on GPU")
-    
+
     for i in tqdm.tqdm(range(len(scales)),f"{sample}: Downscaling to all smaller scales: {scales[2:]}"):
         output_dir = f"{output_root}/{scales[i]}x/"
-        pathlib.Path(f"{output_dir}").mkdir(parents=True, exist_ok=True)            
+        pathlib.Path(f"{output_dir}").mkdir(parents=True, exist_ok=True)
         if verbose >= 1: print(f"Writing out scale {scales[i]}x {(voxels[i].shape)} to {output_dir}/{sample}.uint16")
         voxels[i].tofile(f"{output_dir}/{sample}.uint16")
diff --git a/src/pybind/io-pybind.cc b/src/pybind/io-pybind.cc
index 0e5d680..8da8e5d 100644
--- a/src/pybind/io-pybind.cc
+++ b/src/pybind/io-pybind.cc
@@ -18,17 +18,7 @@ void load_slice(py::array_t<T> &np_data, const string filename,
     auto [oz, oy, ox] = offset;
     uint64_t flat_offset = oz*Ny*Nx + oy*Nx + ox;
 
-    cout <<
-        Nz << " " << Ny << " " << Nx << " " <<
-        oz << " " << oy << " " << ox << " " <<
-        flat_offset << endl;
-
     NS::load_contiguous_slice<T>(data, filename, flat_offset, data_info.size);
-
-    T checksum = (T) 0;
-    for (int64_t i = 0; i < data_info.size; i++)
-        checksum += data[i];
-    cout << checksum << " " << sizeof(T) << endl;
 }
 
 template <typename T>
@@ -41,6 +31,7 @@ void write_slice(const py::array_t<T> &np_data,
     auto [Nz, Ny, Nx] = shape;
     auto [oz, oy, ox] = offset;
     uint64_t flat_offset = oz*Ny*Nx + oy*Nx + ox;
+
     NS::write_contiguous_slice<T>(data, filename, flat_offset, data_info.size);
 }
 

From e8172006f40308aa049118ce9e206a727a721772 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Tue, 25 Apr 2023 15:12:52 +0200
Subject: [PATCH 130/136] Explicit type conversions

---
 src/lib/cpp/cpu_seq/geometry.cc | 99 ++++++++++++++++-----------------
 1 file changed, 49 insertions(+), 50 deletions(-)

diff --git a/src/lib/cpp/cpu_seq/geometry.cc b/src/lib/cpp/cpu_seq/geometry.cc
index 5c0acfa..3c35f5e 100644
--- a/src/lib/cpp/cpu_seq/geometry.cc
+++ b/src/lib/cpp/cpu_seq/geometry.cc
@@ -18,7 +18,6 @@ array<real_t, 3> center_of_mass(const input_ndarray<mask_type> &mask) {
     uint64_t total_mass = 0, cmz = 0, cmy = 0, cmx = 0;
 
     BLOCK_BEGIN(mask, reduction(+:total_mass,cmz,cmy,cmx)); {
-    // TODO James approves; now RUN!
 
         mask_type m = mask_buffer[flat_index];
 
@@ -85,9 +84,9 @@ void cylinder_projection(const input_ndarray<float>  edt,  // Euclidean Distance
     const auto& [U_min, U_max, V_min, V_max, W_min, W_max] = bbox;
 
     real_t
-        edz = edt_Nz / real_t(C_Nz),
-        edy = edt_Ny / real_t(C_Ny),
-        edx = edt_Nx / real_t(C_Nx);
+        //edz = real_t(edt_Nz) / real_t(C_Nz),
+        edy = real_t(edt_Ny) / real_t(C_Ny),
+        edx = real_t(edt_Nx) / real_t(C_Nx);
 
     //printf("Segmenting from %g to %g micrometers distance of implant.\n",d_min,d_max);
     //printf("Bounding box is [U_min,U_max,V_min,V_max,W_min,W_max] = [[%g,%g],[%g,%g],[%g,%g]]\n",
@@ -125,9 +124,9 @@ void cylinder_projection(const input_ndarray<float>  edt,  // Euclidean Distance
                 // Index into local block
                 const int64_t Xl = (k  / (C_Ny*C_Nz)), Yl = (k / C_Nz) % C_Ny, Zl = k  % C_Nz;
                 // Index into local edt block. Note EDT has 1-slice padding top+bottom
-                const float  x = (Xl+1)*edx, y = Yl*edy, z = Zl*edy;
+                const float  x = float(Xl+1)*edx, y = float(Yl)*edy, z = float(Zl)*edy;
 
-                if (x > block_height) {
+                if (x > float(block_height)) {
                     printf("Block number k=%ld.\nX,Y,Z=%ld,%ld,%ld\nXl,Yl,Zl=%ld,%ld,%ld\nx,y,z=%.2f, %.2f, %.2f\n",k,X,Y,Z,Xl,Yl,Zl,x,y,z);
                     abort();
                 }
@@ -136,7 +135,7 @@ void cylinder_projection(const input_ndarray<float>  edt,  // Euclidean Distance
                 real_t distance = resample2x2x2<float>(edt_block, {this_edt_length/(edt_Ny*edt_Nz),edt_Ny,edt_Nz}, {x,y,z});
 
                 if (distance > d_min && distance <= d_max) { // TODO: and W>w_min
-                    array<real_t,4> Xs = {X*voxel_size, Y*voxel_size, Z*voxel_size, 1};
+                    array<real_t,4> Xs = {real_t(X)*voxel_size, real_t(Y)*voxel_size, real_t(Z)*voxel_size, 1};
                     auto [U,V,W,c] = hom_transform(Xs,Muvw);
                     n_shell ++;
 
@@ -147,10 +146,10 @@ void cylinder_projection(const input_ndarray<float>  edt,  // Euclidean Distance
                         if (theta >= theta_min && theta <= theta_max) {
                             n_shell_bbox++;
 
-                            ssize_t theta_i = floor( (theta-theta_min) * (n_theta-1)/(theta_max-theta_min) );
-                            ssize_t U_i     = floor( (U    -    U_min) * (n_U    -1)/(    U_max-    U_min) );
+                            ssize_t theta_i = ssize_t(floor( (theta-theta_min) * real_t(n_theta-1)/(theta_max-theta_min) ));
+                            ssize_t U_i     = ssize_t(floor( (U    -    U_min) * real_t(n_U    -1)/(    U_max-    U_min) ));
 
-                            real_t p = C_buffer[k]/255.;
+                            real_t p = real_t(C_buffer[k])/255.f;
 
                             assert(theta >= theta_min);
                             assert(theta <= theta_max);
@@ -307,6 +306,46 @@ array<real_t,9> inertia_matrix(const input_ndarray<mask_type> &mask, const array
     };
 }
 
+void integrate_axes(const input_ndarray<mask_type> &mask,
+		    const array<real_t,3> &x0,
+		    const array<real_t,3> &v_axis,
+		    const array<real_t,3> &w_axis,
+		    const real_t v_min, const real_t w_min,
+		    output_ndarray<uint64_t> output) {
+    UNPACK_NUMPY(mask);
+    ssize_t Nv = output.shape[0], Nw = output.shape[1];
+    uint64_t *output_data = output.data;
+
+    // TODO: Check v_axis & w_axis projections to certify bounds and get rid of runtime check
+    #pragma acc data copy(output_data[:Nv*Nw]) copyin(x0, v_axis, w_axis, v_min, w_min)
+    {
+    BLOCK_BEGIN(mask, ) {
+
+        mask_type voxel = mask_buffer[flat_index];
+        if (voxel != 0) {
+            real_t xs[3] = {
+                real_t(x) - x0[0],
+                real_t(y) - x0[1],
+                real_t(z) - x0[2]
+            };
+
+            real_t
+                v = dot(xs, v_axis),
+                w = dot(xs, w_axis);
+            int64_t
+                i_v = int64_t(round(v - v_min)),
+                j_w = int64_t(round(w - w_min));
+
+            if (i_v >= 0 && j_w >= 0 && i_v < Nv && j_w < Nw) {
+                ATOMIC()
+                output_data[i_v*Nw + j_w] += voxel;
+            }
+        }
+
+    BLOCK_END() }
+    }
+}
+
 template <typename T>
 void sample_plane(const input_ndarray<T> &voxels,
                   const real_t voxel_size, // In micrometers
@@ -361,46 +400,6 @@ void sample_plane(const input_ndarray<T> &voxels,
     }
 }
 
-void integrate_axes(const input_ndarray<mask_type> &mask,
-		    const array<real_t,3> &x0,
-		    const array<real_t,3> &v_axis,
-		    const array<real_t,3> &w_axis,
-		    const real_t v_min, const real_t w_min,
-		    output_ndarray<uint64_t> output) {
-    UNPACK_NUMPY(mask);
-    ssize_t Nv = output.shape[0], Nw = output.shape[1];
-    uint64_t *output_data = output.data;
-
-    // TODO: Check v_axis & w_axis projections to certify bounds and get rid of runtime check
-    #pragma acc data copy(output_data[:Nv*Nw]) copyin(x0, v_axis, w_axis, v_min, w_min)
-    {
-    BLOCK_BEGIN(mask, ) {
-
-        mask_type voxel = mask_buffer[flat_index];
-        if (voxel != 0) {
-            real_t xs[3] = {
-                real_t(x) - x0[0],
-                real_t(y) - x0[1],
-                real_t(z) - x0[2]
-            };
-
-            real_t
-                v = dot(xs, v_axis),
-                w = dot(xs, w_axis);
-            int64_t
-                i_v = int64_t(round(v - v_min)),
-                j_w = int64_t(round(w - w_min));
-
-            if (i_v >= 0 && j_w >= 0 && i_v < Nv && j_w < Nw) {
-                ATOMIC()
-                output_data[i_v*Nw + j_w] += voxel;
-            }
-        }
-
-    BLOCK_END() }
-    }
-}
-
 // NB: xyz are in indices, not micrometers
 void zero_outside_bbox(const array<real_t,9> &principal_axes,
                const array<real_t,6> &parameter_ranges,

From 8dfe3073e40fb6961aa178ff85318d4d8a2f46b3 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Tue, 25 Apr 2023 15:13:44 +0200
Subject: [PATCH 131/136] Added debug image generation

---
 src/processing_steps/0700_implant_FoR.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/processing_steps/0700_implant_FoR.py b/src/processing_steps/0700_implant_FoR.py
index b311755..81943e8 100644
--- a/src/processing_steps/0700_implant_FoR.py
+++ b/src/processing_steps/0700_implant_FoR.py
@@ -176,6 +176,11 @@ def figure_FoR_circle(name,center,v_vec,w_vec,radius,implant_bbox,debug=True):
                  tuple(center), tuple(v_vec), tuple(w_vec),
                  sample_bbox,sample)
 
+    print (voxel_size, cm, v_vec, w_vec, sample_bbox)
+    plt.imshow(sample)
+    plt.savefig(f'{image_output_dir}/sample_plane_check.png')
+    plt.clf()
+
     fig = plt.figure()
     ax = fig.add_subplot(111)
     ax.imshow(sample.T[::-1], extent=sample_bbox,cmap='RdYlBu')
@@ -283,6 +288,9 @@ def figure_FoR_voxels(name,voxels,debug=True):
     if verbose >= 1: print(f"Loading {scale}x voxels from {binary_root}/voxels/{scale}x/{sample}.uint16")
     voxels  = np.fromfile(f"{binary_root}/voxels/{scale}x/{sample}.uint16",dtype=np.uint16).reshape(implant.shape)
 
+    plt.imshow(implant[implant.shape[0]//2,:,:]); plt.savefig(f'{image_output_dir}/implant-sanity-xy.png')
+    plt.imshow(implant[:,implant.shape[0]//2,:]); plt.savefig(f'{image_output_dir}/implant-sanity-xz.png')
+    plt.imshow(implant[:,:,implant.shape[0]//2]); plt.savefig(f'{image_output_dir}/implant-sanity-yz.png')
     plt.imshow(voxels[voxels.shape[0]//2,:,:]); plt.savefig(f'{image_output_dir}/voxels-sanity-xy.png')
     plt.imshow(voxels[:,voxels.shape[0]//2,:]); plt.savefig(f'{image_output_dir}/voxels-sanity-xz.png')
     plt.imshow(voxels[:,:,voxels.shape[0]//2]); plt.savefig(f'{image_output_dir}/voxels-sanity-yz.png')

From 2b7fbe14dcee238860b478e6e3c1ce245b6d98e8 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Tue, 25 Apr 2023 15:14:47 +0200
Subject: [PATCH 132/136] Added note about how to profile OpenACC. Should not
 be here in the future

---
 src/Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/Makefile b/src/Makefile
index f64e876..57bc345 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -50,6 +50,8 @@ $(foreach PLATFORM, $(PLATFORMS), \
 	) \
 )
 
+# TODO lightweight openacc profiling can be done with the environment variable NV_ACC_TIME=1 !!!
+
 test: all
 	$(PYTHON) -m pytest -n auto test
 

From d402b9778104152d2e8408e612af3550c25a925a Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Tue, 25 Apr 2023 15:16:27 +0200
Subject: [PATCH 133/136] #25 Changed in_bbox to work on tuples, rather than
 three parameters

---
 src/lib/cpp/include/geometry.hh | 55 ++++++++++++++-------------------
 1 file changed, 24 insertions(+), 31 deletions(-)

diff --git a/src/lib/cpp/include/geometry.hh b/src/lib/cpp/include/geometry.hh
index 937c5d2..307f7d9 100644
--- a/src/lib/cpp/include/geometry.hh
+++ b/src/lib/cpp/include/geometry.hh
@@ -27,45 +27,38 @@ inline vector4 hom_transform(const vector4 &x, const matrix4x4 &M) {
     return c;
 }
 
-inline bool in_bbox(float U, float V, float W, const std::array<float, 6> &bbox) {
-    const auto& [U_min, U_max, V_min, V_max, W_min, W_max] = bbox;
-
-    bool inside =
-        U >= U_min &&
-        U <= U_max &&
-        V >= V_min &&
-        V <= V_max &&
-        W >= W_min &&
-        W <= W_max;
-
-    // printf("in_bbox: (%.1f,%.1f,%.1f) \in ([%.1f,%.1f],[%.1f,%.1f],[%.1f,%.1f]) == %d\n",
-    //      U,V,W,U_min,U_max,V_min,V_max,U_min,U_max,inside);
-
-    return inside;
+inline bool in_bbox(const std::array<float, 3> index, const std::array<float, 6> &bbox) {
+    const auto& [z, y, x] = index;
+    const auto& [zmin, zmax, ymin, ymax, xmin, xmax] = bbox;
+
+    return
+        z >= zmin && z <= zmax &&
+        y >= ymin && y <= ymax &&
+        x >= xmin && x <= xmax;
 }
 
 template <typename T>
-float resample2x2x2(const T             *voxels,
-                    const array<ssize_t, 3> &shape,
-                    const array<float, 3>   &X) {
-    auto  [Nx,Ny,Nz] = shape;
+float resample2x2x2(const T                      *voxels,
+                    const std::array<ssize_t, 3> &shape,
+                    const std::array<float, 3>   &index) {
+    auto  [Nz,Ny,Nx] = shape;
 
-    if (!in_bbox(X[0], X[1], X[2], {0.5f, float(Nx)-0.5f, 0.5f, float(Ny)-0.5f, 0.5f, float(Nz)-0.5f})) {
-        uint64_t voxel_index = uint64_t(floor(X[0]))*Ny*Nz + uint64_t(floor(X[1]))*Ny + uint64_t(floor(X[2]));
+    if (!in_bbox(index, {0.5f, float(Nx)-0.5f, 0.5f, float(Ny)-0.5f, 0.5f, float(Nz)-0.5f})) {
+        uint64_t voxel_index = uint64_t(floor(index[0]))*Nz*Ny + uint64_t(floor(index[1]))*Nx + uint64_t(floor(index[2]));
         return voxels[voxel_index];
     }
 
-    float   Xfrac[2][3]; // {Xminus[3], Xplus[3]}
-    int64_t Xint[2][3];  // {Iminus[3], Iplus[3]}
+    float   Ifrac[2][3]; // {Xminus[3], Xplus[3]}
+    int64_t Iint[2][3];  // {Iminus[3], Iplus[3]}
     float   value = 0;
 
     for (int i = 0; i < 3; i++) {
         float Iminus, Iplus;
-        Xfrac[0][i] = 1-modf(X[i]-0.5f, &Iminus); // 1-{X[i]-1/2}, floor(X[i]-1/2)
-        Xfrac[1][i] =   modf(X[i]+0.5f, &Iplus);  // {X[i]+1/2}, floor(X[i]+1/2)
+        Ifrac[0][i] = 1-std::modf(index[i]-0.5f, &Iminus); // 1-{X[i]-1/2}, floor(X[i]-1/2)
+        Ifrac[1][i] =   std::modf(index[i]+0.5f, &Iplus);  // {X[i]+1/2}, floor(X[i]+1/2)
 
-        Xint[0][i] = (int64_t) Iminus;
-        Xint[1][i] = (int64_t) Iplus;
+        Iint[0][i] = (int64_t) Iminus;
+        Iint[1][i] = (int64_t) Iplus;
     }
 
     for (int ijk = 0; ijk <= 7; ijk++) {
@@ -74,8 +67,8 @@ float resample2x2x2(const T             *voxels,
 
         for (int axis = 0; axis < 3; axis++) { // x-1/2 or x+1/2
             int pm    = (ijk >> axis) & 1;
-            IJK[axis] = Xint[pm][axis];
-            weight   *= Xfrac[pm][axis];
+            IJK[axis] = Iint[pm][axis];
+            weight   *= Ifrac[pm][axis];
         }
 
         auto [I,J,K] = IJK;
@@ -87,11 +80,11 @@ float resample2x2x2(const T             *voxels,
         //   printf("(I,J,K) = (%ld,%ld,%ld), (Nx,Ny,Nz) = (%ld,%ld,%ld)\n",I,J,K,Nx,Ny,Nz);
         //   abort();
         // }
-        uint64_t voxel_index = I*Ny*Nz+J*Ny+K;
+        uint64_t voxel_index = I*Ny*Nx + J*Nx + K;
         //assert(I>=0 && J>=0 && K>=0);
         //assert(I<Nx && J<Ny && K<Nz);
         float voxel = (float) voxels[voxel_index];
-        value += voxel*weight;
+        value += voxel * weight;
     }
 
     return value;

From 2176297e0a83d9ee24fcc050e9f34d9e6c20319d Mon Sep 17 00:00:00 2001
From: James Avery <avery@nbi.ku.dk>
Date: Tue, 25 Apr 2023 15:41:00 +0200
Subject: [PATCH 134/136] Fix link path

---
 src/processing_steps/1400_rescale_cupy_bin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/processing_steps/1400_rescale_cupy_bin.py b/src/processing_steps/1400_rescale_cupy_bin.py
index ec15bd3..d68be24 120000
--- a/src/processing_steps/1400_rescale_cupy_bin.py
+++ b/src/processing_steps/1400_rescale_cupy_bin.py
@@ -1 +1 @@
-processing_steps/0500_rescale_cupy_bin.py
\ No newline at end of file
+0500_rescale_cupy_bin.py
\ No newline at end of file

From 3ecc845abf768975b2e23bae8091af9c8e465052 Mon Sep 17 00:00:00 2001
From: James Avery <avery@nbi.ku.dk>
Date: Tue, 25 Apr 2023 15:52:47 +0200
Subject: [PATCH 135/136] in_bbox predicate now takes a std::array<real_t,3>
 for coordinates.

---
 src/lib/cpp/cpu_seq/geometry.cc | 8 ++++----
 src/lib/cpp/gpu/geometry.cc     | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/lib/cpp/cpu_seq/geometry.cc b/src/lib/cpp/cpu_seq/geometry.cc
index 3c35f5e..fc7c45f 100644
--- a/src/lib/cpp/cpu_seq/geometry.cc
+++ b/src/lib/cpp/cpu_seq/geometry.cc
@@ -140,7 +140,7 @@ void cylinder_projection(const input_ndarray<float>  edt,  // Euclidean Distance
                     n_shell ++;
 
                     //        printf("distance = %.1f, U,V,W = %.2f,%.2f,%.2f\n",distance,U,V,W);
-                    if (in_bbox(U,V,W,bbox)) {
+                    if (in_bbox({{U,V,W}},bbox)) {
                         real_t theta    = atan2(V,W);
 
                         if (theta >= theta_min && theta <= theta_max) {
@@ -218,7 +218,7 @@ void fill_implant_mask(const input_ndarray<mask_type> mask,
                     int U_i = int(floor((U - U_min) * real_t(n_segments-1) / (U_max - U_min)));
 
                 //    if (U_i >= 0 && U_i < n_segments) {
-                    if ( in_bbox(U, V, W, bbox) ) {
+                    if ( in_bbox({{U, V, W}}, bbox) ) {
                         rsqr_maxs_d[U_i] = max(rsqr_maxs_d[U_i], float(r_sqr));
                         theta_min = min(theta_min, theta);
                         theta_max = max(theta_max, theta);
@@ -388,8 +388,8 @@ void sample_plane(const input_ndarray<T> &voxels,
             //      printf("u,v = %g,%g -> %.1f,%.1f,%.1f -> %d, %d, %d\n",u,v,X,Y,Z,int(round(x)),int(round(y)),int(round(z)));
 
             T value = 0;
-            std::array<float, 6> local_bbox = {0.5f, float(voxels_Nx)-0.5f, 0.5f, float(voxels_Ny)-0.5f, 0.5f, float(voxels_Nz)-0.5f};
-            if (in_bbox(x,y,z, local_bbox))
+            std::array<float, 6> local_bbox = {0.5f, float(voxels_Nx)-0.5f, 0.5f, float(voxels_Ny)-0.5f, 0.5f, float(voxels_Nz)-0.5f}; 
+            if (in_bbox({{x,y,z}}, local_bbox))
                 value = (T) round(resample2x2x2<T>(voxels.data, {voxels_Nx, voxels_Ny, voxels_Nz}, {x, y, z}));
             // else
             //     fprintf(stderr,"Sampling outside image: x,y,z = %.1f,%.1f,%.1f, Nx,Ny,Nz = %ld,%ld,%ld\n",x,y,z,Nx,Ny,Nz);
diff --git a/src/lib/cpp/gpu/geometry.cc b/src/lib/cpp/gpu/geometry.cc
index b76db65..230ddc0 100644
--- a/src/lib/cpp/gpu/geometry.cc
+++ b/src/lib/cpp/gpu/geometry.cc
@@ -82,7 +82,7 @@ void fill_implant_mask(const input_ndarray<mask_type> mask,
 
                             int U_i = int(floor((U - U_min) * real_t(n_segments-1) / (U_max - U_min)));
 
-                            if ( in_bbox(U,V,W,bbox) ) {
+                            if ( in_bbox({{U,V,W}},bbox) ) {
                                 //#pragma acc atomic update
                                 rsqr_maxs_d[U_i] = max(rsqr_maxs_d[U_i], float(r_sqr));
                                 theta_min = min(theta_min, theta);
@@ -175,4 +175,4 @@ void zero_outside_bbox(const array<real_t,9> &principal_axes,
     return cpu_seq::zero_outside_bbox(principal_axes, parameter_ranges, cm, voxels);
 }
 
-}
\ No newline at end of file
+}

From bfe1508f1e4dde101947e0ef2a0886b1bbf3c951 Mon Sep 17 00:00:00 2001
From: James Avery <avery@nbi.ku.dk>
Date: Tue, 25 Apr 2023 16:03:50 +0200
Subject: [PATCH 136/136] Added pip_install target to install python
 requirements

---
 src/Makefile | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 57bc345..fdc1bd9 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -37,7 +37,7 @@ else
 $(info OpenACC compiler nvc++ not found. Compiling without.)
 endif
 
-all: $(TARGETS)
+all: $(TARGETS) pip_install
 
 define GEN_RULE
 $(CPP_FOLDER)/$(PLATFORM)/$(LIB)$(PYBIND_SUFFIX): pybind/$(LIB)-pybind.cc $(CPP_FOLDER)/$(PLATFORM)/$(LIB).cc $(CPP_FOLDER)/include/*.hh
@@ -52,6 +52,9 @@ $(foreach PLATFORM, $(PLATFORMS), \
 
 # TODO lightweight openacc profiling can be done with the environment variable NV_ACC_TIME=1 !!!
 
+pip_install:
+	$(PYTHON) -m pip install -r requirements.txt
+
 test: all
 	$(PYTHON) -m pytest -n auto test
 
@@ -59,4 +62,4 @@ test_%: test/test_%.py all
 	$(PYTHON) -m pytest -n auto $<
 
 clean:
-	rm -rf $(CLEANUP) __pycache__ test/__pycache__ .pytest_cache lib/cpp/**/*.so
\ No newline at end of file
+	rm -rf $(CLEANUP) __pycache__ test/__pycache__ .pytest_cache lib/cpp/**/*.so