diff --git a/CMakeLists.txt b/CMakeLists.txt
index d0481c4e..6698cd8e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,6 +12,10 @@ project(anari_library_visionaray LANGUAGES C CXX)
 
 include(GNUInstallDirs)
 
+# build external libraries
+
+add_subdirectory(external)
+
 # note: we're often tracking the most recent changes from ANARI-SDK
 # (on branch "next_release")
 find_package(anari 0.10.0 REQUIRED)
@@ -33,9 +37,6 @@ endif()
 
 option(ANARI_VISIONARAY_ENABLE_NANOVDB "Enable NanoVDB spatial field type" OFF)
 set(nanovdb ${ANARI_VISIONARAY_ENABLE_NANOVDB})
-if (nanovdb)
-  find_package(OpenVDB COMPONENTS nanovdb REQUIRED)
-endif()
 
 anari_generate_queries(
   NAME visionaray
@@ -130,6 +131,8 @@ target_sources(${PROJECT_NAME} PRIVATE
 if (nanovdb)
   target_sources(${PROJECT_NAME} PRIVATE scene/volume/spatial_field/NanoVDBField.cpp)
   target_compile_definitions(${PROJECT_NAME} PRIVATE WITH_NANOVDB=1)
+  target_link_libraries(${PROJECT_NAME} PRIVATE
+    $<BUILD_INTERFACE:vsnray_nanovdb>)
 endif()
 
 include(GenerateExportHeader)
@@ -137,7 +140,7 @@ generate_export_header(${PROJECT_NAME}
   EXPORT_MACRO_NAME "VISIONARAY_DEVICE_INTERFACE"
 )
 
-target_link_libraries(${PROJECT_NAME}
+target_link_libraries(${PROJECT_NAME} PUBLIC
     visionaray::visionaray anari::anari anari::helium)
 
 target_include_directories(${PROJECT_NAME} PUBLIC
@@ -173,7 +176,13 @@ if (cuda)
   target_compile_definitions(${PROJECT_NAME}_cuda PRIVATE WITH_CUDA=1)
   if (nanovdb)
     target_sources(${PROJECT_NAME}_cuda PRIVATE
-        scene/volume/spatial_field/NanoVDBField.cpp)
+        scene/volume/spatial_field/NanoVDBField.cu)
+    set_source_files_properties(
+      scene/volume/spatial_field/NanoVDBField.cu
+      PROPERTIES COMPILE_FLAGS "--extended-lambda --expt-relaxed-constexpr"
+    )
+    target_link_libraries(${PROJECT_NAME}_cuda PRIVATE
+      $<BUILD_INTERFACE:vsnray_nanovdb>)
     target_compile_definitions(${PROJECT_NAME}_cuda PRIVATE WITH_NANOVDB=1)
   endif()
 
@@ -181,11 +190,11 @@ if (cuda)
     EXPORT_MACRO_NAME "VISIONARAY_DEVICE_INTERFACE"
   )
 
-  target_link_libraries(${PROJECT_NAME}_cuda
+  target_link_libraries(${PROJECT_NAME}_cuda PUBLIC
       visionaray::visionaray anari::anari anari::helium)
 
   if (TARGET CUDA::cudart)
-    target_link_libraries(${PROJECT_NAME}_cuda CUDA::cudart)
+    target_link_libraries(${PROJECT_NAME}_cuda PUBLIC CUDA::cudart)
   endif()
 
   target_include_directories(${PROJECT_NAME}_cuda PUBLIC
@@ -205,9 +214,7 @@ if (hip)
   )
   target_compile_definitions(${PROJECT_NAME}_hip PRIVATE WITH_HIP=1)
   if (nanovdb)
-    target_sources(${PROJECT_NAME}_hip PRIVATE
-        scene/volume/spatial_field/NanoVDBField.cpp)
-    target_compile_definitions(${PROJECT_NAME}_hip PRIVATE WITH_NANOVDB=1)
+    message(WARNING "No VDB support with HIP")
   endif()
 
   target_link_libraries(${PROJECT_NAME}_hip
diff --git a/external/nanovdb/CMakeLists.txt b/external/nanovdb/CMakeLists.txt
new file mode 100644
index 00000000..dae56ed2
--- /dev/null
+++ b/external/nanovdb/CMakeLists.txt
@@ -0,0 +1,3 @@
+project(vsnray_nanovdb LANGUAGES CXX CUDA)
+add_library(${PROJECT_NAME} INTERFACE)
+target_include_directories(${PROJECT_NAME} INTERFACE ${CMAKE_CURRENT_LIST_DIR}/..)
diff --git a/external/nanovdb/CNanoVDB.h b/external/nanovdb/CNanoVDB.h
new file mode 100644
index 00000000..c714f94d
--- /dev/null
+++ b/external/nanovdb/CNanoVDB.h
@@ -0,0 +1,715 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+//
+// Simple C-wrapper for the nanovdb structure
+// Meant for systems where you lack a C++ compiler.
+//
+#ifndef __CNANOVDB__
+#define __CNANOVDB__
+
+#define CNANOVDB_DATA_ALIGNMENT 32
+#define CNANOVDB_ALIGNMENT_PADDING(x, n) (-(x) & ((n)-1))
+
+#define USE_SINGLE_ROOT_KEY
+
+#ifdef __OPENCL_VERSION__
+
+#define CNANOVDB_GLOBAL __global
+#define RESTRICT restrict
+
+// OpenCL doesn't define these basic types:
+typedef unsigned long uint64_t;
+typedef long int64_t;
+typedef unsigned int uint32_t;
+typedef int int32_t;
+typedef short int16_t;
+typedef unsigned short uint16_t;
+typedef unsigned char uint8_t;
+
+#else
+
+#define CNANOVDB_GLOBAL
+#define RESTRICT __restrict
+
+#endif
+
+
+enum cnanovdb_GridType
+{
+    cnanovdb_GridType_Unknown = 0,
+    cnanovdb_GridType_Float   = 1,
+    cnanovdb_GridType_Double  = 2,
+    cnanovdb_GridType_Int16   = 3,
+    cnanovdb_GridType_Int32   = 4,
+    cnanovdb_GridType_Int64   = 5,
+    cnanovdb_GridType_Vec3f   = 6,
+    cnanovdb_GridType_Vec3d   = 7,
+    cnanovdb_GridType_Mask    = 8,
+    cnanovdb_GridType_FP16    = 9,
+    cnanovdb_GridType_End     = 10
+};
+
+#define ROOT_LEVEL 3
+
+#define DEFINEMASK_int(LOG2DIM, SIZE) \
+typedef struct \
+{ \
+    uint64_t    mWords[SIZE >> 6]; \
+} cnanovdb_mask##LOG2DIM; \
+\
+static void cnanovdb_mask##LOG2DIM##_clear(CNANOVDB_GLOBAL cnanovdb_mask##LOG2DIM *RESTRICT mask) \
+{ for (uint32_t i = 0; i < (SIZE >> 6); i++) mask->mWords[i] = 0; } \
+\
+static bool cnanovdb_mask##LOG2DIM##_isOn(const CNANOVDB_GLOBAL cnanovdb_mask##LOG2DIM *RESTRICT mask, uint32_t n) \
+{ return 0 != (mask->mWords[n >> 6] & (((uint64_t)(1)) << (n & 63))); } \
+/**/
+
+#define DEFINEMASK(LOG2DIM) \
+    DEFINEMASK_int(LOG2DIM, (1U << (3*LOG2DIM)))
+
+#define INSTANTIATE(LOG2DIM) \
+    DEFINEMASK(LOG2DIM)
+
+INSTANTIATE(3)
+INSTANTIATE(4)
+INSTANTIATE(5)
+
+typedef struct
+{
+    float  mMatF[9];    // r,c = 3*r + c
+    float  mInvMatF[9]; // r,c = 3*r + c
+    float  mVecF[3];
+    float  mTaperF;
+    double mMatD[9];    // r,c = 3*r + c
+    double mInvMatD[9]; // r,c = 3*r + c
+    double mVecD[3];
+    double mTaperD;
+} cnanovdb_map;
+
+typedef struct
+{
+    float       mVec[3];
+} cnanovdb_Vec3F;
+
+typedef struct
+{
+    int32_t     mVec[3];
+} cnanovdb_coord;
+
+static int
+cnanovdb_coord_compare(const CNANOVDB_GLOBAL cnanovdb_coord *a, const cnanovdb_coord *b)
+{
+    if (a->mVec[0] < b->mVec[0])
+        return -1;
+    if (a->mVec[0] > b->mVec[0])
+        return 1;
+    if (a->mVec[1] < b->mVec[1])
+        return -1;
+    if (a->mVec[1] > b->mVec[1])
+        return 1;
+    if (a->mVec[2] < b->mVec[2])
+        return -1;
+    if (a->mVec[2] > b->mVec[2])
+        return 1;
+    return 0;
+}
+
+#ifdef USE_SINGLE_ROOT_KEY
+static uint64_t
+cnanovdb_coord_to_key(const cnanovdb_coord *RESTRICT ijk)
+{
+    // Define to workaround a bug with 64-bit shifts in the AMD OpenCL compiler.
+#if defined(AVOID_64BIT_SHIFT)
+    uint2 key = (uint2)( ((uint32_t)ijk->mVec[2]) >> 12, 0) |
+                (uint2)((((uint32_t)ijk->mVec[1]) >> 12) << 21,
+                         ((uint32_t)ijk->mVec[1]) >> 23) |
+             (uint2)(0, (((uint32_t)ijk->mVec[0]) >> 12) << 10);
+    return *(uint64_t *)&key;
+#else
+    return  ((uint64_t) (((uint32_t)ijk->mVec[2]) >> 12)) |
+           (((uint64_t) (((uint32_t)ijk->mVec[1]) >> 12)) << 21) |
+           (((uint64_t) (((uint32_t)ijk->mVec[0]) >> 12)) << 42);
+#endif
+}
+#else
+static void
+cnanovdb_coord_to_key(cnanovdb_coord *RESTRICT key, const cnanovdb_coord *RESTRICT ijk)
+{
+    key->mVec[0] = ijk->mVec[0] & ~((1u << 12) - 1u);
+    key->mVec[1] = ijk->mVec[1] & ~((1u << 12) - 1u);
+    key->mVec[2] = ijk->mVec[2] & ~((1u << 12) - 1u);
+}
+#endif
+
+static void
+cnanovdb_map_apply(cnanovdb_Vec3F *dst, const CNANOVDB_GLOBAL cnanovdb_map *RESTRICT map, const cnanovdb_Vec3F *src)
+{
+    float sx = src->mVec[0];
+    float sy = src->mVec[1];
+    float sz = src->mVec[2];
+    dst->mVec[0] = sx * map->mMatF[0] + sy * map->mMatF[1] + sz * map->mMatF[2] + map->mVecF[0];
+    dst->mVec[1] = sx * map->mMatF[3] + sy * map->mMatF[4] + sz * map->mMatF[5] + map->mVecF[1];
+    dst->mVec[2] = sx * map->mMatF[6] + sy * map->mMatF[7] + sz * map->mMatF[8] + map->mVecF[2];
+}
+
+static void
+cnanovdb_map_applyInverse(cnanovdb_Vec3F *dst, const CNANOVDB_GLOBAL cnanovdb_map *RESTRICT map, const cnanovdb_Vec3F *src)
+{
+    float sx = src->mVec[0] - map->mVecF[0];
+    float sy = src->mVec[1] - map->mVecF[1];
+    float sz = src->mVec[2] - map->mVecF[2];
+    dst->mVec[0] = sx * map->mInvMatF[0] + sy * map->mInvMatF[1] + sz * map->mInvMatF[2];
+    dst->mVec[1] = sx * map->mInvMatF[3] + sy * map->mInvMatF[4] + sz * map->mInvMatF[5];
+    dst->mVec[2] = sx * map->mInvMatF[6] + sy * map->mInvMatF[7] + sz * map->mInvMatF[8];
+}
+
+static void
+cnanovdb_map_applyJacobi(cnanovdb_Vec3F *dst, const CNANOVDB_GLOBAL cnanovdb_map *RESTRICT map, const cnanovdb_Vec3F *src)
+{
+    float sx = src->mVec[0];
+    float sy = src->mVec[1];
+    float sz = src->mVec[2];
+    dst->mVec[0] = sx * map->mMatF[0] + sy * map->mMatF[1] + sz * map->mMatF[2];
+    dst->mVec[1] = sx * map->mMatF[3] + sy * map->mMatF[4] + sz * map->mMatF[5];
+    dst->mVec[2] = sx * map->mMatF[6] + sy * map->mMatF[7] + sz * map->mMatF[8];
+}
+
+static void
+cnanovdb_map_applyInverseJacobi(cnanovdb_Vec3F *dst, const CNANOVDB_GLOBAL cnanovdb_map *RESTRICT map, const cnanovdb_Vec3F *src)
+{
+    float sx = src->mVec[0];
+    float sy = src->mVec[1];
+    float sz = src->mVec[2];
+    dst->mVec[0] = sx * map->mInvMatF[0] + sy * map->mInvMatF[1] + sz * map->mInvMatF[2];
+    dst->mVec[1] = sx * map->mInvMatF[3] + sy * map->mInvMatF[4] + sz * map->mInvMatF[5];
+    dst->mVec[2] = sx * map->mInvMatF[6] + sy * map->mInvMatF[7] + sz * map->mInvMatF[8];
+}
+
+static void
+cnanovdb_map_applyIJT(cnanovdb_Vec3F *dst, const CNANOVDB_GLOBAL cnanovdb_map *RESTRICT map, const cnanovdb_Vec3F *src)
+{
+    float sx = src->mVec[0];
+    float sy = src->mVec[1];
+    float sz = src->mVec[2];
+    dst->mVec[0] = sx * map->mInvMatF[0] + sy * map->mInvMatF[3] + sz * map->mInvMatF[6];
+    dst->mVec[1] = sx * map->mInvMatF[1] + sy * map->mInvMatF[4] + sz * map->mInvMatF[7];
+    dst->mVec[2] = sx * map->mInvMatF[2] + sy * map->mInvMatF[5] + sz * map->mInvMatF[8];
+}
+
+typedef struct
+{
+    int64_t     mByteOffset;   // byte offset to the blind data, relative to the GridData.
+    uint64_t    mElementCount; // number of elements, e.g. point count
+    uint32_t    mFlags;        // flags
+    uint32_t    mSemantic;     // semantic meaning of the data.
+    uint32_t    mDataClass;    // 4 bytes
+    uint32_t    mDataType;     // 4 bytes
+    char        mName[256];
+    uint8_t     _reserved[CNANOVDB_ALIGNMENT_PADDING(sizeof(int64_t)+sizeof(uint64_t)+2*sizeof(uint32_t)+2*sizeof(uint32_t)+256*sizeof(char), CNANOVDB_DATA_ALIGNMENT)];
+} cnanovdb_gridblindmetadata;
+
+typedef struct
+{
+    uint64_t         mMagic; // 8B magic to validate it is valid grid data.
+    uint64_t         mChecksum; // 8B. Checksum of grid buffer.
+    uint32_t         mVersion;// 4B. compacted major.minor.path version number.
+    uint32_t         mFlags; // 4B. flags for grid.
+    uint32_t         mGridIndex;// 4B. Index of this grid in the buffer
+    uint32_t         mGridCount; // 4B. Total number of grids in the buffer
+    uint64_t         mGridSize; // 8B. byte count of this entire grid occupied in the buffer.
+    char             mGridName[256]; // 256B
+    cnanovdb_map     mMap; // 264B. affine transformation between index and world space in both single and double precision
+    double           mBBox[6]; // 48B. floating-point bounds of active values in WORLD SPACE
+    double           mVoxelSize[3]; // 24B. size of a voxel in world units
+    uint32_t         mGridClass; // 4B.
+    uint32_t         mGridType; // 4B.
+    uint64_t         mBlindMetadataOffset; // 8B. offset of GridBlindMetaData structures.
+    int32_t          mBlindMetadataCount; // 4B. count of GridBlindMetaData structures.
+    uint32_t         _reserved[CNANOVDB_ALIGNMENT_PADDING(8 + 8 + 4 + 4 + 4 + 4 + 8 + 256 + 24 + 24 + sizeof(cnanovdb_map) + 24 + 4 + 4 + 8 + 4, CNANOVDB_DATA_ALIGNMENT) / 4];
+} cnanovdb_griddata;
+
+static void
+cnanovdb_griddata_worldToIndex(cnanovdb_Vec3F *dst, const CNANOVDB_GLOBAL cnanovdb_griddata *RESTRICT grid, const cnanovdb_Vec3F *src)
+{
+    cnanovdb_map_applyInverse(dst, &grid->mMap, src);
+}
+
+static void
+cnanovdb_griddata_indexToWorld(cnanovdb_Vec3F *dst, const CNANOVDB_GLOBAL cnanovdb_griddata *RESTRICT grid, const cnanovdb_Vec3F *src)
+{
+    cnanovdb_map_apply(dst, &grid->mMap, src);
+}
+
+static void
+cnanovdb_griddata_worldToIndexDir(cnanovdb_Vec3F *dst, const CNANOVDB_GLOBAL cnanovdb_griddata *RESTRICT grid, const cnanovdb_Vec3F *src)
+{
+    cnanovdb_map_applyInverseJacobi(dst, &grid->mMap, src);
+}
+
+static void
+cnanovdb_griddata_indexToWorldDir(cnanovdb_Vec3F *dst, const CNANOVDB_GLOBAL cnanovdb_griddata *RESTRICT grid, const cnanovdb_Vec3F *src)
+{
+    cnanovdb_map_applyJacobi(dst, &grid->mMap, src);
+}
+
+static void
+cnanovdb_griddata_applyIJT(cnanovdb_Vec3F *dst, const CNANOVDB_GLOBAL cnanovdb_griddata *RESTRICT grid, const cnanovdb_Vec3F *src)
+{
+    cnanovdb_map_applyIJT(dst, &grid->mMap, src);
+}
+
+typedef struct
+{
+    uint64_t mNodeOffset[ROOT_LEVEL + 1];
+    uint32_t mNodeCount[ROOT_LEVEL];
+    uint32_t mTileCount[ROOT_LEVEL];
+    uint64_t mVoxelCount;
+    uint8_t  _reserved[CNANOVDB_ALIGNMENT_PADDING(4*sizeof(uint64_t)+(3+3)*sizeof(uint32_t)+sizeof(uint64_t), CNANOVDB_DATA_ALIGNMENT)];
+} cnanovdb_treedata;
+
+static const CNANOVDB_GLOBAL cnanovdb_treedata *
+cnanovdb_griddata_tree(const CNANOVDB_GLOBAL cnanovdb_griddata *RESTRICT griddata)
+{
+    return (const CNANOVDB_GLOBAL cnanovdb_treedata *)(griddata + 1);
+}
+
+#define CREATE_TILEENTRY(VALUETYPE, SUFFIX) \
+typedef union \
+{ \
+    VALUETYPE   value; \
+    uint64_t    child; \
+} cnanovdb_tileentry##SUFFIX; \
+/**/
+
+typedef struct
+{
+    cnanovdb_coord              mKey;
+    const CNANOVDB_GLOBAL void *mNode[4];
+} cnanovdb_readaccessor;
+
+
+static void
+cnanovdb_readaccessor_insert(cnanovdb_readaccessor *RESTRICT acc, int childlevel, const CNANOVDB_GLOBAL void *RESTRICT node, const cnanovdb_coord *RESTRICT ijk)
+{
+    acc->mNode[childlevel] = node;
+    acc->mKey.mVec[0] = ijk->mVec[0];
+    acc->mKey.mVec[1] = ijk->mVec[1];
+    acc->mKey.mVec[2] = ijk->mVec[2];
+}
+
+#define CREATE_LEAF_NODE_int(LEVEL, LOG2DIM, CHILDTOTAL, TOTAL, MASK, VALUETYPE, STATSTYPE, SUFFIX) \
+typedef struct \
+{ \
+    cnanovdb_coord              mBBox_min; \
+    uint8_t                     mBBoxDif[3]; \
+    uint8_t                     mFlags; \
+    cnanovdb_mask##LOG2DIM      mValueMask; \
+    VALUETYPE                   mMinimum; \
+    VALUETYPE                   mMaximum; \
+    STATSTYPE                   mAverage; \
+    STATSTYPE                   mStdDevi; \
+    uint32_t                    _reserved[ CNANOVDB_ALIGNMENT_PADDING(sizeof(cnanovdb_mask##LOG2DIM)+2*sizeof(VALUETYPE)+2*sizeof(STATSTYPE)+sizeof(cnanovdb_coord)+sizeof(uint8_t[3])+sizeof(uint8_t), CNANOVDB_DATA_ALIGNMENT)/4]; \
+    VALUETYPE                   mVoxels[1u << (3*LOG2DIM)]; \
+} cnanovdb_node##LEVEL##SUFFIX; \
+\
+static uint32_t \
+cnanovdb_node##LEVEL##SUFFIX##_CoordToOffset(const cnanovdb_coord *RESTRICT ijk) \
+{ \
+    return ( ( ( ijk->mVec[0] & MASK ) >> CHILDTOTAL ) << ( 2 * LOG2DIM ) ) + \
+           ( ( ( ijk->mVec[1] & MASK ) >> CHILDTOTAL ) << ( LOG2DIM ) ) + \
+             ( ( ijk->mVec[2] & MASK ) >> CHILDTOTAL ); \
+} \
+\
+static VALUETYPE \
+cnanovdb_node##LEVEL##SUFFIX##_getValue(const CNANOVDB_GLOBAL cnanovdb_node##LEVEL##SUFFIX *RESTRICT node, const cnanovdb_coord *RESTRICT ijk) \
+{ \
+    uint32_t n = cnanovdb_node##LEVEL##SUFFIX##_CoordToOffset(ijk); \
+    return node->mVoxels[n]; \
+} \
+\
+static VALUETYPE \
+cnanovdb_node##LEVEL##SUFFIX##_getValueAndCache(const CNANOVDB_GLOBAL cnanovdb_node##LEVEL##SUFFIX *RESTRICT node, const cnanovdb_coord *RESTRICT ijk, cnanovdb_readaccessor *RESTRICT /* DO NOT REMOVE: Required for C99 compliance */ acc) \
+{ \
+    (void)(acc); \
+    uint32_t n = cnanovdb_node##LEVEL##SUFFIX##_CoordToOffset(ijk); \
+    return node->mVoxels[n]; \
+} \
+\
+static bool \
+cnanovdb_node##LEVEL##SUFFIX##_isActive(const CNANOVDB_GLOBAL cnanovdb_node##LEVEL##SUFFIX *RESTRICT node, const cnanovdb_coord *RESTRICT ijk) \
+{ \
+    uint32_t n = cnanovdb_node##LEVEL##SUFFIX##_CoordToOffset(ijk); \
+    if (cnanovdb_mask##LOG2DIM##_isOn(&node->mValueMask, n)) \
+        return true; \
+    return false; \
+} \
+\
+static bool \
+cnanovdb_node##LEVEL##SUFFIX##_isActiveAndCache(const CNANOVDB_GLOBAL cnanovdb_node##LEVEL##SUFFIX *RESTRICT node, const cnanovdb_coord *RESTRICT ijk, cnanovdb_readaccessor *RESTRICT /* DO NOT REMOVE: Required for C99 compliance */ acc) \
+{ \
+    (void)(acc); \
+    uint32_t n = cnanovdb_node##LEVEL##SUFFIX##_CoordToOffset(ijk); \
+    if (cnanovdb_mask##LOG2DIM##_isOn(&node->mValueMask, n)) \
+        return true; \
+    return false; \
+} \
+\
+static const CNANOVDB_GLOBAL cnanovdb_node##LEVEL##SUFFIX * \
+cnanovdb_tree_getNode##LEVEL##SUFFIX(const CNANOVDB_GLOBAL cnanovdb_treedata *RESTRICT tree, uint64_t i) \
+{ \
+    const CNANOVDB_GLOBAL cnanovdb_node##LEVEL##SUFFIX *basenode = (const CNANOVDB_GLOBAL cnanovdb_node##LEVEL##SUFFIX *)((CNANOVDB_GLOBAL uint8_t *)(tree) + tree->mNodeOffset[LEVEL]); \
+    return basenode + i; \
+} \
+\
+/**/
+
+#define CREATE_LEAF_NODE(LEVEL, LOG2DIM, TOTAL, VALUETYPE, STATSTYPE, SUFFIX) \
+CREATE_LEAF_NODE_int(LEVEL, LOG2DIM, (TOTAL-LOG2DIM), TOTAL, ((1u << TOTAL) - 1u), VALUETYPE, STATSTYPE, SUFFIX)
+
+#define CREATE_INTERNAL_NODE_int(CHILDLEVEL, LEVEL, LOG2DIM, CHILDTOTAL, TOTAL, MASK, VALUETYPE, STATSTYPE, SUFFIX) \
+typedef struct \
+{ \
+    cnanovdb_coord               mBBox_min, mBBox_max; \
+    int32_t                      mOffset; \
+    uint32_t                     mFlags; \
+    cnanovdb_mask##LOG2DIM       mValueMask, mChildMask; \
+    VALUETYPE                    mMinimum, mMaximum; \
+    STATSTYPE                    mAverage, mStdDevi; \
+    uint8_t                      _reserved[CNANOVDB_ALIGNMENT_PADDING(sizeof(cnanovdb_mask##LOG2DIM)+sizeof(VALUETYPE)*2+sizeof(STATSTYPE)*2+sizeof(cnanovdb_coord)*2+sizeof(int32_t)+sizeof(uint32_t), CNANOVDB_DATA_ALIGNMENT)]; \
+    cnanovdb_tileentry##SUFFIX   mTable[1u << (3*LOG2DIM)]; \
+} cnanovdb_node##LEVEL##SUFFIX; \
+\
+static uint32_t \
+cnanovdb_node##LEVEL##SUFFIX##_CoordToOffset(const cnanovdb_coord *RESTRICT ijk) \
+{ \
+    return ( ( ( ijk->mVec[0] & MASK ) >> CHILDTOTAL ) << ( 2 * LOG2DIM ) ) + \
+           ( ( ( ijk->mVec[1] & MASK ) >> CHILDTOTAL ) << ( LOG2DIM ) ) + \
+             ( ( ijk->mVec[2] & MASK ) >> CHILDTOTAL ); \
+} \
+\
+static const CNANOVDB_GLOBAL cnanovdb_node##CHILDLEVEL##SUFFIX * \
+cnanovdb_node##LEVEL##SUFFIX##_getChild(const CNANOVDB_GLOBAL cnanovdb_node##LEVEL##SUFFIX *RESTRICT node, uint32_t n) \
+{ \
+    const CNANOVDB_GLOBAL cnanovdb_node##CHILDLEVEL##SUFFIX *childnode = (const CNANOVDB_GLOBAL cnanovdb_node##CHILDLEVEL##SUFFIX *)( ((CNANOVDB_GLOBAL uint8_t *)node) + node->mTable[n].child); \
+    return childnode; \
+} \
+\
+static VALUETYPE \
+cnanovdb_node##LEVEL##SUFFIX##_getValue(const CNANOVDB_GLOBAL cnanovdb_node##LEVEL##SUFFIX *RESTRICT node, const cnanovdb_coord *RESTRICT ijk) \
+{ \
+    uint32_t n = cnanovdb_node##LEVEL##SUFFIX##_CoordToOffset(ijk); \
+    if (cnanovdb_mask##LOG2DIM##_isOn(&node->mChildMask, n)) \
+    { \
+        const CNANOVDB_GLOBAL cnanovdb_node##CHILDLEVEL##SUFFIX *child = cnanovdb_node##LEVEL##SUFFIX##_getChild(node, n); \
+        return cnanovdb_node##CHILDLEVEL##SUFFIX##_getValue(child, ijk); \
+    } \
+    return node->mTable[n].value; \
+} \
+\
+static VALUETYPE \
+cnanovdb_node##LEVEL##SUFFIX##_getValueAndCache(const CNANOVDB_GLOBAL cnanovdb_node##LEVEL##SUFFIX *RESTRICT node, const cnanovdb_coord *RESTRICT ijk, cnanovdb_readaccessor *RESTRICT acc) \
+{ \
+    uint32_t n = cnanovdb_node##LEVEL##SUFFIX##_CoordToOffset(ijk); \
+    if (cnanovdb_mask##LOG2DIM##_isOn(&node->mChildMask, n)) \
+    { \
+        const CNANOVDB_GLOBAL cnanovdb_node##CHILDLEVEL##SUFFIX *child = cnanovdb_node##LEVEL##SUFFIX##_getChild(node, n); \
+        cnanovdb_readaccessor_insert(acc, CHILDLEVEL, child, ijk); \
+        return cnanovdb_node##CHILDLEVEL##SUFFIX##_getValueAndCache(child, ijk, acc); \
+    } \
+    return node->mTable[n].value; \
+} \
+\
+static bool \
+cnanovdb_node##LEVEL##SUFFIX##_isActive(const CNANOVDB_GLOBAL cnanovdb_node##LEVEL##SUFFIX *RESTRICT node, const cnanovdb_coord *RESTRICT ijk) \
+{ \
+    uint32_t n = cnanovdb_node##LEVEL##SUFFIX##_CoordToOffset(ijk); \
+    if (cnanovdb_mask##LOG2DIM##_isOn(&node->mChildMask, n)) \
+    { \
+        const CNANOVDB_GLOBAL cnanovdb_node##CHILDLEVEL##SUFFIX *child = cnanovdb_node##LEVEL##SUFFIX##_getChild(node, n); \
+        return cnanovdb_node##CHILDLEVEL##SUFFIX##_isActive(child, ijk); \
+    } \
+    return cnanovdb_mask##LOG2DIM##_isOn(&node->mValueMask, n) ? true : false; \
+} \
+\
+static bool \
+cnanovdb_node##LEVEL##SUFFIX##_isActiveAndCache(const CNANOVDB_GLOBAL cnanovdb_node##LEVEL##SUFFIX *RESTRICT node, const cnanovdb_coord *RESTRICT ijk, cnanovdb_readaccessor *RESTRICT acc) \
+{ \
+    uint32_t n = cnanovdb_node##LEVEL##SUFFIX##_CoordToOffset(ijk); \
+    if (cnanovdb_mask##LOG2DIM##_isOn(&node->mChildMask, n)) \
+    { \
+        const CNANOVDB_GLOBAL cnanovdb_node##CHILDLEVEL##SUFFIX *child = cnanovdb_node##LEVEL##SUFFIX##_getChild(node, n); \
+        cnanovdb_readaccessor_insert(acc, CHILDLEVEL, child, ijk); \
+        return cnanovdb_node##CHILDLEVEL##SUFFIX##_isActiveAndCache(child, ijk, acc); \
+    } \
+    return cnanovdb_mask##LOG2DIM##_isOn(&node->mValueMask, n) ? true : false; \
+} \
+\
+static const CNANOVDB_GLOBAL cnanovdb_node##LEVEL##SUFFIX * \
+cnanovdb_tree_getNode##LEVEL##SUFFIX(const CNANOVDB_GLOBAL cnanovdb_treedata *RESTRICT tree, uint64_t i) \
+{ \
+    const CNANOVDB_GLOBAL cnanovdb_node##LEVEL##SUFFIX *basenode = (const CNANOVDB_GLOBAL cnanovdb_node##LEVEL##SUFFIX *)((CNANOVDB_GLOBAL uint8_t *)(tree) + tree->mNodeOffset[LEVEL]); \
+    return basenode + i; \
+} \
+\
+/**/
+
+#define CREATE_INTERNAL_NODE(CHILDLEVEL, LEVEL, LOG2DIM, TOTAL, VALUETYPE, STATSTYPE, SUFFIX) \
+CREATE_INTERNAL_NODE_int(CHILDLEVEL, LEVEL, LOG2DIM, (TOTAL-LOG2DIM), TOTAL, ((1u << TOTAL) - 1u), VALUETYPE, STATSTYPE, SUFFIX)
+
+
+#ifdef USE_SINGLE_ROOT_KEY
+#define DEFINE_KEY(KEY) \
+        uint64_t        KEY;
+#define KEYSIZE sizeof(uint64_t)
+
+#define KEYSEARCH(SUFFIX) \
+    uint64_t                key; \
+    key = cnanovdb_coord_to_key(ijk); \
+\
+    for (int i = low; i < high; i++) \
+    { \
+        const CNANOVDB_GLOBAL cnanovdb_rootdata_tile##SUFFIX   *tile = tiles + i; \
+        if (tile->key == key) \
+            return tile; \
+    } \
+/**/
+#else
+#define DEFINE_KEY(KEY) \
+        cnanovdb_coord   KEY;
+#define KEYSIZE sizeof(cnanovdb_coord)
+#define KEYSEARCH(SUFFIX) \
+    cnanovdb_coord key; \
+    cnanovdb_coord_to_key(&key, ijk); \
+ \
+    while (low != high) \
+    { \
+        int32_t mid = low + (( high - low ) >> 1 ); \
+        const CNANOVDB_GLOBAL cnanovdb_rootdata_tile##SUFFIX   *tile = tiles + mid; \
+ \
+        int             keycmp = cnanovdb_coord_compare(&tile->key, &key); \
+        if (keycmp == 0) \
+        { \
+            return tile; \
+        } \
+ \
+        if (keycmp < 0) \
+            low = mid + 1; \
+        else \
+            high = mid; \
+    } \
+/**/
+#endif
+
+
+#define CREATE_ROOTDATA(VALUETYPE, STATSTYPE, SUFFIX) \
+typedef struct \
+{ \
+    DEFINE_KEY(key); \
+    int64_t             child; \
+    uint32_t            state; \
+    VALUETYPE           value; \
+    uint8_t             _reserved[CNANOVDB_ALIGNMENT_PADDING(sizeof(KEYSIZE)+sizeof(VALUETYPE)+sizeof(int64_t)+sizeof(uint32_t), CNANOVDB_DATA_ALIGNMENT)]; \
+} cnanovdb_rootdata_tile##SUFFIX; \
+ \
+typedef struct \
+{ \
+    cnanovdb_coord mBBox_min, mBBox_max; \
+    uint32_t       mTableSize; \
+    VALUETYPE      mBackground; \
+    VALUETYPE      mMinimum, mMaximum; \
+    STATSTYPE      mAverage, mStdDevi; \
+    uint32_t       _reserved[CNANOVDB_ALIGNMENT_PADDING(sizeof(cnanovdb_coord)*2+sizeof(uint32_t)+sizeof(VALUETYPE)*3+sizeof(STATSTYPE)*2, CNANOVDB_DATA_ALIGNMENT)/4]; \
+} cnanovdb_rootdata##SUFFIX; \
+ \
+static const CNANOVDB_GLOBAL cnanovdb_rootdata##SUFFIX * \
+cnanovdb_treedata_root##SUFFIX(const CNANOVDB_GLOBAL cnanovdb_treedata *RESTRICT treedata) \
+{ \
+    return (const CNANOVDB_GLOBAL cnanovdb_rootdata##SUFFIX *) ((const CNANOVDB_GLOBAL uint8_t *)(treedata) + treedata->mNodeOffset[ROOT_LEVEL]); \
+} \
+ \
+static const CNANOVDB_GLOBAL cnanovdb_rootdata_tile##SUFFIX * \
+cnanovdb_rootdata##SUFFIX##_getTile(const CNANOVDB_GLOBAL cnanovdb_rootdata##SUFFIX *RESTRICT rootdata, uint32_t n) \
+{ \
+    const CNANOVDB_GLOBAL cnanovdb_rootdata_tile##SUFFIX *basetile = (const CNANOVDB_GLOBAL cnanovdb_rootdata_tile##SUFFIX *) (rootdata + 1); \
+    return basetile + n; \
+} \
+ \
+static const CNANOVDB_GLOBAL cnanovdb_node2##SUFFIX * \
+cnanovdb_rootdata##SUFFIX##_getChild(const CNANOVDB_GLOBAL cnanovdb_rootdata##SUFFIX *RESTRICT rootdata, const CNANOVDB_GLOBAL cnanovdb_rootdata_tile##SUFFIX *RESTRICT tile) \
+{ \
+    CNANOVDB_GLOBAL cnanovdb_node2##SUFFIX *basenode = (CNANOVDB_GLOBAL cnanovdb_node2##SUFFIX *) (((CNANOVDB_GLOBAL uint8_t *) rootdata) + tile->child); \
+    return basenode; \
+} \
+ \
+static const CNANOVDB_GLOBAL cnanovdb_rootdata_tile##SUFFIX * \
+cnanovdb_rootdata##SUFFIX##_findTile(const CNANOVDB_GLOBAL cnanovdb_rootdata##SUFFIX *RESTRICT rootdata, const cnanovdb_coord *RESTRICT ijk) \
+{ \
+    int32_t                      low = 0, high = rootdata->mTableSize; \
+    const CNANOVDB_GLOBAL cnanovdb_rootdata_tile##SUFFIX *tiles = cnanovdb_rootdata##SUFFIX##_getTile(rootdata, 0); \
+ \
+    KEYSEARCH(SUFFIX) \
+    return 0; \
+} \
+ \
+static VALUETYPE \
+cnanovdb_rootdata##SUFFIX##_getValue(const CNANOVDB_GLOBAL cnanovdb_rootdata##SUFFIX *RESTRICT rootdata, const cnanovdb_coord *RESTRICT ijk) \
+{ \
+    const CNANOVDB_GLOBAL cnanovdb_rootdata_tile##SUFFIX *tile = cnanovdb_rootdata##SUFFIX##_findTile(rootdata, ijk); \
+    if (!tile) \
+        return rootdata->mBackground; \
+    if (tile->child == 0) \
+        return tile->value; \
+    return cnanovdb_node2##SUFFIX##_getValue( cnanovdb_rootdata##SUFFIX##_getChild(rootdata, tile), ijk ); \
+} \
+ \
+static VALUETYPE \
+cnanovdb_rootdata##SUFFIX##_getValueAndCache(const CNANOVDB_GLOBAL cnanovdb_rootdata##SUFFIX *RESTRICT rootdata, const cnanovdb_coord *RESTRICT ijk, cnanovdb_readaccessor *RESTRICT acc) \
+{ \
+    const CNANOVDB_GLOBAL cnanovdb_rootdata_tile##SUFFIX *tile = cnanovdb_rootdata##SUFFIX##_findTile(rootdata, ijk); \
+    if (!tile) \
+        return rootdata->mBackground; \
+    if (tile->child == 0) \
+        return tile->value; \
+    const CNANOVDB_GLOBAL cnanovdb_node2##SUFFIX *child = cnanovdb_rootdata##SUFFIX##_getChild(rootdata, tile); \
+    cnanovdb_readaccessor_insert(acc, 2, child, ijk); \
+    return cnanovdb_node2##SUFFIX##_getValueAndCache( child, ijk, acc ); \
+} \
+\
+static bool \
+cnanovdb_rootdata##SUFFIX##_isActive(const CNANOVDB_GLOBAL cnanovdb_rootdata##SUFFIX *RESTRICT rootdata, const cnanovdb_coord *RESTRICT ijk) \
+{ \
+    const CNANOVDB_GLOBAL cnanovdb_rootdata_tile##SUFFIX *tile = cnanovdb_rootdata##SUFFIX##_findTile(rootdata, ijk); \
+    if (!tile) \
+        return false; \
+    if (tile->child == 0) \
+        return tile->state; \
+    return cnanovdb_node2##SUFFIX##_isActive( cnanovdb_rootdata##SUFFIX##_getChild(rootdata, tile), ijk ); \
+} \
+ \
+static bool \
+cnanovdb_rootdata##SUFFIX##_isActiveAndCache(const CNANOVDB_GLOBAL cnanovdb_rootdata##SUFFIX *RESTRICT rootdata, const cnanovdb_coord *RESTRICT ijk, cnanovdb_readaccessor *RESTRICT acc) \
+{ \
+    const CNANOVDB_GLOBAL cnanovdb_rootdata_tile##SUFFIX *tile = cnanovdb_rootdata##SUFFIX##_findTile(rootdata, ijk); \
+    if (!tile) \
+        return false; \
+    if (tile->child == 0) \
+        return tile->state; \
+    const CNANOVDB_GLOBAL cnanovdb_node2##SUFFIX *child = cnanovdb_rootdata##SUFFIX##_getChild(rootdata, tile); \
+    cnanovdb_readaccessor_insert(acc, 2, child, ijk); \
+    return cnanovdb_node2##SUFFIX##_isActiveAndCache( child, ijk, acc ); \
+} \
+/**/
+
+
+inline void
+cnanovdb_readaccessor_init(cnanovdb_readaccessor *RESTRICT acc,
+                    const CNANOVDB_GLOBAL void /*cnanovdb_rootdata* */ *RESTRICT rootdata)
+{
+    acc->mNode[0] = acc->mNode[1] = acc->mNode[2] = 0;
+    acc->mNode[3] = rootdata;
+}
+
+#define DEFINE_ISCACHED(LEVEL, MASK) \
+inline bool \
+cnanovdb_readaccessor_isCached##LEVEL(cnanovdb_readaccessor *RESTRICT acc, int32_t dirty) \
+{ \
+    if (!acc->mNode[LEVEL]) \
+        return false; \
+    if (dirty & ~MASK) \
+    { \
+        acc->mNode[LEVEL] = 0; \
+        return false; \
+    } \
+    return true; \
+} \
+/**/
+
+DEFINE_ISCACHED(0, ((1u <<  3) - 1u) )
+DEFINE_ISCACHED(1, ((1u <<  7) - 1u) )
+DEFINE_ISCACHED(2, ((1u << 12) - 1u) )
+
+inline int32_t
+cnanovdb_readaccessor_computeDirty(const cnanovdb_readaccessor *RESTRICT acc, const cnanovdb_coord *RESTRICT ijk)
+{
+    return (ijk->mVec[0] ^ acc->mKey.mVec[0]) |
+           (ijk->mVec[1] ^ acc->mKey.mVec[1]) |
+           (ijk->mVec[2] ^ acc->mKey.mVec[2]);
+}
+
+#define CREATE_ACCESSOR(VALUETYPE, SUFFIX) \
+inline VALUETYPE \
+cnanovdb_readaccessor_getValue##SUFFIX(cnanovdb_readaccessor *RESTRICT acc, const cnanovdb_coord *RESTRICT ijk) \
+{ \
+    int32_t dirty = cnanovdb_readaccessor_computeDirty(acc, ijk); \
+ \
+    if (cnanovdb_readaccessor_isCached0(acc, dirty)) \
+        return cnanovdb_node0##SUFFIX##_getValue( ((CNANOVDB_GLOBAL cnanovdb_node0##SUFFIX *) acc->mNode[0]), ijk); \
+    if (cnanovdb_readaccessor_isCached1(acc, dirty)) \
+        return cnanovdb_node1##SUFFIX##_getValueAndCache( ((CNANOVDB_GLOBAL cnanovdb_node1##SUFFIX *) acc->mNode[1]), ijk, acc); \
+    if (cnanovdb_readaccessor_isCached2(acc, dirty)) \
+        return cnanovdb_node2##SUFFIX##_getValueAndCache( ((CNANOVDB_GLOBAL cnanovdb_node2##SUFFIX *) acc->mNode[2]), ijk, acc); \
+ \
+    return cnanovdb_rootdata##SUFFIX##_getValueAndCache( ((CNANOVDB_GLOBAL cnanovdb_rootdata##SUFFIX *)acc->mNode[3]), ijk, acc); \
+} \
+\
+inline bool \
+cnanovdb_readaccessor_isActive##SUFFIX(cnanovdb_readaccessor *RESTRICT acc, const cnanovdb_coord *RESTRICT ijk) \
+{ \
+    int32_t dirty = cnanovdb_readaccessor_computeDirty(acc, ijk); \
+ \
+    if (cnanovdb_readaccessor_isCached0(acc, dirty)) \
+        return cnanovdb_node0##SUFFIX##_isActive( ((CNANOVDB_GLOBAL cnanovdb_node0##SUFFIX *) acc->mNode[0]), ijk); \
+    if (cnanovdb_readaccessor_isCached1(acc, dirty)) \
+        return cnanovdb_node1##SUFFIX##_isActiveAndCache( ((CNANOVDB_GLOBAL cnanovdb_node1##SUFFIX *) acc->mNode[1]), ijk, acc); \
+    if (cnanovdb_readaccessor_isCached2(acc, dirty)) \
+        return cnanovdb_node2##SUFFIX##_isActiveAndCache( ((CNANOVDB_GLOBAL cnanovdb_node2##SUFFIX *) acc->mNode[2]), ijk, acc); \
+ \
+    return cnanovdb_rootdata##SUFFIX##_isActiveAndCache( ((CNANOVDB_GLOBAL cnanovdb_rootdata##SUFFIX *)acc->mNode[3]), ijk, acc); \
+} \
+/**/
+
+
+#define CREATE_GRIDTYPE(VALUETYPE, STATSTYPE, SUFFIX) \
+CREATE_TILEENTRY(VALUETYPE, SUFFIX) \
+CREATE_LEAF_NODE(0, 3, 3, VALUETYPE, STATSTYPE, SUFFIX) \
+CREATE_INTERNAL_NODE(0, 1, 4, 7, VALUETYPE, STATSTYPE, SUFFIX) \
+CREATE_INTERNAL_NODE(1, 2, 5, 12, VALUETYPE, STATSTYPE, SUFFIX) \
+CREATE_ROOTDATA(VALUETYPE, STATSTYPE, SUFFIX) \
+CREATE_ACCESSOR(VALUETYPE, SUFFIX) \
+/**/
+
+CREATE_GRIDTYPE(float, float, F)
+CREATE_GRIDTYPE(cnanovdb_Vec3F, float, F3)
+
+static int
+cnanovdb_griddata_valid(const CNANOVDB_GLOBAL cnanovdb_griddata *RESTRICT grid)
+{
+    if (!grid)
+        return 0;
+    if (grid->mMagic != 0x304244566f6e614eUL && grid->mMagic != 0x314244566f6e614eUL)
+        return 0;
+    return 1;
+}
+
+static int
+cnanovdb_griddata_validF(const CNANOVDB_GLOBAL cnanovdb_griddata *RESTRICT grid)
+{
+    if (!cnanovdb_griddata_valid(grid))
+        return 0;
+    if (grid->mGridType != cnanovdb_GridType_Float)
+        return 0;
+    return 1;
+}
+
+static int
+cnanovdb_griddata_validF3(const CNANOVDB_GLOBAL cnanovdb_griddata *RESTRICT grid)
+{
+    if (!cnanovdb_griddata_valid(grid))
+        return 0;
+    if (grid->mGridType != cnanovdb_GridType_Vec3f)
+        return 0;
+    return 1;
+}
+
+#endif
diff --git a/external/nanovdb/GridHandle.h b/external/nanovdb/GridHandle.h
new file mode 100644
index 00000000..05e49204
--- /dev/null
+++ b/external/nanovdb/GridHandle.h
@@ -0,0 +1,493 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file nanovdb/GridHandle.h
+
+    \author Ken Museth
+
+    \date January 8, 2020
+
+    \brief Defines GridHandle, which manages a host, and possibly a device,
+           memory buffer containing one or more NanoVDB grids.
+*/
+
+#ifndef NANOVDB_GRID_HANDLE_H_HAS_BEEN_INCLUDED
+#define NANOVDB_GRID_HANDLE_H_HAS_BEEN_INCLUDED
+
+#include <fstream> // for std::ifstream
+#include <iostream> // for std::cerr/cout
+#include <vector>
+#include <initializer_list>
+
+#include <nanovdb/NanoVDB.h>// for toGridType
+#include <nanovdb/HostBuffer.h>
+#include <nanovdb/tools/GridChecksum.h>// for updateGridCount
+
+namespace nanovdb {
+
+// --------------------------> GridHandle <------------------------------------
+
+struct GridHandleMetaData {uint64_t offset, size; GridType gridType;};
+
+/// @brief This class serves to manage a buffer containing one or more NanoVDB Grids.
+///
+/// @note  It is important to note that this class does NOT depend on OpenVDB.
+template<typename BufferT = HostBuffer>
+class GridHandle
+{
+    std::vector<GridHandleMetaData> mMetaData;
+    BufferT mBuffer;
+
+    template <typename T>
+    static T* no_const(const T* ptr) { return const_cast<T*>(ptr); }
+
+public:
+    using BufferType = BufferT;
+
+    /// @brief  Move constructor from a host buffer
+    /// @param buffer buffer containing one or more NanoGrids that will be moved into this GridHandle
+    /// @throw Will throw and error with the buffer does not contain a valid NanoGrid!
+    template<typename T = BufferT, typename util::enable_if<BufferTraits<T>::hasDeviceDual, int>::type = 0>
+    GridHandle(T&& buffer);
+
+    /// @brief  Move constructor from a dual host-device buffer
+    /// @param buffer buffer containing one or more NanoGrids that will be moved into this GridHandle
+    /// @throw Will throw and error with the buffer does not contain a valid NanoGrid!
+    template<typename T = BufferT, typename util::disable_if<BufferTraits<T>::hasDeviceDual, int>::type = 0>
+    GridHandle(T&& buffer);
+
+    /// @brief Constructs an empty GridHandle
+    GridHandle() = default;
+
+    /// @brief Disallow copy-construction
+    GridHandle(const GridHandle&) = delete;
+
+    /// @brief Move copy-constructor
+    GridHandle(GridHandle&& other) noexcept {
+        mBuffer   = std::move(other.mBuffer);
+        mMetaData = std::move(other.mMetaData);
+    }
+
+    /// @brief clear this GridHandle to an empty handle
+    void reset() {
+        mBuffer.clear();
+        mMetaData.clear();
+    }
+
+    /// @brief Disallow copy assignment operation
+    GridHandle& operator=(const GridHandle&) = delete;
+
+    /// @brief Move copy assignment operation
+    GridHandle& operator=(GridHandle&& other) noexcept {
+        mBuffer   = std::move(other.mBuffer);
+        mMetaData = std::move(other.mMetaData);
+        return *this;
+    }
+
+    /// @brief Performs a deep copy of the GridHandle, possibly templated on a different buffer type
+    /// @tparam OtherBufferT Buffer type of the deep copy
+    /// @param buffer optional buffer used for allocation
+    /// @return A new handle of the specified buffer type that contains a deep copy of the current handle
+    template <typename OtherBufferT = HostBuffer>
+    GridHandle<OtherBufferT> copy(const OtherBufferT& buffer = OtherBufferT()) const;
+
+    /// @brief Return a reference to the buffer
+    BufferT&       buffer() { return mBuffer; }
+
+    /// @brief Return a const reference to the buffer
+    const BufferT& buffer() const { return mBuffer; }
+
+    /// @brief Returns a non-const pointer to the data.
+    /// @warning Note that the return pointer can be NULL if the GridHandle was not initialized
+    void* data() { return mBuffer.data(); }
+
+    /// @brief Returns a const pointer to the data.
+    /// @warning Note that the return pointer can be NULL if the GridHandle was not initialized
+    const void* data() const { return mBuffer.data(); }
+
+    template<typename U = BufferT>
+    typename util::enable_if<BufferTraits<U>::hasDeviceDual, const void*>::type
+    deviceData() const { return mBuffer.deviceData(); }
+    template<typename U = BufferT>
+    typename util::enable_if<BufferTraits<U>::hasDeviceDual, void*>::type
+    deviceData() { return mBuffer.deviceData(); }
+
+    /// @brief Returns the size in bytes of the raw memory buffer managed by this GridHandle.
+    uint64_t size() const { return mBuffer.size(); }
+
+    //@{
+    /// @brief Return true if this handle is empty, i.e. has no allocated memory
+    bool empty()   const { return this->size() == 0; }
+    bool isEmpty() const { return this->size() == 0; }
+    //@}
+
+    /// @brief Return true if this handle contains any grids
+    operator bool() const { return !this->empty(); }
+
+    /// @brief Returns a const host pointer to the @a n'th NanoVDB grid encoded in this GridHandle.
+    /// @tparam ValueT Value type of the grid point to be returned
+    /// @param n Index of the (host) grid pointer to be returned
+    /// @warning Note that the return pointer can be NULL if the GridHandle no host grid, @a n is invalid
+    ///          or if the template parameter does not match the specified grid!
+    template<typename ValueT>
+    const NanoGrid<ValueT>* grid(uint32_t n = 0) const;
+
+    /// @brief Returns a host pointer to the @a n'th  NanoVDB grid encoded in this GridHandle.
+    /// @tparam ValueT Value type of the grid point to be returned
+    /// @param n Index of the (host) grid pointer to be returned
+    /// @warning Note that the return pointer can be NULL if the GridHandle no host grid, @a n is invalid
+    ///          or if the template parameter does not match the specified grid!
+    template<typename ValueT>
+    NanoGrid<ValueT>* grid(uint32_t n = 0) {return const_cast<NanoGrid<ValueT>*>(static_cast<const GridHandle*>(this)->template grid<ValueT>(n));}
+
+    /// @brief Return a const pointer to the @a n'th grid encoded in this GridHandle on the device, e.g. GPU
+    /// @tparam ValueT Value type of the grid point to be returned
+    /// @param n Index of the (device) grid pointer to be returned
+    /// @warning Note that the return pointer can be NULL if the GridHandle has no device grid, @a n is invalid,
+    ///          or if the template parameter does not match the specified grid.
+    template<typename ValueT, typename U = BufferT>
+    typename util::enable_if<BufferTraits<U>::hasDeviceDual, const NanoGrid<ValueT>*>::type
+    deviceGrid(uint32_t n=0) const;
+
+    /// @brief Return a const pointer to the @a n'th grid encoded in this GridHandle on the device, e.g. GPU
+    /// @tparam ValueT Value type of the grid point to be returned
+    /// @param n Index if of the grid pointer to be returned
+    /// @param verbose if non-zero error messages will be printed in case something failed
+    /// @warning Note that the return pointer can be NULL if the GridHandle was not initialized, @a n is invalid,
+    ///          or if the template parameter does not match the specified grid.
+    template<typename ValueT, typename U = BufferT>
+    typename util::enable_if<BufferTraits<U>::hasDeviceDual, NanoGrid<ValueT>*>::type
+    deviceGrid(uint32_t n=0){return const_cast<NanoGrid<ValueT>*>(static_cast<const GridHandle*>(this)->template deviceGrid<ValueT>(n));}
+
+    /// @brief Upload the grid to the device, e.g. from CPU to GPU
+    /// @note This method is only available if the buffer supports devices
+    template<typename U = BufferT>
+    typename util::enable_if<BufferTraits<U>::hasDeviceDual, void>::type
+    deviceUpload(void* stream = nullptr, bool sync = true) { mBuffer.deviceUpload(stream, sync); }
+
+    /// @brief Download the grid to from the device, e.g. from GPU to CPU
+    /// @note This method is only available if the buffer supports devices
+    template<typename U = BufferT>
+    typename util::enable_if<BufferTraits<U>::hasDeviceDual, void>::type
+    deviceDownload(void* stream = nullptr, bool sync = true) { mBuffer.deviceDownload(stream, sync); }
+
+    /// @brief Check if the buffer is this handle has any padding, i.e. if the buffer is larger than the combined size of all its grids
+    /// @return true is the combined size of all grid is smaller than the buffer size
+    bool isPadded() const {return mMetaData.empty() ? false : mMetaData.back().offset + mMetaData.back().size != mBuffer.size();}
+
+    /// @brief Return the total number of grids contained in this buffer
+    uint32_t gridCount() const {return static_cast<uint32_t>(mMetaData.size());}
+
+    /// @brief Return the grid size of the @a n'th grid in this GridHandle
+    /// @param n index of the grid (assumed to be less than gridCount())
+    /// @return Return the byte size of the specified grid
+    uint64_t gridSize(uint32_t n = 0) const {return mMetaData[n].size; }
+
+    /// @brief Return the GridType of the @a n'th grid in this GridHandle
+    /// @param n index of the grid (assumed to be less than gridCount())
+    /// @return Return the GridType of the specified grid
+    GridType gridType(uint32_t n = 0) const {return mMetaData[n].gridType; }
+
+    /// @brief Access to the GridData of the n'th grid in the current handle
+    /// @param n zero-based ID of the grid
+    /// @return Const pointer to the n'th GridData in the current handle
+    const GridData* gridData(uint32_t n = 0) const;
+
+    /// @brief Returns a const point to the @a n'th grid meta data
+    /// @param n zero-based ID of the grid
+    /// @warning Note that the return pointer can be NULL if the GridHandle was not initialized
+    const GridMetaData* gridMetaData(uint32_t n = 0) const;
+
+    /// @brief Write a specific grid in this buffer to an output stream
+    /// @param os  output stream that the buffer will be written to
+    /// @param n zero-based index of the grid to be written to stream
+    void write(std::ostream& os, uint32_t n) const {
+        if (const GridData* data = this->gridData(n)) {
+            os.write((const char*)data, data->mGridSize);
+        } else {
+            throw std::runtime_error("GridHandle does not contain a #" + std::to_string(n) + " grid");
+        }
+    }
+
+    /// @brief Write the entire grid buffer to an output stream
+    /// @param os output stream that the buffer will be written to
+    void write(std::ostream& os) const {
+        for (uint32_t n=0; n<this->gridCount(); ++n) this->write(os, n);
+    }
+
+    /// @brief Write this entire grid buffer to a file
+    /// @param fileName string name of the output file
+    void write(const std::string &fileName) const {
+        std::ofstream os(fileName, std::ios::out | std::ios::binary | std::ios::trunc);
+        if (!os.is_open()) throw std::ios_base::failure("Unable to open file named \"" + fileName + "\" for output");
+        this->write(os);
+    }
+
+    /// @brief Write a specific grid to file
+    /// @param fileName string name of the output file
+    /// @param n zero-based index of the grid to be written to file
+    void write(const std::string &fileName, uint32_t n) const {
+        std::ofstream os(fileName, std::ios::out | std::ios::binary | std::ios::trunc);
+        if (!os.is_open()) throw std::ios_base::failure("Unable to open file named \"" + fileName + "\" for output");
+        this->write(os, n);
+    }
+
+    /// @brief Read an entire raw grid buffer from an input stream
+    /// @param is input stream containing a raw grid buffer
+    /// @param pool optional pool from which to allocate the new grid buffer
+    /// @throw Will throw a std::logic_error if the stream does not contain a valid raw grid
+    void read(std::istream& is, const BufferT& pool = BufferT());
+
+    /// @brief Read a specific grid from an input stream containing a raw grid buffer
+    /// @param is input stream containing a raw grid buffer
+    /// @param n zero-based index of the grid to be read
+    /// @param pool optional pool from which to allocate the new grid buffer
+    /// @throw Will throw a std::logic_error if the stream does not contain a valid raw grid
+    void read(std::istream& is, uint32_t n, const BufferT& pool = BufferT());
+
+    /// @brief Read a specific grid from an input stream containing a raw grid buffer
+    /// @param is input stream containing a raw grid buffer
+    /// @param gridName string name of the grid to be read
+    /// @param pool optional pool from which to allocate the new grid buffer
+    /// @throw Will throw a std::logic_error if the stream does not contain a valid raw grid with the speficied name
+    void read(std::istream& is, const std::string &gridName, const BufferT& pool = BufferT());
+
+    /// @brief Read a raw grid buffer from a file
+    /// @param filename string name of the input file containing a raw grid buffer
+    /// @param pool optional pool from which to allocate the new grid buffe
+    void read(const std::string &fileName, const BufferT& pool = BufferT()) {
+        std::ifstream is(fileName, std::ios::in | std::ios::binary);
+        if (!is.is_open()) throw std::ios_base::failure("Unable to open file named \"" + fileName + "\" for input");
+        this->read(is, pool);
+    }
+
+    /// @brief Read a specific grid from a file containing a raw grid buffer
+    /// @param filename string name of the input file containing a raw grid buffer
+    /// @param n zero-based index of the grid to be read
+    /// @param pool optional pool from which to allocate the new grid buffer
+    /// @throw Will throw a std::ios_base::failure if the file does not exist and a
+    ///        std::logic_error if the files does not contain a valid raw grid
+    void read(const std::string &fileName, uint32_t n, const BufferT& pool = BufferT()) {
+        std::ifstream is(fileName, std::ios::in | std::ios::binary);
+        if (!is.is_open()) throw std::ios_base::failure("Unable to open file named \"" + fileName + "\" for input");
+        this->read(is, n, pool);
+    }
+
+    /// @brief Read a specific grid from a file containing a raw grid buffer
+    /// @param filename string name of the input file containing a raw grid buffer
+    /// @param gridName string name of the grid to be read
+    /// @param pool optional pool from which to allocate the new grid buffer
+    /// @throw Will throw a std::ios_base::failure if the file does not exist and a
+    ///        std::logic_error if the files does not contain a valid raw grid withe the specified name
+    void read(const std::string &fileName, const std::string &gridName, const BufferT& pool = BufferT()) {
+        std::ifstream is(fileName, std::ios::in | std::ios::binary);
+        if (!is.is_open()) throw std::ios_base::failure("Unable to open file named \"" + fileName + "\" for input");
+        this->read(is, gridName, pool);
+    }
+}; // GridHandle
+
+// --------------------------> Implementation of private methods in GridHandle <------------------------------------
+
+template<typename BufferT>
+inline const GridData* GridHandle<BufferT>::gridData(uint32_t n) const
+{
+    const void *data = this->data();
+    if (data == nullptr || n >= mMetaData.size()) return nullptr;
+    return util::PtrAdd<GridData>(data, mMetaData[n].offset);
+}// const GridData* GridHandle<BufferT>::gridData(uint32_t n) const
+
+template<typename BufferT>
+inline const GridMetaData* GridHandle<BufferT>::gridMetaData(uint32_t n) const
+{
+    const auto *data = this->data();
+    if (data == nullptr || n >= mMetaData.size()) return nullptr;
+    return util::PtrAdd<GridMetaData>(data, mMetaData[n].offset);
+}// const GridMetaData* GridHandle<BufferT>::gridMetaData(uint32_t n) const
+
+inline __hostdev__ void cpyGridHandleMeta(const GridData *data, GridHandleMetaData *meta)
+{
+    uint64_t offset = 0;
+    for (auto *p=meta, *q=p+data->mGridCount; p!=q; ++p) {
+        *p = {offset,  data->mGridSize, data->mGridType};
+        offset += p->size;
+        data = util::PtrAdd<GridData>(data, p->size);
+    }
+}// void cpyGridHandleMeta(const GridData *data, GridHandleMetaData *meta)
+
+template<typename BufferT>
+template<typename T, typename util::disable_if<BufferTraits<T>::hasDeviceDual, int>::type>
+GridHandle<BufferT>::GridHandle(T&& buffer)
+{
+    static_assert(util::is_same<T,BufferT>::value, "Expected U==BufferT");
+    mBuffer = std::move(buffer);
+    if (auto *data = reinterpret_cast<const GridData*>(mBuffer.data())) {
+        if (!data->isValid()) throw std::runtime_error("GridHandle was constructed with an invalid host buffer");
+        mMetaData.resize(data->mGridCount);
+        cpyGridHandleMeta(data, mMetaData.data());
+    }
+}// GridHandle<BufferT>::GridHandle(T&& buffer)
+
+template<typename BufferT>
+template <typename OtherBufferT>
+inline GridHandle<OtherBufferT> GridHandle<BufferT>::copy(const OtherBufferT& other) const
+{
+    if (mBuffer.isEmpty()) return GridHandle<OtherBufferT>();// return an empty handle
+    auto buffer = OtherBufferT::create(mBuffer.size(), &other);
+    std::memcpy(buffer.data(), mBuffer.data(), mBuffer.size());// deep copy of buffer
+    return GridHandle<OtherBufferT>(std::move(buffer));
+}// GridHandle<OtherBufferT> GridHandle<BufferT>::copy(const OtherBufferT& other) const
+
+template<typename BufferT>
+template<typename ValueT>
+inline const NanoGrid<ValueT>* GridHandle<BufferT>::grid(uint32_t n) const
+{
+    const void *data = mBuffer.data();
+    if (data == nullptr || n >= mMetaData.size() || mMetaData[n].gridType != toGridType<ValueT>()) return nullptr;
+    return util::PtrAdd<NanoGrid<ValueT>>(data, mMetaData[n].offset);
+}// const NanoGrid<ValueT>* GridHandle<BufferT>::grid(uint32_t n) const
+
+template<typename BufferT>
+template<typename ValueT, typename U>
+inline typename util::enable_if<BufferTraits<U>::hasDeviceDual, const NanoGrid<ValueT>*>::type
+GridHandle<BufferT>::deviceGrid(uint32_t n) const
+{
+    const void *data = mBuffer.deviceData();
+    if (data == nullptr || n >= mMetaData.size() || mMetaData[n].gridType != toGridType<ValueT>()) return nullptr;
+    return util::PtrAdd<NanoGrid<ValueT>>(data, mMetaData[n].offset);
+}// GridHandle<BufferT>::deviceGrid(uint32_t n) cons
+
+template<typename BufferT>
+void GridHandle<BufferT>::read(std::istream& is, const BufferT& pool)
+{
+    GridData data;
+    is.read((char*)&data, sizeof(GridData));
+    if (data.isValid()) {
+        uint64_t size = data.mGridSize, sum = 0u;
+        while(data.mGridIndex + 1u < data.mGridCount) {// loop over remaining raw grids in stream
+            is.seekg(data.mGridSize - sizeof(GridData), std::ios::cur);// skip grid
+            is.read((char*)&data, sizeof(GridData));
+            sum += data.mGridSize;
+        }
+        auto buffer = BufferT::create(size + sum, &pool);
+        is.seekg(-int64_t(sum + sizeof(GridData)), std::ios::cur);// rewind to start
+        is.read((char*)(buffer.data()), buffer.size());
+        *this = GridHandle(std::move(buffer));
+    } else {
+        is.seekg(-sizeof(GridData), std::ios::cur);// rewind
+        throw std::logic_error("This stream does not contain a valid raw grid buffer");
+    }
+}// void GridHandle<BufferT>::read(std::istream& is, const BufferT& pool)
+
+template<typename BufferT>
+void GridHandle<BufferT>::read(std::istream& is, uint32_t n, const BufferT& pool)
+{
+    GridData data;
+    is.read((char*)&data, sizeof(GridData));
+    if (data.isValid()) {
+        if (n>=data.mGridCount) throw std::runtime_error("stream does not contain a #" + std::to_string(n) + " grid");
+        while(data.mGridIndex != n) {
+            is.seekg(data.mGridSize - sizeof(GridData), std::ios::cur);// skip grid
+            is.read((char*)&data, sizeof(GridData));
+        }
+        auto buffer = BufferT::create(data.mGridSize, &pool);
+        is.seekg(-sizeof(GridData), std::ios::cur);// rewind
+        is.read((char*)(buffer.data()), data.mGridSize);
+        tools::updateGridCount((GridData*)buffer.data(), 0u, 1u);
+        *this = GridHandle(std::move(buffer));
+    } else {
+        is.seekg(-sizeof(GridData), std::ios::cur);// rewind sizeof(GridData) bytes to undo initial read
+        throw std::logic_error("This file does not contain a valid raw buffer");
+    }
+}// void GridHandle<BufferT>::read(std::istream& is, uint32_t n, const BufferT& pool)
+
+template<typename BufferT>
+void GridHandle<BufferT>::read(std::istream& is, const std::string &gridName, const BufferT& pool)
+{
+    static const std::streamsize byteSize = sizeof(GridData);
+    GridData data;
+    is.read((char*)&data, byteSize);
+    is.seekg(-byteSize, std::ios::cur);// rewind
+    if (data.isValid()) {
+        uint32_t n = 0;
+        while(data.mGridName != gridName && n++ < data.mGridCount) {
+            is.seekg(data.mGridSize, std::ios::cur);// skip grid
+            is.read((char*)&data, byteSize);// read sizeof(GridData) bytes
+            is.seekg(-byteSize, std::ios::cur);// rewind
+        }
+        if (n>data.mGridCount) throw std::runtime_error("No raw grid named \""+gridName+"\"");
+        auto buffer = BufferT::create(data.mGridSize, &pool);
+        is.read((char*)(buffer.data()), data.mGridSize);
+        tools::updateGridCount((GridData*)buffer.data(), 0u, 1u);
+        *this = GridHandle(std::move(buffer));
+    } else {
+        throw std::logic_error("This file does not contain a valid raw buffer");
+    }
+}// void GridHandle<BufferT>::read(std::istream& is, const std::string &gridName n, const BufferT& pool)
+
+// --------------------------> free-standing functions <------------------------------------
+
+/// @brief Split all grids in a single GridHandle into a vector of multiple GridHandles each with a single grid
+/// @tparam BufferT Type of the input and output grid buffers
+/// @param handle GridHandle with grids that will be slip into individual GridHandles
+/// @param pool optional pool used for allocation of output GridHandle
+/// @return Vector of GridHandles each containing a single grid
+template<typename BufferT, template <class, class...> class VectorT = std::vector>
+inline VectorT<GridHandle<BufferT>>
+splitGrids(const GridHandle<BufferT> &handle, const BufferT* other = nullptr)
+{
+    using HandleT = GridHandle<BufferT>;
+    const void *ptr = handle.data();
+    if (ptr == nullptr) return VectorT<HandleT>();
+    VectorT<HandleT> handles(handle.gridCount());
+    for (auto &h : handles) {
+        const GridData *src = reinterpret_cast<const GridData*>(ptr);
+        NANOVDB_ASSERT(src->isValid());
+        auto buffer = BufferT::create(src->mGridSize, other);
+        GridData *dst = reinterpret_cast<GridData*>(buffer.data());
+        std::memcpy(dst, src, src->mGridSize);
+        tools::updateGridCount(dst, 0u, 1u);
+        h = HandleT(std::move(buffer));
+        ptr = util::PtrAdd(ptr, src->mGridSize);
+    }
+    return std::move(handles);
+}// splitGrids
+
+/// @brief Combines (or merges) multiple GridHandles into a single GridHandle containing all grids
+/// @tparam BufferT Type of the input and output grid buffers
+/// @param handles Vector of GridHandles to be combined
+/// @param pool optional pool used for allocation of output GridHandle
+/// @return single GridHandle containing all input grids
+template<typename BufferT, template <class, class...> class VectorT>
+inline GridHandle<BufferT>
+mergeGrids(const VectorT<GridHandle<BufferT>> &handles, const BufferT* pool = nullptr)
+{
+    uint64_t size = 0u;
+    uint32_t counter = 0u, gridCount = 0u;
+    for (auto &h : handles) {
+        gridCount += h.gridCount();
+        for (uint32_t n=0; n<h.gridCount(); ++n) size += h.gridSize(n);
+    }
+    auto buffer = BufferT::create(size, pool);
+    void *dst = buffer.data();
+    for (auto &h : handles) {
+        const void *src = h.data();
+        for (uint32_t n=0; n<h.gridCount(); ++n) {
+            std::memcpy(dst, src, h.gridSize(n));
+            GridData *data = reinterpret_cast<GridData*>(dst);
+            NANOVDB_ASSERT(data->isValid());
+            tools::updateGridCount(data, counter++, gridCount);
+            dst = util::PtrAdd(dst, data->mGridSize);
+            src = util::PtrAdd(src, data->mGridSize);
+        }
+    }
+    return GridHandle<BufferT>(std::move(buffer));
+}// mergeGrids
+
+} // namespace nanovdb
+
+#if defined(__CUDACC__)
+#include <nanovdb/cuda/GridHandle.cuh>
+#endif// defined(__CUDACC__)
+
+#endif // NANOVDB_GRID_HANDLE_H_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/HostBuffer.h b/external/nanovdb/HostBuffer.h
new file mode 100644
index 00000000..70c9ce0f
--- /dev/null
+++ b/external/nanovdb/HostBuffer.h
@@ -0,0 +1,590 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    @file nanovdb/HostBuffer.h
+
+    @date April 20, 2021
+
+    @brief HostBuffer - a buffer that contains a shared or private bump
+           pool to either externally or internally managed host memory.
+
+    @details This HostBuffer can be used in multiple ways, most of which are
+             demonstrated in the examples below. Memory in the pool can
+             be managed or unmanged (e.g. internal or external) and can
+             be shared between multiple buffers or belong to a single buffer.
+
+   Example that uses HostBuffer::create inside io::readGrids to create a
+   full self-managed buffer, i.e. not shared and without padding, per grid in the file.
+   @code
+        auto handles = nanovdb::io::readGrids("file.nvdb");
+   @endcode
+
+   Example that uses HostBuffer::createFull. Assuming you have a raw pointer
+   to a NanoVDB grid of unknown type, this examples shows how to create its
+   GridHandle which can be used to enquire about the grid type and meta data.
+   @code
+        void    *data;// pointer to a NanoVDB grid of unknown type
+        uint64_t size;// byte size of NanoVDB grid of unknown type
+        auto buffer = nanovdb::HostBuffer::createFull(size, data);
+        nanovdb::GridHandle<> gridHandle(std::move(buffer));
+   @endcode
+
+   Example that uses HostBuffer::createPool for internally managed host memory.
+   Suppose you want to read multiple grids in multiple files, but reuse the same
+   fixed sized memory buffer to both avoid memory fragmentation as well as
+   exceeding the fixed memory ceiling!
+   @code
+        auto pool = nanovdb::HostBuffer::createPool(1 << 30);// 1 GB memory pool
+        std::vector<std::string>> frames;// vector of grid names
+        for (int i=0; i<frames.size(); ++i) {
+            auto handles = nanovdb::io::readGrids(frames[i], 0, pool);// throws if grids in file exceed 1 GB
+            ...
+            pool.reset();// clears all handles and resets the memory pool for reuse
+        }
+   @endcode
+
+   Example that uses HostBuffer::createPool for externally managed host memory.
+   Note that in this example @c handles are allowed to outlive @c pool since
+   they internally store a shared pointer to the memory pool. However @c data
+   MUST outlive @c handles since the pool does not own its memory in this example.
+   @code
+        const size_t poolSize = 1 << 30;// 1 GB
+        void *data = std::malloc(size + NANOVDB_DATA_ALIGNMENT);// 1 GB pool with padding
+        void *buffer = nanovdb::alignPtr(data);// 32B aligned buffer
+        //void *buffer = std::aligned_alloc(NANOVDB_DATA_ALIGNMENT, poolSize);// in C++17
+        auto pool = nanovdb::HostBuffer::createPool(poolSize, buffer);
+        auto handles1 = nanovdb::io::readGrids("file1.nvdb", 0, pool);
+        auto handles2 = nanovdb::io::readGrids("file2.nvdb", 0, pool);
+        ....
+        std::free(data);
+        //std::free(buffer);
+   @endcode
+
+   Example that uses HostBuffer::createPool for externally managed host memory.
+   Note that in this example @c handles are allowed to outlive @c pool since
+   they internally store a shared pointer to the memory pool. However @c array
+   MUST outlive @c handles since the pool does not own its memory in this example.
+   @code
+        const size_t poolSize = 1 << 30;// 1 GB
+        std::unique_ptr<char[]> array(new char[size + NANOVDB_DATA_ALIGNMENT]);// scoped pool of 1 GB with padding
+        void *buffer = nanovdb::alignPtr(array.get());// 32B aligned buffer
+        auto pool = nanovdb::HostBuffer::createPool(poolSize, buffer);
+        auto handles = nanovdb::io::readGrids("file.nvdb", 0, pool);
+   @endcode
+*/
+
+#ifndef NANOVDB_HOSTBUFFER_H_HAS_BEEN_INCLUDED
+#define NANOVDB_HOSTBUFFER_H_HAS_BEEN_INCLUDED
+
+#include <nanovdb/NanoVDB.h>// for NANOVDB_DATA_ALIGNMENT;
+#include <stdint.h> //         for types like int32_t etc
+#include <cstdio> //           for fprintf
+#include <cstdlib> //          for std::malloc/std::realloc/std::free
+#include <memory>//            for std::make_shared
+#include <mutex>//             for std::mutex
+#include <unordered_set>//     for std::unordered_set
+#include <cassert>//           for assert
+#include <sstream>//           for std::stringstream
+#include <cstring>//           for memcpy
+
+#define checkPtr(ptr, msg) \
+    { \
+        ptrAssert((ptr), (msg), __FILE__, __LINE__); \
+    }
+
+namespace nanovdb {
+
+template<typename BufferT>
+struct BufferTraits
+{
+    static constexpr bool hasDeviceDual = false;
+};
+
+// ----------------------------> HostBuffer <--------------------------------------
+
+/// @brief This is a buffer that contains a shared or private pool
+///        to either externally or internally managed host memory.
+///
+/// @note  Terminology:
+///        Pool:   0 = buffer.size() < buffer.poolSize()
+///        Buffer: 0 < buffer.size() < buffer.poolSize()
+///        Full:   0 < buffer.size() = buffer.poolSize()
+///        Empty:  0 = buffer.size() = buffer.poolSize()
+class HostBuffer
+{
+    struct Pool;// forward declaration of private pool struct
+    std::shared_ptr<Pool> mPool;
+    uint64_t              mSize; // total number of bytes for the NanoVDB grid.
+    void*                 mData; // raw buffer for the NanoVDB grid.
+
+#if defined(DEBUG) || defined(_DEBUG)
+    static inline void ptrAssert(void* ptr, const char* msg, const char* file, int line, bool abort = true)
+    {
+        if (ptr == nullptr) {
+            fprintf(stderr, "NULL pointer error: %s %s %d\n", msg, file, line);
+            if (abort)
+                exit(1);
+        }
+        if (uint64_t(ptr) % NANOVDB_DATA_ALIGNMENT) {
+            fprintf(stderr, "Alignment pointer error: %s %s %d\n", msg, file, line);
+            if (abort)
+                exit(1);
+        }
+    }
+#else
+    static inline void ptrAssert(void*, const char*, const char*, int, bool = true)
+    {
+    }
+#endif
+
+public:
+    /// @brief Return a full buffer or an empty buffer
+    HostBuffer(uint64_t bufferSize = 0);
+
+     /// @brief Move copy-constructor
+    HostBuffer(HostBuffer&& other);
+
+    /// @brief Custom descructor
+    ~HostBuffer() { this->clear(); }
+
+    /// @brief Move copy assignment operation
+    HostBuffer& operator=(HostBuffer&& other);
+
+    /// @brief Disallow copy-construction
+    HostBuffer(const HostBuffer&) = delete;
+
+    /// @brief Disallow copy assignment operation
+    HostBuffer& operator=(const HostBuffer&) = delete;
+
+    /// @brief Return a pool buffer which satisfies: buffer.size == 0,
+    ///        buffer.poolSize() == poolSize, and buffer.data() == nullptr.
+    ///        If data==nullptr, memory for the pool will be allocated.
+    ///
+    /// @throw If poolSize is zero.
+    static HostBuffer createPool(uint64_t poolSize, void *data = nullptr);
+
+    /// @brief Return a full buffer which satisfies: buffer.size == bufferSize,
+    ///        buffer.poolSize() == bufferSize, and buffer.data() == data.
+    ///        If data==nullptr, memory for the pool will be allocated.
+    ///
+    /// @throw If bufferSize is zero.
+    static HostBuffer createFull(uint64_t bufferSize, void *data = nullptr);
+
+    /// @brief Return a buffer with @c bufferSize bytes managed by
+    ///        the specified memory @c pool. If none is provided, i.e.
+    ///        @c pool == nullptr or @c pool->poolSize() == 0, one is
+    ///        created with size @c bufferSize, i.e. a full buffer is returned.
+    ///
+    /// @throw If the specified @c pool has insufficient memory for
+    ///        the requested buffer size.
+    static HostBuffer create(uint64_t bufferSize, const HostBuffer* pool = nullptr);
+
+    /// @brief Initialize as a full buffer with the specified size. If data is NULL
+    ///        the memory is internally allocated.
+    void init(uint64_t bufferSize, void *data = nullptr);
+
+    //@{
+    /// @brief Retuns a pointer to the raw memory buffer managed by this allocator.
+    ///
+    /// @warning Note that the pointer can be NULL if the allocator was not initialized!
+    const void* data() const { return mData; }
+    void* data() { return mData; }
+    //@}
+
+    //@{
+    /// @brief Returns the size in bytes associated with this buffer.
+    uint64_t bufferSize() const { return mSize; }
+    uint64_t size() const { return this->bufferSize(); }
+    //@}
+
+    /// @brief Returns the size in bytes of the memory pool shared with this instance.
+    uint64_t poolSize() const;
+
+    /// @brief Return true if memory is managed (using std::malloc and std:free) by the
+    ///        shared pool in this buffer. Else memory is assumed to be managed externally.
+    bool isManaged() const;
+
+    //@{
+    /// @brief Returns true if this buffer has no memory associated with it
+    bool isEmpty() const { return !mPool || mSize == 0 || mData == nullptr; }
+    bool empty() const { return this->isEmpty(); }
+    //@}
+
+    /// @brief Return true if this is a pool, i.e. an empty buffer with a nonempty
+    ///        internal pool, i.e. this->size() == 0 and this->poolSize() != 0
+    bool isPool() const { return mSize == 0 && this->poolSize() > 0; }
+
+    /// @brief Return true if the pool exists, is nonempty but has no more available memory
+    bool isFull() const;
+
+    /// @brief Clear this buffer so it is empty.
+    void clear();
+
+    /// @brief Clears all existing buffers that are registered against the memory pool
+    ///        and resets the pool so it can be reused to create new buffers.
+    ///
+    /// @throw If this instance is not empty or contains no pool.
+    ///
+    /// @warning This method is not thread-safe!
+    void reset();
+
+    /// @brief Total number of bytes from the pool currently in use by buffers
+    uint64_t poolUsage() const;
+
+    /// @brief resize the pool size. It will attempt to resize the existing
+    ///        memory block, but if that fails a deep copy is performed.
+    ///        If @c data is not NULL it will be used as new externally
+    ///        managed memory for the pool. All registered buffers are
+    ///        updated so GridHandle::grid might return a new address (if
+    ///        deep copy was performed).
+    ///
+    /// @note  This method can be use to resize the memory pool and even
+    ///        change it from internally to externally managed memory or vice versa.
+    ///
+    /// @throw if @c poolSize is less than this->poolUsage() the used memory
+    ///        or allocations fail.
+    void resizePool(uint64_t poolSize, void *data = nullptr);
+
+}; // HostBuffer class
+
+// --------------------------> Implementation of HostBuffer::Pool <------------------------------------
+
+// This is private struct of HostBuffer so you can safely ignore the API
+struct HostBuffer::Pool
+{
+    using HashTableT = std::unordered_set<HostBuffer*>;
+    std::mutex mMutex; // mutex for updating mRegister and mFree
+    HashTableT mRegister;
+    void      *mData, *mFree;
+    uint64_t   mSize, mPadding;
+    bool       mManaged;
+
+    /// @brief External memory ctor
+    Pool(uint64_t size = 0, void* data = nullptr)
+        : mData(data)
+        , mFree(mData)
+        , mSize(size)
+        , mPadding(0)
+        , mManaged(data == nullptr)
+    {
+        if (mManaged) {
+            mData = Pool::alloc(mSize);
+            if (mData == nullptr) throw std::runtime_error("Pool::Pool malloc failed");
+        }
+        mPadding = alignmentPadding(mData);
+        if (!mManaged && mPadding != 0) {
+            throw std::runtime_error("Pool::Pool: external memory buffer is not aligned to " +
+                                     std::to_string(NANOVDB_DATA_ALIGNMENT) +
+                                     " bytes.\nHint: use nanovdb::alignPtr or std::aligned_alloc (C++17 only)");
+        }
+        mFree = util::PtrAdd(mData, mPadding);
+    }
+
+    /// @brief Custom destructor
+    ~Pool()
+    {
+        assert(mRegister.empty());
+        if (mManaged) std::free(mData);
+    }
+
+    /// @brief Disallow copy-construction
+    Pool(const Pool&) = delete;
+
+    /// @brief Disallow move-construction
+    Pool(const Pool&&) = delete;
+
+    /// @brief Disallow copy assignment operation
+    Pool& operator=(const Pool&) = delete;
+
+    /// @brief Disallow move assignment operation
+    Pool& operator=(const Pool&&) = delete;
+
+    /// @brief Return the total number of bytes used from this Pool by buffers
+    uint64_t usage() const { return util::PtrDiff(mFree, mData) - mPadding; }
+
+    /// @brief Allocate a buffer of the specified size and add it to the register
+    void add(HostBuffer* buffer, uint64_t size)
+    {
+        void *alignedFree = util::PtrAdd(mFree, alignmentPadding(mFree));
+
+        if (util::PtrAdd(alignedFree, size) > util::PtrAdd(mData, mPadding + mSize)) {
+            std::stringstream ss;
+            ss << "HostBuffer::Pool: insufficient memory\n"
+               << "\tA buffer requested " << size << " bytes with " << NANOVDB_DATA_ALIGNMENT
+               << "-bytes alignment from a pool with "
+               << mSize << " bytes of which\n\t" << (util::PtrDiff(alignedFree, mData) - mPadding)
+               << " bytes are used by " << mRegister.size() << " other buffer(s). "
+               << "Pool is " << (mManaged ? "internally" : "externally") << " managed.\n";
+            //std::cerr << ss.str();
+            throw std::runtime_error(ss.str());
+        }
+        buffer->mSize = size;
+        const std::lock_guard<std::mutex> lock(mMutex);
+        mRegister.insert(buffer);
+        buffer->mData = alignedFree;
+        mFree = util::PtrAdd(alignedFree, size);
+    }
+
+    /// @brief Remove the specified buffer from the register
+    void remove(HostBuffer *buffer)
+    {
+        const std::lock_guard<std::mutex> lock(mMutex);
+        mRegister.erase(buffer);
+    }
+
+    /// @brief Replaces buffer1 with buffer2 in the register
+    void replace(HostBuffer *buffer1, HostBuffer *buffer2)
+    {
+        const std::lock_guard<std::mutex> lock(mMutex);
+        mRegister.erase( buffer1);
+        mRegister.insert(buffer2);
+    }
+
+    /// @brief Reset the register and all its buffers
+    void reset()
+    {
+        for (HostBuffer *buffer : mRegister) {
+            buffer->mPool.reset();
+            buffer->mSize = 0;
+            buffer->mData = nullptr;
+        }
+        mRegister.clear();
+        mFree = util::PtrAdd(mData, mPadding);
+    }
+
+    /// @brief Resize this Pool and update registered buffers as needed. If data is no NULL
+    ///        it is used as externally managed memory.
+    void resize(uint64_t size, void *data = nullptr)
+    {
+        const uint64_t memUsage = this->usage();
+
+        const bool managed = (data == nullptr);
+
+        if (!managed && alignmentPadding(data) != 0) {
+            throw std::runtime_error("Pool::resize: external memory buffer is not aligned to " +
+                                     std::to_string(NANOVDB_DATA_ALIGNMENT) + " bytes");
+        }
+
+        if (memUsage > size) {
+            throw std::runtime_error("Pool::resize: insufficient memory");
+        }
+
+        uint64_t padding = 0;
+        if (mManaged && managed && size != mSize) { // managed -> managed
+            padding = mPadding;
+            data = Pool::realloc(mData, memUsage, size, padding); // performs both copy and free of mData
+        } else if (!mManaged && managed) { // un-managed -> managed
+            data = Pool::alloc(size);
+            padding = alignmentPadding(data);
+        }
+
+        if (data == nullptr) {
+            throw std::runtime_error("Pool::resize: allocation failed");
+        } else if (data != mData) {
+            void* paddedData = util::PtrAdd(data, padding);
+
+            if (!(mManaged && managed)) { // no need to copy if managed -> managed
+                memcpy(paddedData, util::PtrAdd(mData, mPadding), memUsage);
+            }
+
+            for (HostBuffer* buffer : mRegister) { // update registered buffers
+                //buffer->mData = paddedData + ptrdiff_t(buffer->mData - (mData + mPadding));
+                buffer->mData = util::PtrAdd(paddedData, util::PtrDiff(buffer->mData, util::PtrAdd(mData, mPadding)));
+            }
+            mFree = util::PtrAdd(paddedData, memUsage); // update the free pointer
+            if (mManaged && !managed) {// only free if managed -> un-managed
+                std::free(mData);
+            }
+
+            mData = data;
+            mPadding = padding;
+        }
+        mSize    = size;
+        mManaged = managed;
+    }
+    /// @brief Return true is all the memory in this pool is in use.
+    bool isFull() const
+    {
+        assert(mFree <= util::PtrAdd(mData, mPadding + mSize));
+        return mSize > 0 ? mFree == util::PtrAdd(mData, mPadding + mSize) : false;
+    }
+
+private:
+
+    static void* alloc(uint64_t size)
+    {
+//#if (__cplusplus >= 201703L)
+//    return std::aligned_alloc(NANOVDB_DATA_ALIGNMENT, size);//C++17 or newer
+//#else
+    // make sure we alloc enough space to align the result
+    return std::malloc(size + NANOVDB_DATA_ALIGNMENT);
+//#endif
+    }
+
+    static void* realloc(void* const origData,
+                         uint64_t    origSize,
+                         uint64_t    desiredSize,
+                         uint64_t&   padding)
+    {
+        // make sure we alloc enough space to align the result
+        void* data = std::realloc(origData, desiredSize + NANOVDB_DATA_ALIGNMENT);
+
+        if (data != nullptr && data != origData) {
+            uint64_t newPadding = alignmentPadding(data);
+            // Number of padding bytes may have changed -- move data if that's the case
+            if (newPadding != padding) {
+                // Realloc should not happen when shrinking down buffer, but let's be safe
+                std::memmove(util::PtrAdd(data, newPadding),
+                             util::PtrAdd(data, padding),
+                             math::Min(origSize, desiredSize));
+                padding = newPadding;
+            }
+        }
+
+        return data;
+    }
+
+};// struct HostBuffer::Pool
+
+// --------------------------> Implementation of HostBuffer <------------------------------------
+
+inline HostBuffer::HostBuffer(uint64_t size) : mPool(nullptr), mSize(size), mData(nullptr)
+{
+    if (size>0) {
+        mPool = std::make_shared<Pool>(size);
+        mData = mPool->mFree;
+        mPool->mRegister.insert(this);
+        mPool->mFree = util::PtrAdd(mPool->mFree, size);
+    }
+}
+
+inline HostBuffer::HostBuffer(HostBuffer&& other) : mPool(other.mPool), mSize(other.mSize), mData(other.mData)
+{
+    if (mPool && mSize != 0) {
+        mPool->replace(&other, this);
+    }
+    other.mPool.reset();
+    other.mSize = 0;
+    other.mData = nullptr;
+}
+
+inline void HostBuffer::init(uint64_t bufferSize, void *data)
+{
+    if (bufferSize == 0) {
+        throw std::runtime_error("HostBuffer: invalid buffer size");
+    }
+    if (mPool) {
+        mPool.reset();
+    }
+    if (!mPool || mPool->mSize != bufferSize) {
+        mPool = std::make_shared<Pool>(bufferSize, data);
+    }
+    mPool->add(this, bufferSize);
+}
+
+inline HostBuffer& HostBuffer::operator=(HostBuffer&& other)
+{
+    if (mPool) {
+        mPool->remove(this);
+    }
+    mPool = other.mPool;
+    mSize = other.mSize;
+    mData = other.mData;
+    if (mPool && mSize != 0) {
+        mPool->replace(&other, this);
+    }
+    other.mPool.reset();
+    other.mSize = 0;
+    other.mData = nullptr;
+    return *this;
+}
+
+inline uint64_t HostBuffer::poolSize() const
+{
+    return mPool ? mPool->mSize : 0u;
+}
+
+inline uint64_t HostBuffer::poolUsage() const
+{
+    return mPool ? mPool->usage(): 0u;
+}
+
+inline bool HostBuffer::isManaged() const
+{
+    return mPool ? mPool->mManaged : false;
+}
+
+inline bool HostBuffer::isFull() const
+{
+    return mPool ? mPool->isFull() : false;
+}
+
+inline HostBuffer HostBuffer::createPool(uint64_t poolSize, void *data)
+{
+    if (poolSize == 0) {
+        throw std::runtime_error("HostBuffer: invalid pool size");
+    }
+    HostBuffer buffer;
+    buffer.mPool = std::make_shared<Pool>(poolSize, data);
+    // note the buffer is NOT registered by its pool since it is not using its memory
+    buffer.mSize = 0;
+    buffer.mData = nullptr;
+    return buffer;
+}
+
+inline HostBuffer HostBuffer::createFull(uint64_t bufferSize, void *data)
+{
+    if (bufferSize == 0) {
+        throw std::runtime_error("HostBuffer: invalid buffer size");
+    }
+    HostBuffer buffer;
+    buffer.mPool = std::make_shared<Pool>(bufferSize, data);
+    buffer.mPool->add(&buffer, bufferSize);
+    return buffer;
+}
+
+inline HostBuffer HostBuffer::create(uint64_t bufferSize, const HostBuffer* pool)
+{
+    HostBuffer buffer;
+    if (pool == nullptr || !pool->mPool) {
+        buffer.mPool = std::make_shared<Pool>(bufferSize);
+    } else {
+       buffer.mPool = pool->mPool;
+    }
+    buffer.mPool->add(&buffer, bufferSize);
+    return buffer;
+}
+
+inline void HostBuffer::clear()
+{
+    if (mPool) {// remove self from the buffer register in the pool
+        mPool->remove(this);
+    }
+    mPool.reset();
+    mSize = 0;
+    mData = nullptr;
+}
+
+inline void HostBuffer::reset()
+{
+    if (this->size()>0) {
+        throw std::runtime_error("HostBuffer: only empty buffers can call reset");
+    }
+    if (!mPool) {
+        throw std::runtime_error("HostBuffer: this buffer contains no pool to reset");
+    }
+    mPool->reset();
+}
+
+inline void HostBuffer::resizePool(uint64_t size, void *data)
+{
+    if (!mPool) {
+        throw std::runtime_error("HostBuffer: this buffer contains no pool to resize");
+    }
+    mPool->resize(size, data);
+}
+
+} // namespace nanovdb
+
+#endif // end of NANOVDB_HOSTBUFFER_H_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/NanoVDB.h b/external/nanovdb/NanoVDB.h
new file mode 100644
index 00000000..36b412b0
--- /dev/null
+++ b/external/nanovdb/NanoVDB.h
@@ -0,0 +1,6624 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file   nanovdb/NanoVDB.h
+
+    \author Ken Museth
+
+    \date  January 8, 2020
+
+    \brief Implements a light-weight self-contained VDB data-structure in a
+           single file! In other words, this is a significantly watered-down
+           version of the OpenVDB implementation, with few dependencies - so
+           a one-stop-shop for a minimalistic VDB data structure that run on
+           most platforms!
+
+    \note It is important to note that NanoVDB (by design) is a read-only
+          sparse GPU (and CPU) friendly data structure intended for applications
+          like rendering and collision detection. As such it obviously lacks
+          a lot of the functionality and features of OpenVDB grids. NanoVDB
+          is essentially a compact linearized (or serialized) representation of
+          an OpenVDB tree with getValue methods only. For best performance use
+          the ReadAccessor::getValue method as opposed to the Tree::getValue
+          method. Note that since a ReadAccessor caches previous access patterns
+          it is by design not thread-safe, so use one instantiation per thread
+          (it is very light-weight). Also, it is not safe to copy accessors between
+          the GPU and CPU! In fact, client code should only interface
+          with the API of the Grid class (all other nodes of the NanoVDB data
+          structure can safely be ignored by most client codes)!
+
+
+    \warning NanoVDB grids can only be constructed via tools like createNanoGrid
+             or the GridBuilder. This explains why none of the grid nodes defined below
+             have public constructors or destructors.
+
+    \details Please see the following paper for more details on the data structure:
+          K. Museth, “VDB: High-Resolution Sparse Volumes with Dynamic Topology”,
+          ACM Transactions on Graphics 32(3), 2013, which can be found here:
+          http://www.museth.org/Ken/Publications_files/Museth_TOG13.pdf
+
+          NanoVDB was first published there: https://dl.acm.org/doi/fullHtml/10.1145/3450623.3464653
+
+
+    Overview: This file implements the following fundamental class that when combined
+          forms the backbone of the VDB tree data structure:
+
+          Coord- a signed integer coordinate
+          Vec3 - a 3D vector
+          Vec4 - a 4D vector
+          BBox - a bounding box
+          Mask - a bitmask essential to the non-root tree nodes
+          Map  - an affine coordinate transformation
+          Grid - contains a Tree and a map for world<->index transformations. Use
+                 this class as the main API with client code!
+          Tree - contains a RootNode and getValue methods that should only be used for debugging
+          RootNode - the top-level node of the VDB data structure
+          InternalNode - the internal nodes of the VDB data structure
+          LeafNode - the lowest level tree nodes that encode voxel values and state
+          ReadAccessor - implements accelerated random access operations
+
+    Semantics: A VDB data structure encodes values and (binary) states associated with
+          signed integer coordinates. Values encoded at the leaf node level are
+          denoted voxel values, and values associated with other tree nodes are referred
+          to as tile values, which by design cover a larger coordinate index domain.
+
+
+    Memory layout:
+
+    It's important to emphasize that all the grid data (defined below) are explicitly 32 byte
+    aligned, which implies that any memory buffer that contains a NanoVDB grid must also be at
+    32 byte aligned. That is, the memory address of the beginning of a buffer (see ascii diagram below)
+    must be divisible by 32, i.e. uintptr_t(&buffer)%32 == 0! If this is not the case, the C++ standard
+    says the behaviour is undefined! Normally this is not a concerns on GPUs, because they use 256 byte
+    aligned allocations, but the same cannot be said about the CPU.
+
+    GridData is always at the very beginning of the buffer immediately followed by TreeData!
+    The remaining nodes and blind-data are allowed to be scattered throughout the buffer,
+    though in practice they are arranged as:
+
+    GridData: 672 bytes (e.g. magic, checksum, major, flags, index, count, size, name, map, world bbox, voxel size, class, type, offset, count)
+
+    TreeData: 64 bytes (node counts and byte offsets)
+
+    ... optional padding ...
+
+    RootData: size depends on ValueType (index bbox, voxel count, tile count, min/max/avg/standard deviation)
+
+    Array of: RootData::Tile
+
+    ... optional padding ...
+
+    Array of: Upper InternalNodes of size 32^3:  bbox, two bit masks, 32768 tile values, and min/max/avg/standard deviation values
+
+    ... optional padding ...
+
+    Array of: Lower InternalNodes of size 16^3:  bbox, two bit masks, 4096 tile values, and min/max/avg/standard deviation values
+
+    ... optional padding ...
+
+    Array of: LeafNodes of size 8^3: bbox, bit masks, 512 voxel values, and min/max/avg/standard deviation values
+
+
+    Notation: "]---[" implies it has optional padding, and "][" implies zero padding
+
+    [GridData(672B)][TreeData(64B)]---[RootData][N x Root::Tile]---[InternalData<5>]---[InternalData<4>]---[LeafData<3>]---[BLINDMETA...]---[BLIND0]---[BLIND1]---etc.
+    ^                                 ^         ^                  ^                   ^                   ^
+    |                                 |         |                  |                   |                   |
+    +-- Start of 32B aligned buffer   |         |                  |                   |                   +-- Node0::DataType* leafData
+        GridType::DataType* gridData  |         |                  |                   |
+                                      |         |                  |                   +-- Node1::DataType* lowerData
+       RootType::DataType* rootData --+         |                  |
+                                                |                  +-- Node2::DataType* upperData
+                                                |
+                                                +-- RootType::DataType::Tile* tile
+
+*/
+
+#ifndef NANOVDB_NANOVDB_H_HAS_BEEN_INCLUDED
+#define NANOVDB_NANOVDB_H_HAS_BEEN_INCLUDED
+
+// The following two header files are the only mandatory dependencies
+#include <nanovdb/util/Util.h>// for __hostdev__ and lots of other utility functions
+#include <nanovdb/math/Math.h>// for Coord, BBox, Vec3, Vec4 etc
+
+// Do not change this value! 32 byte alignment is fixed in NanoVDB
+#define NANOVDB_DATA_ALIGNMENT 32
+
+// NANOVDB_MAGIC_NUMB is currently used for both grids and files (starting with v32.6.0)
+// NANOVDB_MAGIC_GRID will soon be used exclusively for grids (serialized to a single buffer)
+// NANOVDB_MAGIC_FILE will soon be used exclusively for files
+// NANOVDB_MAGIC_NODE will soon be used exclusively for NodeManager
+// NANOVDB_MAGIC_FRAG will soon be used exclusively for a fragmented grid, i.e. a grid that is not serialized
+//                              | : 0 in 30 corresponds to 0 in NanoVDB0
+#define NANOVDB_MAGIC_NUMB  0x304244566f6e614eUL // "NanoVDB0" in hex - little endian (uint64_t)
+#define NANOVDB_MAGIC_GRID  0x314244566f6e614eUL // "NanoVDB1" in hex - little endian (uint64_t)
+#define NANOVDB_MAGIC_FILE  0x324244566f6e614eUL // "NanoVDB2" in hex - little endian (uint64_t)
+#define NANOVDB_MAGIC_NODE  0x334244566f6e614eUL // "NanoVDB3" in hex - little endian (uint64_t)
+#define NANOVDB_MAGIC_FRAG  0x344244566f6e614eUL // "NanoVDB4" in hex - little endian (uint64_t)
+#define NANOVDB_MAGIC_MASK  0x00FFFFFFFFFFFFFFUL // use this mask to remove the number
+
+//#define NANOVDB_MAGIC_NUMBER 0x304244566f6e614eUL
+//#define NANOVDB_USE_NEW_MAGIC_NUMBERS// used to enable use of the new magic numbers described above
+
+#define NANOVDB_MAJOR_VERSION_NUMBER 32 // reflects changes to the ABI and hence also the file format
+#define NANOVDB_MINOR_VERSION_NUMBER 7 //  reflects changes to the API but not ABI
+#define NANOVDB_PATCH_VERSION_NUMBER 0 //  reflects changes that does not affect the ABI or API
+
+#define TBB_SUPPRESS_DEPRECATED_MESSAGES 1
+
+// This replaces a Coord key at the root level with a single uint64_t
+#define NANOVDB_USE_SINGLE_ROOT_KEY
+
+// This replaces three levels of Coord keys in the ReadAccessor with one Coord
+//#define NANOVDB_USE_SINGLE_ACCESSOR_KEY
+
+// Use this to switch between std::ofstream or FILE implementations
+//#define NANOVDB_USE_IOSTREAMS
+
+// Use this to switch between old and new accessor methods
+#define NANOVDB_NEW_ACCESSOR_METHODS
+
+#define NANOVDB_FPN_BRANCHLESS
+
+#if !defined(NANOVDB_ALIGN)
+#define NANOVDB_ALIGN(n) alignas(n)
+#endif // !defined(NANOVDB_ALIGN)
+
+namespace nanovdb {// =================================================================
+
+// --------------------------> Build types <------------------------------------
+
+/// @brief Dummy type for a voxel whose value equals an offset into an external value array
+class ValueIndex{};
+
+/// @brief Dummy type for a voxel whose value equals an offset into an external value array of active values
+class ValueOnIndex{};
+
+/// @brief Like @c ValueIndex but with a mutable mask
+class ValueIndexMask{};
+
+/// @brief Like @c ValueOnIndex but with a mutable mask
+class ValueOnIndexMask{};
+
+/// @brief Dummy type for a voxel whose value equals its binary active state
+class ValueMask{};
+
+/// @brief Dummy type for a 16 bit floating point values (placeholder for IEEE 754 Half)
+class Half{};
+
+/// @brief Dummy type for a 4bit quantization of float point values
+class Fp4{};
+
+/// @brief Dummy type for a 8bit quantization of float point values
+class Fp8{};
+
+/// @brief Dummy type for a 16bit quantization of float point values
+class Fp16{};
+
+/// @brief Dummy type for a variable bit quantization of floating point values
+class FpN{};
+
+/// @brief Dummy type for indexing points into voxels
+class Point{};
+
+// --------------------------> GridType <------------------------------------
+
+/// @brief return the number of characters (including null termination) required to convert enum type to a string
+template <class EnumT>
+__hostdev__ inline constexpr uint32_t strlen(){return (uint32_t)EnumT::StrLen - (uint32_t)EnumT::End;}
+
+/// @brief List of types that are currently supported by NanoVDB
+///
+/// @note To expand on this list do:
+///       1) Add the new type between Unknown and End in the enum below
+///       2) Add the new type to OpenToNanoVDB::processGrid that maps OpenVDB types to GridType
+///       3) Verify that the ConvertTrait in NanoToOpenVDB.h works correctly with the new type
+///       4) Add the new type to toGridType (defined below) that maps NanoVDB types to GridType
+///       5) Add the new type to toStr (defined below)
+enum class GridType : uint32_t { Unknown = 0, //  unknown value type - should rarely be used
+                                 Float = 1, //  single precision floating point value
+                                 Double = 2, //  double precision floating point value
+                                 Int16 = 3, //  half precision signed integer value
+                                 Int32 = 4, //  single precision signed integer value
+                                 Int64 = 5, //  double precision signed integer value
+                                 Vec3f = 6, //  single precision floating 3D vector
+                                 Vec3d = 7, //  double precision floating 3D vector
+                                 Mask = 8, //  no value, just the active state
+                                 Half = 9, //  half precision floating point value (placeholder for IEEE 754 Half)
+                                 UInt32 = 10, // single precision unsigned integer value
+                                 Boolean = 11, // boolean value, encoded in bit array
+                                 RGBA8 = 12, // RGBA packed into 32bit word in reverse-order, i.e. R is lowest byte.
+                                 Fp4 = 13, // 4bit quantization of floating point value
+                                 Fp8 = 14, // 8bit quantization of floating point value
+                                 Fp16 = 15, // 16bit quantization of floating point value
+                                 FpN = 16, // variable bit quantization of floating point value
+                                 Vec4f = 17, // single precision floating 4D vector
+                                 Vec4d = 18, // double precision floating 4D vector
+                                 Index = 19, // index into an external array of active and inactive values
+                                 OnIndex = 20, // index into an external array of active values
+                                 IndexMask = 21, // like Index but with a mutable mask
+                                 OnIndexMask = 22, // like OnIndex but with a mutable mask
+                                 PointIndex = 23, // voxels encode indices to co-located points
+                                 Vec3u8 = 24, // 8bit quantization of floating point 3D vector (only as blind data)
+                                 Vec3u16 = 25, // 16bit quantization of floating point 3D vector (only as blind data)
+                                 UInt8 = 26, // 8 bit unsigned integer values (eg 0 -> 255 gray scale)
+                                 End = 27,// total number of types in this enum (excluding StrLen since it's not a type)
+                                 StrLen = End + 12};// this entry is used to determine the minimum size of c-string
+
+/// @brief Maps a GridType to a c-string
+/// @param dst destination string of size 12 or larger
+/// @param gridType GridType enum to be mapped to a string
+/// @return Retuns a c-string used to describe a GridType
+__hostdev__ inline char* toStr(char *dst, GridType gridType)
+{
+    switch (gridType){
+        case GridType::Unknown:     return util::strcpy(dst, "?");
+        case GridType::Float:       return util::strcpy(dst, "float");
+        case GridType::Double:      return util::strcpy(dst, "double");
+        case GridType::Int16:       return util::strcpy(dst, "int16");
+        case GridType::Int32:       return util::strcpy(dst, "int32");
+        case GridType::Int64:       return util::strcpy(dst, "int64");
+        case GridType::Vec3f:       return util::strcpy(dst, "Vec3f");
+        case GridType::Vec3d:       return util::strcpy(dst, "Vec3d");
+        case GridType::Mask:        return util::strcpy(dst, "Mask");
+        case GridType::Half:        return util::strcpy(dst, "Half");
+        case GridType::UInt32:      return util::strcpy(dst, "uint32");
+        case GridType::Boolean:     return util::strcpy(dst, "bool");
+        case GridType::RGBA8:       return util::strcpy(dst, "RGBA8");
+        case GridType::Fp4:         return util::strcpy(dst, "Float4");
+        case GridType::Fp8:         return util::strcpy(dst, "Float8");
+        case GridType::Fp16:        return util::strcpy(dst, "Float16");
+        case GridType::FpN:         return util::strcpy(dst, "FloatN");
+        case GridType::Vec4f:       return util::strcpy(dst, "Vec4f");
+        case GridType::Vec4d:       return util::strcpy(dst, "Vec4d");
+        case GridType::Index:       return util::strcpy(dst, "Index");
+        case GridType::OnIndex:     return util::strcpy(dst, "OnIndex");
+        case GridType::IndexMask:   return util::strcpy(dst, "IndexMask");
+        case GridType::OnIndexMask: return util::strcpy(dst, "OnIndexMask");
+        case GridType::PointIndex:  return util::strcpy(dst, "PointIndex");
+        case GridType::Vec3u8:      return util::strcpy(dst, "Vec3u8");
+        case GridType::Vec3u16:     return util::strcpy(dst, "Vec3u16");
+        case GridType::UInt8:       return util::strcpy(dst, "uint8");
+        default:                    return util::strcpy(dst, "End");
+    }
+}
+
+// --------------------------> GridClass <------------------------------------
+
+/// @brief Classes (superset of OpenVDB) that are currently supported by NanoVDB
+enum class GridClass : uint32_t { Unknown = 0,
+                                  LevelSet = 1, // narrow band level set, e.g. SDF
+                                  FogVolume = 2, // fog volume, e.g. density
+                                  Staggered = 3, // staggered MAC grid, e.g. velocity
+                                  PointIndex = 4, // point index grid
+                                  PointData = 5, // point data grid
+                                  Topology = 6, // grid with active states only (no values)
+                                  VoxelVolume = 7, // volume of geometric cubes, e.g. colors cubes in Minecraft
+                                  IndexGrid = 8, // grid whose values are offsets, e.g. into an external array
+                                  TensorGrid = 9, // Index grid for indexing learnable tensor features
+                                  End = 10,// total number of types in this enum (excluding StrLen since it's not a type)
+                                  StrLen = End + 7};// this entry is used to determine the minimum size of c-string
+
+
+/// @brief Retuns a c-string used to describe a GridClass
+/// @param dst destination string of size 7 or larger
+/// @param gridClass GridClass enum to be converted to a string
+__hostdev__ inline char* toStr(char *dst, GridClass gridClass)
+{
+    switch (gridClass){
+        case GridClass::Unknown:     return util::strcpy(dst, "?");
+        case GridClass::LevelSet:    return util::strcpy(dst, "SDF");
+        case GridClass::FogVolume:   return util::strcpy(dst, "FOG");
+        case GridClass::Staggered:   return util::strcpy(dst, "MAC");
+        case GridClass::PointIndex:  return util::strcpy(dst, "PNTIDX");
+        case GridClass::PointData:   return util::strcpy(dst, "PNTDAT");
+        case GridClass::Topology:    return util::strcpy(dst, "TOPO");
+        case GridClass::VoxelVolume: return util::strcpy(dst, "VOX");
+        case GridClass::IndexGrid:   return util::strcpy(dst, "INDEX");
+        case GridClass::TensorGrid:  return util::strcpy(dst, "TENSOR");
+        default:                     return util::strcpy(dst, "END");
+    }
+}
+
+// --------------------------> GridFlags <------------------------------------
+
+/// @brief Grid flags which indicate what extra information is present in the grid buffer.
+enum class GridFlags : uint32_t {
+    HasLongGridName = 1 << 0, // grid name is longer than 256 characters
+    HasBBox = 1 << 1, // nodes contain bounding-boxes of active values
+    HasMinMax = 1 << 2, // nodes contain min/max of active values
+    HasAverage = 1 << 3, // nodes contain averages of active values
+    HasStdDeviation = 1 << 4, // nodes contain standard deviations of active values
+    IsBreadthFirst = 1 << 5, // nodes are typically arranged breadth-first in memory
+    End = 1 << 6, // use End - 1 as a mask for the 5 lower bit flags
+    StrLen = End + 23,// this entry is used to determine the minimum size of c-string
+};
+
+/// @brief Retuns a c-string used to describe a GridFlags
+/// @param dst destination string of size 23 or larger
+/// @param gridFlags GridFlags enum to be converted to a string
+__hostdev__ inline const char* toStr(char *dst, GridFlags gridFlags)
+{
+    switch (gridFlags){
+        case GridFlags::HasLongGridName: return util::strcpy(dst, "has long grid name");
+        case GridFlags::HasBBox:         return util::strcpy(dst, "has bbox");
+        case GridFlags::HasMinMax:       return util::strcpy(dst, "has min/max");
+        case GridFlags::HasAverage:      return util::strcpy(dst, "has average");
+        case GridFlags::HasStdDeviation: return util::strcpy(dst, "has standard deviation");
+        case GridFlags::IsBreadthFirst:  return util::strcpy(dst, "is breadth-first");
+        default:                         return util::strcpy(dst, "end");
+    }
+}
+
+// --------------------------> MagicType <------------------------------------
+
+/// @brief Enums used to identify magic numbers recognized by NanoVDB
+enum class MagicType : uint32_t { Unknown  = 0,// first 64 bits are neither of the cases below
+                                  OpenVDB  = 1,// first 32 bits = 0x56444220UL
+                                  NanoVDB  = 2,// first 64 bits = NANOVDB_MAGIC_NUMB
+                                  NanoGrid = 3,// first 64 bits = NANOVDB_MAGIC_GRID
+                                  NanoFile = 4,// first 64 bits = NANOVDB_MAGIC_FILE
+                                  NanoNode = 5,// first 64 bits = NANOVDB_MAGIC_NODE
+                                  NanoFrag = 6,// first 64 bits = NANOVDB_MAGIC_FRAG
+                                  End      = 7,
+                                  StrLen   = End + 25};// this entry is used to determine the minimum size of c-string
+
+/// @brief maps 64 bits of magic number to enum
+__hostdev__ inline MagicType toMagic(uint64_t magic)
+{
+    switch (magic){
+        case NANOVDB_MAGIC_NUMB:   return MagicType::NanoVDB;
+        case NANOVDB_MAGIC_GRID:   return MagicType::NanoGrid;
+        case NANOVDB_MAGIC_FILE:   return MagicType::NanoFile;
+        case NANOVDB_MAGIC_NODE:   return MagicType::NanoNode;
+        case NANOVDB_MAGIC_FRAG:   return MagicType::NanoFrag;
+        default: return (magic & ~uint32_t(0)) == 0x56444220UL ? MagicType::OpenVDB : MagicType::Unknown;
+    }
+}
+
+/// @brief print 64-bit magic number to string
+/// @param dst destination string of size 25 or larger
+/// @param magic 64 bit magic number to be printed
+/// @return return destination string @c dst
+__hostdev__ inline char* toStr(char *dst, MagicType magic)
+{
+    switch (magic){
+        case MagicType::Unknown:  return util::strcpy(dst, "unknown");
+        case MagicType::NanoVDB:  return util::strcpy(dst, "nanovdb");
+        case MagicType::NanoGrid: return util::strcpy(dst, "nanovdb::Grid");
+        case MagicType::NanoFile: return util::strcpy(dst, "nanovdb::File");
+        case MagicType::NanoNode: return util::strcpy(dst, "nanovdb::NodeManager");
+        case MagicType::NanoFrag: return util::strcpy(dst, "fragmented nanovdb::Grid");
+        case MagicType::OpenVDB:  return util::strcpy(dst, "openvdb");
+        default:                  return util::strcpy(dst, "end");
+    }
+}
+
+// --------------------------> PointType enums <------------------------------------
+
+// Define the type used when the points are encoded as blind data in the output grid
+enum class PointType : uint32_t { Disable = 0,// no point information e.g. when BuildT != Point
+                                  PointID = 1,// linear index of type uint32_t to points
+                                  World64 = 2,// Vec3d in world space
+                                  World32 = 3,// Vec3f in world space
+                                  Grid64  = 4,// Vec3d in grid space
+                                  Grid32  = 5,// Vec3f in grid space
+                                  Voxel32 = 6,// Vec3f in voxel space
+                                  Voxel16 = 7,// Vec3u16 in voxel space
+                                  Voxel8  = 8,// Vec3u8 in voxel space
+                                  Default = 9,// output matches input, i.e. Vec3d or Vec3f in world space
+                                  End     =10 };
+
+// --------------------------> GridBlindData enums <------------------------------------
+
+/// @brief Blind-data Classes that are currently supported by NanoVDB
+enum class GridBlindDataClass : uint32_t { Unknown = 0,
+                                           IndexArray = 1,
+                                           AttributeArray = 2,
+                                           GridName = 3,
+                                           ChannelArray = 4,
+                                           End = 5 };
+
+/// @brief Blind-data Semantics that are currently understood by NanoVDB
+enum class GridBlindDataSemantic : uint32_t { Unknown = 0,
+                                              PointPosition = 1, // 3D coordinates in an unknown space
+                                              PointColor = 2,
+                                              PointNormal = 3,
+                                              PointRadius = 4,
+                                              PointVelocity = 5,
+                                              PointId = 6,
+                                              WorldCoords = 7, // 3D coordinates in world space, e.g. (0.056, 0.8, 1,8)
+                                              GridCoords = 8, // 3D coordinates in grid space, e.g. (1.2, 4.0, 5.7), aka index-space
+                                              VoxelCoords = 9, // 3D coordinates in voxel space, e.g. (0.2, 0.0, 0.7)
+                                              End = 10 };
+
+// --------------------------> BuildTraits <------------------------------------
+
+/// @brief Define static boolean tests for template build types
+template<typename T>
+struct BuildTraits
+{
+    // check if T is an index type
+    static constexpr bool is_index     = util::is_same<T, ValueIndex, ValueIndexMask, ValueOnIndex, ValueOnIndexMask>::value;
+    static constexpr bool is_onindex   = util::is_same<T, ValueOnIndex, ValueOnIndexMask>::value;
+    static constexpr bool is_offindex  = util::is_same<T, ValueIndex, ValueIndexMask>::value;
+    static constexpr bool is_indexmask = util::is_same<T, ValueIndexMask, ValueOnIndexMask>::value;
+    // check if T is a compressed float type with fixed bit precision
+    static constexpr bool is_FpX = util::is_same<T, Fp4, Fp8, Fp16>::value;
+    // check if T is a compressed float type with fixed or variable bit precision
+    static constexpr bool is_Fp = util::is_same<T, Fp4, Fp8, Fp16, FpN>::value;
+    // check if T is a POD float type, i.e float or double
+    static constexpr bool is_float = util::is_floating_point<T>::value;
+    // check if T is a template specialization of LeafData<T>, i.e. has T mValues[512]
+    static constexpr bool is_special = is_index || is_Fp || util::is_same<T, Point, bool, ValueMask>::value;
+}; // BuildTraits
+
+// --------------------------> BuildToValueMap <------------------------------------
+
+/// @brief Maps one type (e.g. the build types above) to other (actual) types
+template<typename T>
+struct BuildToValueMap
+{
+    using Type = T;
+    using type = T;
+};
+
+template<>
+struct BuildToValueMap<ValueIndex>
+{
+    using Type = uint64_t;
+    using type = uint64_t;
+};
+
+template<>
+struct BuildToValueMap<ValueOnIndex>
+{
+    using Type = uint64_t;
+    using type = uint64_t;
+};
+
+template<>
+struct BuildToValueMap<ValueIndexMask>
+{
+    using Type = uint64_t;
+    using type = uint64_t;
+};
+
+template<>
+struct BuildToValueMap<ValueOnIndexMask>
+{
+    using Type = uint64_t;
+    using type = uint64_t;
+};
+
+template<>
+struct BuildToValueMap<ValueMask>
+{
+    using Type = bool;
+    using type = bool;
+};
+
+template<>
+struct BuildToValueMap<Half>
+{
+    using Type = float;
+    using type = float;
+};
+
+template<>
+struct BuildToValueMap<Fp4>
+{
+    using Type = float;
+    using type = float;
+};
+
+template<>
+struct BuildToValueMap<Fp8>
+{
+    using Type = float;
+    using type = float;
+};
+
+template<>
+struct BuildToValueMap<Fp16>
+{
+    using Type = float;
+    using type = float;
+};
+
+template<>
+struct BuildToValueMap<FpN>
+{
+    using Type = float;
+    using type = float;
+};
+
+template<>
+struct BuildToValueMap<Point>
+{
+    using Type = uint64_t;
+    using type = uint64_t;
+};
+
+// --------------------------> utility functions related to alignment <------------------------------------
+
+/// @brief return true if the specified pointer is 32 byte aligned
+__hostdev__ inline static bool isAligned(const void* p){return uint64_t(p) % NANOVDB_DATA_ALIGNMENT == 0;}
+
+/// @brief return the smallest number of bytes that when added to the specified pointer results in a 32 byte aligned pointer.
+__hostdev__ inline static uint64_t alignmentPadding(const void* p)
+{
+    NANOVDB_ASSERT(p);
+    return (NANOVDB_DATA_ALIGNMENT - (uint64_t(p) % NANOVDB_DATA_ALIGNMENT)) % NANOVDB_DATA_ALIGNMENT;
+}
+
+/// @brief offset the specified pointer so it is 32 byte aligned. Works with both const and non-const pointers.
+template <typename T>
+__hostdev__ inline static T* alignPtr(T* p){return util::PtrAdd<T>(p, alignmentPadding(p));}
+
+// --------------------------> isFloatingPoint(GridType) <------------------------------------
+
+/// @brief return true if the GridType maps to a floating point type
+__hostdev__ inline bool isFloatingPoint(GridType gridType)
+{
+    return gridType == GridType::Float ||
+           gridType == GridType::Double ||
+           gridType == GridType::Half ||
+           gridType == GridType::Fp4 ||
+           gridType == GridType::Fp8 ||
+           gridType == GridType::Fp16 ||
+           gridType == GridType::FpN;
+}
+
+// --------------------------> isFloatingPointVector(GridType) <------------------------------------
+
+/// @brief return true if the GridType maps to a floating point vec3.
+__hostdev__ inline bool isFloatingPointVector(GridType gridType)
+{
+    return gridType == GridType::Vec3f ||
+           gridType == GridType::Vec3d ||
+           gridType == GridType::Vec4f ||
+           gridType == GridType::Vec4d;
+}
+
+// --------------------------> isInteger(GridType) <------------------------------------
+
+/// @brief Return true if the GridType maps to a POD integer type.
+/// @details These types are used to associate a voxel with a POD integer type
+__hostdev__ inline bool isInteger(GridType gridType)
+{
+    return gridType == GridType::Int16 ||
+           gridType == GridType::Int32 ||
+           gridType == GridType::Int64 ||
+           gridType == GridType::UInt32||
+           gridType == GridType::UInt8;
+}
+
+// --------------------------> isIndex(GridType) <------------------------------------
+
+/// @brief Return true if the GridType maps to a special index type (not a POD integer type).
+/// @details These types are used to index from a voxel into an external array of values, e.g. sidecar or blind data.
+__hostdev__ inline bool isIndex(GridType gridType)
+{
+    return gridType == GridType::Index ||// index both active and inactive values
+           gridType == GridType::OnIndex ||// index active values only
+           gridType == GridType::IndexMask ||// as Index, but with an additional mask
+           gridType == GridType::OnIndexMask;// as OnIndex, but with an additional mask
+}
+
+// --------------------------> isValue(GridType, GridClass) <------------------------------------
+
+/// @brief return true if the combination of GridType and GridClass is valid.
+__hostdev__ inline bool isValid(GridType gridType, GridClass gridClass)
+{
+    if (gridClass == GridClass::LevelSet || gridClass == GridClass::FogVolume) {
+        return isFloatingPoint(gridType);
+    } else if (gridClass == GridClass::Staggered) {
+        return isFloatingPointVector(gridType);
+    } else if (gridClass == GridClass::PointIndex || gridClass == GridClass::PointData) {
+        return gridType == GridType::PointIndex || gridType == GridType::UInt32;
+    } else if (gridClass == GridClass::Topology) {
+        return gridType == GridType::Mask;
+    } else if (gridClass == GridClass::IndexGrid) {
+        return isIndex(gridType);
+    } else if (gridClass == GridClass::VoxelVolume) {
+        return gridType == GridType::RGBA8 || gridType == GridType::Float ||
+               gridType == GridType::Double || gridType == GridType::Vec3f ||
+               gridType == GridType::Vec3d || gridType == GridType::UInt32 ||
+               gridType == GridType::UInt8;
+    }
+    return gridClass < GridClass::End && gridType < GridType::End; // any valid combination
+}
+
+// --------------------------> validation of blind data meta data <------------------------------------
+
+/// @brief return true if the combination of GridBlindDataClass, GridBlindDataSemantic and GridType is valid.
+__hostdev__ inline bool isValid(const GridBlindDataClass&    blindClass,
+                                const GridBlindDataSemantic& blindSemantics,
+                                const GridType&              blindType)
+{
+    bool test = false;
+    switch (blindClass) {
+    case GridBlindDataClass::IndexArray:
+        test = (blindSemantics == GridBlindDataSemantic::Unknown ||
+                blindSemantics == GridBlindDataSemantic::PointId) &&
+               isInteger(blindType);
+        break;
+    case GridBlindDataClass::AttributeArray:
+        if (blindSemantics == GridBlindDataSemantic::PointPosition ||
+            blindSemantics == GridBlindDataSemantic::WorldCoords) {
+            test = blindType == GridType::Vec3f || blindType == GridType::Vec3d;
+        } else if (blindSemantics == GridBlindDataSemantic::GridCoords) {
+            test = blindType == GridType::Vec3f;
+        } else if (blindSemantics == GridBlindDataSemantic::VoxelCoords) {
+            test = blindType == GridType::Vec3f || blindType == GridType::Vec3u8 || blindType == GridType::Vec3u16;
+        } else {
+            test = blindSemantics != GridBlindDataSemantic::PointId;
+        }
+        break;
+    case GridBlindDataClass::GridName:
+        test = blindSemantics == GridBlindDataSemantic::Unknown && blindType == GridType::Unknown;
+        break;
+    default: // captures blindClass == Unknown and ChannelArray
+        test = blindClass < GridBlindDataClass::End &&
+               blindSemantics < GridBlindDataSemantic::End &&
+               blindType < GridType::End; // any valid combination
+        break;
+    }
+    //if (!test) printf("Invalid combination: GridBlindDataClass=%u, GridBlindDataSemantic=%u, GridType=%u\n",(uint32_t)blindClass, (uint32_t)blindSemantics, (uint32_t)blindType);
+    return test;
+}
+
+// ----------------------------> Version class <-------------------------------------
+
+/// @brief Bit-compacted representation of all three version numbers
+///
+/// @details major is the top 11 bits, minor is the 11 middle bits and patch is the lower 10 bits
+class Version
+{
+    uint32_t mData; // 11 + 11 + 10 bit packing of major + minor + patch
+public:
+    static constexpr uint32_t End = 0, StrLen = 8;// for strlen<Version>()
+    /// @brief Default constructor
+    __hostdev__ Version()
+        : mData(uint32_t(NANOVDB_MAJOR_VERSION_NUMBER) << 21 |
+                uint32_t(NANOVDB_MINOR_VERSION_NUMBER) << 10 |
+                uint32_t(NANOVDB_PATCH_VERSION_NUMBER))
+    {
+    }
+    /// @brief Constructor from a raw uint32_t data representation
+    __hostdev__ Version(uint32_t data) : mData(data) {}
+    /// @brief Constructor from major.minor.patch version numbers
+    __hostdev__ Version(uint32_t major, uint32_t minor, uint32_t patch)
+        : mData(major << 21 | minor << 10 | patch)
+    {
+        NANOVDB_ASSERT(major < (1u << 11)); // max value of major is 2047
+        NANOVDB_ASSERT(minor < (1u << 11)); // max value of minor is 2047
+        NANOVDB_ASSERT(patch < (1u << 10)); // max value of patch is 1023
+    }
+    __hostdev__ bool     operator==(const Version& rhs) const { return mData == rhs.mData; }
+    __hostdev__ bool     operator<( const Version& rhs) const { return mData < rhs.mData; }
+    __hostdev__ bool     operator<=(const Version& rhs) const { return mData <= rhs.mData; }
+    __hostdev__ bool     operator>( const Version& rhs) const { return mData > rhs.mData; }
+    __hostdev__ bool     operator>=(const Version& rhs) const { return mData >= rhs.mData; }
+    __hostdev__ uint32_t id() const { return mData; }
+    __hostdev__ uint32_t getMajor() const { return (mData >> 21) & ((1u << 11) - 1); }
+    __hostdev__ uint32_t getMinor() const { return (mData >> 10) & ((1u << 11) - 1); }
+    __hostdev__ uint32_t getPatch() const { return  mData        & ((1u << 10) - 1); }
+    __hostdev__ bool isCompatible() const { return this->getMajor() == uint32_t(NANOVDB_MAJOR_VERSION_NUMBER); }
+    /// @brief Returns the difference between major version of this instance and NANOVDB_MAJOR_VERSION_NUMBER
+    /// @return return 0 if the major version equals NANOVDB_MAJOR_VERSION_NUMBER, else a negative age if this
+    ///         instance has a smaller major verion (is older), and a positive age if it is newer, i.e. larger.
+    __hostdev__ int age() const {return int(this->getMajor()) - int(NANOVDB_MAJOR_VERSION_NUMBER);}
+}; // Version
+
+/// @brief print the verion number to a c-string
+/// @param dst destination string of size 8 or more
+/// @param v version to be printed
+/// @return returns destination string @c dst
+__hostdev__ inline char* toStr(char *dst, const Version &v)
+{
+    return util::sprint(dst, v.getMajor(), ".",v.getMinor(), ".",v.getPatch());
+}
+
+// ----------------------------> TensorTraits <--------------------------------------
+
+template<typename T, int Rank = (util::is_specialization<T, math::Vec3>::value || util::is_specialization<T, math::Vec4>::value || util::is_same<T, math::Rgba8>::value) ? 1 : 0>
+struct TensorTraits;
+
+template<typename T>
+struct TensorTraits<T, 0>
+{
+    static const int  Rank = 0; // i.e. scalar
+    static const bool IsScalar = true;
+    static const bool IsVector = false;
+    static const int  Size = 1;
+    using ElementType = T;
+    static T scalar(const T& s) { return s; }
+};
+
+template<typename T>
+struct TensorTraits<T, 1>
+{
+    static const int  Rank = 1; // i.e. vector
+    static const bool IsScalar = false;
+    static const bool IsVector = true;
+    static const int  Size = T::SIZE;
+    using ElementType = typename T::ValueType;
+    static ElementType scalar(const T& v) { return v.length(); }
+};
+
+// ----------------------------> FloatTraits <--------------------------------------
+
+template<typename T, int = sizeof(typename TensorTraits<T>::ElementType)>
+struct FloatTraits
+{
+    using FloatType = float;
+};
+
+template<typename T>
+struct FloatTraits<T, 8>
+{
+    using FloatType = double;
+};
+
+template<>
+struct FloatTraits<bool, 1>
+{
+    using FloatType = bool;
+};
+
+template<>
+struct FloatTraits<ValueIndex, 1> // size of empty class in C++ is 1 byte and not 0 byte
+{
+    using FloatType = uint64_t;
+};
+
+template<>
+struct FloatTraits<ValueIndexMask, 1> // size of empty class in C++ is 1 byte and not 0 byte
+{
+    using FloatType = uint64_t;
+};
+
+template<>
+struct FloatTraits<ValueOnIndex, 1> // size of empty class in C++ is 1 byte and not 0 byte
+{
+    using FloatType = uint64_t;
+};
+
+template<>
+struct FloatTraits<ValueOnIndexMask, 1> // size of empty class in C++ is 1 byte and not 0 byte
+{
+    using FloatType = uint64_t;
+};
+
+template<>
+struct FloatTraits<ValueMask, 1> // size of empty class in C++ is 1 byte and not 0 byte
+{
+    using FloatType = bool;
+};
+
+template<>
+struct FloatTraits<Point, 1> // size of empty class in C++ is 1 byte and not 0 byte
+{
+    using FloatType = double;
+};
+
+// ----------------------------> mapping BuildType -> GridType <--------------------------------------
+
+/// @brief Maps from a templated build type to a GridType enum
+template<typename BuildT>
+__hostdev__ inline GridType toGridType()
+{
+    if constexpr(util::is_same<BuildT, float>::value) { // resolved at compile-time
+        return GridType::Float;
+    } else if constexpr(util::is_same<BuildT, double>::value) {
+        return GridType::Double;
+    } else if constexpr(util::is_same<BuildT, int16_t>::value) {
+        return GridType::Int16;
+    } else if constexpr(util::is_same<BuildT, int32_t>::value) {
+        return GridType::Int32;
+    } else if constexpr(util::is_same<BuildT, int64_t>::value) {
+        return GridType::Int64;
+    } else if constexpr(util::is_same<BuildT, Vec3f>::value) {
+        return GridType::Vec3f;
+    } else if constexpr(util::is_same<BuildT, Vec3d>::value) {
+        return GridType::Vec3d;
+    } else if constexpr(util::is_same<BuildT, uint32_t>::value) {
+        return GridType::UInt32;
+    } else if constexpr(util::is_same<BuildT, ValueMask>::value) {
+        return GridType::Mask;
+    } else if constexpr(util::is_same<BuildT, Half>::value) {
+        return GridType::Half;
+    } else if constexpr(util::is_same<BuildT, ValueIndex>::value) {
+        return GridType::Index;
+    } else if constexpr(util::is_same<BuildT, ValueOnIndex>::value) {
+        return GridType::OnIndex;
+    } else if constexpr(util::is_same<BuildT, ValueIndexMask>::value) {
+        return GridType::IndexMask;
+    } else if constexpr(util::is_same<BuildT, ValueOnIndexMask>::value) {
+        return GridType::OnIndexMask;
+    } else if constexpr(util::is_same<BuildT, bool>::value) {
+        return GridType::Boolean;
+    } else if constexpr(util::is_same<BuildT, math::Rgba8>::value) {
+        return GridType::RGBA8;
+    } else if constexpr(util::is_same<BuildT, Fp4>::value) {
+        return GridType::Fp4;
+    } else if constexpr(util::is_same<BuildT, Fp8>::value) {
+        return GridType::Fp8;
+    } else if constexpr(util::is_same<BuildT, Fp16>::value) {
+        return GridType::Fp16;
+    } else if constexpr(util::is_same<BuildT, FpN>::value) {
+        return GridType::FpN;
+    } else if constexpr(util::is_same<BuildT, Vec4f>::value) {
+        return GridType::Vec4f;
+    } else if constexpr(util::is_same<BuildT, Vec4d>::value) {
+        return GridType::Vec4d;
+    } else if constexpr(util::is_same<BuildT, Point>::value) {
+        return GridType::PointIndex;
+    } else if constexpr(util::is_same<BuildT, Vec3u8>::value) {
+        return GridType::Vec3u8;
+    } else if constexpr(util::is_same<BuildT, Vec3u16>::value) {
+        return GridType::Vec3u16;
+    } else if constexpr(util::is_same<BuildT, uint8_t>::value) {
+        return GridType::UInt8;
+    }
+    return GridType::Unknown;
+}// toGridType
+
+template<typename BuildT>
+[[deprecated("Use toGridType<T>() instead.")]]
+__hostdev__ inline GridType mapToGridType(){return toGridType<BuildT>();}
+
+// ----------------------------> mapping BuildType -> GridClass <--------------------------------------
+
+/// @brief Maps from a templated build type to a GridClass enum
+template<typename BuildT>
+__hostdev__ inline GridClass toGridClass(GridClass defaultClass = GridClass::Unknown)
+{
+    if constexpr(util::is_same<BuildT, ValueMask>::value) {
+        return GridClass::Topology;
+    } else if constexpr(BuildTraits<BuildT>::is_index) {
+        return GridClass::IndexGrid;
+    } else if constexpr(util::is_same<BuildT, math::Rgba8>::value) {
+        return GridClass::VoxelVolume;
+    } else if constexpr(util::is_same<BuildT, Point>::value) {
+        return GridClass::PointIndex;
+    }
+    return defaultClass;
+}
+
+template<typename BuildT>
+[[deprecated("Use toGridClass<T>() instead.")]]
+__hostdev__ inline GridClass mapToGridClass(GridClass defaultClass = GridClass::Unknown)
+{
+    return toGridClass<BuildT>();
+}
+
+//  ----------------------------> BitFlags <--------------------------------------
+
+template<int N>
+struct BitArray;
+template<>
+struct BitArray<8>
+{
+    uint8_t mFlags{0};
+};
+template<>
+struct BitArray<16>
+{
+    uint16_t mFlags{0};
+};
+template<>
+struct BitArray<32>
+{
+    uint32_t mFlags{0};
+};
+template<>
+struct BitArray<64>
+{
+    uint64_t mFlags{0};
+};
+
+template<int N>
+class BitFlags : public BitArray<N>
+{
+protected:
+    using BitArray<N>::mFlags;
+
+public:
+    using Type = decltype(mFlags);
+    BitFlags() {}
+    BitFlags(Type mask) : BitArray<N>{mask} {}
+    BitFlags(std::initializer_list<uint8_t> list)
+    {
+        for (auto bit : list) mFlags |= static_cast<Type>(1 << bit);
+    }
+    template<typename MaskT>
+    BitFlags(std::initializer_list<MaskT> list)
+    {
+        for (auto mask : list) mFlags |= static_cast<Type>(mask);
+    }
+    __hostdev__ Type  data() const { return mFlags; }
+    __hostdev__ Type& data() { return mFlags; }
+    __hostdev__ void  initBit(std::initializer_list<uint8_t> list)
+    {
+        mFlags = 0u;
+        for (auto bit : list) mFlags |= static_cast<Type>(1 << bit);
+    }
+    template<typename MaskT>
+    __hostdev__ void initMask(std::initializer_list<MaskT> list)
+    {
+        mFlags = 0u;
+        for (auto mask : list) mFlags |= static_cast<Type>(mask);
+    }
+    //__hostdev__ Type& data() { return mFlags; }
+    //__hostdev__ Type data() const { return mFlags; }
+    __hostdev__ Type getFlags() const { return mFlags & (static_cast<Type>(GridFlags::End) - 1u); } // mask out everything except relevant bits
+
+    __hostdev__ void setOn() { mFlags = ~Type(0u); }
+    __hostdev__ void setOff() { mFlags = Type(0u); }
+
+    __hostdev__ void setBitOn(uint8_t bit) { mFlags |= static_cast<Type>(1 << bit); }
+    __hostdev__ void setBitOff(uint8_t bit) { mFlags &= ~static_cast<Type>(1 << bit); }
+
+    __hostdev__ void setBitOn(std::initializer_list<uint8_t> list)
+    {
+        for (auto bit : list) mFlags |= static_cast<Type>(1 << bit);
+    }
+    __hostdev__ void setBitOff(std::initializer_list<uint8_t> list)
+    {
+        for (auto bit : list) mFlags &= ~static_cast<Type>(1 << bit);
+    }
+
+    template<typename MaskT>
+    __hostdev__ void setMaskOn(MaskT mask) { mFlags |= static_cast<Type>(mask); }
+    template<typename MaskT>
+    __hostdev__ void setMaskOff(MaskT mask) { mFlags &= ~static_cast<Type>(mask); }
+
+    template<typename MaskT>
+    __hostdev__ void setMaskOn(std::initializer_list<MaskT> list)
+    {
+        for (auto mask : list) mFlags |= static_cast<Type>(mask);
+    }
+    template<typename MaskT>
+    __hostdev__ void setMaskOff(std::initializer_list<MaskT> list)
+    {
+        for (auto mask : list) mFlags &= ~static_cast<Type>(mask);
+    }
+
+    __hostdev__ void setBit(uint8_t bit, bool on) { on ? this->setBitOn(bit) : this->setBitOff(bit); }
+    template<typename MaskT>
+    __hostdev__ void setMask(MaskT mask, bool on) { on ? this->setMaskOn(mask) : this->setMaskOff(mask); }
+
+    __hostdev__ bool isOn() const { return mFlags == ~Type(0u); }
+    __hostdev__ bool isOff() const { return mFlags == Type(0u); }
+    __hostdev__ bool isBitOn(uint8_t bit) const { return 0 != (mFlags & static_cast<Type>(1 << bit)); }
+    __hostdev__ bool isBitOff(uint8_t bit) const { return 0 == (mFlags & static_cast<Type>(1 << bit)); }
+    template<typename MaskT>
+    __hostdev__ bool isMaskOn(MaskT mask) const { return 0 != (mFlags & static_cast<Type>(mask)); }
+    template<typename MaskT>
+    __hostdev__ bool isMaskOff(MaskT mask) const { return 0 == (mFlags & static_cast<Type>(mask)); }
+    /// @brief return true if any of the masks in the list are on
+    template<typename MaskT>
+    __hostdev__ bool isMaskOn(std::initializer_list<MaskT> list) const
+    {
+        for (auto mask : list) {
+            if (0 != (mFlags & static_cast<Type>(mask))) return true;
+        }
+        return false;
+    }
+    /// @brief return true if any of the masks in the list are off
+    template<typename MaskT>
+    __hostdev__ bool isMaskOff(std::initializer_list<MaskT> list) const
+    {
+        for (auto mask : list) {
+            if (0 == (mFlags & static_cast<Type>(mask))) return true;
+        }
+        return false;
+    }
+    /// @brief required for backwards compatibility
+    __hostdev__ BitFlags& operator=(Type n)
+    {
+        mFlags = n;
+        return *this;
+    }
+}; // BitFlags<N>
+
+// ----------------------------> Mask <--------------------------------------
+
+/// @brief Bit-mask to encode active states and facilitate sequential iterators
+/// and a fast codec for I/O compression.
+template<uint32_t LOG2DIM>
+class Mask
+{
+public:
+    static constexpr uint32_t SIZE = 1U << (3 * LOG2DIM); // Number of bits in mask
+    static constexpr uint32_t WORD_COUNT = SIZE >> 6; // Number of 64 bit words
+
+    /// @brief Return the memory footprint in bytes of this Mask
+    __hostdev__ static size_t memUsage() { return sizeof(Mask); }
+
+    /// @brief Return the number of bits available in this Mask
+    __hostdev__ static uint32_t bitCount() { return SIZE; }
+
+    /// @brief Return the number of machine words used by this Mask
+    __hostdev__ static uint32_t wordCount() { return WORD_COUNT; }
+
+    /// @brief Return the total number of set bits in this Mask
+    __hostdev__ uint32_t countOn() const
+    {
+        uint32_t sum = 0;
+        for (const uint64_t *w = mWords, *q = w + WORD_COUNT; w != q; ++w)
+            sum += util::countOn(*w);
+        return sum;
+    }
+
+    /// @brief Return the number of lower set bits in mask up to but excluding the i'th bit
+    inline __hostdev__ uint32_t countOn(uint32_t i) const
+    {
+        uint32_t n = i >> 6, sum = util::countOn(mWords[n] & ((uint64_t(1) << (i & 63u)) - 1u));
+        for (const uint64_t* w = mWords; n--; ++w)
+            sum += util::countOn(*w);
+        return sum;
+    }
+
+    template<bool On>
+    class Iterator
+    {
+    public:
+        __hostdev__ Iterator()
+            : mPos(Mask::SIZE)
+            , mParent(nullptr)
+        {
+        }
+        __hostdev__ Iterator(uint32_t pos, const Mask* parent)
+            : mPos(pos)
+            , mParent(parent)
+        {
+        }
+        Iterator&            operator=(const Iterator&) = default;
+        __hostdev__ uint32_t operator*() const { return mPos; }
+        __hostdev__ uint32_t pos() const { return mPos; }
+        __hostdev__ operator bool() const { return mPos != Mask::SIZE; }
+        __hostdev__ Iterator& operator++()
+        {
+            mPos = mParent->findNext<On>(mPos + 1);
+            return *this;
+        }
+        __hostdev__ Iterator operator++(int)
+        {
+            auto tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+
+    private:
+        uint32_t    mPos;
+        const Mask* mParent;
+    }; // Member class Iterator
+
+    class DenseIterator
+    {
+    public:
+        __hostdev__ DenseIterator(uint32_t pos = Mask::SIZE)
+            : mPos(pos)
+        {
+        }
+        DenseIterator&       operator=(const DenseIterator&) = default;
+        __hostdev__ uint32_t operator*() const { return mPos; }
+        __hostdev__ uint32_t pos() const { return mPos; }
+        __hostdev__ operator bool() const { return mPos != Mask::SIZE; }
+        __hostdev__ DenseIterator& operator++()
+        {
+            ++mPos;
+            return *this;
+        }
+        __hostdev__ DenseIterator operator++(int)
+        {
+            auto tmp = *this;
+            ++mPos;
+            return tmp;
+        }
+
+    private:
+        uint32_t mPos;
+    }; // Member class DenseIterator
+
+    using OnIterator = Iterator<true>;
+    using OffIterator = Iterator<false>;
+
+    __hostdev__ OnIterator beginOn() const { return OnIterator(this->findFirst<true>(), this); }
+
+    __hostdev__ OffIterator beginOff() const { return OffIterator(this->findFirst<false>(), this); }
+
+    __hostdev__ DenseIterator beginAll() const { return DenseIterator(0); }
+
+    /// @brief Initialize all bits to zero.
+    __hostdev__ Mask()
+    {
+        for (uint32_t i = 0; i < WORD_COUNT; ++i)
+            mWords[i] = 0;
+    }
+    __hostdev__ Mask(bool on)
+    {
+        const uint64_t v = on ? ~uint64_t(0) : uint64_t(0);
+        for (uint32_t i = 0; i < WORD_COUNT; ++i)
+            mWords[i] = v;
+    }
+
+    /// @brief Copy constructor
+    __hostdev__ Mask(const Mask& other)
+    {
+        for (uint32_t i = 0; i < WORD_COUNT; ++i)
+            mWords[i] = other.mWords[i];
+    }
+
+    /// @brief Return a pointer to the list of words of the bit mask
+    __hostdev__ uint64_t*       words() { return mWords; }
+    __hostdev__ const uint64_t* words() const { return mWords; }
+
+    /// @brief Assignment operator that works with openvdb::util::NodeMask
+    template<typename MaskT = Mask>
+    __hostdev__ typename util::enable_if<!util::is_same<MaskT, Mask>::value, Mask&>::type operator=(const MaskT& other)
+    {
+        static_assert(sizeof(Mask) == sizeof(MaskT), "Mismatching sizeof");
+        static_assert(WORD_COUNT == MaskT::WORD_COUNT, "Mismatching word count");
+        static_assert(LOG2DIM == MaskT::LOG2DIM, "Mismatching LOG2DIM");
+        auto* src = reinterpret_cast<const uint64_t*>(&other);
+        for (uint64_t *dst = mWords, *end = dst + WORD_COUNT; dst != end; ++dst)
+            *dst = *src++;
+        return *this;
+    }
+
+    //__hostdev__ Mask& operator=(const Mask& other){return *util::memcpy(this, &other);}
+    Mask& operator=(const Mask&) = default;
+
+    __hostdev__ bool operator==(const Mask& other) const
+    {
+        for (uint32_t i = 0; i < WORD_COUNT; ++i) {
+            if (mWords[i] != other.mWords[i])
+                return false;
+        }
+        return true;
+    }
+
+    __hostdev__ bool operator!=(const Mask& other) const { return !((*this) == other); }
+
+    /// @brief Return true if the given bit is set.
+    __hostdev__ bool isOn(uint32_t n) const { return 0 != (mWords[n >> 6] & (uint64_t(1) << (n & 63))); }
+
+    /// @brief Return true if the given bit is NOT set.
+    __hostdev__ bool isOff(uint32_t n) const { return 0 == (mWords[n >> 6] & (uint64_t(1) << (n & 63))); }
+
+    /// @brief Return true if all the bits are set in this Mask.
+    __hostdev__ bool isOn() const
+    {
+        for (uint32_t i = 0; i < WORD_COUNT; ++i)
+            if (mWords[i] != ~uint64_t(0))
+                return false;
+        return true;
+    }
+
+    /// @brief Return true if none of the bits are set in this Mask.
+    __hostdev__ bool isOff() const
+    {
+        for (uint32_t i = 0; i < WORD_COUNT; ++i)
+            if (mWords[i] != uint64_t(0))
+                return false;
+        return true;
+    }
+
+    /// @brief Set the specified bit on.
+    __hostdev__ void setOn(uint32_t n) { mWords[n >> 6] |= uint64_t(1) << (n & 63); }
+    /// @brief Set the specified bit off.
+    __hostdev__ void setOff(uint32_t n) { mWords[n >> 6] &= ~(uint64_t(1) << (n & 63)); }
+
+#if defined(__CUDACC__) // the following functions only run on the GPU!
+    __device__ inline void setOnAtomic(uint32_t n)
+    {
+        atomicOr(reinterpret_cast<unsigned long long int*>(this) + (n >> 6), 1ull << (n & 63));
+    }
+    __device__ inline void setOffAtomic(uint32_t n)
+    {
+        atomicAnd(reinterpret_cast<unsigned long long int*>(this) + (n >> 6), ~(1ull << (n & 63)));
+    }
+    __device__ inline void setAtomic(uint32_t n, bool on)
+    {
+        on ? this->setOnAtomic(n) : this->setOffAtomic(n);
+    }
+#endif
+    /// @brief Set the specified bit on or off.
+    __hostdev__ void set(uint32_t n, bool on)
+    {
+#if 1 // switch between branchless
+        auto& word = mWords[n >> 6];
+        n &= 63;
+        word &= ~(uint64_t(1) << n);
+        word |= uint64_t(on) << n;
+#else
+        on ? this->setOn(n) : this->setOff(n);
+#endif
+    }
+
+    /// @brief Set all bits on
+    __hostdev__ void setOn()
+    {
+        for (uint32_t i = 0; i < WORD_COUNT; ++i)mWords[i] = ~uint64_t(0);
+    }
+
+    /// @brief Set all bits off
+    __hostdev__ void setOff()
+    {
+        for (uint32_t i = 0; i < WORD_COUNT; ++i) mWords[i] = uint64_t(0);
+    }
+
+    /// @brief Set all bits off
+    __hostdev__ void set(bool on)
+    {
+        const uint64_t v = on ? ~uint64_t(0) : uint64_t(0);
+        for (uint32_t i = 0; i < WORD_COUNT; ++i) mWords[i] = v;
+    }
+    /// brief Toggle the state of all bits in the mask
+    __hostdev__ void toggle()
+    {
+        uint32_t n = WORD_COUNT;
+        for (auto* w = mWords; n--; ++w) *w = ~*w;
+    }
+    __hostdev__ void toggle(uint32_t n) { mWords[n >> 6] ^= uint64_t(1) << (n & 63); }
+
+    /// @brief Bitwise intersection
+    __hostdev__ Mask& operator&=(const Mask& other)
+    {
+        uint64_t*       w1 = mWords;
+        const uint64_t* w2 = other.mWords;
+        for (uint32_t n = WORD_COUNT; n--; ++w1, ++w2) *w1 &= *w2;
+        return *this;
+    }
+    /// @brief Bitwise union
+    __hostdev__ Mask& operator|=(const Mask& other)
+    {
+        uint64_t*       w1 = mWords;
+        const uint64_t* w2 = other.mWords;
+        for (uint32_t n = WORD_COUNT; n--; ++w1, ++w2) *w1 |= *w2;
+        return *this;
+    }
+    /// @brief Bitwise difference
+    __hostdev__ Mask& operator-=(const Mask& other)
+    {
+        uint64_t*       w1 = mWords;
+        const uint64_t* w2 = other.mWords;
+        for (uint32_t n = WORD_COUNT; n--; ++w1, ++w2) *w1 &= ~*w2;
+        return *this;
+    }
+    /// @brief Bitwise XOR
+    __hostdev__ Mask& operator^=(const Mask& other)
+    {
+        uint64_t*       w1 = mWords;
+        const uint64_t* w2 = other.mWords;
+        for (uint32_t n = WORD_COUNT; n--; ++w1, ++w2) *w1 ^= *w2;
+        return *this;
+    }
+
+    NANOVDB_HOSTDEV_DISABLE_WARNING
+    template<bool ON>
+    __hostdev__ uint32_t findFirst() const
+    {
+        uint32_t        n = 0u;
+        const uint64_t* w = mWords;
+        for (; n < WORD_COUNT && !(ON ? *w : ~*w); ++w, ++n);
+        return n < WORD_COUNT ? (n << 6) + util::findLowestOn(ON ? *w : ~*w) : SIZE;
+    }
+
+    NANOVDB_HOSTDEV_DISABLE_WARNING
+    template<bool ON>
+    __hostdev__ uint32_t findNext(uint32_t start) const
+    {
+        uint32_t n = start >> 6; // initiate
+        if (n >= WORD_COUNT) return SIZE; // check for out of bounds
+        uint32_t m = start & 63u;
+        uint64_t b = ON ? mWords[n] : ~mWords[n];
+        if (b & (uint64_t(1u) << m)) return start; // simple case: start is on/off
+        b &= ~uint64_t(0u) << m; // mask out lower bits
+        while (!b && ++n < WORD_COUNT) b = ON ? mWords[n] : ~mWords[n]; // find next non-zero word
+        return b ? (n << 6) + util::findLowestOn(b) : SIZE; // catch last word=0
+    }
+
+    NANOVDB_HOSTDEV_DISABLE_WARNING
+    template<bool ON>
+    __hostdev__ uint32_t findPrev(uint32_t start) const
+    {
+        uint32_t n = start >> 6; // initiate
+        if (n >= WORD_COUNT) return SIZE; // check for out of bounds
+        uint32_t m = start & 63u;
+        uint64_t b = ON ? mWords[n] : ~mWords[n];
+        if (b & (uint64_t(1u) << m)) return start; // simple case: start is on/off
+        b &= (uint64_t(1u) << m) - 1u; // mask out higher bits
+        while (!b && n) b = ON ? mWords[--n] : ~mWords[--n]; // find previous non-zero word
+        return b ? (n << 6) + util::findHighestOn(b) : SIZE; // catch first word=0
+    }
+
+private:
+    uint64_t mWords[WORD_COUNT];
+}; // Mask class
+
+// ----------------------------> Map <--------------------------------------
+
+/// @brief Defines an affine transform and its inverse represented as a 3x3 matrix and a vec3 translation
+struct Map
+{ // 264B (not 32B aligned!)
+    float  mMatF[9]; // 9*4B <- 3x3 matrix
+    float  mInvMatF[9]; // 9*4B <- 3x3 matrix
+    float  mVecF[3]; // 3*4B <- translation
+    float  mTaperF; // 4B, placeholder for taper value
+    double mMatD[9]; // 9*8B <- 3x3 matrix
+    double mInvMatD[9]; // 9*8B <- 3x3 matrix
+    double mVecD[3]; // 3*8B <- translation
+    double mTaperD; // 8B, placeholder for taper value
+
+    /// @brief Default constructor for the identity map
+    __hostdev__ Map()
+        : mMatF{   1.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f}
+        , mInvMatF{1.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f}
+        , mVecF{0.0f, 0.0f, 0.0f}
+        , mTaperF{1.0f}
+        , mMatD{   1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0}
+        , mInvMatD{1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0}
+        , mVecD{0.0, 0.0, 0.0}
+        , mTaperD{1.0}
+    {
+    }
+    __hostdev__ Map(double s, const Vec3d& t = Vec3d(0.0, 0.0, 0.0))
+        : mMatF{float(s), 0.0f, 0.0f, 0.0f, float(s), 0.0f, 0.0f, 0.0f, float(s)}
+        , mInvMatF{1.0f / float(s), 0.0f, 0.0f, 0.0f, 1.0f / float(s), 0.0f, 0.0f, 0.0f, 1.0f / float(s)}
+        , mVecF{float(t[0]), float(t[1]), float(t[2])}
+        , mTaperF{1.0f}
+        , mMatD{s, 0.0, 0.0, 0.0, s, 0.0, 0.0, 0.0, s}
+        , mInvMatD{1.0 / s, 0.0, 0.0, 0.0, 1.0 / s, 0.0, 0.0, 0.0, 1.0 / s}
+        , mVecD{t[0], t[1], t[2]}
+        , mTaperD{1.0}
+    {
+    }
+
+    /// @brief Initialize the member data from 3x3 or 4x4 matrices
+    /// @note This is not _hostdev__ since then MatT=openvdb::Mat4d will produce warnings
+    template<typename MatT, typename Vec3T>
+    void set(const MatT& mat, const MatT& invMat, const Vec3T& translate, double taper = 1.0);
+
+    /// @brief Initialize the member data from 4x4 matrices
+    /// @note  The last (4th) row of invMat is actually ignored.
+    ///        This is not _hostdev__ since then Mat4T=openvdb::Mat4d will produce warnings
+    template<typename Mat4T>
+    void set(const Mat4T& mat, const Mat4T& invMat, double taper = 1.0) { this->set(mat, invMat, mat[3], taper); }
+
+    template<typename Vec3T>
+    void set(double scale, const Vec3T& translation, double taper = 1.0);
+
+    /// @brief Apply the forward affine transformation to a vector using 64bit floating point arithmetics.
+    /// @note Typically this operation is used for the scale, rotation and translation of index -> world mapping
+    /// @tparam Vec3T Template type of the 3D vector to be mapped
+    /// @param ijk 3D vector to be mapped - typically floating point index coordinates
+    /// @return Forward mapping for affine transformation, i.e. (mat x ijk) + translation
+    template<typename Vec3T>
+    __hostdev__ Vec3T applyMap(const Vec3T& ijk) const { return math::matMult(mMatD, mVecD, ijk); }
+
+    /// @brief Apply the forward affine transformation to a vector using 32bit floating point arithmetics.
+    /// @note Typically this operation is used for the scale, rotation and translation of index -> world mapping
+    /// @tparam Vec3T Template type of the 3D vector to be mapped
+    /// @param ijk 3D vector to be mapped - typically floating point index coordinates
+    /// @return Forward mapping for affine transformation, i.e. (mat x ijk) + translation
+    template<typename Vec3T>
+    __hostdev__ Vec3T applyMapF(const Vec3T& ijk) const { return math::matMult(mMatF, mVecF, ijk); }
+
+    /// @brief Apply the linear forward 3x3 transformation to an input 3d vector using 64bit floating point arithmetics,
+    ///        e.g. scale and rotation WITHOUT translation.
+    /// @note Typically this operation is used for scale and rotation from index -> world mapping
+    /// @tparam Vec3T Template type of the 3D vector to be mapped
+    /// @param ijk 3D vector to be mapped - typically floating point index coordinates
+    /// @return linear forward 3x3 mapping of the input vector
+    template<typename Vec3T>
+    __hostdev__ Vec3T applyJacobian(const Vec3T& ijk) const { return math::matMult(mMatD, ijk); }
+
+    /// @brief Apply the linear forward 3x3 transformation to an input 3d vector using 32bit floating point arithmetics,
+    ///        e.g. scale and rotation WITHOUT translation.
+    /// @note Typically this operation is used for scale and rotation from index -> world mapping
+    /// @tparam Vec3T Template type of the 3D vector to be mapped
+    /// @param ijk 3D vector to be mapped - typically floating point index coordinates
+    /// @return linear forward 3x3 mapping of the input vector
+    template<typename Vec3T>
+    __hostdev__ Vec3T applyJacobianF(const Vec3T& ijk) const { return math::matMult(mMatF, ijk); }
+
+    /// @brief Apply the inverse affine mapping to a vector using 64bit floating point arithmetics.
+    /// @note Typically this operation is used for the world -> index mapping
+    /// @tparam Vec3T Template type of the 3D vector to be mapped
+    /// @param xyz 3D vector to be mapped - typically floating point world coordinates
+    /// @return Inverse affine mapping of the input @c xyz i.e. (xyz - translation) x mat^-1
+    template<typename Vec3T>
+    __hostdev__ Vec3T applyInverseMap(const Vec3T& xyz) const
+    {
+        return math::matMult(mInvMatD, Vec3T(xyz[0] - mVecD[0], xyz[1] - mVecD[1], xyz[2] - mVecD[2]));
+    }
+
+    /// @brief Apply the inverse affine mapping to a vector using 32bit floating point arithmetics.
+    /// @note Typically this operation is used for the world -> index mapping
+    /// @tparam Vec3T Template type of the 3D vector to be mapped
+    /// @param xyz 3D vector to be mapped - typically floating point world coordinates
+    /// @return Inverse affine mapping of the input @c xyz i.e. (xyz - translation) x mat^-1
+    template<typename Vec3T>
+    __hostdev__ Vec3T applyInverseMapF(const Vec3T& xyz) const
+    {
+        return math::matMult(mInvMatF, Vec3T(xyz[0] - mVecF[0], xyz[1] - mVecF[1], xyz[2] - mVecF[2]));
+    }
+
+    /// @brief Apply the linear inverse 3x3 transformation to an input 3d vector using 64bit floating point arithmetics,
+    ///        e.g. inverse scale and inverse rotation WITHOUT translation.
+    /// @note Typically this operation is used for scale and rotation from world -> index mapping
+    /// @tparam Vec3T Template type of the 3D vector to be mapped
+    /// @param ijk 3D vector to be mapped - typically floating point index coordinates
+    /// @return linear inverse 3x3 mapping of the input vector i.e. xyz x mat^-1
+    template<typename Vec3T>
+    __hostdev__ Vec3T applyInverseJacobian(const Vec3T& xyz) const { return math::matMult(mInvMatD, xyz); }
+
+    /// @brief Apply the linear inverse 3x3 transformation to an input 3d vector using 32bit floating point arithmetics,
+    ///        e.g. inverse scale and inverse rotation WITHOUT translation.
+    /// @note Typically this operation is used for scale and rotation from world -> index mapping
+    /// @tparam Vec3T Template type of the 3D vector to be mapped
+    /// @param ijk 3D vector to be mapped - typically floating point index coordinates
+    /// @return linear inverse 3x3 mapping of the input vector i.e. xyz x mat^-1
+    template<typename Vec3T>
+    __hostdev__ Vec3T applyInverseJacobianF(const Vec3T& xyz) const { return math::matMult(mInvMatF, xyz); }
+
+    /// @brief Apply the transposed inverse 3x3 transformation to an input 3d vector using 64bit floating point arithmetics,
+    ///        e.g. inverse scale and inverse rotation WITHOUT translation.
+    /// @note Typically this operation is used for scale and rotation from world -> index mapping
+    /// @tparam Vec3T Template type of the 3D vector to be mapped
+    /// @param ijk 3D vector to be mapped - typically floating point index coordinates
+    /// @return linear inverse 3x3 mapping of the input vector i.e. xyz x mat^-1
+    template<typename Vec3T>
+    __hostdev__ Vec3T applyIJT(const Vec3T& xyz) const { return math::matMultT(mInvMatD, xyz); }
+    template<typename Vec3T>
+    __hostdev__ Vec3T applyIJTF(const Vec3T& xyz) const { return math::matMultT(mInvMatF, xyz); }
+
+    /// @brief Return a voxels size in each coordinate direction, measured at the origin
+    __hostdev__ Vec3d getVoxelSize() const { return this->applyMap(Vec3d(1)) - this->applyMap(Vec3d(0)); }
+}; // Map
+
+template<typename MatT, typename Vec3T>
+inline void Map::set(const MatT& mat, const MatT& invMat, const Vec3T& translate, double taper)
+{
+    float * mf = mMatF, *vf = mVecF, *mif = mInvMatF;
+    double *md = mMatD, *vd = mVecD, *mid = mInvMatD;
+    mTaperF = static_cast<float>(taper);
+    mTaperD = taper;
+    for (int i = 0; i < 3; ++i) {
+        *vd++ = translate[i]; //translation
+        *vf++ = static_cast<float>(translate[i]); //translation
+        for (int j = 0; j < 3; ++j) {
+            *md++ = mat[j][i]; //transposed
+            *mid++ = invMat[j][i];
+            *mf++ = static_cast<float>(mat[j][i]); //transposed
+            *mif++ = static_cast<float>(invMat[j][i]);
+        }
+    }
+}
+
+template<typename Vec3T>
+inline void Map::set(double dx, const Vec3T& trans, double taper)
+{
+    NANOVDB_ASSERT(dx > 0.0);
+    const double mat[3][3] = { {dx, 0.0, 0.0},   // row 0
+                               {0.0, dx, 0.0},   // row 1
+                               {0.0, 0.0, dx} }; // row 2
+    const double idx = 1.0 / dx;
+    const double invMat[3][3] = { {idx, 0.0, 0.0},   // row 0
+                                  {0.0, idx, 0.0},   // row 1
+                                  {0.0, 0.0, idx} }; // row 2
+    this->set(mat, invMat, trans, taper);
+}
+
+// ----------------------------> GridBlindMetaData <--------------------------------------
+
+struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) GridBlindMetaData
+{ // 288 bytes
+    static const int      MaxNameSize = 256; // due to NULL termination the maximum length is one less!
+    int64_t               mDataOffset; // byte offset to the blind data, relative to this GridBlindMetaData.
+    uint64_t              mValueCount; // number of blind values, e.g. point count
+    uint32_t              mValueSize;// byte size of each value, e.g. 4 if mDataType=Float and 1 if mDataType=Unknown since that amounts to char
+    GridBlindDataSemantic mSemantic; // semantic meaning of the data.
+    GridBlindDataClass    mDataClass; // 4 bytes
+    GridType              mDataType; // 4 bytes
+    char                  mName[MaxNameSize]; // note this includes the NULL termination
+    // no padding required for 32 byte alignment
+
+    // disallow copy-construction since methods like blindData and getBlindData uses the this pointer!
+    GridBlindMetaData(const GridBlindMetaData&) = delete;
+
+    // disallow copy-assignment since methods like blindData and getBlindData uses the this pointer!
+    const GridBlindMetaData& operator=(const GridBlindMetaData&) = delete;
+
+    __hostdev__ void setBlindData(void* blindData) { mDataOffset = util::PtrDiff(blindData, this); }
+
+    // unsafe
+    __hostdev__ const void* blindData() const {return util::PtrAdd(this, mDataOffset);}
+
+    /// @brief Get a const pointer to the blind data represented by this meta data
+    /// @tparam BlindDataT Expected value type of the blind data.
+    /// @return Returns NULL if mGridType!=toGridType<BlindDataT>(), else a const point of type BlindDataT.
+    /// @note Use mDataType=Unknown if BlindDataT is a custom data type unknown to NanoVDB.
+    template<typename BlindDataT>
+    __hostdev__ const BlindDataT* getBlindData() const
+    {
+        //if (mDataType != toGridType<BlindDataT>()) printf("getBlindData mismatch\n");
+        return mDataType == toGridType<BlindDataT>() ? util::PtrAdd<BlindDataT>(this, mDataOffset) : nullptr;
+    }
+
+    /// @brief return true if this meta data has a valid combination of semantic, class and value tags
+    __hostdev__ bool isValid() const
+    {
+        auto check = [&]()->bool{
+            switch (mDataType){
+            case GridType::Unknown: return mValueSize==1u;// i.e. we encode data as mValueCount chars
+            case GridType::Float:   return mValueSize==4u;
+            case GridType::Double:  return mValueSize==8u;
+            case GridType::Int16:   return mValueSize==2u;
+            case GridType::Int32:   return mValueSize==4u;
+            case GridType::Int64:   return mValueSize==8u;
+            case GridType::Vec3f:   return mValueSize==12u;
+            case GridType::Vec3d:   return mValueSize==24u;
+            case GridType::Half:    return mValueSize==2u;
+            case GridType::RGBA8:   return mValueSize==4u;
+            case GridType::Fp8:     return mValueSize==1u;
+            case GridType::Fp16:    return mValueSize==2u;
+            case GridType::Vec4f:   return mValueSize==16u;
+            case GridType::Vec4d:   return mValueSize==32u;
+            case GridType::Vec3u8:  return mValueSize==3u;
+            case GridType::Vec3u16: return mValueSize==6u;
+            default: return true;}// all other combinations are valid
+        };
+        return nanovdb::isValid(mDataClass, mSemantic, mDataType) && check();
+    }
+
+    /// @brief return size in bytes of the blind data represented by this blind meta data
+    /// @note This size includes possible padding for 32 byte alignment. The actual amount
+    ///       of bind data is mValueCount * mValueSize
+    __hostdev__ uint64_t blindDataSize() const
+    {
+        return math::AlignUp<NANOVDB_DATA_ALIGNMENT>(mValueCount * mValueSize);
+    }
+}; // GridBlindMetaData
+
+// ----------------------------> NodeTrait <--------------------------------------
+
+/// @brief Struct to derive node type from its level in a given
+///        grid, tree or root while preserving constness
+template<typename GridOrTreeOrRootT, int LEVEL>
+struct NodeTrait;
+
+// Partial template specialization of above Node struct
+template<typename GridOrTreeOrRootT>
+struct NodeTrait<GridOrTreeOrRootT, 0>
+{
+    static_assert(GridOrTreeOrRootT::RootNodeType::LEVEL == 3, "Tree depth is not supported");
+    using Type = typename GridOrTreeOrRootT::LeafNodeType;
+    using type = typename GridOrTreeOrRootT::LeafNodeType;
+};
+template<typename GridOrTreeOrRootT>
+struct NodeTrait<const GridOrTreeOrRootT, 0>
+{
+    static_assert(GridOrTreeOrRootT::RootNodeType::LEVEL == 3, "Tree depth is not supported");
+    using Type = const typename GridOrTreeOrRootT::LeafNodeType;
+    using type = const typename GridOrTreeOrRootT::LeafNodeType;
+};
+
+template<typename GridOrTreeOrRootT>
+struct NodeTrait<GridOrTreeOrRootT, 1>
+{
+    static_assert(GridOrTreeOrRootT::RootNodeType::LEVEL == 3, "Tree depth is not supported");
+    using Type = typename GridOrTreeOrRootT::RootNodeType::ChildNodeType::ChildNodeType;
+    using type = typename GridOrTreeOrRootT::RootNodeType::ChildNodeType::ChildNodeType;
+};
+template<typename GridOrTreeOrRootT>
+struct NodeTrait<const GridOrTreeOrRootT, 1>
+{
+    static_assert(GridOrTreeOrRootT::RootNodeType::LEVEL == 3, "Tree depth is not supported");
+    using Type = const typename GridOrTreeOrRootT::RootNodeType::ChildNodeType::ChildNodeType;
+    using type = const typename GridOrTreeOrRootT::RootNodeType::ChildNodeType::ChildNodeType;
+};
+template<typename GridOrTreeOrRootT>
+struct NodeTrait<GridOrTreeOrRootT, 2>
+{
+    static_assert(GridOrTreeOrRootT::RootNodeType::LEVEL == 3, "Tree depth is not supported");
+    using Type = typename GridOrTreeOrRootT::RootNodeType::ChildNodeType;
+    using type = typename GridOrTreeOrRootT::RootNodeType::ChildNodeType;
+};
+template<typename GridOrTreeOrRootT>
+struct NodeTrait<const GridOrTreeOrRootT, 2>
+{
+    static_assert(GridOrTreeOrRootT::RootNodeType::LEVEL == 3, "Tree depth is not supported");
+    using Type = const typename GridOrTreeOrRootT::RootNodeType::ChildNodeType;
+    using type = const typename GridOrTreeOrRootT::RootNodeType::ChildNodeType;
+};
+template<typename GridOrTreeOrRootT>
+struct NodeTrait<GridOrTreeOrRootT, 3>
+{
+    static_assert(GridOrTreeOrRootT::RootNodeType::LEVEL == 3, "Tree depth is not supported");
+    using Type = typename GridOrTreeOrRootT::RootNodeType;
+    using type = typename GridOrTreeOrRootT::RootNodeType;
+};
+
+template<typename GridOrTreeOrRootT>
+struct NodeTrait<const GridOrTreeOrRootT, 3>
+{
+    static_assert(GridOrTreeOrRootT::RootNodeType::LEVEL == 3, "Tree depth is not supported");
+    using Type = const typename GridOrTreeOrRootT::RootNodeType;
+    using type = const typename GridOrTreeOrRootT::RootNodeType;
+};
+
+// ----------------------------> Froward decelerations of random access methods <--------------------------------------
+
+template<typename BuildT>
+struct GetValue;
+template<typename BuildT>
+struct SetValue;
+template<typename BuildT>
+struct SetVoxel;
+template<typename BuildT>
+struct GetState;
+template<typename BuildT>
+struct GetDim;
+template<typename BuildT>
+struct GetLeaf;
+template<typename BuildT>
+struct ProbeValue;
+template<typename BuildT>
+struct GetNodeInfo;
+
+// ----------------------------> CheckMode <----------------------------------
+
+/// @brief List of different modes for computing for a checksum
+enum class CheckMode : uint32_t { Disable = 0,  // no computation
+                                  Empty   = 0,
+                                  Half    = 1,
+                                  Partial = 1,  // fast but approximate
+                                  Default = 1,  // defaults to Partial
+                                  Full    = 2,  // slow but accurate
+                                  End     = 3, // marks the end of the enum list
+                                  StrLen  = 9 + End};
+
+/// @brief Prints CheckMode enum to a c-string
+/// @param dst Destination c-string
+/// @param mode CheckMode enum to be converted to string
+/// @return destinations string @c dst
+__hostdev__ inline char* toStr(char *dst, CheckMode mode)
+{
+    switch (mode){
+        case CheckMode::Half: return util::strcpy(dst, "half");
+        case CheckMode::Full: return util::strcpy(dst, "full");
+        default: return util::strcpy(dst, "disabled");
+    }
+}
+
+// ----------------------------> Checksum <----------------------------------
+
+/// @brief Class that encapsulates two CRC32 checksums, one for the Grid, Tree and Root node meta data
+///        and one for the remaining grid nodes.
+class Checksum
+{
+    /// Three types of checksums:
+    ///   1) Empty: all 64 bits are on (used to signify a disabled or undefined checksum)
+    ///   2) Half: Upper 32 bits are on and not all of lower 32 bits are on (lower 32 bits checksum head of grid)
+    ///   3) Full: Not all of the 64 bits are one (lower 32 bits checksum head of grid and upper 32 bits checksum tail of grid)
+    union { uint32_t mCRC32[2]; uint64_t mCRC64; };// mCRC32[0] is checksum of Grid, Tree and Root, and mCRC32[1] is checksum of nodes
+
+public:
+
+    static constexpr uint32_t EMPTY32 = ~uint32_t{0};
+    static constexpr uint64_t EMPTY64 = ~uint64_t(0);
+
+    /// @brief default constructor initiates checksum to EMPTY
+    __hostdev__ Checksum() : mCRC64{EMPTY64} {}
+
+    /// @brief Constructor that allows the two 32bit checksums to be initiated explicitly
+    /// @param head Initial 32bit CRC checksum of grid, tree and root data
+    /// @param tail Initial 32bit CRC checksum of all the nodes and blind data
+    __hostdev__ Checksum(uint32_t head, uint32_t tail) :  mCRC32{head, tail} {}
+
+    /// @brief
+    /// @param checksum
+    /// @param mode
+    __hostdev__ Checksum(uint64_t checksum, CheckMode mode = CheckMode::Full) : mCRC64{mode == CheckMode::Disable ? EMPTY64 : checksum}
+    {
+        if (mode == CheckMode::Partial) mCRC32[1] = EMPTY32;
+    }
+
+    /// @brief return the 64 bit checksum of this instance
+    [[deprecated("Use Checksum::data instead.")]]
+    __hostdev__ uint64_t checksum() const { return mCRC64; }
+    [[deprecated("Use Checksum::head and Ckecksum::tail instead.")]]
+    __hostdev__ uint32_t& checksum(int i) {NANOVDB_ASSERT(i==0 || i==1); return mCRC32[i]; }
+    [[deprecated("Use Checksum::head and Ckecksum::tail instead.")]]
+    __hostdev__ uint32_t checksum(int i) const {NANOVDB_ASSERT(i==0 || i==1); return mCRC32[i]; }
+
+    __hostdev__ uint64_t  full() const { return mCRC64; }
+    __hostdev__ uint64_t& full()       { return mCRC64; }
+    __hostdev__ uint32_t  head() const { return mCRC32[0]; }
+    __hostdev__ uint32_t& head()       { return mCRC32[0]; }
+    __hostdev__ uint32_t  tail() const { return mCRC32[1]; }
+    __hostdev__ uint32_t& tail()       { return mCRC32[1]; }
+
+    /// @brief return true if the 64 bit checksum is partial, i.e. of head only
+    [[deprecated("Use Checksum::isHalf instead.")]]
+    __hostdev__ bool isPartial() const { return mCRC32[0] != EMPTY32 && mCRC32[1] == EMPTY32; }
+    __hostdev__ bool isHalf() const { return mCRC32[0] != EMPTY32 && mCRC32[1] == EMPTY32; }
+
+    /// @brief return true if the 64 bit checksum is fill, i.e. of both had and nodes
+    __hostdev__ bool isFull() const { return mCRC64 != EMPTY64 && mCRC32[1] != EMPTY32; }
+
+    /// @brief return true if the 64 bit checksum is disables (unset)
+    __hostdev__ bool isEmpty() const { return mCRC64 == EMPTY64; }
+
+    __hostdev__ void disable() { mCRC64 = EMPTY64; }
+
+    /// @brief return the mode of the 64 bit checksum
+    __hostdev__ CheckMode mode() const
+    {
+        return mCRC64    == EMPTY64 ? CheckMode::Disable :
+               mCRC32[1] == EMPTY32 ? CheckMode::Partial : CheckMode::Full;
+    }
+
+    /// @brief return true if the checksums are identical
+    /// @param rhs other Checksum
+    __hostdev__ bool operator==(const Checksum &rhs) const {return mCRC64 == rhs.mCRC64;}
+
+    /// @brief return true if the checksums are not identical
+    /// @param rhs other Checksum
+    __hostdev__ bool operator!=(const Checksum &rhs) const {return mCRC64 != rhs.mCRC64;}
+};// Checksum
+
+/// @brief Maps 64 bit checksum to CheckMode enum
+/// @param checksum 64 bit checksum with two CRC32 codes
+/// @return CheckMode enum
+__hostdev__ inline CheckMode toCheckMode(const Checksum &checksum){return checksum.mode();}
+
+// ----------------------------> Grid <--------------------------------------
+
+/*
+    The following class and comment is for internal use only
+
+    Memory layout:
+
+    Grid ->       39 x double                          (world bbox and affine transformation)
+    Tree -> Root  3 x ValueType + int32_t + N x Tiles  (background,min,max,tileCount + tileCount x Tiles)
+
+    N2 upper InternalNodes each with 2 bit masks, N2 tiles, and min/max values
+
+    N1 lower InternalNodes each with 2 bit masks, N1 tiles, and min/max values
+
+    N0 LeafNodes each with a bit mask, N0 ValueTypes and min/max
+
+    Example layout: ("---" implies it has a custom offset, "..." implies zero or more)
+    [GridData][TreeData]---[RootData][ROOT TILES...]---[InternalData<5>]---[InternalData<4>]---[LeafData<3>]---[BLINDMETA...]---[BLIND0]---[BLIND1]---etc.
+*/
+
+/// @brief Struct with all the member data of the Grid (useful during serialization of an openvdb grid)
+///
+/// @note The transform is assumed to be affine (so linear) and have uniform scale! So frustum transforms
+///       and non-uniform scaling are not supported (primarily because they complicate ray-tracing in index space)
+///
+/// @note No client code should (or can) interface with this struct so it can safely be ignored!
+struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) GridData
+{ // sizeof(GridData) = 672B
+    static const int MaxNameSize = 256; // due to NULL termination the maximum length is one less
+    uint64_t         mMagic; // 8B (0) magic to validate it is valid grid data.
+    Checksum         mChecksum; // 8B (8). Checksum of grid buffer.
+    Version          mVersion; // 4B (16) major, minor, and patch version numbers
+    BitFlags<32>     mFlags; // 4B (20). flags for grid.
+    uint32_t         mGridIndex; // 4B (24). Index of this grid in the buffer
+    uint32_t         mGridCount; // 4B (28). Total number of grids in the buffer
+    uint64_t         mGridSize; // 8B (32). byte count of this entire grid occupied in the buffer.
+    char             mGridName[MaxNameSize]; // 256B (40)
+    Map              mMap; // 264B (296). affine transformation between index and world space in both single and double precision
+    Vec3dBBox        mWorldBBox; // 48B (560). floating-point AABB of active values in WORLD SPACE (2 x 3 doubles)
+    Vec3d            mVoxelSize; // 24B (608). size of a voxel in world units
+    GridClass        mGridClass; // 4B (632).
+    GridType         mGridType; //  4B (636).
+    int64_t          mBlindMetadataOffset; // 8B (640). offset to beginning of GridBlindMetaData structures that follow this grid.
+    uint32_t         mBlindMetadataCount; // 4B (648). count of GridBlindMetaData structures that follow this grid.
+    uint32_t         mData0; // 4B (652) unused
+    uint64_t         mData1; // 8B (656) is use for the total number of values indexed by an IndexGrid
+    uint64_t         mData2; // 8B (664) padding to 32 B alignment
+    /// @brief Use this method to initiate most member data
+    GridData& operator=(const GridData&) = default;
+    //__hostdev__ GridData& operator=(const GridData& other){return *util::memcpy(this, &other);}
+    __hostdev__ void init(std::initializer_list<GridFlags> list = {GridFlags::IsBreadthFirst},
+                          uint64_t                         gridSize = 0u,
+                          const Map&                       map = Map(),
+                          GridType                         gridType = GridType::Unknown,
+                          GridClass                        gridClass = GridClass::Unknown)
+    {
+#ifdef NANOVDB_USE_NEW_MAGIC_NUMBERS
+        mMagic = NANOVDB_MAGIC_GRID;
+#else
+        mMagic = NANOVDB_MAGIC_NUMB;
+#endif
+        mChecksum.disable();// all 64 bits ON means checksum is disabled
+        mVersion = Version();
+        mFlags.initMask(list);
+        mGridIndex = 0u;
+        mGridCount = 1u;
+        mGridSize = gridSize;
+        mGridName[0] = '\0';
+        mMap = map;
+        mWorldBBox = Vec3dBBox();// invalid bbox
+        mVoxelSize = map.getVoxelSize();
+        mGridClass = gridClass;
+        mGridType = gridType;
+        mBlindMetadataOffset = mGridSize; // i.e. no blind data
+        mBlindMetadataCount = 0u; // i.e. no blind data
+        mData0 = 0u; // zero padding
+        mData1 = 0u; // only used for index and point grids
+        mData2 = NANOVDB_MAGIC_GRID; // since version 32.6.0 (will change in the future)
+    }
+    /// @brief return true if the magic number and the version are both valid
+    __hostdev__ bool isValid() const {
+        // Before v32.6.0: toMagic(mMagic) = MagicType::NanoVDB  and mData2 was undefined
+        // For    v32.6.0: toMagic(mMagic) = MagicType::NanoVDB  and toMagic(mData2) = MagicType::NanoGrid
+        // After  v32.7.X: toMagic(mMagic) = MagicType::NanoGrid and mData2 will again be undefined
+        const MagicType magic = toMagic(mMagic);
+        if (magic == MagicType::NanoGrid || toMagic(mData2) == MagicType::NanoGrid) return true;
+        bool test = magic == MagicType::NanoVDB;// could be GridData or io::FileHeader
+        if (test) test = mVersion.isCompatible();
+        if (test) test = mGridCount > 0u && mGridIndex < mGridCount;
+        if (test) test = mGridClass < GridClass::End && mGridType < GridType::End;
+        return test;
+    }
+    // Set and unset various bit flags
+    __hostdev__ void setMinMaxOn(bool on = true) { mFlags.setMask(GridFlags::HasMinMax, on); }
+    __hostdev__ void setBBoxOn(bool on = true) { mFlags.setMask(GridFlags::HasBBox, on); }
+    __hostdev__ void setLongGridNameOn(bool on = true) { mFlags.setMask(GridFlags::HasLongGridName, on); }
+    __hostdev__ void setAverageOn(bool on = true) { mFlags.setMask(GridFlags::HasAverage, on); }
+    __hostdev__ void setStdDeviationOn(bool on = true) { mFlags.setMask(GridFlags::HasStdDeviation, on); }
+    __hostdev__ bool setGridName(const char* src)
+    {
+        const bool success = (util::strncpy(mGridName, src, MaxNameSize)[MaxNameSize-1] == '\0');
+        if (!success) mGridName[MaxNameSize-1] = '\0';
+        return success; // returns true if input grid name is NOT longer than MaxNameSize characters
+    }
+    // Affine transformations based on double precision
+    template<typename Vec3T>
+    __hostdev__ Vec3T applyMap(const Vec3T& xyz) const { return mMap.applyMap(xyz); } // Pos: index -> world
+    template<typename Vec3T>
+    __hostdev__ Vec3T applyInverseMap(const Vec3T& xyz) const { return mMap.applyInverseMap(xyz); } // Pos: world -> index
+    template<typename Vec3T>
+    __hostdev__ Vec3T applyJacobian(const Vec3T& xyz) const { return mMap.applyJacobian(xyz); } // Dir: index -> world
+    template<typename Vec3T>
+    __hostdev__ Vec3T applyInverseJacobian(const Vec3T& xyz) const { return mMap.applyInverseJacobian(xyz); } // Dir: world -> index
+    template<typename Vec3T>
+    __hostdev__ Vec3T applyIJT(const Vec3T& xyz) const { return mMap.applyIJT(xyz); }
+    // Affine transformations based on single precision
+    template<typename Vec3T>
+    __hostdev__ Vec3T applyMapF(const Vec3T& xyz) const { return mMap.applyMapF(xyz); } // Pos: index -> world
+    template<typename Vec3T>
+    __hostdev__ Vec3T applyInverseMapF(const Vec3T& xyz) const { return mMap.applyInverseMapF(xyz); } // Pos: world -> index
+    template<typename Vec3T>
+    __hostdev__ Vec3T applyJacobianF(const Vec3T& xyz) const { return mMap.applyJacobianF(xyz); } // Dir: index -> world
+    template<typename Vec3T>
+    __hostdev__ Vec3T applyInverseJacobianF(const Vec3T& xyz) const { return mMap.applyInverseJacobianF(xyz); } // Dir: world -> index
+    template<typename Vec3T>
+    __hostdev__ Vec3T applyIJTF(const Vec3T& xyz) const { return mMap.applyIJTF(xyz); }
+
+    // @brief Return a non-const void pointer to the tree
+    __hostdev__ void* treePtr() { return this + 1; }// TreeData is always right after GridData
+
+    // @brief Return a const void pointer to the tree
+    __hostdev__ const void* treePtr() const { return this + 1; }// TreeData is always right after GridData
+
+    /// @brief Return a non-const void pointer to the first node at @c LEVEL
+    /// @tparam LEVEL Level of the node. LEVEL 0 means leaf node and LEVEL 3 means root node
+    template <uint32_t LEVEL>
+    __hostdev__ const void* nodePtr() const
+    {
+        static_assert(LEVEL >= 0 && LEVEL <= 3, "invalid LEVEL template parameter");
+        const void *treeData = this + 1;// TreeData is always right after GridData
+        const uint64_t nodeOffset = *util::PtrAdd<uint64_t>(treeData, 8*LEVEL);// skip LEVEL uint64_t
+        return nodeOffset ? util::PtrAdd(treeData, nodeOffset) : nullptr;
+    }
+
+    /// @brief Return a non-const void pointer to the first node at @c LEVEL
+    /// @tparam LEVEL of the node. LEVEL 0 means leaf node and LEVEL 3 means root node
+    /// @warning If not nodes exist at @c LEVEL NULL is returned
+    template <uint32_t LEVEL>
+    __hostdev__ void* nodePtr()
+    {
+        static_assert(LEVEL >= 0 && LEVEL <= 3, "invalid LEVEL template parameter");
+        void *treeData  = this + 1;// TreeData is always right after GridData
+        const uint64_t nodeOffset = *util::PtrAdd<uint64_t>(treeData, 8*LEVEL);// skip LEVEL uint64_t
+        return nodeOffset ? util::PtrAdd(treeData, nodeOffset) : nullptr;
+    }
+
+    /// @brief Return number of nodes at @c LEVEL
+    /// @tparam Level of the node. LEVEL 0 means leaf node and LEVEL 2 means upper node
+    template <uint32_t LEVEL>
+    __hostdev__ uint32_t nodeCount() const
+    {
+        static_assert(LEVEL >= 0 && LEVEL < 3, "invalid LEVEL template parameter");
+        return *util::PtrAdd<uint32_t>(this + 1, 4*(8 + LEVEL));// TreeData is always right after GridData
+    }
+
+    /// @brief Returns a const reference to the blindMetaData at the specified linear offset.
+    ///
+    /// @warning The linear offset is assumed to be in the valid range
+    __hostdev__ const GridBlindMetaData* blindMetaData(uint32_t n) const
+    {
+        NANOVDB_ASSERT(n < mBlindMetadataCount);
+        return util::PtrAdd<GridBlindMetaData>(this, mBlindMetadataOffset) + n;
+    }
+
+    __hostdev__ const char* gridName() const
+    {
+        if (mFlags.isMaskOn(GridFlags::HasLongGridName)) {// search for first blind meta data that contains a name
+            NANOVDB_ASSERT(mBlindMetadataCount > 0);
+            for (uint32_t i = 0; i < mBlindMetadataCount; ++i) {
+                const auto* metaData = this->blindMetaData(i);// EXTREMELY important to be a pointer
+                if (metaData->mDataClass == GridBlindDataClass::GridName) {
+                    NANOVDB_ASSERT(metaData->mDataType == GridType::Unknown);
+                    return metaData->template getBlindData<const char>();
+                }
+            }
+            NANOVDB_ASSERT(false); // should never hit this!
+        }
+        return mGridName;
+    }
+
+    /// @brief Return memory usage in bytes for this class only.
+    __hostdev__ static uint64_t memUsage() { return sizeof(GridData); }
+
+    /// @brief return AABB of active values in world space
+    __hostdev__ const Vec3dBBox& worldBBox() const { return mWorldBBox; }
+
+    /// @brief return AABB of active values in index space
+    __hostdev__ const CoordBBox& indexBBox() const {return *(const CoordBBox*)(this->nodePtr<3>());}
+
+    /// @brief return the root table has size
+    __hostdev__ uint32_t rootTableSize() const
+    {
+        const void *root = this->nodePtr<3>();
+        return root ? *util::PtrAdd<uint32_t>(root, sizeof(CoordBBox)) : 0u;
+    }
+
+    /// @brief test if the grid is empty, e.i the root table has size 0
+    /// @return  true if this grid contains not data whatsoever
+    __hostdev__ bool isEmpty() const {return this->rootTableSize() == 0u;}
+
+    /// @brief  return true if RootData follows TreeData in memory without any extra padding
+    /// @details TreeData is always following right after GridData, but the same might not be true for RootData
+    __hostdev__ bool isRootConnected() const { return *(const uint64_t*)((const char*)(this + 1) + 24) == 64u;}
+}; // GridData
+
+// Forward declaration of accelerated random access class
+template<typename BuildT, int LEVEL0 = -1, int LEVEL1 = -1, int LEVEL2 = -1>
+class ReadAccessor;
+
+template<typename BuildT>
+using DefaultReadAccessor = ReadAccessor<BuildT, 0, 1, 2>;
+
+/// @brief Highest level of the data structure. Contains a tree and a world->index
+///        transform (that currently only supports uniform scaling and translation).
+///
+/// @note This the API of this class to interface with client code
+template<typename TreeT>
+class Grid : public GridData
+{
+public:
+    using TreeType = TreeT;
+    using RootType = typename TreeT::RootType;
+    using RootNodeType = RootType;
+    using UpperNodeType = typename RootNodeType::ChildNodeType;
+    using LowerNodeType = typename UpperNodeType::ChildNodeType;
+    using LeafNodeType = typename RootType::LeafNodeType;
+    using DataType = GridData;
+    using ValueType = typename TreeT::ValueType;
+    using BuildType = typename TreeT::BuildType; // in rare cases BuildType != ValueType, e.g. then BuildType = ValueMask and ValueType = bool
+    using CoordType = typename TreeT::CoordType;
+    using AccessorType = DefaultReadAccessor<BuildType>;
+
+    /// @brief Disallow constructions, copy and assignment
+    ///
+    /// @note Only a Serializer, defined elsewhere, can instantiate this class
+    Grid(const Grid&) = delete;
+    Grid& operator=(const Grid&) = delete;
+    ~Grid() = delete;
+
+    __hostdev__ Version version() const { return DataType::mVersion; }
+
+    __hostdev__ DataType* data() { return reinterpret_cast<DataType*>(this); }
+
+    __hostdev__ const DataType* data() const { return reinterpret_cast<const DataType*>(this); }
+
+    /// @brief Return memory usage in bytes for this class only.
+    //__hostdev__ static uint64_t memUsage() { return sizeof(GridData); }
+
+    /// @brief Return the memory footprint of the entire grid, i.e. including all nodes and blind data
+    __hostdev__ uint64_t gridSize() const { return DataType::mGridSize; }
+
+    /// @brief Return index of this grid in the buffer
+    __hostdev__ uint32_t gridIndex() const { return DataType::mGridIndex; }
+
+    /// @brief Return total number of grids in the buffer
+    __hostdev__ uint32_t gridCount() const { return DataType::mGridCount; }
+
+    /// @brief  @brief Return the total number of values indexed by this IndexGrid
+    ///
+    /// @note This method is only defined for IndexGrid = NanoGrid<ValueIndex || ValueOnIndex || ValueIndexMask || ValueOnIndexMask>
+    template<typename T = BuildType>
+    __hostdev__ typename util::enable_if<BuildTraits<T>::is_index, const uint64_t&>::type
+    valueCount() const { return DataType::mData1; }
+
+    /// @brief  @brief Return the total number of points indexed by this PointGrid
+    ///
+    /// @note This method is only defined for PointGrid = NanoGrid<Point>
+    template<typename T = BuildType>
+    __hostdev__ typename util::enable_if<util::is_same<T, Point>::value, const uint64_t&>::type
+    pointCount() const { return DataType::mData1; }
+
+    /// @brief Return a const reference to the tree
+    __hostdev__ const TreeT& tree() const { return *reinterpret_cast<const TreeT*>(this->treePtr()); }
+
+    /// @brief Return a non-const reference to the tree
+    __hostdev__ TreeT& tree() { return *reinterpret_cast<TreeT*>(this->treePtr()); }
+
+    /// @brief Return a new instance of a ReadAccessor used to access values in this grid
+    __hostdev__ AccessorType getAccessor() const { return AccessorType(this->tree().root()); }
+
+    /// @brief Return a const reference to the size of a voxel in world units
+    __hostdev__ const Vec3d& voxelSize() const { return DataType::mVoxelSize; }
+
+    /// @brief Return a const reference to the Map for this grid
+    __hostdev__ const Map& map() const { return DataType::mMap; }
+
+    /// @brief world to index space transformation
+    template<typename Vec3T>
+    __hostdev__ Vec3T worldToIndex(const Vec3T& xyz) const { return this->applyInverseMap(xyz); }
+
+    /// @brief index to world space transformation
+    template<typename Vec3T>
+    __hostdev__ Vec3T indexToWorld(const Vec3T& xyz) const { return this->applyMap(xyz); }
+
+    /// @brief transformation from index space direction to world space direction
+    /// @warning assumes dir to be normalized
+    template<typename Vec3T>
+    __hostdev__ Vec3T indexToWorldDir(const Vec3T& dir) const { return this->applyJacobian(dir); }
+
+    /// @brief transformation from world space direction to index space direction
+    /// @warning assumes dir to be normalized
+    template<typename Vec3T>
+    __hostdev__ Vec3T worldToIndexDir(const Vec3T& dir) const { return this->applyInverseJacobian(dir); }
+
+    /// @brief transform the gradient from index space to world space.
+    /// @details Applies the inverse jacobian transform map.
+    template<typename Vec3T>
+    __hostdev__ Vec3T indexToWorldGrad(const Vec3T& grad) const { return this->applyIJT(grad); }
+
+    /// @brief world to index space transformation
+    template<typename Vec3T>
+    __hostdev__ Vec3T worldToIndexF(const Vec3T& xyz) const { return this->applyInverseMapF(xyz); }
+
+    /// @brief index to world space transformation
+    template<typename Vec3T>
+    __hostdev__ Vec3T indexToWorldF(const Vec3T& xyz) const { return this->applyMapF(xyz); }
+
+    /// @brief transformation from index space direction to world space direction
+    /// @warning assumes dir to be normalized
+    template<typename Vec3T>
+    __hostdev__ Vec3T indexToWorldDirF(const Vec3T& dir) const { return this->applyJacobianF(dir); }
+
+    /// @brief transformation from world space direction to index space direction
+    /// @warning assumes dir to be normalized
+    template<typename Vec3T>
+    __hostdev__ Vec3T worldToIndexDirF(const Vec3T& dir) const { return this->applyInverseJacobianF(dir); }
+
+    /// @brief Transforms the gradient from index space to world space.
+    /// @details Applies the inverse jacobian transform map.
+    template<typename Vec3T>
+    __hostdev__ Vec3T indexToWorldGradF(const Vec3T& grad) const { return DataType::applyIJTF(grad); }
+
+    /// @brief Computes a AABB of active values in world space
+    //__hostdev__ const Vec3dBBox& worldBBox() const { return DataType::mWorldBBox; }
+
+    /// @brief Computes a AABB of active values in index space
+    ///
+    /// @note This method is returning a floating point bounding box and not a CoordBBox. This makes
+    ///       it more useful for clipping rays.
+    //__hostdev__ const BBox<CoordType>& indexBBox() const { return this->tree().bbox(); }
+
+    /// @brief Return the total number of active voxels in this tree.
+    __hostdev__ uint64_t activeVoxelCount() const { return this->tree().activeVoxelCount(); }
+
+    /// @brief Methods related to the classification of this grid
+    __hostdev__ bool             isValid() const { return DataType::isValid(); }
+    __hostdev__ const GridType&  gridType() const { return DataType::mGridType; }
+    __hostdev__ const GridClass& gridClass() const { return DataType::mGridClass; }
+    __hostdev__ bool             isLevelSet() const { return DataType::mGridClass == GridClass::LevelSet; }
+    __hostdev__ bool             isFogVolume() const { return DataType::mGridClass == GridClass::FogVolume; }
+    __hostdev__ bool             isStaggered() const { return DataType::mGridClass == GridClass::Staggered; }
+    __hostdev__ bool             isPointIndex() const { return DataType::mGridClass == GridClass::PointIndex; }
+    __hostdev__ bool             isGridIndex() const { return DataType::mGridClass == GridClass::IndexGrid; }
+    __hostdev__ bool             isPointData() const { return DataType::mGridClass == GridClass::PointData; }
+    __hostdev__ bool             isMask() const { return DataType::mGridClass == GridClass::Topology; }
+    __hostdev__ bool             isUnknown() const { return DataType::mGridClass == GridClass::Unknown; }
+    __hostdev__ bool             hasMinMax() const { return DataType::mFlags.isMaskOn(GridFlags::HasMinMax); }
+    __hostdev__ bool             hasBBox() const { return DataType::mFlags.isMaskOn(GridFlags::HasBBox); }
+    __hostdev__ bool             hasLongGridName() const { return DataType::mFlags.isMaskOn(GridFlags::HasLongGridName); }
+    __hostdev__ bool             hasAverage() const { return DataType::mFlags.isMaskOn(GridFlags::HasAverage); }
+    __hostdev__ bool             hasStdDeviation() const { return DataType::mFlags.isMaskOn(GridFlags::HasStdDeviation); }
+    __hostdev__ bool             isBreadthFirst() const { return DataType::mFlags.isMaskOn(GridFlags::IsBreadthFirst); }
+
+    /// @brief return true if the specified node type is layed out breadth-first in memory and has a fixed size.
+    ///        This allows for sequential access to the nodes.
+    template<typename NodeT>
+    __hostdev__ bool isSequential() const { return NodeT::FIXED_SIZE && this->isBreadthFirst(); }
+
+    /// @brief return true if the specified node level is layed out breadth-first in memory and has a fixed size.
+    ///        This allows for sequential access to the nodes.
+    template<int LEVEL>
+    __hostdev__ bool isSequential() const { return NodeTrait<TreeT, LEVEL>::type::FIXED_SIZE && this->isBreadthFirst(); }
+
+   /// @brief return true if nodes at all levels can safely be accessed with simple linear offsets
+    __hostdev__ bool isSequential() const { return UpperNodeType::FIXED_SIZE && LowerNodeType::FIXED_SIZE && LeafNodeType::FIXED_SIZE && this->isBreadthFirst(); }
+
+    /// @brief Return a c-string with the name of this grid
+    __hostdev__ const char* gridName() const { return DataType::gridName(); }
+
+    /// @brief Return a c-string with the name of this grid, truncated to 255 characters
+    __hostdev__ const char* shortGridName() const { return DataType::mGridName; }
+
+    /// @brief Return checksum of the grid buffer.
+    __hostdev__ const Checksum& checksum() const { return DataType::mChecksum; }
+
+    /// @brief Return true if this grid is empty, i.e. contains no values or nodes.
+    //__hostdev__ bool isEmpty() const { return this->tree().isEmpty(); }
+
+    /// @brief Return the count of blind-data encoded in this grid
+    __hostdev__ uint32_t blindDataCount() const { return DataType::mBlindMetadataCount; }
+
+    /// @brief Return the index of the first blind data with specified name if found, otherwise -1.
+    __hostdev__ int findBlindData(const char* name) const;
+
+    /// @brief Return the index of the first blind data with specified semantic if found, otherwise -1.
+    __hostdev__ int findBlindDataForSemantic(GridBlindDataSemantic semantic) const;
+
+    /// @brief Returns a const pointer to the blindData at the specified linear offset.
+    ///
+    /// @warning Pointer might be NULL and the linear offset is assumed to be in the valid range
+    // this method is deprecated !!!!
+    [[deprecated("Use Grid::getBlindData<T>() instead.")]]
+    __hostdev__ const void* blindData(uint32_t n) const
+    {
+        printf("\nnanovdb::Grid::blindData is unsafe and hence deprecated! Please use nanovdb::Grid::getBlindData instead.\n\n");
+        NANOVDB_ASSERT(n < DataType::mBlindMetadataCount);
+        return this->blindMetaData(n).blindData();
+    }
+
+    template <typename BlindDataT>
+     __hostdev__ const BlindDataT* getBlindData(uint32_t n) const
+    {
+        if (n >= DataType::mBlindMetadataCount) return nullptr;// index is out of bounds
+        return this->blindMetaData(n).template getBlindData<BlindDataT>();// NULL if mismatching BlindDataT
+    }
+
+    template <typename BlindDataT>
+     __hostdev__ BlindDataT* getBlindData(uint32_t n)
+    {
+        if (n >= DataType::mBlindMetadataCount) return nullptr;// index is out of bounds
+        return const_cast<BlindDataT*>(this->blindMetaData(n).template getBlindData<BlindDataT>());// NULL if mismatching BlindDataT
+    }
+
+    __hostdev__ const GridBlindMetaData& blindMetaData(uint32_t n) const { return *DataType::blindMetaData(n); }
+
+private:
+    static_assert(sizeof(GridData) % NANOVDB_DATA_ALIGNMENT == 0, "sizeof(GridData) is misaligned");
+}; // Class Grid
+
+template<typename TreeT>
+__hostdev__ int Grid<TreeT>::findBlindDataForSemantic(GridBlindDataSemantic semantic) const
+{
+    for (uint32_t i = 0, n = this->blindDataCount(); i < n; ++i) {
+        if (this->blindMetaData(i).mSemantic == semantic)
+            return int(i);
+    }
+    return -1;
+}
+
+template<typename TreeT>
+__hostdev__ int Grid<TreeT>::findBlindData(const char* name) const
+{
+    auto test = [&](int n) {
+        const char* str = this->blindMetaData(n).mName;
+        for (int i = 0; i < GridBlindMetaData::MaxNameSize; ++i) {
+            if (name[i] != str[i])
+                return false;
+            if (name[i] == '\0' && str[i] == '\0')
+                return true;
+        }
+        return true; // all len characters matched
+    };
+    for (int i = 0, n = this->blindDataCount(); i < n; ++i)
+        if (test(i))
+            return i;
+    return -1;
+}
+
+// ----------------------------> Tree <--------------------------------------
+
+struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) TreeData
+{ // sizeof(TreeData) == 64B
+    int64_t  mNodeOffset[4];// 32B, byte offset from this tree to first leaf, lower, upper and root node. If mNodeCount[N]=0 => mNodeOffset[N]==mNodeOffset[N+1]
+    uint32_t mNodeCount[3]; // 12B, total number of nodes of type: leaf, lower internal, upper internal
+    uint32_t mTileCount[3]; // 12B, total number of active tile values at the lower internal, upper internal and root node levels
+    uint64_t mVoxelCount; //    8B, total number of active voxels in the root and all its child nodes.
+    // No padding since it's always 32B aligned
+    //__hostdev__ TreeData& operator=(const TreeData& other){return *util::memcpy(this, &other);}
+    TreeData& operator=(const TreeData&) = default;
+    __hostdev__ void setRoot(const void* root) {
+        NANOVDB_ASSERT(root);
+        mNodeOffset[3] = util::PtrDiff(root, this);
+    }
+
+    /// @brief Get a non-const void pointer to the root node (never NULL)
+    __hostdev__ void* getRoot() { return util::PtrAdd(this, mNodeOffset[3]); }
+
+    /// @brief Get a const void pointer to the root node (never NULL)
+    __hostdev__ const void* getRoot() const { return util::PtrAdd(this, mNodeOffset[3]); }
+
+    template<typename NodeT>
+    __hostdev__ void setFirstNode(const NodeT* node) {mNodeOffset[NodeT::LEVEL] = (node ? util::PtrDiff(node, this) : 0);}
+
+    /// @brief Return true if the root is empty, i.e. has not child nodes or constant tiles
+    __hostdev__ bool isEmpty() const  {return  mNodeOffset[3] ? *util::PtrAdd<uint32_t>(this, mNodeOffset[3] + sizeof(CoordBBox)) == 0 : true;}
+
+    /// @brief Return the index bounding box of all the active values in this tree, i.e. in all nodes of the tree
+    __hostdev__ CoordBBox bbox() const {return  mNodeOffset[3] ? *util::PtrAdd<CoordBBox>(this, mNodeOffset[3]) : CoordBBox();}
+
+    /// @brief  return true if RootData is layout out immediately after TreeData in memory
+    __hostdev__ bool isRootNext() const {return mNodeOffset[3] ? mNodeOffset[3] == sizeof(TreeData) : false; }
+};// TreeData
+
+// ----------------------------> GridTree <--------------------------------------
+
+/// @brief defines a tree type from a grid type while preserving constness
+template<typename GridT>
+struct GridTree
+{
+    using Type = typename GridT::TreeType;
+    using type = typename GridT::TreeType;
+};
+template<typename GridT>
+struct GridTree<const GridT>
+{
+    using Type = const typename GridT::TreeType;
+    using type = const typename GridT::TreeType;
+};
+
+// ----------------------------> Tree <--------------------------------------
+
+/// @brief VDB Tree, which is a thin wrapper around a RootNode.
+template<typename RootT>
+class Tree : public TreeData
+{
+    static_assert(RootT::LEVEL == 3, "Tree depth is not supported");
+    static_assert(RootT::ChildNodeType::LOG2DIM == 5, "Tree configuration is not supported");
+    static_assert(RootT::ChildNodeType::ChildNodeType::LOG2DIM == 4, "Tree configuration is not supported");
+    static_assert(RootT::LeafNodeType::LOG2DIM == 3, "Tree configuration is not supported");
+
+public:
+    using DataType = TreeData;
+    using RootType = RootT;
+    using RootNodeType = RootT;
+    using UpperNodeType = typename RootNodeType::ChildNodeType;
+    using LowerNodeType = typename UpperNodeType::ChildNodeType;
+    using LeafNodeType = typename RootType::LeafNodeType;
+    using ValueType = typename RootT::ValueType;
+    using BuildType = typename RootT::BuildType; // in rare cases BuildType != ValueType, e.g. then BuildType = ValueMask and ValueType = bool
+    using CoordType = typename RootT::CoordType;
+    using AccessorType = DefaultReadAccessor<BuildType>;
+
+    using Node3 = RootT;
+    using Node2 = typename RootT::ChildNodeType;
+    using Node1 = typename Node2::ChildNodeType;
+    using Node0 = LeafNodeType;
+
+    /// @brief This class cannot be constructed or deleted
+    Tree() = delete;
+    Tree(const Tree&) = delete;
+    Tree& operator=(const Tree&) = delete;
+    ~Tree() = delete;
+
+    __hostdev__ DataType* data() { return reinterpret_cast<DataType*>(this); }
+
+    __hostdev__ const DataType* data() const { return reinterpret_cast<const DataType*>(this); }
+
+    /// @brief return memory usage in bytes for the class
+    __hostdev__ static uint64_t memUsage() { return sizeof(DataType); }
+
+    __hostdev__ RootT& root() {return *reinterpret_cast<RootT*>(DataType::getRoot());}
+
+    __hostdev__ const RootT& root() const {return *reinterpret_cast<const RootT*>(DataType::getRoot());}
+
+    __hostdev__ AccessorType getAccessor() const { return AccessorType(this->root()); }
+
+    /// @brief Return the value of the given voxel (regardless of state or location in the tree.)
+    __hostdev__ ValueType getValue(const CoordType& ijk) const { return this->root().getValue(ijk); }
+    __hostdev__ ValueType getValue(int i, int j, int k) const { return this->root().getValue(CoordType(i, j, k)); }
+
+    /// @brief Return the active state of the given voxel (regardless of state or location in the tree.)
+    __hostdev__ bool isActive(const CoordType& ijk) const { return this->root().isActive(ijk); }
+
+    /// @brief Return true if this tree is empty, i.e. contains no values or nodes
+    //__hostdev__ bool isEmpty() const { return this->root().isEmpty(); }
+
+    /// @brief Combines the previous two methods in a single call
+    __hostdev__ bool probeValue(const CoordType& ijk, ValueType& v) const { return this->root().probeValue(ijk, v); }
+
+    /// @brief Return a const reference to the background value.
+    __hostdev__ const ValueType& background() const { return this->root().background(); }
+
+    /// @brief Sets the extrema values of all the active values in this tree, i.e. in all nodes of the tree
+    __hostdev__ void extrema(ValueType& min, ValueType& max) const;
+
+    /// @brief Return a const reference to the index bounding box of all the active values in this tree, i.e. in all nodes of the tree
+    //__hostdev__ const BBox<CoordType>& bbox() const { return this->root().bbox(); }
+
+    /// @brief Return the total number of active voxels in this tree.
+    __hostdev__ uint64_t activeVoxelCount() const { return DataType::mVoxelCount; }
+
+    /// @brief   Return the total number of active tiles at the specified level of the tree.
+    ///
+    /// @details level = 1,2,3 corresponds to active tile count in lower internal nodes, upper
+    ///          internal nodes, and the root level. Note active values at the leaf level are
+    ///          referred to as active voxels (see activeVoxelCount defined above).
+    __hostdev__ const uint32_t& activeTileCount(uint32_t level) const
+    {
+        NANOVDB_ASSERT(level > 0 && level <= 3); // 1, 2, or 3
+        return DataType::mTileCount[level - 1];
+    }
+
+    template<typename NodeT>
+    __hostdev__ uint32_t nodeCount() const
+    {
+        static_assert(NodeT::LEVEL < 3, "Invalid NodeT");
+        return DataType::mNodeCount[NodeT::LEVEL];
+    }
+
+    __hostdev__ uint32_t nodeCount(int level) const
+    {
+        NANOVDB_ASSERT(level < 3);
+        return DataType::mNodeCount[level];
+    }
+
+    __hostdev__ uint32_t totalNodeCount() const
+    {
+        return DataType::mNodeCount[0] + DataType::mNodeCount[1] + DataType::mNodeCount[2];
+    }
+
+    /// @brief return a pointer to the first node of the specified type
+    ///
+    /// @warning Note it may return NULL if no nodes exist
+    template<typename NodeT>
+    __hostdev__ NodeT* getFirstNode()
+    {
+        const int64_t nodeOffset = DataType::mNodeOffset[NodeT::LEVEL];
+        return nodeOffset ? util::PtrAdd<NodeT>(this, nodeOffset) : nullptr;
+    }
+
+    /// @brief return a const pointer to the first node of the specified type
+    ///
+    /// @warning Note it may return NULL if no nodes exist
+    template<typename NodeT>
+    __hostdev__ const NodeT* getFirstNode() const
+    {
+        const int64_t nodeOffset = DataType::mNodeOffset[NodeT::LEVEL];
+        return nodeOffset ? util::PtrAdd<NodeT>(this, nodeOffset) : nullptr;
+    }
+
+    /// @brief return a pointer to the first node at the specified level
+    ///
+    /// @warning Note it may return NULL if no nodes exist
+    template<int LEVEL>
+    __hostdev__ typename NodeTrait<RootT, LEVEL>::type* getFirstNode()
+    {
+        return this->template getFirstNode<typename NodeTrait<RootT, LEVEL>::type>();
+    }
+
+    /// @brief return a const pointer to the first node of the specified level
+    ///
+    /// @warning Note it may return NULL if no nodes exist
+    template<int LEVEL>
+    __hostdev__ const typename NodeTrait<RootT, LEVEL>::type* getFirstNode() const
+    {
+        return this->template getFirstNode<typename NodeTrait<RootT, LEVEL>::type>();
+    }
+
+    /// @brief Template specializations of getFirstNode
+    __hostdev__ LeafNodeType*                             getFirstLeaf() { return this->getFirstNode<LeafNodeType>(); }
+    __hostdev__ const LeafNodeType*                       getFirstLeaf() const { return this->getFirstNode<LeafNodeType>(); }
+    __hostdev__ typename NodeTrait<RootT, 1>::type*       getFirstLower() { return this->getFirstNode<1>(); }
+    __hostdev__ const typename NodeTrait<RootT, 1>::type* getFirstLower() const { return this->getFirstNode<1>(); }
+    __hostdev__ typename NodeTrait<RootT, 2>::type*       getFirstUpper() { return this->getFirstNode<2>(); }
+    __hostdev__ const typename NodeTrait<RootT, 2>::type* getFirstUpper() const { return this->getFirstNode<2>(); }
+
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto get(const CoordType& ijk, ArgsT&&... args) const
+    {
+        return this->root().template get<OpT>(ijk, args...);
+    }
+
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto set(const CoordType& ijk, ArgsT&&... args)
+    {
+        return this->root().template set<OpT>(ijk, args...);
+    }
+
+private:
+    static_assert(sizeof(DataType) % NANOVDB_DATA_ALIGNMENT == 0, "sizeof(TreeData) is misaligned");
+
+}; // Tree class
+
+template<typename RootT>
+__hostdev__ void Tree<RootT>::extrema(ValueType& min, ValueType& max) const
+{
+    min = this->root().minimum();
+    max = this->root().maximum();
+}
+
+// --------------------------> RootData <------------------------------------
+
+/// @brief Struct with all the member data of the RootNode (useful during serialization of an openvdb RootNode)
+///
+/// @note No client code should (or can) interface with this struct so it can safely be ignored!
+template<typename ChildT>
+struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) RootData
+{
+    using ValueT = typename ChildT::ValueType;
+    using BuildT = typename ChildT::BuildType; // in rare cases BuildType != ValueType, e.g. then BuildType = ValueMask and ValueType = bool
+    using CoordT = typename ChildT::CoordType;
+    using StatsT = typename ChildT::FloatType;
+    static constexpr bool FIXED_SIZE = false;
+
+    /// @brief Return a key based on the coordinates of a voxel
+#ifdef NANOVDB_USE_SINGLE_ROOT_KEY
+    using KeyT = uint64_t;
+    template<typename CoordType>
+    __hostdev__ static KeyT CoordToKey(const CoordType& ijk)
+    {
+        static_assert(sizeof(CoordT) == sizeof(CoordType), "Mismatching sizeof");
+        static_assert(32 - ChildT::TOTAL <= 21, "Cannot use 64 bit root keys");
+        return (KeyT(uint32_t(ijk[2]) >> ChildT::TOTAL)) | //       z is the lower 21 bits
+               (KeyT(uint32_t(ijk[1]) >> ChildT::TOTAL) << 21) | // y is the middle 21 bits
+               (KeyT(uint32_t(ijk[0]) >> ChildT::TOTAL) << 42); //  x is the upper 21 bits
+    }
+    __hostdev__ static CoordT KeyToCoord(const KeyT& key)
+    {
+        static constexpr uint64_t MASK = (1u << 21) - 1; // used to mask out 21 lower bits
+        return CoordT(((key >> 42) & MASK) << ChildT::TOTAL, // x are the upper 21 bits
+                      ((key >> 21) & MASK) << ChildT::TOTAL, // y are the middle 21 bits
+                      (key & MASK) << ChildT::TOTAL); // z are the lower 21 bits
+    }
+#else
+    using KeyT = CoordT;
+    __hostdev__ static KeyT   CoordToKey(const CoordT& ijk) { return ijk & ~ChildT::MASK; }
+    __hostdev__ static CoordT KeyToCoord(const KeyT& key) { return key; }
+#endif
+    math::BBox<CoordT> mBBox; // 24B. AABB of active values in index space.
+    uint32_t           mTableSize; // 4B. number of tiles and child pointers in the root node
+
+    ValueT mBackground; // background value, i.e. value of any unset voxel
+    ValueT mMinimum; // typically 4B, minimum of all the active values
+    ValueT mMaximum; // typically 4B, maximum of all the active values
+    StatsT mAverage; // typically 4B, average of all the active values in this node and its child nodes
+    StatsT mStdDevi; // typically 4B, standard deviation of all the active values in this node and its child nodes
+
+    /// @brief Return padding of this class in bytes, due to aliasing and 32B alignment
+    ///
+    /// @note The extra bytes are not necessarily at the end, but can come from aliasing of individual data members.
+    __hostdev__ static constexpr uint32_t padding()
+    {
+        return sizeof(RootData) - (24 + 4 + 3 * sizeof(ValueT) + 2 * sizeof(StatsT));
+    }
+
+    struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) Tile
+    {
+        template<typename CoordType>
+        __hostdev__ void setChild(const CoordType& k, const void* ptr, const RootData* data)
+        {
+            key = CoordToKey(k);
+            state = false;
+            child = util::PtrDiff(ptr, data);
+        }
+        template<typename CoordType, typename ValueType>
+        __hostdev__ void setValue(const CoordType& k, bool s, const ValueType& v)
+        {
+            key = CoordToKey(k);
+            state = s;
+            value = v;
+            child = 0;
+        }
+        __hostdev__ bool   isChild() const { return child != 0; }
+        __hostdev__ bool   isValue() const { return child == 0; }
+        __hostdev__ bool   isActive() const { return child == 0 && state; }
+        __hostdev__ CoordT origin() const { return KeyToCoord(key); }
+        KeyT               key; // NANOVDB_USE_SINGLE_ROOT_KEY ? 8B : 12B
+        int64_t            child; // 8B. signed byte offset from this node to the child node.  0 means it is a constant tile, so use value.
+        uint32_t           state; // 4B. state of tile value
+        ValueT             value; // value of tile (i.e. no child node)
+    }; // Tile
+
+    /// @brief Returns a non-const reference to the tile at the specified linear offset.
+    ///
+    /// @warning The linear offset is assumed to be in the valid range
+    __hostdev__ const Tile* tile(uint32_t n) const
+    {
+        NANOVDB_ASSERT(n < mTableSize);
+        return reinterpret_cast<const Tile*>(this + 1) + n;
+    }
+    __hostdev__ Tile* tile(uint32_t n)
+    {
+        NANOVDB_ASSERT(n < mTableSize);
+        return reinterpret_cast<Tile*>(this + 1) + n;
+    }
+
+    __hostdev__ Tile* probeTile(const CoordT& ijk)
+    {
+#if 1 // switch between linear and binary seach
+        const auto key = CoordToKey(ijk);
+        for (Tile *p = reinterpret_cast<Tile*>(this + 1), *q = p + mTableSize; p < q; ++p)
+            if (p->key == key)
+                return p;
+        return nullptr;
+#else // do not enable binary search if tiles are not guaranteed to be sorted!!!!!!
+        int32_t low = 0, high = mTableSize; // low is inclusive and high is exclusive
+        while (low != high) {
+            int         mid = low + ((high - low) >> 1);
+            const Tile* tile = &tiles[mid];
+            if (tile->key == key) {
+                return tile;
+            } else if (tile->key < key) {
+                low = mid + 1;
+            } else {
+                high = mid;
+            }
+        }
+        return nullptr;
+#endif
+    }
+
+    __hostdev__ inline const Tile* probeTile(const CoordT& ijk) const
+    {
+        return const_cast<RootData*>(this)->probeTile(ijk);
+    }
+
+    /// @brief Returns a const reference to the child node in the specified tile.
+    ///
+    /// @warning A child node is assumed to exist in the specified tile
+    __hostdev__ ChildT* getChild(const Tile* tile)
+    {
+        NANOVDB_ASSERT(tile->child);
+        return util::PtrAdd<ChildT>(this, tile->child);
+    }
+    __hostdev__ const ChildT* getChild(const Tile* tile) const
+    {
+        NANOVDB_ASSERT(tile->child);
+        return util::PtrAdd<ChildT>(this, tile->child);
+    }
+
+    __hostdev__ const ValueT& getMin() const { return mMinimum; }
+    __hostdev__ const ValueT& getMax() const { return mMaximum; }
+    __hostdev__ const StatsT& average() const { return mAverage; }
+    __hostdev__ const StatsT& stdDeviation() const { return mStdDevi; }
+
+    __hostdev__ void setMin(const ValueT& v) { mMinimum = v; }
+    __hostdev__ void setMax(const ValueT& v) { mMaximum = v; }
+    __hostdev__ void setAvg(const StatsT& v) { mAverage = v; }
+    __hostdev__ void setDev(const StatsT& v) { mStdDevi = v; }
+
+    /// @brief This class cannot be constructed or deleted
+    RootData() = delete;
+    RootData(const RootData&) = delete;
+    RootData& operator=(const RootData&) = delete;
+    ~RootData() = delete;
+}; // RootData
+
+// --------------------------> RootNode <------------------------------------
+
+/// @brief Top-most node of the VDB tree structure.
+template<typename ChildT>
+class RootNode : public RootData<ChildT>
+{
+public:
+    using DataType = RootData<ChildT>;
+    using ChildNodeType = ChildT;
+    using RootType = RootNode<ChildT>; // this allows RootNode to behave like a Tree
+    using RootNodeType = RootType;
+    using UpperNodeType = ChildT;
+    using LowerNodeType = typename UpperNodeType::ChildNodeType;
+    using LeafNodeType = typename ChildT::LeafNodeType;
+    using ValueType = typename DataType::ValueT;
+    using FloatType = typename DataType::StatsT;
+    using BuildType = typename DataType::BuildT; // in rare cases BuildType != ValueType, e.g. then BuildType = ValueMask and ValueType = bool
+
+    using CoordType = typename ChildT::CoordType;
+    using BBoxType = math::BBox<CoordType>;
+    using AccessorType = DefaultReadAccessor<BuildType>;
+    using Tile = typename DataType::Tile;
+    static constexpr bool FIXED_SIZE = DataType::FIXED_SIZE;
+
+    static constexpr uint32_t LEVEL = 1 + ChildT::LEVEL; // level 0 = leaf
+
+    template<typename RootT>
+    class BaseIter
+    {
+    protected:
+        using DataT = typename util::match_const<DataType, RootT>::type;
+        using TileT = typename util::match_const<Tile, RootT>::type;
+        DataT*      mData;
+        uint32_t    mPos, mSize;
+        __hostdev__ BaseIter(DataT* data = nullptr, uint32_t n = 0)
+            : mData(data)
+            , mPos(0)
+            , mSize(n)
+        {
+        }
+
+    public:
+        __hostdev__ operator bool() const { return mPos < mSize; }
+        __hostdev__ uint32_t  pos() const { return mPos; }
+        __hostdev__ void      next() { ++mPos; }
+        __hostdev__ TileT*    tile() const { return mData->tile(mPos); }
+        __hostdev__ CoordType getOrigin() const
+        {
+            NANOVDB_ASSERT(*this);
+            return this->tile()->origin();
+        }
+        __hostdev__ CoordType getCoord() const
+        {
+            NANOVDB_ASSERT(*this);
+            return this->tile()->origin();
+        }
+    }; // Member class BaseIter
+
+    template<typename RootT>
+    class ChildIter : public BaseIter<RootT>
+    {
+        static_assert(util::is_same<typename util::remove_const<RootT>::type, RootNode>::value, "Invalid RootT");
+        using BaseT = BaseIter<RootT>;
+        using NodeT = typename util::match_const<ChildT, RootT>::type;
+
+    public:
+        __hostdev__ ChildIter()
+            : BaseT()
+        {
+        }
+        __hostdev__ ChildIter(RootT* parent)
+            : BaseT(parent->data(), parent->tileCount())
+        {
+            NANOVDB_ASSERT(BaseT::mData);
+            while (*this && !this->tile()->isChild())
+                this->next();
+        }
+        __hostdev__ NodeT& operator*() const
+        {
+            NANOVDB_ASSERT(*this);
+            return *BaseT::mData->getChild(this->tile());
+        }
+        __hostdev__ NodeT* operator->() const
+        {
+            NANOVDB_ASSERT(*this);
+            return BaseT::mData->getChild(this->tile());
+        }
+        __hostdev__ ChildIter& operator++()
+        {
+            NANOVDB_ASSERT(BaseT::mData);
+            this->next();
+            while (*this && this->tile()->isValue())
+                this->next();
+            return *this;
+        }
+        __hostdev__ ChildIter operator++(int)
+        {
+            auto tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+    }; // Member class ChildIter
+
+    using ChildIterator      = ChildIter<RootNode>;
+    using ConstChildIterator = ChildIter<const RootNode>;
+
+    __hostdev__ ChildIterator       beginChild() { return ChildIterator(this); }
+    __hostdev__ ConstChildIterator cbeginChild() const { return ConstChildIterator(this); }
+
+    template<typename RootT>
+    class ValueIter : public BaseIter<RootT>
+    {
+        using BaseT = BaseIter<RootT>;
+
+    public:
+        __hostdev__ ValueIter()
+            : BaseT()
+        {
+        }
+        __hostdev__ ValueIter(RootT* parent)
+            : BaseT(parent->data(), parent->tileCount())
+        {
+            NANOVDB_ASSERT(BaseT::mData);
+            while (*this && this->tile()->isChild())
+                this->next();
+        }
+        __hostdev__ ValueType operator*() const
+        {
+            NANOVDB_ASSERT(*this);
+            return this->tile()->value;
+        }
+        __hostdev__ bool isActive() const
+        {
+            NANOVDB_ASSERT(*this);
+            return this->tile()->state;
+        }
+        __hostdev__ ValueIter& operator++()
+        {
+            NANOVDB_ASSERT(BaseT::mData);
+            this->next();
+            while (*this && this->tile()->isChild())
+                this->next();
+            return *this;
+        }
+        __hostdev__ ValueIter operator++(int)
+        {
+            auto tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+    }; // Member class ValueIter
+
+    using ValueIterator = ValueIter<RootNode>;
+    using ConstValueIterator = ValueIter<const RootNode>;
+
+    __hostdev__ ValueIterator      beginValue() { return ValueIterator(this); }
+    __hostdev__ ConstValueIterator cbeginValueAll() const { return ConstValueIterator(this); }
+
+    template<typename RootT>
+    class ValueOnIter : public BaseIter<RootT>
+    {
+        using BaseT = BaseIter<RootT>;
+
+    public:
+        __hostdev__ ValueOnIter()
+            : BaseT()
+        {
+        }
+        __hostdev__ ValueOnIter(RootT* parent)
+            : BaseT(parent->data(), parent->tileCount())
+        {
+            NANOVDB_ASSERT(BaseT::mData);
+            while (*this && !this->tile()->isActive())
+                ++BaseT::mPos;
+        }
+        __hostdev__ ValueType operator*() const
+        {
+            NANOVDB_ASSERT(*this);
+            return this->tile()->value;
+        }
+        __hostdev__ ValueOnIter& operator++()
+        {
+            NANOVDB_ASSERT(BaseT::mData);
+            this->next();
+            while (*this && !this->tile()->isActive())
+                this->next();
+            return *this;
+        }
+        __hostdev__ ValueOnIter operator++(int)
+        {
+            auto tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+    }; // Member class ValueOnIter
+
+    using ValueOnIterator = ValueOnIter<RootNode>;
+    using ConstValueOnIterator = ValueOnIter<const RootNode>;
+
+    __hostdev__ ValueOnIterator      beginValueOn() { return ValueOnIterator(this); }
+    __hostdev__ ConstValueOnIterator cbeginValueOn() const { return ConstValueOnIterator(this); }
+
+    template<typename RootT>
+    class DenseIter : public BaseIter<RootT>
+    {
+        using BaseT = BaseIter<RootT>;
+        using NodeT = typename util::match_const<ChildT, RootT>::type;
+
+    public:
+        __hostdev__ DenseIter()
+            : BaseT()
+        {
+        }
+        __hostdev__ DenseIter(RootT* parent)
+            : BaseT(parent->data(), parent->tileCount())
+        {
+            NANOVDB_ASSERT(BaseT::mData);
+        }
+        __hostdev__ NodeT* probeChild(ValueType& value) const
+        {
+            NANOVDB_ASSERT(*this);
+            NodeT* child = nullptr;
+            auto*  t = this->tile();
+            if (t->isChild()) {
+                child = BaseT::mData->getChild(t);
+            } else {
+                value = t->value;
+            }
+            return child;
+        }
+        __hostdev__ bool isValueOn() const
+        {
+            NANOVDB_ASSERT(*this);
+            return this->tile()->state;
+        }
+        __hostdev__ DenseIter& operator++()
+        {
+            NANOVDB_ASSERT(BaseT::mData);
+            this->next();
+            return *this;
+        }
+        __hostdev__ DenseIter operator++(int)
+        {
+            auto tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+    }; // Member class DenseIter
+
+    using DenseIterator = DenseIter<RootNode>;
+    using ConstDenseIterator = DenseIter<const RootNode>;
+
+    __hostdev__ DenseIterator      beginDense() { return DenseIterator(this); }
+    __hostdev__ ConstDenseIterator cbeginDense() const { return ConstDenseIterator(this); }
+    __hostdev__ ConstDenseIterator cbeginChildAll() const { return ConstDenseIterator(this); }
+
+    /// @brief This class cannot be constructed or deleted
+    RootNode() = delete;
+    RootNode(const RootNode&) = delete;
+    RootNode& operator=(const RootNode&) = delete;
+    ~RootNode() = delete;
+
+    __hostdev__ AccessorType getAccessor() const { return AccessorType(*this); }
+
+    __hostdev__ DataType* data() { return reinterpret_cast<DataType*>(this); }
+
+    __hostdev__ const DataType* data() const { return reinterpret_cast<const DataType*>(this); }
+
+    /// @brief Return a const reference to the index bounding box of all the active values in this tree, i.e. in all nodes of the tree
+    __hostdev__ const BBoxType& bbox() const { return DataType::mBBox; }
+
+    /// @brief Return the total number of active voxels in the root and all its child nodes.
+
+    /// @brief Return a const reference to the background value, i.e. the value associated with
+    ///        any coordinate location that has not been set explicitly.
+    __hostdev__ const ValueType& background() const { return DataType::mBackground; }
+
+    /// @brief Return the number of tiles encoded in this root node
+    __hostdev__ const uint32_t& tileCount() const { return DataType::mTableSize; }
+    __hostdev__ const uint32_t& getTableSize() const { return DataType::mTableSize; }
+
+    /// @brief Return a const reference to the minimum active value encoded in this root node and any of its child nodes
+    __hostdev__ const ValueType& minimum() const { return DataType::mMinimum; }
+
+    /// @brief Return a const reference to the maximum active value encoded in this root node and any of its child nodes
+    __hostdev__ const ValueType& maximum() const { return DataType::mMaximum; }
+
+    /// @brief Return a const reference to the average of all the active values encoded in this root node and any of its child nodes
+    __hostdev__ const FloatType& average() const { return DataType::mAverage; }
+
+    /// @brief Return the variance of all the active values encoded in this root node and any of its child nodes
+    __hostdev__ FloatType variance() const { return math::Pow2(DataType::mStdDevi); }
+
+    /// @brief Return a const reference to the standard deviation of all the active values encoded in this root node and any of its child nodes
+    __hostdev__ const FloatType& stdDeviation() const { return DataType::mStdDevi; }
+
+    /// @brief Return the expected memory footprint in bytes with the specified number of tiles
+    __hostdev__ static uint64_t memUsage(uint32_t tableSize) { return sizeof(RootNode) + tableSize * sizeof(Tile); }
+
+    /// @brief Return the actual memory footprint of this root node
+    __hostdev__ uint64_t memUsage() const { return sizeof(RootNode) + DataType::mTableSize * sizeof(Tile); }
+
+    /// @brief Return true if this RootNode is empty, i.e. contains no values or nodes
+    __hostdev__ bool isEmpty() const { return DataType::mTableSize == uint32_t(0); }
+
+#ifdef NANOVDB_NEW_ACCESSOR_METHODS
+    /// @brief Return the value of the given voxel
+    __hostdev__ ValueType getValue(const CoordType& ijk) const { return this->template get<GetValue<BuildType>>(ijk); }
+    __hostdev__ ValueType getValue(int i, int j, int k) const { return this->template get<GetValue<BuildType>>(CoordType(i, j, k)); }
+    __hostdev__ bool      isActive(const CoordType& ijk) const { return this->template get<GetState<BuildType>>(ijk); }
+    /// @brief return the state and updates the value of the specified voxel
+    __hostdev__ bool                probeValue(const CoordType& ijk, ValueType& v) const { return this->template get<ProbeValue<BuildType>>(ijk, v); }
+    __hostdev__ const LeafNodeType* probeLeaf(const CoordType& ijk) const { return this->template get<GetLeaf<BuildType>>(ijk); }
+#else // NANOVDB_NEW_ACCESSOR_METHODS
+
+    /// @brief Return the value of the given voxel
+    __hostdev__ ValueType getValue(const CoordType& ijk) const
+    {
+        if (const Tile* tile = DataType::probeTile(ijk)) {
+            return tile->isChild() ? this->getChild(tile)->getValue(ijk) : tile->value;
+        }
+        return DataType::mBackground;
+    }
+    __hostdev__ ValueType getValue(int i, int j, int k) const { return this->getValue(CoordType(i, j, k)); }
+
+    __hostdev__ bool isActive(const CoordType& ijk) const
+    {
+        if (const Tile* tile = DataType::probeTile(ijk)) {
+            return tile->isChild() ? this->getChild(tile)->isActive(ijk) : tile->state;
+        }
+        return false;
+    }
+
+    __hostdev__ bool probeValue(const CoordType& ijk, ValueType& v) const
+    {
+        if (const Tile* tile = DataType::probeTile(ijk)) {
+            if (tile->isChild()) {
+                const auto* child = this->getChild(tile);
+                return child->probeValue(ijk, v);
+            }
+            v = tile->value;
+            return tile->state;
+        }
+        v = DataType::mBackground;
+        return false;
+    }
+
+    __hostdev__ const LeafNodeType* probeLeaf(const CoordType& ijk) const
+    {
+        const Tile* tile = DataType::probeTile(ijk);
+        if (tile && tile->isChild()) {
+            const auto* child = this->getChild(tile);
+            return child->probeLeaf(ijk);
+        }
+        return nullptr;
+    }
+
+#endif // NANOVDB_NEW_ACCESSOR_METHODS
+
+    __hostdev__ const ChildNodeType* probeChild(const CoordType& ijk) const
+    {
+        const Tile* tile = DataType::probeTile(ijk);
+        return tile && tile->isChild() ? this->getChild(tile) : nullptr;
+    }
+
+    __hostdev__ ChildNodeType* probeChild(const CoordType& ijk)
+    {
+        const Tile* tile = DataType::probeTile(ijk);
+        return tile && tile->isChild() ? this->getChild(tile) : nullptr;
+    }
+
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto get(const CoordType& ijk, ArgsT&&... args) const
+    {
+        if (const Tile* tile = this->probeTile(ijk)) {
+            if (tile->isChild())
+                return this->getChild(tile)->template get<OpT>(ijk, args...);
+            return OpT::get(*tile, args...);
+        }
+        return OpT::get(*this, args...);
+    }
+
+    template<typename OpT, typename... ArgsT>
+    // __hostdev__ auto // occasionally fails with NVCC
+    __hostdev__ decltype(OpT::set(util::declval<Tile&>(), util::declval<ArgsT>()...))
+    set(const CoordType& ijk, ArgsT&&... args)
+    {
+        if (Tile* tile = DataType::probeTile(ijk)) {
+            if (tile->isChild())
+                return this->getChild(tile)->template set<OpT>(ijk, args...);
+            return OpT::set(*tile, args...);
+        }
+        return OpT::set(*this, args...);
+    }
+
+private:
+    static_assert(sizeof(DataType) % NANOVDB_DATA_ALIGNMENT == 0, "sizeof(RootData) is misaligned");
+    static_assert(sizeof(typename DataType::Tile) % NANOVDB_DATA_ALIGNMENT == 0, "sizeof(RootData::Tile) is misaligned");
+
+    template<typename, int, int, int>
+    friend class ReadAccessor;
+
+    template<typename>
+    friend class Tree;
+#ifndef NANOVDB_NEW_ACCESSOR_METHODS
+    /// @brief Private method to return node information and update a ReadAccessor
+    template<typename AccT>
+    __hostdev__ typename AccT::NodeInfo getNodeInfoAndCache(const CoordType& ijk, const AccT& acc) const
+    {
+        using NodeInfoT = typename AccT::NodeInfo;
+        if (const Tile* tile = this->probeTile(ijk)) {
+            if (tile->isChild()) {
+                const auto* child = this->getChild(tile);
+                acc.insert(ijk, child);
+                return child->getNodeInfoAndCache(ijk, acc);
+            }
+            return NodeInfoT{LEVEL, ChildT::dim(), tile->value, tile->value, tile->value, 0, tile->origin(), tile->origin() + CoordType(ChildT::DIM)};
+        }
+        return NodeInfoT{LEVEL, ChildT::dim(), this->minimum(), this->maximum(), this->average(), this->stdDeviation(), this->bbox()[0], this->bbox()[1]};
+    }
+
+    /// @brief Private method to return a voxel value and update a ReadAccessor
+    template<typename AccT>
+    __hostdev__ ValueType getValueAndCache(const CoordType& ijk, const AccT& acc) const
+    {
+        if (const Tile* tile = this->probeTile(ijk)) {
+            if (tile->isChild()) {
+                const auto* child = this->getChild(tile);
+                acc.insert(ijk, child);
+                return child->getValueAndCache(ijk, acc);
+            }
+            return tile->value;
+        }
+        return DataType::mBackground;
+    }
+
+    template<typename AccT>
+    __hostdev__ bool isActiveAndCache(const CoordType& ijk, const AccT& acc) const
+    {
+        const Tile* tile = this->probeTile(ijk);
+        if (tile && tile->isChild()) {
+            const auto* child = this->getChild(tile);
+            acc.insert(ijk, child);
+            return child->isActiveAndCache(ijk, acc);
+        }
+        return false;
+    }
+
+    template<typename AccT>
+    __hostdev__ bool probeValueAndCache(const CoordType& ijk, ValueType& v, const AccT& acc) const
+    {
+        if (const Tile* tile = this->probeTile(ijk)) {
+            if (tile->isChild()) {
+                const auto* child = this->getChild(tile);
+                acc.insert(ijk, child);
+                return child->probeValueAndCache(ijk, v, acc);
+            }
+            v = tile->value;
+            return tile->state;
+        }
+        v = DataType::mBackground;
+        return false;
+    }
+
+    template<typename AccT>
+    __hostdev__ const LeafNodeType* probeLeafAndCache(const CoordType& ijk, const AccT& acc) const
+    {
+        const Tile* tile = this->probeTile(ijk);
+        if (tile && tile->isChild()) {
+            const auto* child = this->getChild(tile);
+            acc.insert(ijk, child);
+            return child->probeLeafAndCache(ijk, acc);
+        }
+        return nullptr;
+    }
+#endif // NANOVDB_NEW_ACCESSOR_METHODS
+
+    template<typename RayT, typename AccT>
+    __hostdev__ uint32_t getDimAndCache(const CoordType& ijk, const RayT& ray, const AccT& acc) const
+    {
+        if (const Tile* tile = this->probeTile(ijk)) {
+            if (tile->isChild()) {
+                const auto* child = this->getChild(tile);
+                acc.insert(ijk, child);
+                return child->getDimAndCache(ijk, ray, acc);
+            }
+            return 1 << ChildT::TOTAL; //tile value
+        }
+        return ChildNodeType::dim(); // background
+    }
+
+    template<typename OpT, typename AccT, typename... ArgsT>
+    //__hostdev__  decltype(OpT::get(util::declval<const Tile&>(), util::declval<ArgsT>()...))
+    __hostdev__ auto
+    getAndCache(const CoordType& ijk, const AccT& acc, ArgsT&&... args) const
+    {
+        if (const Tile* tile = this->probeTile(ijk)) {
+            if (tile->isChild()) {
+                const ChildT* child = this->getChild(tile);
+                acc.insert(ijk, child);
+                return child->template getAndCache<OpT>(ijk, acc, args...);
+            }
+            return OpT::get(*tile, args...);
+        }
+        return OpT::get(*this, args...);
+    }
+
+    template<typename OpT, typename AccT, typename... ArgsT>
+    // __hostdev__ auto // occasionally fails with NVCC
+    __hostdev__ decltype(OpT::set(util::declval<Tile&>(), util::declval<ArgsT>()...))
+    setAndCache(const CoordType& ijk, const AccT& acc, ArgsT&&... args)
+    {
+        if (Tile* tile = DataType::probeTile(ijk)) {
+            if (tile->isChild()) {
+                ChildT* child = this->getChild(tile);
+                acc.insert(ijk, child);
+                return child->template setAndCache<OpT>(ijk, acc, args...);
+            }
+            return OpT::set(*tile, args...);
+        }
+        return OpT::set(*this, args...);
+    }
+
+}; // RootNode class
+
+// After the RootNode the memory layout is assumed to be the sorted Tiles
+
+// --------------------------> InternalNode <------------------------------------
+
+/// @brief Struct with all the member data of the InternalNode (useful during serialization of an openvdb InternalNode)
+///
+/// @note No client code should (or can) interface with this struct so it can safely be ignored!
+template<typename ChildT, uint32_t LOG2DIM>
+struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) InternalData
+{
+    using ValueT = typename ChildT::ValueType;
+    using BuildT = typename ChildT::BuildType; // in rare cases BuildType != ValueType, e.g. then BuildType = ValueMask and ValueType = bool
+    using StatsT = typename ChildT::FloatType;
+    using CoordT = typename ChildT::CoordType;
+    using MaskT = typename ChildT::template MaskType<LOG2DIM>;
+    static constexpr bool FIXED_SIZE = true;
+
+    union Tile
+    {
+        ValueT  value;
+        int64_t child; //signed 64 bit byte offset relative to this InternalData, i.e. child-pointer = Tile::child + this
+        /// @brief This class cannot be constructed or deleted
+        Tile() = delete;
+        Tile(const Tile&) = delete;
+        Tile& operator=(const Tile&) = delete;
+        ~Tile() = delete;
+    };
+
+    math::BBox<CoordT> mBBox; // 24B. node bounding box.                   |
+    uint64_t     mFlags; // 8B. node flags.                          | 32B aligned
+    MaskT        mValueMask; // LOG2DIM(5): 4096B, LOG2DIM(4): 512B  | 32B aligned
+    MaskT        mChildMask; // LOG2DIM(5): 4096B, LOG2DIM(4): 512B  | 32B aligned
+
+    ValueT mMinimum; // typically 4B
+    ValueT mMaximum; // typically 4B
+    StatsT mAverage; // typically 4B, average of all the active values in this node and its child nodes
+    StatsT mStdDevi; // typically 4B, standard deviation of all the active values in this node and its child nodes
+    // possible padding, e.g. 28 byte padding when ValueType = bool
+
+    /// @brief Return padding of this class in bytes, due to aliasing and 32B alignment
+    ///
+    /// @note The extra bytes are not necessarily at the end, but can come from aliasing of individual data members.
+    __hostdev__ static constexpr uint32_t padding()
+    {
+        return sizeof(InternalData) - (24u + 8u + 2 * (sizeof(MaskT) + sizeof(ValueT) + sizeof(StatsT)) + (1u << (3 * LOG2DIM)) * (sizeof(ValueT) > 8u ? sizeof(ValueT) : 8u));
+    }
+    alignas(32) Tile mTable[1u << (3 * LOG2DIM)]; // sizeof(ValueT) x (16*16*16 or 32*32*32)
+
+    __hostdev__ static uint64_t memUsage() { return sizeof(InternalData); }
+
+    __hostdev__ void setChild(uint32_t n, const void* ptr)
+    {
+        NANOVDB_ASSERT(mChildMask.isOn(n));
+        mTable[n].child = util::PtrDiff(ptr, this);
+    }
+
+    template<typename ValueT>
+    __hostdev__ void setValue(uint32_t n, const ValueT& v)
+    {
+        NANOVDB_ASSERT(!mChildMask.isOn(n));
+        mTable[n].value = v;
+    }
+
+    /// @brief Returns a pointer to the child node at the specifed linear offset.
+    __hostdev__ ChildT* getChild(uint32_t n)
+    {
+        NANOVDB_ASSERT(mChildMask.isOn(n));
+        return util::PtrAdd<ChildT>(this, mTable[n].child);
+    }
+    __hostdev__ const ChildT* getChild(uint32_t n) const
+    {
+        NANOVDB_ASSERT(mChildMask.isOn(n));
+        return util::PtrAdd<ChildT>(this, mTable[n].child);
+    }
+
+    __hostdev__ ValueT getValue(uint32_t n) const
+    {
+        NANOVDB_ASSERT(mChildMask.isOff(n));
+        return mTable[n].value;
+    }
+
+    __hostdev__ bool isActive(uint32_t n) const
+    {
+        NANOVDB_ASSERT(mChildMask.isOff(n));
+        return mValueMask.isOn(n);
+    }
+
+    __hostdev__ bool isChild(uint32_t n) const { return mChildMask.isOn(n); }
+
+    template<typename T>
+    __hostdev__ void setOrigin(const T& ijk) { mBBox[0] = ijk; }
+
+    __hostdev__ const ValueT& getMin() const { return mMinimum; }
+    __hostdev__ const ValueT& getMax() const { return mMaximum; }
+    __hostdev__ const StatsT& average() const { return mAverage; }
+    __hostdev__ const StatsT& stdDeviation() const { return mStdDevi; }
+
+#if defined(__GNUC__) && !defined(__APPLE__) && !defined(__llvm__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstringop-overflow"
+#endif
+    __hostdev__ void setMin(const ValueT& v) { mMinimum = v; }
+    __hostdev__ void setMax(const ValueT& v) { mMaximum = v; }
+    __hostdev__ void setAvg(const StatsT& v) { mAverage = v; }
+    __hostdev__ void setDev(const StatsT& v) { mStdDevi = v; }
+#if defined(__GNUC__) && !defined(__APPLE__) && !defined(__llvm__)
+#pragma GCC diagnostic pop
+#endif
+
+    /// @brief This class cannot be constructed or deleted
+    InternalData() = delete;
+    InternalData(const InternalData&) = delete;
+    InternalData& operator=(const InternalData&) = delete;
+    ~InternalData() = delete;
+}; // InternalData
+
+/// @brief Internal nodes of a VDB tree
+template<typename ChildT, uint32_t Log2Dim = ChildT::LOG2DIM + 1>
+class InternalNode : public InternalData<ChildT, Log2Dim>
+{
+public:
+    using DataType = InternalData<ChildT, Log2Dim>;
+    using ValueType = typename DataType::ValueT;
+    using FloatType = typename DataType::StatsT;
+    using BuildType = typename DataType::BuildT; // in rare cases BuildType != ValueType, e.g. then BuildType = ValueMask and ValueType = bool
+    using LeafNodeType = typename ChildT::LeafNodeType;
+    using ChildNodeType = ChildT;
+    using CoordType = typename ChildT::CoordType;
+    static constexpr bool FIXED_SIZE = DataType::FIXED_SIZE;
+    template<uint32_t LOG2>
+    using MaskType = typename ChildT::template MaskType<LOG2>;
+    template<bool On>
+    using MaskIterT = typename Mask<Log2Dim>::template Iterator<On>;
+
+    static constexpr uint32_t LOG2DIM = Log2Dim;
+    static constexpr uint32_t TOTAL = LOG2DIM + ChildT::TOTAL; // dimension in index space
+    static constexpr uint32_t DIM = 1u << TOTAL; // number of voxels along each axis of this node
+    static constexpr uint32_t SIZE = 1u << (3 * LOG2DIM); // number of tile values (or child pointers)
+    static constexpr uint32_t MASK = (1u << TOTAL) - 1u;
+    static constexpr uint32_t LEVEL = 1 + ChildT::LEVEL; // level 0 = leaf
+    static constexpr uint64_t NUM_VALUES = uint64_t(1) << (3 * TOTAL); // total voxel count represented by this node
+
+    /// @brief Visits child nodes of this node only
+    template <typename ParentT>
+    class ChildIter : public MaskIterT<true>
+    {
+        static_assert(util::is_same<typename util::remove_const<ParentT>::type, InternalNode>::value, "Invalid ParentT");
+        using BaseT = MaskIterT<true>;
+        using NodeT = typename util::match_const<ChildT, ParentT>::type;
+        ParentT* mParent;
+
+    public:
+        __hostdev__ ChildIter()
+            : BaseT()
+            , mParent(nullptr)
+        {
+        }
+        __hostdev__ ChildIter(ParentT* parent)
+            : BaseT(parent->mChildMask.beginOn())
+            , mParent(parent)
+        {
+        }
+        ChildIter& operator=(const ChildIter&) = default;
+        __hostdev__ NodeT& operator*() const
+        {
+            NANOVDB_ASSERT(*this);
+            return *mParent->getChild(BaseT::pos());
+        }
+        __hostdev__ NodeT* operator->() const
+        {
+            NANOVDB_ASSERT(*this);
+            return mParent->getChild(BaseT::pos());
+        }
+        __hostdev__ CoordType getOrigin() const
+        {
+            NANOVDB_ASSERT(*this);
+            return (*this)->origin();
+        }
+        __hostdev__ CoordType getCoord() const {return this->getOrigin();}
+    }; // Member class ChildIter
+
+    using ChildIterator      = ChildIter<InternalNode>;
+    using ConstChildIterator = ChildIter<const InternalNode>;
+
+    __hostdev__ ChildIterator       beginChild()       { return ChildIterator(this); }
+    __hostdev__ ConstChildIterator cbeginChild() const { return ConstChildIterator(this); }
+
+    /// @brief Visits all tile values in this node, i.e. both inactive and active tiles
+    class ValueIterator : public MaskIterT<false>
+    {
+        using BaseT = MaskIterT<false>;
+        const InternalNode* mParent;
+
+    public:
+        __hostdev__ ValueIterator()
+            : BaseT()
+            , mParent(nullptr)
+        {
+        }
+        __hostdev__ ValueIterator(const InternalNode* parent)
+            : BaseT(parent->data()->mChildMask.beginOff())
+            , mParent(parent)
+        {
+        }
+        ValueIterator&        operator=(const ValueIterator&) = default;
+        __hostdev__ ValueType operator*() const
+        {
+            NANOVDB_ASSERT(*this);
+            return mParent->data()->getValue(BaseT::pos());
+        }
+        __hostdev__ CoordType getOrigin() const
+        {
+            NANOVDB_ASSERT(*this);
+            return mParent->offsetToGlobalCoord(BaseT::pos());
+        }
+        __hostdev__ CoordType getCoord() const {return this->getOrigin();}
+        __hostdev__ bool isActive() const
+        {
+            NANOVDB_ASSERT(*this);
+            return mParent->data()->isActive(BaseT::mPos);
+        }
+    }; // Member class ValueIterator
+
+    __hostdev__ ValueIterator beginValue() const { return ValueIterator(this); }
+    __hostdev__ ValueIterator cbeginValueAll() const { return ValueIterator(this); }
+
+    /// @brief Visits active tile values of this node only
+    class ValueOnIterator : public MaskIterT<true>
+    {
+        using BaseT = MaskIterT<true>;
+        const InternalNode* mParent;
+
+    public:
+        __hostdev__ ValueOnIterator()
+            : BaseT()
+            , mParent(nullptr)
+        {
+        }
+        __hostdev__ ValueOnIterator(const InternalNode* parent)
+            : BaseT(parent->data()->mValueMask.beginOn())
+            , mParent(parent)
+        {
+        }
+        ValueOnIterator&      operator=(const ValueOnIterator&) = default;
+        __hostdev__ ValueType operator*() const
+        {
+            NANOVDB_ASSERT(*this);
+            return mParent->data()->getValue(BaseT::pos());
+        }
+        __hostdev__ CoordType getOrigin() const
+        {
+            NANOVDB_ASSERT(*this);
+            return mParent->offsetToGlobalCoord(BaseT::pos());
+        }
+        __hostdev__ CoordType getCoord() const {return this->getOrigin();}
+    }; // Member class ValueOnIterator
+
+    __hostdev__ ValueOnIterator beginValueOn() const { return ValueOnIterator(this); }
+    __hostdev__ ValueOnIterator cbeginValueOn() const { return ValueOnIterator(this); }
+
+    /// @brief Visits all tile values and child nodes of this node
+    class DenseIterator : public Mask<Log2Dim>::DenseIterator
+    {
+        using BaseT = typename Mask<Log2Dim>::DenseIterator;
+        const DataType* mParent;
+
+    public:
+        __hostdev__ DenseIterator()
+            : BaseT()
+            , mParent(nullptr)
+        {
+        }
+        __hostdev__ DenseIterator(const InternalNode* parent)
+            : BaseT(0)
+            , mParent(parent->data())
+        {
+        }
+        DenseIterator&            operator=(const DenseIterator&) = default;
+        __hostdev__ const ChildT* probeChild(ValueType& value) const
+        {
+            NANOVDB_ASSERT(mParent && bool(*this));
+            const ChildT* child = nullptr;
+            if (mParent->mChildMask.isOn(BaseT::pos())) {
+                child = mParent->getChild(BaseT::pos());
+            } else {
+                value = mParent->getValue(BaseT::pos());
+            }
+            return child;
+        }
+        __hostdev__ bool isValueOn() const
+        {
+            NANOVDB_ASSERT(mParent && bool(*this));
+            return mParent->isActive(BaseT::pos());
+        }
+        __hostdev__ CoordType getOrigin() const
+        {
+            NANOVDB_ASSERT(mParent && bool(*this));
+            return mParent->offsetToGlobalCoord(BaseT::pos());
+        }
+        __hostdev__ CoordType getCoord() const {return this->getOrigin();}
+    }; // Member class DenseIterator
+
+    __hostdev__ DenseIterator beginDense() const { return DenseIterator(this); }
+    __hostdev__ DenseIterator cbeginChildAll() const { return DenseIterator(this); } // matches openvdb
+
+    /// @brief This class cannot be constructed or deleted
+    InternalNode() = delete;
+    InternalNode(const InternalNode&) = delete;
+    InternalNode& operator=(const InternalNode&) = delete;
+    ~InternalNode() = delete;
+
+    __hostdev__ DataType* data() { return reinterpret_cast<DataType*>(this); }
+
+    __hostdev__ const DataType* data() const { return reinterpret_cast<const DataType*>(this); }
+
+    /// @brief Return the dimension, in voxel units, of this internal node (typically 8*16 or 8*16*32)
+    __hostdev__ static uint32_t dim() { return 1u << TOTAL; }
+
+    /// @brief Return memory usage in bytes for the class
+    __hostdev__ static size_t memUsage() { return DataType::memUsage(); }
+
+    /// @brief Return a const reference to the bit mask of active voxels in this internal node
+    __hostdev__ const MaskType<LOG2DIM>& valueMask() const { return DataType::mValueMask; }
+    __hostdev__ const MaskType<LOG2DIM>& getValueMask() const { return DataType::mValueMask; }
+
+    /// @brief Return a const reference to the bit mask of child nodes in this internal node
+    __hostdev__ const MaskType<LOG2DIM>& childMask() const { return DataType::mChildMask; }
+    __hostdev__ const MaskType<LOG2DIM>& getChildMask() const { return DataType::mChildMask; }
+
+    /// @brief Return the origin in index space of this leaf node
+    __hostdev__ CoordType origin() const { return DataType::mBBox.min() & ~MASK; }
+
+    /// @brief Return a const reference to the minimum active value encoded in this internal node and any of its child nodes
+    __hostdev__ const ValueType& minimum() const { return this->getMin(); }
+
+    /// @brief Return a const reference to the maximum active value encoded in this internal node and any of its child nodes
+    __hostdev__ const ValueType& maximum() const { return this->getMax(); }
+
+    /// @brief Return a const reference to the average of all the active values encoded in this internal node and any of its child nodes
+    __hostdev__ const FloatType& average() const { return DataType::mAverage; }
+
+    /// @brief Return the variance of all the active values encoded in this internal node and any of its child nodes
+    __hostdev__ FloatType variance() const { return DataType::mStdDevi * DataType::mStdDevi; }
+
+    /// @brief Return a const reference to the standard deviation of all the active values encoded in this internal node and any of its child nodes
+    __hostdev__ const FloatType& stdDeviation() const { return DataType::mStdDevi; }
+
+    /// @brief Return a const reference to the bounding box in index space of active values in this internal node and any of its child nodes
+    __hostdev__ const math::BBox<CoordType>& bbox() const { return DataType::mBBox; }
+
+    /// @brief If the first entry in this node's table is a tile, return the tile's value.
+    ///        Otherwise, return the result of calling getFirstValue() on the child.
+    __hostdev__ ValueType getFirstValue() const
+    {
+        return DataType::mChildMask.isOn(0) ? this->getChild(0)->getFirstValue() : DataType::getValue(0);
+    }
+
+    /// @brief If the last entry in this node's table is a tile, return the tile's value.
+    ///        Otherwise, return the result of calling getLastValue() on the child.
+    __hostdev__ ValueType getLastValue() const
+    {
+        return DataType::mChildMask.isOn(SIZE - 1) ? this->getChild(SIZE - 1)->getLastValue() : DataType::getValue(SIZE - 1);
+    }
+
+#ifdef NANOVDB_NEW_ACCESSOR_METHODS
+    /// @brief Return the value of the given voxel
+    __hostdev__ ValueType getValue(const CoordType& ijk) const { return this->template get<GetValue<BuildType>>(ijk); }
+    __hostdev__ bool      isActive(const CoordType& ijk) const { return this->template get<GetState<BuildType>>(ijk); }
+    /// @brief return the state and updates the value of the specified voxel
+    __hostdev__ bool                probeValue(const CoordType& ijk, ValueType& v) const { return this->template get<ProbeValue<BuildType>>(ijk, v); }
+    __hostdev__ const LeafNodeType* probeLeaf(const CoordType& ijk) const { return this->template get<GetLeaf<BuildType>>(ijk); }
+#else // NANOVDB_NEW_ACCESSOR_METHODS
+    __hostdev__ ValueType getValue(const CoordType& ijk) const
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        return DataType::mChildMask.isOn(n) ? this->getChild(n)->getValue(ijk) : DataType::getValue(n);
+    }
+    __hostdev__ bool isActive(const CoordType& ijk) const
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        return DataType::mChildMask.isOn(n) ? this->getChild(n)->isActive(ijk) : DataType::isActive(n);
+    }
+    __hostdev__ bool probeValue(const CoordType& ijk, ValueType& v) const
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        if (DataType::mChildMask.isOn(n))
+            return this->getChild(n)->probeValue(ijk, v);
+        v = DataType::getValue(n);
+        return DataType::isActive(n);
+    }
+    __hostdev__ const LeafNodeType* probeLeaf(const CoordType& ijk) const
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        if (DataType::mChildMask.isOn(n))
+            return this->getChild(n)->probeLeaf(ijk);
+        return nullptr;
+    }
+
+#endif // NANOVDB_NEW_ACCESSOR_METHODS
+
+    __hostdev__ ChildNodeType* probeChild(const CoordType& ijk)
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        return DataType::mChildMask.isOn(n) ? this->getChild(n) : nullptr;
+    }
+    __hostdev__ const ChildNodeType* probeChild(const CoordType& ijk) const
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        return DataType::mChildMask.isOn(n) ? this->getChild(n) : nullptr;
+    }
+
+    /// @brief Return the linear offset corresponding to the given coordinate
+    __hostdev__ static uint32_t CoordToOffset(const CoordType& ijk)
+    {
+        return (((ijk[0] & MASK) >> ChildT::TOTAL) << (2 * LOG2DIM)) | // note, we're using bitwise OR instead of +
+               (((ijk[1] & MASK) >> ChildT::TOTAL) << (LOG2DIM)) |
+               ((ijk[2] & MASK) >> ChildT::TOTAL);
+    }
+
+    /// @return the local coordinate of the n'th tile or child node
+    __hostdev__ static Coord OffsetToLocalCoord(uint32_t n)
+    {
+        NANOVDB_ASSERT(n < SIZE);
+        const uint32_t m = n & ((1 << 2 * LOG2DIM) - 1);
+        return Coord(n >> 2 * LOG2DIM, m >> LOG2DIM, m & ((1 << LOG2DIM) - 1));
+    }
+
+    /// @brief modifies local coordinates to global coordinates of a tile or child node
+    __hostdev__ void localToGlobalCoord(Coord& ijk) const
+    {
+        ijk <<= ChildT::TOTAL;
+        ijk += this->origin();
+    }
+
+    __hostdev__ Coord offsetToGlobalCoord(uint32_t n) const
+    {
+        Coord ijk = InternalNode::OffsetToLocalCoord(n);
+        this->localToGlobalCoord(ijk);
+        return ijk;
+    }
+
+    /// @brief Return true if this node or any of its child nodes contain active values
+    __hostdev__ bool isActive() const { return DataType::mFlags & uint32_t(2); }
+
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto get(const CoordType& ijk, ArgsT&&... args) const
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        if (this->isChild(n))
+            return this->getChild(n)->template get<OpT>(ijk, args...);
+        return OpT::get(*this, n, args...);
+    }
+
+    template<typename OpT, typename... ArgsT>
+    //__hostdev__ auto // occasionally fails with NVCC
+    __hostdev__ decltype(OpT::set(util::declval<InternalNode&>(), util::declval<uint32_t>(), util::declval<ArgsT>()...))
+    set(const CoordType& ijk, ArgsT&&... args)
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        if (this->isChild(n))
+            return this->getChild(n)->template set<OpT>(ijk, args...);
+        return OpT::set(*this, n, args...);
+    }
+
+private:
+    static_assert(sizeof(DataType) % NANOVDB_DATA_ALIGNMENT == 0, "sizeof(InternalData) is misaligned");
+
+    template<typename, int, int, int>
+    friend class ReadAccessor;
+
+    template<typename>
+    friend class RootNode;
+    template<typename, uint32_t>
+    friend class InternalNode;
+
+#ifndef NANOVDB_NEW_ACCESSOR_METHODS
+    /// @brief Private read access method used by the ReadAccessor
+    template<typename AccT>
+    __hostdev__ ValueType getValueAndCache(const CoordType& ijk, const AccT& acc) const
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        if (DataType::mChildMask.isOff(n))
+            return DataType::getValue(n);
+        const ChildT* child = this->getChild(n);
+        acc.insert(ijk, child);
+        return child->getValueAndCache(ijk, acc);
+    }
+    template<typename AccT>
+    __hostdev__ bool isActiveAndCache(const CoordType& ijk, const AccT& acc) const
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        if (DataType::mChildMask.isOff(n))
+            return DataType::isActive(n);
+        const ChildT* child = this->getChild(n);
+        acc.insert(ijk, child);
+        return child->isActiveAndCache(ijk, acc);
+    }
+    template<typename AccT>
+    __hostdev__ bool probeValueAndCache(const CoordType& ijk, ValueType& v, const AccT& acc) const
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        if (DataType::mChildMask.isOff(n)) {
+            v = DataType::getValue(n);
+            return DataType::isActive(n);
+        }
+        const ChildT* child = this->getChild(n);
+        acc.insert(ijk, child);
+        return child->probeValueAndCache(ijk, v, acc);
+    }
+    template<typename AccT>
+    __hostdev__ const LeafNodeType* probeLeafAndCache(const CoordType& ijk, const AccT& acc) const
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        if (DataType::mChildMask.isOff(n))
+            return nullptr;
+        const ChildT* child = this->getChild(n);
+        acc.insert(ijk, child);
+        return child->probeLeafAndCache(ijk, acc);
+    }
+    template<typename AccT>
+    __hostdev__ typename AccT::NodeInfo getNodeInfoAndCache(const CoordType& ijk, const AccT& acc) const
+    {
+        using NodeInfoT = typename AccT::NodeInfo;
+        const uint32_t n = CoordToOffset(ijk);
+        if (DataType::mChildMask.isOff(n)) {
+            return NodeInfoT{LEVEL, this->dim(), this->minimum(), this->maximum(), this->average(), this->stdDeviation(), this->bbox()[0], this->bbox()[1]};
+        }
+        const ChildT* child = this->getChild(n);
+        acc.insert(ijk, child);
+        return child->getNodeInfoAndCache(ijk, acc);
+    }
+#endif // NANOVDB_NEW_ACCESSOR_METHODS
+
+    template<typename RayT, typename AccT>
+    __hostdev__ uint32_t getDimAndCache(const CoordType& ijk, const RayT& ray, const AccT& acc) const
+    {
+        if (DataType::mFlags & uint32_t(1u))
+            return this->dim(); // skip this node if the 1st bit is set
+        //if (!ray.intersects( this->bbox() )) return 1<<TOTAL;
+
+        const uint32_t n = CoordToOffset(ijk);
+        if (DataType::mChildMask.isOn(n)) {
+            const ChildT* child = this->getChild(n);
+            acc.insert(ijk, child);
+            return child->getDimAndCache(ijk, ray, acc);
+        }
+        return ChildNodeType::dim(); // tile value
+    }
+
+    template<typename OpT, typename AccT, typename... ArgsT>
+    __hostdev__ auto
+    //__hostdev__  decltype(OpT::get(util::declval<const InternalNode&>(), util::declval<uint32_t>(), util::declval<ArgsT>()...))
+    getAndCache(const CoordType& ijk, const AccT& acc, ArgsT&&... args) const
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        if (DataType::mChildMask.isOff(n))
+            return OpT::get(*this, n, args...);
+        const ChildT* child = this->getChild(n);
+        acc.insert(ijk, child);
+        return child->template getAndCache<OpT>(ijk, acc, args...);
+    }
+
+    template<typename OpT, typename AccT, typename... ArgsT>
+    //__hostdev__ auto // occasionally fails with NVCC
+    __hostdev__ decltype(OpT::set(util::declval<InternalNode&>(), util::declval<uint32_t>(), util::declval<ArgsT>()...))
+    setAndCache(const CoordType& ijk, const AccT& acc, ArgsT&&... args)
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        if (DataType::mChildMask.isOff(n))
+            return OpT::set(*this, n, args...);
+        ChildT* child = this->getChild(n);
+        acc.insert(ijk, child);
+        return child->template setAndCache<OpT>(ijk, acc, args...);
+    }
+
+}; // InternalNode class
+
+// --------------------------> LeafData<T> <------------------------------------
+
+/// @brief Stuct with all the member data of the LeafNode (useful during serialization of an openvdb LeafNode)
+///
+/// @note No client code should (or can) interface with this struct so it can safely be ignored!
+template<typename ValueT, typename CoordT, template<uint32_t> class MaskT, uint32_t LOG2DIM>
+struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData
+{
+    static_assert(sizeof(CoordT) == sizeof(Coord), "Mismatching sizeof");
+    static_assert(sizeof(MaskT<LOG2DIM>) == sizeof(Mask<LOG2DIM>), "Mismatching sizeof");
+    using ValueType = ValueT;
+    using BuildType = ValueT;
+    using FloatType = typename FloatTraits<ValueT>::FloatType;
+    using ArrayType = ValueT; // type used for the internal mValue array
+    static constexpr bool FIXED_SIZE = true;
+
+    CoordT         mBBoxMin; // 12B.
+    uint8_t        mBBoxDif[3]; // 3B.
+    uint8_t        mFlags; // 1B. bit0: skip render?, bit1: has bbox?, bit3: unused, bit4: has stats, bits5,6,7: bit-width for FpN
+    MaskT<LOG2DIM> mValueMask; // LOG2DIM(3): 64B.
+
+    ValueType mMinimum; // typically 4B
+    ValueType mMaximum; // typically 4B
+    FloatType mAverage; // typically 4B, average of all the active values in this node and its child nodes
+    FloatType mStdDevi; // typically 4B, standard deviation of all the active values in this node and its child nodes
+    alignas(32) ValueType mValues[1u << 3 * LOG2DIM];
+
+    /// @brief Return padding of this class in bytes, due to aliasing and 32B alignment
+    ///
+    /// @note The extra bytes are not necessarily at the end, but can come from aliasing of individual data members.
+    __hostdev__ static constexpr uint32_t padding()
+    {
+        return sizeof(LeafData) - (12 + 3 + 1 + sizeof(MaskT<LOG2DIM>) + 2 * (sizeof(ValueT) + sizeof(FloatType)) + (1u << (3 * LOG2DIM)) * sizeof(ValueT));
+    }
+    __hostdev__ static uint64_t memUsage() { return sizeof(LeafData); }
+
+    __hostdev__ static bool hasStats() { return true; }
+
+    __hostdev__ ValueType getValue(uint32_t i) const { return mValues[i]; }
+    __hostdev__ void      setValueOnly(uint32_t offset, const ValueType& value) { mValues[offset] = value; }
+    __hostdev__ void      setValue(uint32_t offset, const ValueType& value)
+    {
+        mValueMask.setOn(offset);
+        mValues[offset] = value;
+    }
+    __hostdev__ void setOn(uint32_t offset) { mValueMask.setOn(offset); }
+
+    __hostdev__ ValueType getMin() const { return mMinimum; }
+    __hostdev__ ValueType getMax() const { return mMaximum; }
+    __hostdev__ FloatType getAvg() const { return mAverage; }
+    __hostdev__ FloatType getDev() const { return mStdDevi; }
+
+    __hostdev__ void setMin(const ValueType& v) { mMinimum = v; }
+    __hostdev__ void setMax(const ValueType& v) { mMaximum = v; }
+    __hostdev__ void setAvg(const FloatType& v) { mAverage = v; }
+    __hostdev__ void setDev(const FloatType& v) { mStdDevi = v; }
+
+    template<typename T>
+    __hostdev__ void setOrigin(const T& ijk) { mBBoxMin = ijk; }
+
+    __hostdev__ void fill(const ValueType& v)
+    {
+        for (auto *p = mValues, *q = p + 512; p != q; ++p)
+            *p = v;
+    }
+
+    /// @brief This class cannot be constructed or deleted
+    LeafData() = delete;
+    LeafData(const LeafData&) = delete;
+    LeafData& operator=(const LeafData&) = delete;
+    ~LeafData() = delete;
+}; // LeafData<ValueT>
+
+// --------------------------> LeafFnBase <------------------------------------
+
+/// @brief Base-class for quantized float leaf nodes
+template<typename CoordT, template<uint32_t> class MaskT, uint32_t LOG2DIM>
+struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafFnBase
+{
+    static_assert(sizeof(CoordT) == sizeof(Coord), "Mismatching sizeof");
+    static_assert(sizeof(MaskT<LOG2DIM>) == sizeof(Mask<LOG2DIM>), "Mismatching sizeof");
+    using ValueType = float;
+    using FloatType = float;
+
+    CoordT         mBBoxMin; // 12B.
+    uint8_t        mBBoxDif[3]; // 3B.
+    uint8_t        mFlags; // 1B. bit0: skip render?, bit1: has bbox?, bit3: unused, bit4: has stats, bits5,6,7: bit-width for FpN
+    MaskT<LOG2DIM> mValueMask; // LOG2DIM(3): 64B.
+
+    float    mMinimum; //  4B - minimum of ALL values in this node
+    float    mQuantum; //  = (max - min)/15 4B
+    uint16_t mMin, mMax, mAvg, mDev; // quantized representations of statistics of active values
+    // no padding since it's always 32B aligned
+    __hostdev__ static uint64_t memUsage() { return sizeof(LeafFnBase); }
+
+    __hostdev__ static bool hasStats() { return true; }
+
+    /// @brief Return padding of this class in bytes, due to aliasing and 32B alignment
+    ///
+    /// @note The extra bytes are not necessarily at the end, but can come from aliasing of individual data members.
+    __hostdev__ static constexpr uint32_t padding()
+    {
+        return sizeof(LeafFnBase) - (12 + 3 + 1 + sizeof(MaskT<LOG2DIM>) + 2 * 4 + 4 * 2);
+    }
+    __hostdev__ void init(float min, float max, uint8_t bitWidth)
+    {
+        mMinimum = min;
+        mQuantum = (max - min) / float((1 << bitWidth) - 1);
+    }
+
+    __hostdev__ void setOn(uint32_t offset) { mValueMask.setOn(offset); }
+
+    /// @brief return the quantized minimum of the active values in this node
+    __hostdev__ float getMin() const { return mMin * mQuantum + mMinimum; }
+
+    /// @brief return the quantized maximum of the active values in this node
+    __hostdev__ float getMax() const { return mMax * mQuantum + mMinimum; }
+
+    /// @brief return the quantized average of the active values in this node
+    __hostdev__ float getAvg() const { return mAvg * mQuantum + mMinimum; }
+    /// @brief return the quantized standard deviation of the active values in this node
+
+    /// @note 0 <= StdDev <= max-min or 0 <= StdDev/(max-min) <= 1
+    __hostdev__ float getDev() const { return mDev * mQuantum; }
+
+    /// @note min <= X <= max or 0 <= (X-min)/(min-max) <= 1
+    __hostdev__ void setMin(float min) { mMin = uint16_t((min - mMinimum) / mQuantum + 0.5f); }
+
+    /// @note min <= X <= max or 0 <= (X-min)/(min-max) <= 1
+    __hostdev__ void setMax(float max) { mMax = uint16_t((max - mMinimum) / mQuantum + 0.5f); }
+
+    /// @note min <= avg <= max or 0 <= (avg-min)/(min-max) <= 1
+    __hostdev__ void setAvg(float avg) { mAvg = uint16_t((avg - mMinimum) / mQuantum + 0.5f); }
+
+    /// @note 0 <= StdDev <= max-min or 0 <= StdDev/(max-min) <= 1
+    __hostdev__ void setDev(float dev) { mDev = uint16_t(dev / mQuantum + 0.5f); }
+
+    template<typename T>
+    __hostdev__ void setOrigin(const T& ijk) { mBBoxMin = ijk; }
+}; // LeafFnBase
+
+// --------------------------> LeafData<Fp4> <------------------------------------
+
+/// @brief Stuct with all the member data of the LeafNode (useful during serialization of an openvdb LeafNode)
+///
+/// @note No client code should (or can) interface with this struct so it can safely be ignored!
+template<typename CoordT, template<uint32_t> class MaskT, uint32_t LOG2DIM>
+struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<Fp4, CoordT, MaskT, LOG2DIM>
+    : public LeafFnBase<CoordT, MaskT, LOG2DIM>
+{
+    using BaseT = LeafFnBase<CoordT, MaskT, LOG2DIM>;
+    using BuildType = Fp4;
+    using ArrayType = uint8_t; // type used for the internal mValue array
+    static constexpr bool FIXED_SIZE = true;
+    alignas(32) uint8_t mCode[1u << (3 * LOG2DIM - 1)]; // LeafFnBase is 32B aligned and so is mCode
+
+    __hostdev__ static constexpr uint64_t memUsage() { return sizeof(LeafData); }
+    __hostdev__ static constexpr uint32_t padding()
+    {
+        static_assert(BaseT::padding() == 0, "expected no padding in LeafFnBase");
+        return sizeof(LeafData) - sizeof(BaseT) - (1u << (3 * LOG2DIM - 1));
+    }
+
+    __hostdev__ static constexpr uint8_t bitWidth() { return 4u; }
+    __hostdev__ float                    getValue(uint32_t i) const
+    {
+#if 0
+        const uint8_t c = mCode[i>>1];
+        return ( (i&1) ? c >> 4 : c & uint8_t(15) )*BaseT::mQuantum + BaseT::mMinimum;
+#else
+        return ((mCode[i >> 1] >> ((i & 1) << 2)) & uint8_t(15)) * BaseT::mQuantum + BaseT::mMinimum;
+#endif
+    }
+
+    /// @brief This class cannot be constructed or deleted
+    LeafData() = delete;
+    LeafData(const LeafData&) = delete;
+    LeafData& operator=(const LeafData&) = delete;
+    ~LeafData() = delete;
+}; // LeafData<Fp4>
+
+// --------------------------> LeafBase<Fp8> <------------------------------------
+
+template<typename CoordT, template<uint32_t> class MaskT, uint32_t LOG2DIM>
+struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<Fp8, CoordT, MaskT, LOG2DIM>
+    : public LeafFnBase<CoordT, MaskT, LOG2DIM>
+{
+    using BaseT = LeafFnBase<CoordT, MaskT, LOG2DIM>;
+    using BuildType = Fp8;
+    using ArrayType = uint8_t; // type used for the internal mValue array
+    static constexpr bool FIXED_SIZE = true;
+    alignas(32) uint8_t mCode[1u << 3 * LOG2DIM];
+    __hostdev__ static constexpr int64_t  memUsage() { return sizeof(LeafData); }
+    __hostdev__ static constexpr uint32_t padding()
+    {
+        static_assert(BaseT::padding() == 0, "expected no padding in LeafFnBase");
+        return sizeof(LeafData) - sizeof(BaseT) - (1u << 3 * LOG2DIM);
+    }
+
+    __hostdev__ static constexpr uint8_t bitWidth() { return 8u; }
+    __hostdev__ float                    getValue(uint32_t i) const
+    {
+        return mCode[i] * BaseT::mQuantum + BaseT::mMinimum; // code * (max-min)/255 + min
+    }
+    /// @brief This class cannot be constructed or deleted
+    LeafData() = delete;
+    LeafData(const LeafData&) = delete;
+    LeafData& operator=(const LeafData&) = delete;
+    ~LeafData() = delete;
+}; // LeafData<Fp8>
+
+// --------------------------> LeafData<Fp16> <------------------------------------
+
+template<typename CoordT, template<uint32_t> class MaskT, uint32_t LOG2DIM>
+struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<Fp16, CoordT, MaskT, LOG2DIM>
+    : public LeafFnBase<CoordT, MaskT, LOG2DIM>
+{
+    using BaseT = LeafFnBase<CoordT, MaskT, LOG2DIM>;
+    using BuildType = Fp16;
+    using ArrayType = uint16_t; // type used for the internal mValue array
+    static constexpr bool FIXED_SIZE = true;
+    alignas(32) uint16_t mCode[1u << 3 * LOG2DIM];
+
+    __hostdev__ static constexpr uint64_t memUsage() { return sizeof(LeafData); }
+    __hostdev__ static constexpr uint32_t padding()
+    {
+        static_assert(BaseT::padding() == 0, "expected no padding in LeafFnBase");
+        return sizeof(LeafData) - sizeof(BaseT) - 2 * (1u << 3 * LOG2DIM);
+    }
+
+    __hostdev__ static constexpr uint8_t bitWidth() { return 16u; }
+    __hostdev__ float                    getValue(uint32_t i) const
+    {
+        return mCode[i] * BaseT::mQuantum + BaseT::mMinimum; // code * (max-min)/65535 + min
+    }
+
+    /// @brief This class cannot be constructed or deleted
+    LeafData() = delete;
+    LeafData(const LeafData&) = delete;
+    LeafData& operator=(const LeafData&) = delete;
+    ~LeafData() = delete;
+}; // LeafData<Fp16>
+
+// --------------------------> LeafData<FpN> <------------------------------------
+
+template<typename CoordT, template<uint32_t> class MaskT, uint32_t LOG2DIM>
+struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<FpN, CoordT, MaskT, LOG2DIM>
+    : public LeafFnBase<CoordT, MaskT, LOG2DIM>
+{ // this class has no additional data members, however every instance is immediately followed by
+    //  bitWidth*64 bytes. Since its base class is 32B aligned so are the bitWidth*64 bytes
+    using BaseT = LeafFnBase<CoordT, MaskT, LOG2DIM>;
+    using BuildType = FpN;
+    static constexpr bool FIXED_SIZE = false;
+    __hostdev__ static constexpr uint32_t padding()
+    {
+        static_assert(BaseT::padding() == 0, "expected no padding in LeafFnBase");
+        return 0;
+    }
+
+    __hostdev__ uint8_t       bitWidth() const { return 1 << (BaseT::mFlags >> 5); } // 4,8,16,32 = 2^(2,3,4,5)
+    __hostdev__ size_t        memUsage() const { return sizeof(*this) + this->bitWidth() * 64; }
+    __hostdev__ static size_t memUsage(uint32_t bitWidth) { return 96u + bitWidth * 64; }
+    __hostdev__ float         getValue(uint32_t i) const
+    {
+#ifdef NANOVDB_FPN_BRANCHLESS // faster
+        const int b = BaseT::mFlags >> 5; // b = 0, 1, 2, 3, 4 corresponding to 1, 2, 4, 8, 16 bits
+#if 0 // use LUT
+        uint16_t code = reinterpret_cast<const uint16_t*>(this + 1)[i >> (4 - b)];
+        const static uint8_t shift[5] = {15, 7, 3, 1, 0};
+        const static uint16_t mask[5] = {1, 3, 15, 255, 65535};
+        code >>= (i & shift[b]) << b;
+        code  &= mask[b];
+#else // no LUT
+        uint32_t code = reinterpret_cast<const uint32_t*>(this + 1)[i >> (5 - b)];
+        code >>= (i & ((32 >> b) - 1)) << b;
+        code &= (1 << (1 << b)) - 1;
+#endif
+#else // use branched version (slow)
+        float code;
+        auto* values = reinterpret_cast<const uint8_t*>(this + 1);
+        switch (BaseT::mFlags >> 5) {
+        case 0u: // 1 bit float
+            code = float((values[i >> 3] >> (i & 7)) & uint8_t(1));
+            break;
+        case 1u: // 2 bits float
+            code = float((values[i >> 2] >> ((i & 3) << 1)) & uint8_t(3));
+            break;
+        case 2u: // 4 bits float
+            code = float((values[i >> 1] >> ((i & 1) << 2)) & uint8_t(15));
+            break;
+        case 3u: // 8 bits float
+            code = float(values[i]);
+            break;
+        default: // 16 bits float
+            code = float(reinterpret_cast<const uint16_t*>(values)[i]);
+        }
+#endif
+        return float(code) * BaseT::mQuantum + BaseT::mMinimum; // code * (max-min)/UNITS + min
+    }
+
+    /// @brief This class cannot be constructed or deleted
+    LeafData() = delete;
+    LeafData(const LeafData&) = delete;
+    LeafData& operator=(const LeafData&) = delete;
+    ~LeafData() = delete;
+}; // LeafData<FpN>
+
+// --------------------------> LeafData<bool> <------------------------------------
+
+// Partial template specialization of LeafData with bool
+template<typename CoordT, template<uint32_t> class MaskT, uint32_t LOG2DIM>
+struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<bool, CoordT, MaskT, LOG2DIM>
+{
+    static_assert(sizeof(CoordT) == sizeof(Coord), "Mismatching sizeof");
+    static_assert(sizeof(MaskT<LOG2DIM>) == sizeof(Mask<LOG2DIM>), "Mismatching sizeof");
+    using ValueType = bool;
+    using BuildType = bool;
+    using FloatType = bool; // dummy value type
+    using ArrayType = MaskT<LOG2DIM>; // type used for the internal mValue array
+    static constexpr bool FIXED_SIZE = true;
+
+    CoordT         mBBoxMin; // 12B.
+    uint8_t        mBBoxDif[3]; // 3B.
+    uint8_t        mFlags; // 1B. bit0: skip render?, bit1: has bbox?, bit3: unused, bit4: has stats, bits5,6,7: bit-width for FpN
+    MaskT<LOG2DIM> mValueMask; // LOG2DIM(3): 64B.
+    MaskT<LOG2DIM> mValues; // LOG2DIM(3): 64B.
+    uint64_t       mPadding[2]; // 16B padding to 32B alignment
+
+    __hostdev__ static constexpr uint32_t padding() { return sizeof(LeafData) - 12u - 3u - 1u - 2 * sizeof(MaskT<LOG2DIM>) - 16u; }
+    __hostdev__ static uint64_t           memUsage() { return sizeof(LeafData); }
+    __hostdev__ static bool hasStats() { return false; }
+    __hostdev__ bool getValue(uint32_t i) const { return mValues.isOn(i); }
+    __hostdev__ bool getMin() const { return false; } // dummy
+    __hostdev__ bool getMax() const { return false; } // dummy
+    __hostdev__ bool getAvg() const { return false; } // dummy
+    __hostdev__ bool getDev() const { return false; } // dummy
+    __hostdev__ void setValue(uint32_t offset, bool v)
+    {
+        mValueMask.setOn(offset);
+        mValues.set(offset, v);
+    }
+    __hostdev__ void setOn(uint32_t offset) { mValueMask.setOn(offset); }
+    __hostdev__ void setMin(const bool&) {} // no-op
+    __hostdev__ void setMax(const bool&) {} // no-op
+    __hostdev__ void setAvg(const bool&) {} // no-op
+    __hostdev__ void setDev(const bool&) {} // no-op
+
+    template<typename T>
+    __hostdev__ void setOrigin(const T& ijk) { mBBoxMin = ijk; }
+
+    /// @brief This class cannot be constructed or deleted
+    LeafData() = delete;
+    LeafData(const LeafData&) = delete;
+    LeafData& operator=(const LeafData&) = delete;
+    ~LeafData() = delete;
+}; // LeafData<bool>
+
+// --------------------------> LeafData<ValueMask> <------------------------------------
+
+// Partial template specialization of LeafData with ValueMask
+template<typename CoordT, template<uint32_t> class MaskT, uint32_t LOG2DIM>
+struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<ValueMask, CoordT, MaskT, LOG2DIM>
+{
+    static_assert(sizeof(CoordT) == sizeof(Coord), "Mismatching sizeof");
+    static_assert(sizeof(MaskT<LOG2DIM>) == sizeof(Mask<LOG2DIM>), "Mismatching sizeof");
+    using ValueType = bool;
+    using BuildType = ValueMask;
+    using FloatType = bool; // dummy value type
+    using ArrayType = void; // type used for the internal mValue array - void means missing
+    static constexpr bool FIXED_SIZE = true;
+
+    CoordT         mBBoxMin; // 12B.
+    uint8_t        mBBoxDif[3]; // 3B.
+    uint8_t        mFlags; // 1B. bit0: skip render?, bit1: has bbox?, bit3: unused, bit4: has stats, bits5,6,7: bit-width for FpN
+    MaskT<LOG2DIM> mValueMask; // LOG2DIM(3): 64B.
+    uint64_t       mPadding[2]; // 16B padding to 32B alignment
+
+    __hostdev__ static uint64_t memUsage() { return sizeof(LeafData); }
+    __hostdev__ static bool hasStats() { return false; }
+    __hostdev__ static constexpr uint32_t padding()
+    {
+        return sizeof(LeafData) - (12u + 3u + 1u + sizeof(MaskT<LOG2DIM>) + 2 * 8u);
+    }
+
+    __hostdev__ bool getValue(uint32_t i) const { return mValueMask.isOn(i); }
+    __hostdev__ bool getMin() const { return false; } // dummy
+    __hostdev__ bool getMax() const { return false; } // dummy
+    __hostdev__ bool getAvg() const { return false; } // dummy
+    __hostdev__ bool getDev() const { return false; } // dummy
+    __hostdev__ void setValue(uint32_t offset, bool) { mValueMask.setOn(offset); }
+    __hostdev__ void setOn(uint32_t offset) { mValueMask.setOn(offset); }
+    __hostdev__ void setMin(const ValueType&) {} // no-op
+    __hostdev__ void setMax(const ValueType&) {} // no-op
+    __hostdev__ void setAvg(const FloatType&) {} // no-op
+    __hostdev__ void setDev(const FloatType&) {} // no-op
+
+    template<typename T>
+    __hostdev__ void setOrigin(const T& ijk) { mBBoxMin = ijk; }
+
+    /// @brief This class cannot be constructed or deleted
+    LeafData() = delete;
+    LeafData(const LeafData&) = delete;
+    LeafData& operator=(const LeafData&) = delete;
+    ~LeafData() = delete;
+}; // LeafData<ValueMask>
+
+// --------------------------> LeafIndexBase <------------------------------------
+
+// Partial template specialization of LeafData with ValueIndex
+template<typename CoordT, template<uint32_t> class MaskT, uint32_t LOG2DIM>
+struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafIndexBase
+{
+    static_assert(sizeof(CoordT) == sizeof(Coord), "Mismatching sizeof");
+    static_assert(sizeof(MaskT<LOG2DIM>) == sizeof(Mask<LOG2DIM>), "Mismatching sizeof");
+    using ValueType = uint64_t;
+    using FloatType = uint64_t;
+    using ArrayType = void; // type used for the internal mValue array - void means missing
+    static constexpr bool FIXED_SIZE = true;
+
+    CoordT         mBBoxMin; // 12B.
+    uint8_t        mBBoxDif[3]; // 3B.
+    uint8_t        mFlags; // 1B. bit0: skip render?, bit1: has bbox?, bit3: unused, bit4: has stats, bits5,6,7: bit-width for FpN
+    MaskT<LOG2DIM> mValueMask; // LOG2DIM(3): 64B.
+    uint64_t mOffset, mPrefixSum; // 8B offset to first value in this leaf node and 9-bit prefix sum
+    __hostdev__ static constexpr uint32_t padding()
+    {
+        return sizeof(LeafIndexBase) - (12u + 3u + 1u + sizeof(MaskT<LOG2DIM>) + 2 * 8u);
+    }
+    __hostdev__ static uint64_t memUsage() { return sizeof(LeafIndexBase); }
+    __hostdev__ bool            hasStats() const { return mFlags & (uint8_t(1) << 4); }
+    // return the offset to the first value indexed by this leaf node
+    __hostdev__ const uint64_t& firstOffset() const { return mOffset; }
+    __hostdev__ void            setMin(const ValueType&) {} // no-op
+    __hostdev__ void            setMax(const ValueType&) {} // no-op
+    __hostdev__ void            setAvg(const FloatType&) {} // no-op
+    __hostdev__ void            setDev(const FloatType&) {} // no-op
+    __hostdev__ void            setOn(uint32_t offset) { mValueMask.setOn(offset); }
+    template<typename T>
+    __hostdev__ void setOrigin(const T& ijk) { mBBoxMin = ijk; }
+
+protected:
+    /// @brief This class should be used as an abstract class and only constructed or deleted via child classes
+    LeafIndexBase() = default;
+    LeafIndexBase(const LeafIndexBase&) = default;
+    LeafIndexBase& operator=(const LeafIndexBase&) = default;
+    ~LeafIndexBase() = default;
+}; // LeafIndexBase
+
+// --------------------------> LeafData<ValueIndex> <------------------------------------
+
+// Partial template specialization of LeafData with ValueIndex
+template<typename CoordT, template<uint32_t> class MaskT, uint32_t LOG2DIM>
+struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<ValueIndex, CoordT, MaskT, LOG2DIM>
+    : public LeafIndexBase<CoordT, MaskT, LOG2DIM>
+{
+    using BaseT = LeafIndexBase<CoordT, MaskT, LOG2DIM>;
+    using BuildType = ValueIndex;
+    // return the total number of values indexed by this leaf node, excluding the optional 4 stats
+    __hostdev__ static uint32_t valueCount() { return uint32_t(512); } // 8^3 = 2^9
+    // return the offset to the last value indexed by this leaf node (disregarding optional stats)
+    __hostdev__ uint64_t lastOffset() const { return BaseT::mOffset + 511u; } // 2^9 - 1
+    // if stats are available, they are always placed after the last voxel value in this leaf node
+    __hostdev__ uint64_t getMin() const { return this->hasStats() ? BaseT::mOffset + 512u : 0u; }
+    __hostdev__ uint64_t getMax() const { return this->hasStats() ? BaseT::mOffset + 513u : 0u; }
+    __hostdev__ uint64_t getAvg() const { return this->hasStats() ? BaseT::mOffset + 514u : 0u; }
+    __hostdev__ uint64_t getDev() const { return this->hasStats() ? BaseT::mOffset + 515u : 0u; }
+    __hostdev__ uint64_t getValue(uint32_t i) const { return BaseT::mOffset + i; } // dense leaf node with active and inactive voxels
+}; // LeafData<ValueIndex>
+
+// --------------------------> LeafData<ValueOnIndex> <------------------------------------
+
+template<typename CoordT, template<uint32_t> class MaskT, uint32_t LOG2DIM>
+struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<ValueOnIndex, CoordT, MaskT, LOG2DIM>
+    : public LeafIndexBase<CoordT, MaskT, LOG2DIM>
+{
+    using BaseT = LeafIndexBase<CoordT, MaskT, LOG2DIM>;
+    using BuildType = ValueOnIndex;
+    __hostdev__ uint32_t valueCount() const
+    {
+        return util::countOn(BaseT::mValueMask.words()[7]) + (BaseT::mPrefixSum >> 54u & 511u); // last 9 bits of mPrefixSum do not account for the last word in mValueMask
+    }
+    __hostdev__ uint64_t lastOffset() const { return BaseT::mOffset + this->valueCount() - 1u; }
+    __hostdev__ uint64_t getMin() const { return this->hasStats() ? this->lastOffset() + 1u : 0u; }
+    __hostdev__ uint64_t getMax() const { return this->hasStats() ? this->lastOffset() + 2u : 0u; }
+    __hostdev__ uint64_t getAvg() const { return this->hasStats() ? this->lastOffset() + 3u : 0u; }
+    __hostdev__ uint64_t getDev() const { return this->hasStats() ? this->lastOffset() + 4u : 0u; }
+    __hostdev__ uint64_t getValue(uint32_t i) const
+    {
+        //return mValueMask.isOn(i) ? mOffset + mValueMask.countOn(i) : 0u;// for debugging
+        uint32_t       n = i >> 6;
+        const uint64_t w = BaseT::mValueMask.words()[n], mask = uint64_t(1) << (i & 63u);
+        if (!(w & mask)) return uint64_t(0); // if i'th value is inactive return offset to background value
+        uint64_t sum  = BaseT::mOffset + util::countOn(w & (mask - 1u));
+        if (n--) sum += BaseT::mPrefixSum >> (9u * n) & 511u;
+        return sum;
+    }
+}; // LeafData<ValueOnIndex>
+
+// --------------------------> LeafData<ValueIndexMask> <------------------------------------
+
+template<typename CoordT, template<uint32_t> class MaskT, uint32_t LOG2DIM>
+struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<ValueIndexMask, CoordT, MaskT, LOG2DIM>
+    : public LeafData<ValueIndex, CoordT, MaskT, LOG2DIM>
+{
+    using BuildType = ValueIndexMask;
+    MaskT<LOG2DIM>              mMask;
+    __hostdev__ static uint64_t memUsage() { return sizeof(LeafData); }
+    __hostdev__ bool            isMaskOn(uint32_t offset) const { return mMask.isOn(offset); }
+    __hostdev__ void            setMask(uint32_t offset, bool v) { mMask.set(offset, v); }
+}; // LeafData<ValueIndexMask>
+
+template<typename CoordT, template<uint32_t> class MaskT, uint32_t LOG2DIM>
+struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<ValueOnIndexMask, CoordT, MaskT, LOG2DIM>
+    : public LeafData<ValueOnIndex, CoordT, MaskT, LOG2DIM>
+{
+    using BuildType = ValueOnIndexMask;
+    MaskT<LOG2DIM>              mMask;
+    __hostdev__ static uint64_t memUsage() { return sizeof(LeafData); }
+    __hostdev__ bool            isMaskOn(uint32_t offset) const { return mMask.isOn(offset); }
+    __hostdev__ void            setMask(uint32_t offset, bool v) { mMask.set(offset, v); }
+}; // LeafData<ValueOnIndexMask>
+
+// --------------------------> LeafData<Point> <------------------------------------
+
+template<typename CoordT, template<uint32_t> class MaskT, uint32_t LOG2DIM>
+struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<Point, CoordT, MaskT, LOG2DIM>
+{
+    static_assert(sizeof(CoordT) == sizeof(Coord), "Mismatching sizeof");
+    static_assert(sizeof(MaskT<LOG2DIM>) == sizeof(Mask<LOG2DIM>), "Mismatching sizeof");
+    using ValueType = uint64_t;
+    using BuildType = Point;
+    using FloatType = typename FloatTraits<ValueType>::FloatType;
+    using ArrayType = uint16_t; // type used for the internal mValue array
+    static constexpr bool FIXED_SIZE = true;
+
+    CoordT         mBBoxMin; // 12B.
+    uint8_t        mBBoxDif[3]; // 3B.
+    uint8_t        mFlags; // 1B. bit0: skip render?, bit1: has bbox?, bit3: unused, bit4: has stats, bits5,6,7: bit-width for FpN
+    MaskT<LOG2DIM> mValueMask; // LOG2DIM(3): 64B.
+
+    uint64_t mOffset; //  8B
+    uint64_t mPointCount; //  8B
+    alignas(32) uint16_t mValues[1u << 3 * LOG2DIM]; // 1KB
+    // no padding
+
+    /// @brief Return padding of this class in bytes, due to aliasing and 32B alignment
+    ///
+    /// @note The extra bytes are not necessarily at the end, but can come from aliasing of individual data members.
+    __hostdev__ static constexpr uint32_t padding()
+    {
+        return sizeof(LeafData) - (12u + 3u + 1u + sizeof(MaskT<LOG2DIM>) + 2 * 8u + (1u << 3 * LOG2DIM) * 2u);
+    }
+    __hostdev__ static uint64_t memUsage() { return sizeof(LeafData); }
+
+    __hostdev__ uint64_t offset() const { return mOffset; }
+    __hostdev__ uint64_t pointCount() const { return mPointCount; }
+    __hostdev__ uint64_t first(uint32_t i) const { return i ? uint64_t(mValues[i - 1u]) + mOffset : mOffset; }
+    __hostdev__ uint64_t last(uint32_t i) const { return uint64_t(mValues[i]) + mOffset; }
+    __hostdev__ uint64_t getValue(uint32_t i) const { return uint64_t(mValues[i]); }
+    __hostdev__ void     setValueOnly(uint32_t offset, uint16_t value) { mValues[offset] = value; }
+    __hostdev__ void     setValue(uint32_t offset, uint16_t value)
+    {
+        mValueMask.setOn(offset);
+        mValues[offset] = value;
+    }
+    __hostdev__ void setOn(uint32_t offset) { mValueMask.setOn(offset); }
+
+    __hostdev__ ValueType getMin() const { return mOffset; }
+    __hostdev__ ValueType getMax() const { return mPointCount; }
+    __hostdev__ FloatType getAvg() const { return 0.0f; }
+    __hostdev__ FloatType getDev() const { return 0.0f; }
+
+    __hostdev__ void setMin(const ValueType&) {}
+    __hostdev__ void setMax(const ValueType&) {}
+    __hostdev__ void setAvg(const FloatType&) {}
+    __hostdev__ void setDev(const FloatType&) {}
+
+    template<typename T>
+    __hostdev__ void setOrigin(const T& ijk) { mBBoxMin = ijk; }
+
+    /// @brief This class cannot be constructed or deleted
+    LeafData() = delete;
+    LeafData(const LeafData&) = delete;
+    LeafData& operator=(const LeafData&) = delete;
+    ~LeafData() = delete;
+}; // LeafData<Point>
+
+// --------------------------> LeafNode<T> <------------------------------------
+
+/// @brief Leaf nodes of the VDB tree. (defaults to 8x8x8 = 512 voxels)
+template<typename BuildT,
+         typename CoordT = Coord,
+         template<uint32_t> class MaskT = Mask,
+         uint32_t Log2Dim = 3>
+class LeafNode : public LeafData<BuildT, CoordT, MaskT, Log2Dim>
+{
+public:
+    struct ChildNodeType
+    {
+        static constexpr uint32_t   TOTAL = 0;
+        static constexpr uint32_t   DIM = 1;
+        __hostdev__ static uint32_t dim() { return 1u; }
+    }; // Voxel
+    using LeafNodeType = LeafNode<BuildT, CoordT, MaskT, Log2Dim>;
+    using DataType = LeafData<BuildT, CoordT, MaskT, Log2Dim>;
+    using ValueType = typename DataType::ValueType;
+    using FloatType = typename DataType::FloatType;
+    using BuildType = typename DataType::BuildType;
+    using CoordType = CoordT;
+    static constexpr bool FIXED_SIZE = DataType::FIXED_SIZE;
+    template<uint32_t LOG2>
+    using MaskType = MaskT<LOG2>;
+    template<bool ON>
+    using MaskIterT = typename Mask<Log2Dim>::template Iterator<ON>;
+
+    /// @brief Visits all active values in a leaf node
+    class ValueOnIterator : public MaskIterT<true>
+    {
+        using BaseT = MaskIterT<true>;
+        const LeafNode* mParent;
+
+    public:
+        __hostdev__ ValueOnIterator()
+            : BaseT()
+            , mParent(nullptr)
+        {
+        }
+        __hostdev__ ValueOnIterator(const LeafNode* parent)
+            : BaseT(parent->data()->mValueMask.beginOn())
+            , mParent(parent)
+        {
+        }
+        ValueOnIterator&      operator=(const ValueOnIterator&) = default;
+        __hostdev__ ValueType operator*() const
+        {
+            NANOVDB_ASSERT(*this);
+            return mParent->getValue(BaseT::pos());
+        }
+        __hostdev__ CoordT getCoord() const
+        {
+            NANOVDB_ASSERT(*this);
+            return mParent->offsetToGlobalCoord(BaseT::pos());
+        }
+    }; // Member class ValueOnIterator
+
+    __hostdev__ ValueOnIterator beginValueOn() const { return ValueOnIterator(this); }
+    __hostdev__ ValueOnIterator cbeginValueOn() const { return ValueOnIterator(this); }
+
+    /// @brief Visits all inactive values in a leaf node
+    class ValueOffIterator : public MaskIterT<false>
+    {
+        using BaseT = MaskIterT<false>;
+        const LeafNode* mParent;
+
+    public:
+        __hostdev__ ValueOffIterator()
+            : BaseT()
+            , mParent(nullptr)
+        {
+        }
+        __hostdev__ ValueOffIterator(const LeafNode* parent)
+            : BaseT(parent->data()->mValueMask.beginOff())
+            , mParent(parent)
+        {
+        }
+        ValueOffIterator&     operator=(const ValueOffIterator&) = default;
+        __hostdev__ ValueType operator*() const
+        {
+            NANOVDB_ASSERT(*this);
+            return mParent->getValue(BaseT::pos());
+        }
+        __hostdev__ CoordT getCoord() const
+        {
+            NANOVDB_ASSERT(*this);
+            return mParent->offsetToGlobalCoord(BaseT::pos());
+        }
+    }; // Member class ValueOffIterator
+
+    __hostdev__ ValueOffIterator  beginValueOff() const { return ValueOffIterator(this); }
+    __hostdev__ ValueOffIterator cbeginValueOff() const { return ValueOffIterator(this); }
+
+    /// @brief Visits all values in a leaf node, i.e. both active and inactive values
+    class ValueIterator
+    {
+        const LeafNode* mParent;
+        uint32_t        mPos;
+
+    public:
+        __hostdev__ ValueIterator()
+            : mParent(nullptr)
+            , mPos(1u << 3 * Log2Dim)
+        {
+        }
+        __hostdev__ ValueIterator(const LeafNode* parent)
+            : mParent(parent)
+            , mPos(0)
+        {
+            NANOVDB_ASSERT(parent);
+        }
+        ValueIterator&        operator=(const ValueIterator&) = default;
+        __hostdev__ ValueType operator*() const
+        {
+            NANOVDB_ASSERT(*this);
+            return mParent->getValue(mPos);
+        }
+        __hostdev__ CoordT getCoord() const
+        {
+            NANOVDB_ASSERT(*this);
+            return mParent->offsetToGlobalCoord(mPos);
+        }
+        __hostdev__ bool isActive() const
+        {
+            NANOVDB_ASSERT(*this);
+            return mParent->isActive(mPos);
+        }
+        __hostdev__ operator bool() const { return mPos < (1u << 3 * Log2Dim); }
+        __hostdev__ ValueIterator& operator++()
+        {
+            ++mPos;
+            return *this;
+        }
+        __hostdev__ ValueIterator operator++(int)
+        {
+            auto tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+    }; // Member class ValueIterator
+
+    __hostdev__ ValueIterator  beginValue()    const { return ValueIterator(this); }
+    __hostdev__ ValueIterator cbeginValueAll() const { return ValueIterator(this); }
+
+    static_assert(util::is_same<ValueType, typename BuildToValueMap<BuildType>::Type>::value, "Mismatching BuildType");
+    static constexpr uint32_t LOG2DIM = Log2Dim;
+    static constexpr uint32_t TOTAL = LOG2DIM; // needed by parent nodes
+    static constexpr uint32_t DIM = 1u << TOTAL; // number of voxels along each axis of this node
+    static constexpr uint32_t SIZE = 1u << 3 * LOG2DIM; // total number of voxels represented by this node
+    static constexpr uint32_t MASK = (1u << LOG2DIM) - 1u; // mask for bit operations
+    static constexpr uint32_t LEVEL = 0; // level 0 = leaf
+    static constexpr uint64_t NUM_VALUES = uint64_t(1) << (3 * TOTAL); // total voxel count represented by this node
+
+    __hostdev__ DataType* data() { return reinterpret_cast<DataType*>(this); }
+
+    __hostdev__ const DataType* data() const { return reinterpret_cast<const DataType*>(this); }
+
+    /// @brief Return a const reference to the bit mask of active voxels in this leaf node
+    __hostdev__ const MaskType<LOG2DIM>& valueMask() const { return DataType::mValueMask; }
+    __hostdev__ const MaskType<LOG2DIM>& getValueMask() const { return DataType::mValueMask; }
+
+    /// @brief Return a const reference to the minimum active value encoded in this leaf node
+    __hostdev__ ValueType minimum() const { return DataType::getMin(); }
+
+    /// @brief Return a const reference to the maximum active value encoded in this leaf node
+    __hostdev__ ValueType maximum() const { return DataType::getMax(); }
+
+    /// @brief Return a const reference to the average of all the active values encoded in this leaf node
+    __hostdev__ FloatType average() const { return DataType::getAvg(); }
+
+    /// @brief Return the variance of all the active values encoded in this leaf node
+    __hostdev__ FloatType variance() const { return Pow2(DataType::getDev()); }
+
+    /// @brief Return a const reference to the standard deviation of all the active values encoded in this leaf node
+    __hostdev__ FloatType stdDeviation() const { return DataType::getDev(); }
+
+    __hostdev__ uint8_t flags() const { return DataType::mFlags; }
+
+    /// @brief Return the origin in index space of this leaf node
+    __hostdev__ CoordT origin() const { return DataType::mBBoxMin & ~MASK; }
+
+    /// @brief  Compute the local coordinates from a linear offset
+    /// @param n Linear offset into this nodes dense table
+    /// @return Local (vs global) 3D coordinates
+    __hostdev__ static CoordT OffsetToLocalCoord(uint32_t n)
+    {
+        NANOVDB_ASSERT(n < SIZE);
+        const uint32_t m = n & ((1 << 2 * LOG2DIM) - 1);
+        return CoordT(n >> 2 * LOG2DIM, m >> LOG2DIM, m & MASK);
+    }
+
+    /// @brief Converts (in place) a local index coordinate to a global index coordinate
+    __hostdev__ void localToGlobalCoord(Coord& ijk) const { ijk += this->origin(); }
+
+    __hostdev__ CoordT offsetToGlobalCoord(uint32_t n) const
+    {
+        return OffsetToLocalCoord(n) + this->origin();
+    }
+
+    /// @brief Return the dimension, in index space, of this leaf node (typically 8 as for openvdb leaf nodes!)
+    __hostdev__ static uint32_t dim() { return 1u << LOG2DIM; }
+
+    /// @brief Return the bounding box in index space of active values in this leaf node
+    __hostdev__ math::BBox<CoordT> bbox() const
+    {
+        math::BBox<CoordT> bbox(DataType::mBBoxMin, DataType::mBBoxMin);
+        if (this->hasBBox()) {
+            bbox.max()[0] += DataType::mBBoxDif[0];
+            bbox.max()[1] += DataType::mBBoxDif[1];
+            bbox.max()[2] += DataType::mBBoxDif[2];
+        } else { // very rare case
+            bbox = math::BBox<CoordT>(); // invalid
+        }
+        return bbox;
+    }
+
+    /// @brief Return the total number of voxels (e.g. values) encoded in this leaf node
+    __hostdev__ static uint32_t voxelCount() { return 1u << (3 * LOG2DIM); }
+
+    __hostdev__ static uint32_t padding() { return DataType::padding(); }
+
+    /// @brief return memory usage in bytes for the leaf node
+    __hostdev__ uint64_t memUsage() const { return DataType::memUsage(); }
+
+    /// @brief This class cannot be constructed or deleted
+    LeafNode() = delete;
+    LeafNode(const LeafNode&) = delete;
+    LeafNode& operator=(const LeafNode&) = delete;
+    ~LeafNode() = delete;
+
+    /// @brief Return the voxel value at the given offset.
+    __hostdev__ ValueType getValue(uint32_t offset) const { return DataType::getValue(offset); }
+
+    /// @brief Return the voxel value at the given coordinate.
+    __hostdev__ ValueType getValue(const CoordT& ijk) const { return DataType::getValue(CoordToOffset(ijk)); }
+
+    /// @brief Return the first value in this leaf node.
+    __hostdev__ ValueType getFirstValue() const { return this->getValue(0); }
+    /// @brief Return the last value in this leaf node.
+    __hostdev__ ValueType getLastValue() const { return this->getValue(SIZE - 1); }
+
+    /// @brief Sets the value at the specified location and activate its state.
+    ///
+    /// @note This is safe since it does not change the topology of the tree (unlike setValue methods on the other nodes)
+    __hostdev__ void setValue(const CoordT& ijk, const ValueType& v) { DataType::setValue(CoordToOffset(ijk), v); }
+
+    /// @brief Sets the value at the specified location but leaves its state unchanged.
+    ///
+    /// @note This is safe since it does not change the topology of the tree (unlike setValue methods on the other nodes)
+    __hostdev__ void setValueOnly(uint32_t offset, const ValueType& v) { DataType::setValueOnly(offset, v); }
+    __hostdev__ void setValueOnly(const CoordT& ijk, const ValueType& v) { DataType::setValueOnly(CoordToOffset(ijk), v); }
+
+    /// @brief Return @c true if the voxel value at the given coordinate is active.
+    __hostdev__ bool isActive(const CoordT& ijk) const { return DataType::mValueMask.isOn(CoordToOffset(ijk)); }
+    __hostdev__ bool isActive(uint32_t n) const { return DataType::mValueMask.isOn(n); }
+
+    /// @brief Return @c true if any of the voxel value are active in this leaf node.
+    __hostdev__ bool isActive() const
+    {
+        //NANOVDB_ASSERT( bool(DataType::mFlags & uint8_t(2)) != DataType::mValueMask.isOff() );
+        //return DataType::mFlags & uint8_t(2);
+        return !DataType::mValueMask.isOff();
+    }
+
+    __hostdev__ bool hasBBox() const { return DataType::mFlags & uint8_t(2); }
+
+    /// @brief Return @c true if the voxel value at the given coordinate is active and updates @c v with the value.
+    __hostdev__ bool probeValue(const CoordT& ijk, ValueType& v) const
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        v = DataType::getValue(n);
+        return DataType::mValueMask.isOn(n);
+    }
+
+    __hostdev__ const LeafNode* probeLeaf(const CoordT&) const { return this; }
+
+    /// @brief Return the linear offset corresponding to the given coordinate
+    __hostdev__ static uint32_t CoordToOffset(const CoordT& ijk)
+    {
+        return ((ijk[0] & MASK) << (2 * LOG2DIM)) | ((ijk[1] & MASK) << LOG2DIM) | (ijk[2] & MASK);
+    }
+
+    /// @brief Updates the local bounding box of active voxels in this node. Return true if bbox was updated.
+    ///
+    /// @warning It assumes that the origin and value mask have already been set.
+    ///
+    /// @details This method is based on few (intrinsic) bit operations and hence is relatively fast.
+    ///          However, it should only only be called if either the value mask has changed or if the
+    ///          active bounding box is still undefined. e.g. during construction of this node.
+    __hostdev__ bool updateBBox();
+
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto get(const CoordType& ijk, ArgsT&&... args) const
+    {
+        return OpT::get(*this, CoordToOffset(ijk), args...);
+    }
+
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto get(const uint32_t n, ArgsT&&... args) const
+    {
+        return OpT::get(*this, n, args...);
+    }
+
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto set(const CoordType& ijk, ArgsT&&... args)
+    {
+        return OpT::set(*this, CoordToOffset(ijk), args...);
+    }
+
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto set(const uint32_t n, ArgsT&&... args)
+    {
+        return OpT::set(*this, n, args...);
+    }
+
+private:
+    static_assert(sizeof(DataType) % NANOVDB_DATA_ALIGNMENT == 0, "sizeof(LeafData) is misaligned");
+
+    template<typename, int, int, int>
+    friend class ReadAccessor;
+
+    template<typename>
+    friend class RootNode;
+    template<typename, uint32_t>
+    friend class InternalNode;
+
+#ifndef NANOVDB_NEW_ACCESSOR_METHODS
+    /// @brief Private method to return a voxel value and update a (dummy) ReadAccessor
+    template<typename AccT>
+    __hostdev__ ValueType getValueAndCache(const CoordT& ijk, const AccT&) const { return this->getValue(ijk); }
+
+    /// @brief Return the node information.
+    template<typename AccT>
+    __hostdev__ typename AccT::NodeInfo getNodeInfoAndCache(const CoordType& /*ijk*/, const AccT& /*acc*/) const
+    {
+        using NodeInfoT = typename AccT::NodeInfo;
+        return NodeInfoT{LEVEL, this->dim(), this->minimum(), this->maximum(), this->average(), this->stdDeviation(), this->bbox()[0], this->bbox()[1]};
+    }
+
+    template<typename AccT>
+    __hostdev__ bool isActiveAndCache(const CoordT& ijk, const AccT&) const { return this->isActive(ijk); }
+
+    template<typename AccT>
+    __hostdev__ bool probeValueAndCache(const CoordT& ijk, ValueType& v, const AccT&) const { return this->probeValue(ijk, v); }
+
+    template<typename AccT>
+    __hostdev__ const LeafNode* probeLeafAndCache(const CoordT&, const AccT&) const { return this; }
+#endif
+
+    template<typename RayT, typename AccT>
+    __hostdev__ uint32_t getDimAndCache(const CoordT&, const RayT& /*ray*/, const AccT&) const
+    {
+        if (DataType::mFlags & uint8_t(1u))
+            return this->dim(); // skip this node if the 1st bit is set
+
+        //if (!ray.intersects( this->bbox() )) return 1 << LOG2DIM;
+        return ChildNodeType::dim();
+    }
+
+    template<typename OpT, typename AccT, typename... ArgsT>
+    __hostdev__ auto
+    //__hostdev__  decltype(OpT::get(util::declval<const LeafNode&>(), util::declval<uint32_t>(), util::declval<ArgsT>()...))
+    getAndCache(const CoordType& ijk, const AccT&, ArgsT&&... args) const
+    {
+        return OpT::get(*this, CoordToOffset(ijk), args...);
+    }
+
+    template<typename OpT, typename AccT, typename... ArgsT>
+    //__hostdev__ auto // occasionally fails with NVCC
+    __hostdev__ decltype(OpT::set(util::declval<LeafNode&>(), util::declval<uint32_t>(), util::declval<ArgsT>()...))
+    setAndCache(const CoordType& ijk, const AccT&, ArgsT&&... args)
+    {
+        return OpT::set(*this, CoordToOffset(ijk), args...);
+    }
+
+}; // LeafNode class
+
+// --------------------------> LeafNode<T>::updateBBox <------------------------------------
+
+template<typename ValueT, typename CoordT, template<uint32_t> class MaskT, uint32_t LOG2DIM>
+__hostdev__ inline bool LeafNode<ValueT, CoordT, MaskT, LOG2DIM>::updateBBox()
+{
+    static_assert(LOG2DIM == 3, "LeafNode::updateBBox: only supports LOGDIM = 3!");
+    if (DataType::mValueMask.isOff()) {
+        DataType::mFlags &= ~uint8_t(2); // set 2nd bit off, which indicates that this nodes has no bbox
+        return false;
+    }
+    auto update = [&](uint32_t min, uint32_t max, int axis) {
+        NANOVDB_ASSERT(min <= max && max < 8);
+        DataType::mBBoxMin[axis] = (DataType::mBBoxMin[axis] & ~MASK) + int(min);
+        DataType::mBBoxDif[axis] = uint8_t(max - min);
+    };
+    uint64_t *w = DataType::mValueMask.words(), word64 = *w;
+    uint32_t  Xmin = word64 ? 0u : 8u, Xmax = Xmin;
+    for (int i = 1; i < 8; ++i) { // last loop over 8 64 bit words
+        if (w[i]) { // skip if word has no set bits
+            word64 |= w[i]; // union 8 x 64 bits words into one 64 bit word
+            if (Xmin == 8)
+                Xmin = i; // only set once
+            Xmax = i;
+        }
+    }
+    NANOVDB_ASSERT(word64);
+    update(Xmin, Xmax, 0);
+    update(util::findLowestOn(word64) >> 3, util::findHighestOn(word64) >> 3, 1);
+    const uint32_t *p = reinterpret_cast<const uint32_t*>(&word64), word32 = p[0] | p[1];
+    const uint16_t *q = reinterpret_cast<const uint16_t*>(&word32), word16 = q[0] | q[1];
+    const uint8_t  *b = reinterpret_cast<const uint8_t*>(&word16), byte = b[0] | b[1];
+    NANOVDB_ASSERT(byte);
+    update(util::findLowestOn(static_cast<uint32_t>(byte)), util::findHighestOn(static_cast<uint32_t>(byte)), 2);
+    DataType::mFlags |= uint8_t(2); // set 2nd bit on, which indicates that this nodes has a bbox
+    return true;
+} // LeafNode::updateBBox
+
+// --------------------------> Template specializations and traits <------------------------------------
+
+/// @brief Template specializations to the default configuration used in OpenVDB:
+///        Root -> 32^3 -> 16^3 -> 8^3
+template<typename BuildT>
+using NanoLeaf = LeafNode<BuildT, Coord, Mask, 3>;
+template<typename BuildT>
+using NanoLower = InternalNode<NanoLeaf<BuildT>, 4>;
+template<typename BuildT>
+using NanoUpper = InternalNode<NanoLower<BuildT>, 5>;
+template<typename BuildT>
+using NanoRoot = RootNode<NanoUpper<BuildT>>;
+template<typename BuildT>
+using NanoTree = Tree<NanoRoot<BuildT>>;
+template<typename BuildT>
+using NanoGrid = Grid<NanoTree<BuildT>>;
+
+/// @brief Trait to map from LEVEL to node type
+template<typename BuildT, int LEVEL>
+struct NanoNode;
+
+// Partial template specialization of above Node struct
+template<typename BuildT>
+struct NanoNode<BuildT, 0>
+{
+    using Type = NanoLeaf<BuildT>;
+    using type = NanoLeaf<BuildT>;
+};
+template<typename BuildT>
+struct NanoNode<BuildT, 1>
+{
+    using Type = NanoLower<BuildT>;
+    using type = NanoLower<BuildT>;
+};
+template<typename BuildT>
+struct NanoNode<BuildT, 2>
+{
+    using Type = NanoUpper<BuildT>;
+    using type = NanoUpper<BuildT>;
+};
+template<typename BuildT>
+struct NanoNode<BuildT, 3>
+{
+    using Type = NanoRoot<BuildT>;
+    using type = NanoRoot<BuildT>;
+};
+
+using FloatTree = NanoTree<float>;
+using Fp4Tree = NanoTree<Fp4>;
+using Fp8Tree = NanoTree<Fp8>;
+using Fp16Tree = NanoTree<Fp16>;
+using FpNTree = NanoTree<FpN>;
+using DoubleTree = NanoTree<double>;
+using Int32Tree = NanoTree<int32_t>;
+using UInt32Tree = NanoTree<uint32_t>;
+using Int64Tree = NanoTree<int64_t>;
+using Vec3fTree = NanoTree<Vec3f>;
+using Vec3dTree = NanoTree<Vec3d>;
+using Vec4fTree = NanoTree<Vec4f>;
+using Vec4dTree = NanoTree<Vec4d>;
+using Vec3ITree = NanoTree<Vec3i>;
+using MaskTree = NanoTree<ValueMask>;
+using BoolTree = NanoTree<bool>;
+using IndexTree = NanoTree<ValueIndex>;
+using OnIndexTree = NanoTree<ValueOnIndex>;
+using IndexMaskTree = NanoTree<ValueIndexMask>;
+using OnIndexMaskTree = NanoTree<ValueOnIndexMask>;
+
+using FloatGrid = Grid<FloatTree>;
+using Fp4Grid = Grid<Fp4Tree>;
+using Fp8Grid = Grid<Fp8Tree>;
+using Fp16Grid = Grid<Fp16Tree>;
+using FpNGrid = Grid<FpNTree>;
+using DoubleGrid = Grid<DoubleTree>;
+using Int32Grid = Grid<Int32Tree>;
+using UInt32Grid = Grid<UInt32Tree>;
+using Int64Grid = Grid<Int64Tree>;
+using Vec3fGrid = Grid<Vec3fTree>;
+using Vec3dGrid = Grid<Vec3dTree>;
+using Vec4fGrid = Grid<Vec4fTree>;
+using Vec4dGrid = Grid<Vec4dTree>;
+using Vec3IGrid = Grid<Vec3ITree>;
+using MaskGrid = Grid<MaskTree>;
+using BoolGrid = Grid<BoolTree>;
+using PointGrid = Grid<Point>;
+using IndexGrid = Grid<IndexTree>;
+using OnIndexGrid = Grid<OnIndexTree>;
+using IndexMaskGrid = Grid<IndexMaskTree>;
+using OnIndexMaskGrid = Grid<OnIndexMaskTree>;
+
+// --------------------------> callNanoGrid <------------------------------------
+
+/**
+* @brief Below is an example of the struct used for generic programming with callNanoGrid
+* @details For an example see "struct Crc32TailOld" in nanovdb/tools/GridChecksum.h or
+*          "struct IsNanoGridValid" in nanovdb/tools/GridValidator.h
+* @code
+*   struct OpT {
+        // define these two static functions with non-const GridData
+*       template <typename BuildT>
+*       static auto   known(      GridData *gridData, args...);
+*       static auto unknown(      GridData *gridData, args...);
+*       // or alternatively these two static functions with const GridData
+*       template <typename BuildT>
+*       static auto   known(const GridData *gridData, args...);
+*       static auto unknown(const GridData *gridData, args...);
+*   };
+*  @endcode
+*
+* @brief Here is an example of how to use callNanoGrid in client code
+* @code
+*    return callNanoGrid<OpT>(gridData, args...);
+* @endcode
+*/
+
+/// @brief Use this function, which depends a pointer to GridData, to call
+///        other functions that depend on a NanoGrid of a known ValueType.
+/// @details This function allows for generic programming by converting GridData
+///          to a NanoGrid of the type encoded in GridData::mGridType.
+template<typename OpT, typename GridDataT, typename... ArgsT>
+auto callNanoGrid(GridDataT *gridData, ArgsT&&... args)
+{
+    static_assert(util::is_same<GridDataT, GridData, const GridData>::value, "Expected gridData to be of type GridData* or const GridData*");
+    switch (gridData->mGridType){
+        case GridType::Float:
+            return OpT::template known<float>(gridData, args...);
+        case GridType::Double:
+            return OpT::template known<double>(gridData, args...);
+        case GridType::Int16:
+            return OpT::template known<int16_t>(gridData, args...);
+        case GridType::Int32:
+            return OpT::template known<int32_t>(gridData, args...);
+        case GridType::Int64:
+            return OpT::template known<int64_t>(gridData, args...);
+        case GridType::Vec3f:
+            return OpT::template known<Vec3f>(gridData, args...);
+        case GridType::Vec3d:
+            return OpT::template known<Vec3d>(gridData, args...);
+        case GridType::UInt32:
+            return OpT::template known<uint32_t>(gridData, args...);
+        case GridType::Mask:
+            return OpT::template known<ValueMask>(gridData, args...);
+        case GridType::Index:
+            return OpT::template known<ValueIndex>(gridData, args...);
+        case GridType::OnIndex:
+            return OpT::template known<ValueOnIndex>(gridData, args...);
+        case GridType::IndexMask:
+            return OpT::template known<ValueIndexMask>(gridData, args...);
+        case GridType::OnIndexMask:
+            return OpT::template known<ValueOnIndexMask>(gridData, args...);
+        case GridType::Boolean:
+            return OpT::template known<bool>(gridData, args...);
+        case GridType::RGBA8:
+            return OpT::template known<math::Rgba8>(gridData, args...);
+        case GridType::Fp4:
+            return OpT::template known<Fp4>(gridData, args...);
+        case GridType::Fp8:
+            return OpT::template known<Fp8>(gridData, args...);
+        case GridType::Fp16:
+            return OpT::template known<Fp16>(gridData, args...);
+        case GridType::FpN:
+            return OpT::template known<FpN>(gridData, args...);
+        case GridType::Vec4f:
+            return OpT::template known<Vec4f>(gridData, args...);
+        case GridType::Vec4d:
+            return OpT::template known<Vec4d>(gridData, args...);
+        case GridType::UInt8:
+            return OpT::template known<uint8_t>(gridData, args...);
+        default:
+            return OpT::unknown(gridData, args...);
+    }
+}// callNanoGrid
+
+// --------------------------> ReadAccessor <------------------------------------
+
+/// @brief A read-only value accessor with three levels of node caching. This allows for
+///        inverse tree traversal during lookup, which is on average significantly faster
+///        than calling the equivalent method on the tree (i.e. top-down traversal).
+///
+/// @note  By virtue of the fact that a value accessor accelerates random access operations
+///        by re-using cached access patterns, this access should be reused for multiple access
+///        operations. In other words, never create an instance of this accessor for a single
+///        access only. In general avoid single access operations with this accessor, and
+///        if that is not possible call the corresponding method on the tree instead.
+///
+/// @warning Since this ReadAccessor internally caches raw pointers to the nodes of the tree
+///          structure, it is not safe to copy between host and device, or even to share among
+///          multiple threads on the same host or device. However, it is light-weight so simple
+///          instantiate one per thread (on the host and/or device).
+///
+/// @details Used to accelerated random access into a VDB tree. Provides on average
+///          O(1) random access operations by means of inverse tree traversal,
+///          which amortizes the non-const time complexity of the root node.
+
+template<typename BuildT>
+class ReadAccessor<BuildT, -1, -1, -1>
+{
+    using GridT = NanoGrid<BuildT>; // grid
+    using TreeT = NanoTree<BuildT>; // tree
+    using RootT = NanoRoot<BuildT>; // root node
+    using LeafT = NanoLeaf<BuildT>; // Leaf node
+    using FloatType = typename RootT::FloatType;
+    using CoordValueType = typename RootT::CoordType::ValueType;
+
+    mutable const RootT* mRoot; // 8 bytes (mutable to allow for access methods to be const)
+public:
+    using BuildType = BuildT;
+    using ValueType = typename RootT::ValueType;
+    using CoordType = typename RootT::CoordType;
+
+    static const int CacheLevels = 0;
+#ifndef NANOVDB_NEW_ACCESSOR_METHODS
+    struct NodeInfo
+    {
+        uint32_t  mLevel; //   4B
+        uint32_t  mDim; //     4B
+        ValueType mMinimum; // typically 4B
+        ValueType mMaximum; // typically 4B
+        FloatType mAverage; // typically 4B
+        FloatType mStdDevi; // typically 4B
+        CoordType mBBoxMin; // 3*4B
+        CoordType mBBoxMax; // 3*4B
+    };
+#endif
+    /// @brief Constructor from a root node
+    __hostdev__ ReadAccessor(const RootT& root)
+        : mRoot{&root}
+    {
+    }
+
+    /// @brief Constructor from a grid
+    __hostdev__ ReadAccessor(const GridT& grid)
+        : ReadAccessor(grid.tree().root())
+    {
+    }
+
+    /// @brief Constructor from a tree
+    __hostdev__ ReadAccessor(const TreeT& tree)
+        : ReadAccessor(tree.root())
+    {
+    }
+
+    /// @brief Reset this access to its initial state, i.e. with an empty cache
+    /// @node Noop since this template specialization has no cache
+    __hostdev__ void clear() {}
+
+    __hostdev__ const RootT& root() const { return *mRoot; }
+
+    /// @brief Defaults constructors
+    ReadAccessor(const ReadAccessor&) = default;
+    ~ReadAccessor() = default;
+    ReadAccessor& operator=(const ReadAccessor&) = default;
+#ifdef NANOVDB_NEW_ACCESSOR_METHODS
+    __hostdev__ ValueType getValue(const CoordType& ijk) const
+    {
+        return this->template get<GetValue<BuildT>>(ijk);
+    }
+    __hostdev__ ValueType    getValue(int i, int j, int k) const { return this->template get<GetValue<BuildT>>(CoordType(i, j, k)); }
+    __hostdev__ ValueType    operator()(const CoordType& ijk) const { return this->template get<GetValue<BuildT>>(ijk); }
+    __hostdev__ ValueType    operator()(int i, int j, int k) const { return this->template get<GetValue<BuildT>>(CoordType(i, j, k)); }
+    __hostdev__ auto         getNodeInfo(const CoordType& ijk) const { return this->template get<GetNodeInfo<BuildT>>(ijk); }
+    __hostdev__ bool         isActive(const CoordType& ijk) const { return this->template get<GetState<BuildT>>(ijk); }
+    __hostdev__ bool         probeValue(const CoordType& ijk, ValueType& v) const { return this->template get<ProbeValue<BuildT>>(ijk, v); }
+    __hostdev__ const LeafT* probeLeaf(const CoordType& ijk) const { return this->template get<GetLeaf<BuildT>>(ijk); }
+#else // NANOVDB_NEW_ACCESSOR_METHODS
+    __hostdev__ ValueType getValue(const CoordType& ijk) const
+    {
+        return mRoot->getValueAndCache(ijk, *this);
+    }
+    __hostdev__ ValueType getValue(int i, int j, int k) const
+    {
+        return this->getValue(CoordType(i, j, k));
+    }
+    __hostdev__ ValueType operator()(const CoordType& ijk) const
+    {
+        return this->getValue(ijk);
+    }
+    __hostdev__ ValueType operator()(int i, int j, int k) const
+    {
+        return this->getValue(CoordType(i, j, k));
+    }
+
+    __hostdev__ NodeInfo getNodeInfo(const CoordType& ijk) const
+    {
+        return mRoot->getNodeInfoAndCache(ijk, *this);
+    }
+
+    __hostdev__ bool isActive(const CoordType& ijk) const
+    {
+        return mRoot->isActiveAndCache(ijk, *this);
+    }
+
+    __hostdev__ bool probeValue(const CoordType& ijk, ValueType& v) const
+    {
+        return mRoot->probeValueAndCache(ijk, v, *this);
+    }
+
+    __hostdev__ const LeafT* probeLeaf(const CoordType& ijk) const
+    {
+        return mRoot->probeLeafAndCache(ijk, *this);
+    }
+#endif // NANOVDB_NEW_ACCESSOR_METHODS
+    template<typename RayT>
+    __hostdev__ uint32_t getDim(const CoordType& ijk, const RayT& ray) const
+    {
+        return mRoot->getDimAndCache(ijk, ray, *this);
+    }
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto get(const CoordType& ijk, ArgsT&&... args) const
+    {
+        return mRoot->template get<OpT>(ijk, args...);
+    }
+
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto set(const CoordType& ijk, ArgsT&&... args) const
+    {
+        return const_cast<RootT*>(mRoot)->template set<OpT>(ijk, args...);
+    }
+
+private:
+    /// @brief Allow nodes to insert themselves into the cache.
+    template<typename>
+    friend class RootNode;
+    template<typename, uint32_t>
+    friend class InternalNode;
+    template<typename, typename, template<uint32_t> class, uint32_t>
+    friend class LeafNode;
+
+    /// @brief No-op
+    template<typename NodeT>
+    __hostdev__ void insert(const CoordType&, const NodeT*) const {}
+}; // ReadAccessor<ValueT, -1, -1, -1> class
+
+/// @brief Node caching at a single tree level
+template<typename BuildT, int LEVEL0>
+class ReadAccessor<BuildT, LEVEL0, -1, -1> //e.g. 0, 1, 2
+{
+    static_assert(LEVEL0 >= 0 && LEVEL0 <= 2, "LEVEL0 should be 0, 1, or 2");
+
+    using GridT = NanoGrid<BuildT>; // grid
+    using TreeT = NanoTree<BuildT>;
+    using RootT = NanoRoot<BuildT>; //  root node
+    using LeafT = NanoLeaf<BuildT>; // Leaf node
+    using NodeT = typename NodeTrait<TreeT, LEVEL0>::type;
+    using CoordT = typename RootT::CoordType;
+    using ValueT = typename RootT::ValueType;
+
+    using FloatType = typename RootT::FloatType;
+    using CoordValueType = typename RootT::CoordT::ValueType;
+
+    // All member data are mutable to allow for access methods to be const
+    mutable CoordT       mKey; // 3*4 = 12 bytes
+    mutable const RootT* mRoot; // 8 bytes
+    mutable const NodeT* mNode; // 8 bytes
+
+public:
+    using BuildType = BuildT;
+    using ValueType = ValueT;
+    using CoordType = CoordT;
+
+    static const int CacheLevels = 1;
+#ifndef NANOVDB_NEW_ACCESSOR_METHODS
+    using NodeInfo = typename ReadAccessor<ValueT, -1, -1, -1>::NodeInfo;
+#endif
+    /// @brief Constructor from a root node
+    __hostdev__ ReadAccessor(const RootT& root)
+        : mKey(CoordType::max())
+        , mRoot(&root)
+        , mNode(nullptr)
+    {
+    }
+
+    /// @brief Constructor from a grid
+    __hostdev__ ReadAccessor(const GridT& grid)
+        : ReadAccessor(grid.tree().root())
+    {
+    }
+
+    /// @brief Constructor from a tree
+    __hostdev__ ReadAccessor(const TreeT& tree)
+        : ReadAccessor(tree.root())
+    {
+    }
+
+    /// @brief Reset this access to its initial state, i.e. with an empty cache
+    __hostdev__ void clear()
+    {
+        mKey = CoordType::max();
+        mNode = nullptr;
+    }
+
+    __hostdev__ const RootT& root() const { return *mRoot; }
+
+    /// @brief Defaults constructors
+    ReadAccessor(const ReadAccessor&) = default;
+    ~ReadAccessor() = default;
+    ReadAccessor& operator=(const ReadAccessor&) = default;
+
+    __hostdev__ bool isCached(const CoordType& ijk) const
+    {
+        return (ijk[0] & int32_t(~NodeT::MASK)) == mKey[0] &&
+               (ijk[1] & int32_t(~NodeT::MASK)) == mKey[1] &&
+               (ijk[2] & int32_t(~NodeT::MASK)) == mKey[2];
+    }
+
+#ifdef NANOVDB_NEW_ACCESSOR_METHODS
+    __hostdev__ ValueType getValue(const CoordType& ijk) const
+    {
+        return this->template get<GetValue<BuildT>>(ijk);
+    }
+    __hostdev__ ValueType    getValue(int i, int j, int k) const { return this->template get<GetValue<BuildT>>(CoordType(i, j, k)); }
+    __hostdev__ ValueType    operator()(const CoordType& ijk) const { return this->template get<GetValue<BuildT>>(ijk); }
+    __hostdev__ ValueType    operator()(int i, int j, int k) const { return this->template get<GetValue<BuildT>>(CoordType(i, j, k)); }
+    __hostdev__ auto         getNodeInfo(const CoordType& ijk) const { return this->template get<GetNodeInfo<BuildT>>(ijk); }
+    __hostdev__ bool         isActive(const CoordType& ijk) const { return this->template get<GetState<BuildT>>(ijk); }
+    __hostdev__ bool         probeValue(const CoordType& ijk, ValueType& v) const { return this->template get<ProbeValue<BuildT>>(ijk, v); }
+    __hostdev__ const LeafT* probeLeaf(const CoordType& ijk) const { return this->template get<GetLeaf<BuildT>>(ijk); }
+#else // NANOVDB_NEW_ACCESSOR_METHODS
+    __hostdev__ ValueType getValue(const CoordType& ijk) const
+    {
+        if (this->isCached(ijk))
+            return mNode->getValueAndCache(ijk, *this);
+        return mRoot->getValueAndCache(ijk, *this);
+    }
+    __hostdev__ ValueType getValue(int i, int j, int k) const
+    {
+        return this->getValue(CoordType(i, j, k));
+    }
+    __hostdev__ ValueType operator()(const CoordType& ijk) const
+    {
+        return this->getValue(ijk);
+    }
+    __hostdev__ ValueType operator()(int i, int j, int k) const
+    {
+        return this->getValue(CoordType(i, j, k));
+    }
+
+    __hostdev__ NodeInfo getNodeInfo(const CoordType& ijk) const
+    {
+        if (this->isCached(ijk))
+            return mNode->getNodeInfoAndCache(ijk, *this);
+        return mRoot->getNodeInfoAndCache(ijk, *this);
+    }
+
+    __hostdev__ bool isActive(const CoordType& ijk) const
+    {
+        if (this->isCached(ijk))
+            return mNode->isActiveAndCache(ijk, *this);
+        return mRoot->isActiveAndCache(ijk, *this);
+    }
+
+    __hostdev__ bool probeValue(const CoordType& ijk, ValueType& v) const
+    {
+        if (this->isCached(ijk))
+            return mNode->probeValueAndCache(ijk, v, *this);
+        return mRoot->probeValueAndCache(ijk, v, *this);
+    }
+
+    __hostdev__ const LeafT* probeLeaf(const CoordType& ijk) const
+    {
+        if (this->isCached(ijk))
+            return mNode->probeLeafAndCache(ijk, *this);
+        return mRoot->probeLeafAndCache(ijk, *this);
+    }
+#endif // NANOVDB_NEW_ACCESSOR_METHODS
+    template<typename RayT>
+    __hostdev__ uint32_t getDim(const CoordType& ijk, const RayT& ray) const
+    {
+        if (this->isCached(ijk))
+            return mNode->getDimAndCache(ijk, ray, *this);
+        return mRoot->getDimAndCache(ijk, ray, *this);
+    }
+
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto get(const CoordType& ijk, ArgsT&&... args) const
+    {
+        if (this->isCached(ijk))
+            return mNode->template getAndCache<OpT>(ijk, *this, args...);
+        return mRoot->template getAndCache<OpT>(ijk, *this, args...);
+    }
+
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto set(const CoordType& ijk, ArgsT&&... args) const
+    {
+        if (this->isCached(ijk))
+            return const_cast<NodeT*>(mNode)->template setAndCache<OpT>(ijk, *this, args...);
+        return const_cast<RootT*>(mRoot)->template setAndCache<OpT>(ijk, *this, args...);
+    }
+
+private:
+    /// @brief Allow nodes to insert themselves into the cache.
+    template<typename>
+    friend class RootNode;
+    template<typename, uint32_t>
+    friend class InternalNode;
+    template<typename, typename, template<uint32_t> class, uint32_t>
+    friend class LeafNode;
+
+    /// @brief Inserts a leaf node and key pair into this ReadAccessor
+    __hostdev__ void insert(const CoordType& ijk, const NodeT* node) const
+    {
+        mKey = ijk & ~NodeT::MASK;
+        mNode = node;
+    }
+
+    // no-op
+    template<typename OtherNodeT>
+    __hostdev__ void insert(const CoordType&, const OtherNodeT*) const {}
+
+}; // ReadAccessor<ValueT, LEVEL0>
+
+template<typename BuildT, int LEVEL0, int LEVEL1>
+class ReadAccessor<BuildT, LEVEL0, LEVEL1, -1> //e.g. (0,1), (1,2), (0,2)
+{
+    static_assert(LEVEL0 >= 0 && LEVEL0 <= 2, "LEVEL0 must be 0, 1, 2");
+    static_assert(LEVEL1 >= 0 && LEVEL1 <= 2, "LEVEL1 must be 0, 1, 2");
+    static_assert(LEVEL0 < LEVEL1, "Level 0 must be lower than level 1");
+    using GridT = NanoGrid<BuildT>; // grid
+    using TreeT = NanoTree<BuildT>;
+    using RootT = NanoRoot<BuildT>;
+    using LeafT = NanoLeaf<BuildT>;
+    using Node1T = typename NodeTrait<TreeT, LEVEL0>::type;
+    using Node2T = typename NodeTrait<TreeT, LEVEL1>::type;
+    using CoordT = typename RootT::CoordType;
+    using ValueT = typename RootT::ValueType;
+    using FloatType = typename RootT::FloatType;
+    using CoordValueType = typename RootT::CoordT::ValueType;
+
+    // All member data are mutable to allow for access methods to be const
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY // 44 bytes total
+    mutable CoordT mKey; // 3*4 = 12 bytes
+#else // 68 bytes total
+    mutable CoordT mKeys[2]; // 2*3*4 = 24 bytes
+#endif
+    mutable const RootT*  mRoot;
+    mutable const Node1T* mNode1;
+    mutable const Node2T* mNode2;
+
+public:
+    using BuildType = BuildT;
+    using ValueType = ValueT;
+    using CoordType = CoordT;
+
+    static const int CacheLevels = 2;
+#ifndef NANOVDB_NEW_ACCESSOR_METHODS
+    using NodeInfo = typename ReadAccessor<ValueT, -1, -1, -1>::NodeInfo;
+#endif
+    /// @brief Constructor from a root node
+    __hostdev__ ReadAccessor(const RootT& root)
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+        : mKey(CoordType::max())
+#else
+        : mKeys{CoordType::max(), CoordType::max()}
+#endif
+        , mRoot(&root)
+        , mNode1(nullptr)
+        , mNode2(nullptr)
+    {
+    }
+
+    /// @brief Constructor from a grid
+    __hostdev__ ReadAccessor(const GridT& grid)
+        : ReadAccessor(grid.tree().root())
+    {
+    }
+
+    /// @brief Constructor from a tree
+    __hostdev__ ReadAccessor(const TreeT& tree)
+        : ReadAccessor(tree.root())
+    {
+    }
+
+    /// @brief Reset this access to its initial state, i.e. with an empty cache
+    __hostdev__ void clear()
+    {
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+        mKey = CoordType::max();
+#else
+        mKeys[0] = mKeys[1] = CoordType::max();
+#endif
+        mNode1 = nullptr;
+        mNode2 = nullptr;
+    }
+
+    __hostdev__ const RootT& root() const { return *mRoot; }
+
+    /// @brief Defaults constructors
+    ReadAccessor(const ReadAccessor&) = default;
+    ~ReadAccessor() = default;
+    ReadAccessor& operator=(const ReadAccessor&) = default;
+
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+    __hostdev__ bool isCached1(CoordValueType dirty) const
+    {
+        if (!mNode1)
+            return false;
+        if (dirty & int32_t(~Node1T::MASK)) {
+            mNode1 = nullptr;
+            return false;
+        }
+        return true;
+    }
+    __hostdev__ bool isCached2(CoordValueType dirty) const
+    {
+        if (!mNode2)
+            return false;
+        if (dirty & int32_t(~Node2T::MASK)) {
+            mNode2 = nullptr;
+            return false;
+        }
+        return true;
+    }
+    __hostdev__ CoordValueType computeDirty(const CoordType& ijk) const
+    {
+        return (ijk[0] ^ mKey[0]) | (ijk[1] ^ mKey[1]) | (ijk[2] ^ mKey[2]);
+    }
+#else
+    __hostdev__ bool isCached1(const CoordType& ijk) const
+    {
+        return (ijk[0] & int32_t(~Node1T::MASK)) == mKeys[0][0] &&
+               (ijk[1] & int32_t(~Node1T::MASK)) == mKeys[0][1] &&
+               (ijk[2] & int32_t(~Node1T::MASK)) == mKeys[0][2];
+    }
+    __hostdev__ bool isCached2(const CoordType& ijk) const
+    {
+        return (ijk[0] & int32_t(~Node2T::MASK)) == mKeys[1][0] &&
+               (ijk[1] & int32_t(~Node2T::MASK)) == mKeys[1][1] &&
+               (ijk[2] & int32_t(~Node2T::MASK)) == mKeys[1][2];
+    }
+#endif
+
+#ifdef NANOVDB_NEW_ACCESSOR_METHODS
+    __hostdev__ ValueType getValue(const CoordType& ijk) const
+    {
+        return this->template get<GetValue<BuildT>>(ijk);
+    }
+    __hostdev__ ValueType    getValue(int i, int j, int k) const { return this->template get<GetValue<BuildT>>(CoordType(i, j, k)); }
+    __hostdev__ ValueType    operator()(const CoordType& ijk) const { return this->template get<GetValue<BuildT>>(ijk); }
+    __hostdev__ ValueType    operator()(int i, int j, int k) const { return this->template get<GetValue<BuildT>>(CoordType(i, j, k)); }
+    __hostdev__ auto         getNodeInfo(const CoordType& ijk) const { return this->template get<GetNodeInfo<BuildT>>(ijk); }
+    __hostdev__ bool         isActive(const CoordType& ijk) const { return this->template get<GetState<BuildT>>(ijk); }
+    __hostdev__ bool         probeValue(const CoordType& ijk, ValueType& v) const { return this->template get<ProbeValue<BuildT>>(ijk, v); }
+    __hostdev__ const LeafT* probeLeaf(const CoordType& ijk) const { return this->template get<GetLeaf<BuildT>>(ijk); }
+#else // NANOVDB_NEW_ACCESSOR_METHODS
+
+    __hostdev__ ValueType getValue(const CoordType& ijk) const
+    {
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+        const CoordValueType dirty = this->computeDirty(ijk);
+#else
+        auto&& dirty = ijk;
+#endif
+        if (this->isCached1(dirty)) {
+            return mNode1->getValueAndCache(ijk, *this);
+        } else if (this->isCached2(dirty)) {
+            return mNode2->getValueAndCache(ijk, *this);
+        }
+        return mRoot->getValueAndCache(ijk, *this);
+    }
+    __hostdev__ ValueType operator()(const CoordType& ijk) const
+    {
+        return this->getValue(ijk);
+    }
+    __hostdev__ ValueType operator()(int i, int j, int k) const
+    {
+        return this->getValue(CoordType(i, j, k));
+    }
+    __hostdev__ ValueType getValue(int i, int j, int k) const
+    {
+        return this->getValue(CoordType(i, j, k));
+    }
+    __hostdev__ NodeInfo getNodeInfo(const CoordType& ijk) const
+    {
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+        const CoordValueType dirty = this->computeDirty(ijk);
+#else
+        auto&& dirty = ijk;
+#endif
+        if (this->isCached1(dirty)) {
+            return mNode1->getNodeInfoAndCache(ijk, *this);
+        } else if (this->isCached2(dirty)) {
+            return mNode2->getNodeInfoAndCache(ijk, *this);
+        }
+        return mRoot->getNodeInfoAndCache(ijk, *this);
+    }
+
+    __hostdev__ bool isActive(const CoordType& ijk) const
+    {
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+        const CoordValueType dirty = this->computeDirty(ijk);
+#else
+        auto&& dirty = ijk;
+#endif
+        if (this->isCached1(dirty)) {
+            return mNode1->isActiveAndCache(ijk, *this);
+        } else if (this->isCached2(dirty)) {
+            return mNode2->isActiveAndCache(ijk, *this);
+        }
+        return mRoot->isActiveAndCache(ijk, *this);
+    }
+
+    __hostdev__ bool probeValue(const CoordType& ijk, ValueType& v) const
+    {
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+        const CoordValueType dirty = this->computeDirty(ijk);
+#else
+        auto&& dirty = ijk;
+#endif
+        if (this->isCached1(dirty)) {
+            return mNode1->probeValueAndCache(ijk, v, *this);
+        } else if (this->isCached2(dirty)) {
+            return mNode2->probeValueAndCache(ijk, v, *this);
+        }
+        return mRoot->probeValueAndCache(ijk, v, *this);
+    }
+
+    __hostdev__ const LeafT* probeLeaf(const CoordType& ijk) const
+    {
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+        const CoordValueType dirty = this->computeDirty(ijk);
+#else
+        auto&& dirty = ijk;
+#endif
+        if (this->isCached1(dirty)) {
+            return mNode1->probeLeafAndCache(ijk, *this);
+        } else if (this->isCached2(dirty)) {
+            return mNode2->probeLeafAndCache(ijk, *this);
+        }
+        return mRoot->probeLeafAndCache(ijk, *this);
+    }
+#endif // NANOVDB_NEW_ACCESSOR_METHODS
+
+    template<typename RayT>
+    __hostdev__ uint32_t getDim(const CoordType& ijk, const RayT& ray) const
+    {
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+        const CoordValueType dirty = this->computeDirty(ijk);
+#else
+        auto&& dirty = ijk;
+#endif
+        if (this->isCached1(dirty)) {
+            return mNode1->getDimAndCache(ijk, ray, *this);
+        } else if (this->isCached2(dirty)) {
+            return mNode2->getDimAndCache(ijk, ray, *this);
+        }
+        return mRoot->getDimAndCache(ijk, ray, *this);
+    }
+
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto get(const CoordType& ijk, ArgsT&&... args) const
+    {
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+        const CoordValueType dirty = this->computeDirty(ijk);
+#else
+        auto&& dirty = ijk;
+#endif
+        if (this->isCached1(dirty)) {
+            return mNode1->template getAndCache<OpT>(ijk, *this, args...);
+        } else if (this->isCached2(dirty)) {
+            return mNode2->template getAndCache<OpT>(ijk, *this, args...);
+        }
+        return mRoot->template getAndCache<OpT>(ijk, *this, args...);
+    }
+
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto set(const CoordType& ijk, ArgsT&&... args) const
+    {
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+        const CoordValueType dirty = this->computeDirty(ijk);
+#else
+        auto&& dirty = ijk;
+#endif
+        if (this->isCached1(dirty)) {
+            return const_cast<Node1T*>(mNode1)->template setAndCache<OpT>(ijk, *this, args...);
+        } else if (this->isCached2(dirty)) {
+            return const_cast<Node2T*>(mNode2)->template setAndCache<OpT>(ijk, *this, args...);
+        }
+        return const_cast<RootT*>(mRoot)->template setAndCache<OpT>(ijk, *this, args...);
+    }
+
+private:
+    /// @brief Allow nodes to insert themselves into the cache.
+    template<typename>
+    friend class RootNode;
+    template<typename, uint32_t>
+    friend class InternalNode;
+    template<typename, typename, template<uint32_t> class, uint32_t>
+    friend class LeafNode;
+
+    /// @brief Inserts a leaf node and key pair into this ReadAccessor
+    __hostdev__ void insert(const CoordType& ijk, const Node1T* node) const
+    {
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+        mKey = ijk;
+#else
+        mKeys[0] = ijk & ~Node1T::MASK;
+#endif
+        mNode1 = node;
+    }
+    __hostdev__ void insert(const CoordType& ijk, const Node2T* node) const
+    {
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+        mKey = ijk;
+#else
+        mKeys[1] = ijk & ~Node2T::MASK;
+#endif
+        mNode2 = node;
+    }
+    template<typename OtherNodeT>
+    __hostdev__ void insert(const CoordType&, const OtherNodeT*) const {}
+}; // ReadAccessor<BuildT, LEVEL0, LEVEL1>
+
+/// @brief Node caching at all (three) tree levels
+template<typename BuildT>
+class ReadAccessor<BuildT, 0, 1, 2>
+{
+    using GridT = NanoGrid<BuildT>; // grid
+    using TreeT = NanoTree<BuildT>;
+    using RootT = NanoRoot<BuildT>; //  root node
+    using NodeT2 = NanoUpper<BuildT>; // upper internal node
+    using NodeT1 = NanoLower<BuildT>; // lower internal node
+    using LeafT = NanoLeaf<BuildT>; // Leaf node
+    using CoordT = typename RootT::CoordType;
+    using ValueT = typename RootT::ValueType;
+
+    using FloatType = typename RootT::FloatType;
+    using CoordValueType = typename RootT::CoordT::ValueType;
+
+    // All member data are mutable to allow for access methods to be const
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY // 44 bytes total
+    mutable CoordT mKey; // 3*4 = 12 bytes
+#else // 68 bytes total
+    mutable CoordT mKeys[3]; // 3*3*4 = 36 bytes
+#endif
+    mutable const RootT* mRoot;
+    mutable const void*  mNode[3]; // 4*8 = 32 bytes
+
+public:
+    using BuildType = BuildT;
+    using ValueType = ValueT;
+    using CoordType = CoordT;
+
+    static const int CacheLevels = 3;
+#ifndef NANOVDB_NEW_ACCESSOR_METHODS
+    using NodeInfo = typename ReadAccessor<ValueT, -1, -1, -1>::NodeInfo;
+#endif
+    /// @brief Constructor from a root node
+    __hostdev__ ReadAccessor(const RootT& root)
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+        : mKey(CoordType::max())
+#else
+        : mKeys{CoordType::max(), CoordType::max(), CoordType::max()}
+#endif
+        , mRoot(&root)
+        , mNode{nullptr, nullptr, nullptr}
+    {
+    }
+
+    /// @brief Constructor from a grid
+    __hostdev__ ReadAccessor(const GridT& grid)
+        : ReadAccessor(grid.tree().root())
+    {
+    }
+
+    /// @brief Constructor from a tree
+    __hostdev__ ReadAccessor(const TreeT& tree)
+        : ReadAccessor(tree.root())
+    {
+    }
+
+    __hostdev__ const RootT& root() const { return *mRoot; }
+
+    /// @brief Defaults constructors
+    ReadAccessor(const ReadAccessor&) = default;
+    ~ReadAccessor() = default;
+    ReadAccessor& operator=(const ReadAccessor&) = default;
+
+    /// @brief Return a const point to the cached node of the specified type
+    ///
+    /// @warning The return value could be NULL.
+    template<typename NodeT>
+    __hostdev__ const NodeT* getNode() const
+    {
+        using T = typename NodeTrait<TreeT, NodeT::LEVEL>::type;
+        static_assert(util::is_same<T, NodeT>::value, "ReadAccessor::getNode: Invalid node type");
+        return reinterpret_cast<const T*>(mNode[NodeT::LEVEL]);
+    }
+
+    template<int LEVEL>
+    __hostdev__ const typename NodeTrait<TreeT, LEVEL>::type* getNode() const
+    {
+        using T = typename NodeTrait<TreeT, LEVEL>::type;
+        static_assert(LEVEL >= 0 && LEVEL <= 2, "ReadAccessor::getNode: Invalid node type");
+        return reinterpret_cast<const T*>(mNode[LEVEL]);
+    }
+
+    /// @brief Reset this access to its initial state, i.e. with an empty cache
+    __hostdev__ void clear()
+    {
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+        mKey = CoordType::max();
+#else
+        mKeys[0] = mKeys[1] = mKeys[2] = CoordType::max();
+#endif
+        mNode[0] = mNode[1] = mNode[2] = nullptr;
+    }
+
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+    template<typename NodeT>
+    __hostdev__ bool isCached(CoordValueType dirty) const
+    {
+        if (!mNode[NodeT::LEVEL])
+            return false;
+        if (dirty & int32_t(~NodeT::MASK)) {
+            mNode[NodeT::LEVEL] = nullptr;
+            return false;
+        }
+        return true;
+    }
+
+    __hostdev__ CoordValueType computeDirty(const CoordType& ijk) const
+    {
+        return (ijk[0] ^ mKey[0]) | (ijk[1] ^ mKey[1]) | (ijk[2] ^ mKey[2]);
+    }
+#else
+    template<typename NodeT>
+    __hostdev__ bool isCached(const CoordType& ijk) const
+    {
+        return (ijk[0] & int32_t(~NodeT::MASK)) == mKeys[NodeT::LEVEL][0] &&
+               (ijk[1] & int32_t(~NodeT::MASK)) == mKeys[NodeT::LEVEL][1] &&
+               (ijk[2] & int32_t(~NodeT::MASK)) == mKeys[NodeT::LEVEL][2];
+    }
+#endif
+
+#ifdef NANOVDB_NEW_ACCESSOR_METHODS
+    __hostdev__ ValueType getValue(const CoordType& ijk) const
+    {
+        return this->template get<GetValue<BuildT>>(ijk);
+    }
+    __hostdev__ ValueType    getValue(int i, int j, int k) const { return this->template get<GetValue<BuildT>>(CoordType(i, j, k)); }
+    __hostdev__ ValueType    operator()(const CoordType& ijk) const { return this->template get<GetValue<BuildT>>(ijk); }
+    __hostdev__ ValueType    operator()(int i, int j, int k) const { return this->template get<GetValue<BuildT>>(CoordType(i, j, k)); }
+    __hostdev__ auto         getNodeInfo(const CoordType& ijk) const { return this->template get<GetNodeInfo<BuildT>>(ijk); }
+    __hostdev__ bool         isActive(const CoordType& ijk) const { return this->template get<GetState<BuildT>>(ijk); }
+    __hostdev__ bool         probeValue(const CoordType& ijk, ValueType& v) const { return this->template get<ProbeValue<BuildT>>(ijk, v); }
+    __hostdev__ const LeafT* probeLeaf(const CoordType& ijk) const { return this->template get<GetLeaf<BuildT>>(ijk); }
+#else // NANOVDB_NEW_ACCESSOR_METHODS
+
+    __hostdev__ ValueType getValue(const CoordType& ijk) const
+    {
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+        const CoordValueType dirty = this->computeDirty(ijk);
+#else
+        auto&& dirty = ijk;
+#endif
+        if (this->isCached<LeafT>(dirty)) {
+            return ((LeafT*)mNode[0])->getValue(ijk);
+        } else if (this->isCached<NodeT1>(dirty)) {
+            return ((NodeT1*)mNode[1])->getValueAndCache(ijk, *this);
+        } else if (this->isCached<NodeT2>(dirty)) {
+            return ((NodeT2*)mNode[2])->getValueAndCache(ijk, *this);
+        }
+        return mRoot->getValueAndCache(ijk, *this);
+    }
+    __hostdev__ ValueType operator()(const CoordType& ijk) const
+    {
+        return this->getValue(ijk);
+    }
+    __hostdev__ ValueType operator()(int i, int j, int k) const
+    {
+        return this->getValue(CoordType(i, j, k));
+    }
+    __hostdev__ ValueType getValue(int i, int j, int k) const
+    {
+        return this->getValue(CoordType(i, j, k));
+    }
+
+    __hostdev__ NodeInfo getNodeInfo(const CoordType& ijk) const
+    {
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+        const CoordValueType dirty = this->computeDirty(ijk);
+#else
+        auto&& dirty = ijk;
+#endif
+        if (this->isCached<LeafT>(dirty)) {
+            return ((LeafT*)mNode[0])->getNodeInfoAndCache(ijk, *this);
+        } else if (this->isCached<NodeT1>(dirty)) {
+            return ((NodeT1*)mNode[1])->getNodeInfoAndCache(ijk, *this);
+        } else if (this->isCached<NodeT2>(dirty)) {
+            return ((NodeT2*)mNode[2])->getNodeInfoAndCache(ijk, *this);
+        }
+        return mRoot->getNodeInfoAndCache(ijk, *this);
+    }
+
+    __hostdev__ bool isActive(const CoordType& ijk) const
+    {
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+        const CoordValueType dirty = this->computeDirty(ijk);
+#else
+        auto&& dirty = ijk;
+#endif
+        if (this->isCached<LeafT>(dirty)) {
+            return ((LeafT*)mNode[0])->isActive(ijk);
+        } else if (this->isCached<NodeT1>(dirty)) {
+            return ((NodeT1*)mNode[1])->isActiveAndCache(ijk, *this);
+        } else if (this->isCached<NodeT2>(dirty)) {
+            return ((NodeT2*)mNode[2])->isActiveAndCache(ijk, *this);
+        }
+        return mRoot->isActiveAndCache(ijk, *this);
+    }
+
+    __hostdev__ bool probeValue(const CoordType& ijk, ValueType& v) const
+    {
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+        const CoordValueType dirty = this->computeDirty(ijk);
+#else
+        auto&& dirty = ijk;
+#endif
+        if (this->isCached<LeafT>(dirty)) {
+            return ((LeafT*)mNode[0])->probeValue(ijk, v);
+        } else if (this->isCached<NodeT1>(dirty)) {
+            return ((NodeT1*)mNode[1])->probeValueAndCache(ijk, v, *this);
+        } else if (this->isCached<NodeT2>(dirty)) {
+            return ((NodeT2*)mNode[2])->probeValueAndCache(ijk, v, *this);
+        }
+        return mRoot->probeValueAndCache(ijk, v, *this);
+    }
+    __hostdev__ const LeafT* probeLeaf(const CoordType& ijk) const
+    {
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+        const CoordValueType dirty = this->computeDirty(ijk);
+#else
+        auto&& dirty = ijk;
+#endif
+        if (this->isCached<LeafT>(dirty)) {
+            return ((LeafT*)mNode[0]);
+        } else if (this->isCached<NodeT1>(dirty)) {
+            return ((NodeT1*)mNode[1])->probeLeafAndCache(ijk, *this);
+        } else if (this->isCached<NodeT2>(dirty)) {
+            return ((NodeT2*)mNode[2])->probeLeafAndCache(ijk, *this);
+        }
+        return mRoot->probeLeafAndCache(ijk, *this);
+    }
+#endif // NANOVDB_NEW_ACCESSOR_METHODS
+
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto get(const CoordType& ijk, ArgsT&&... args) const
+    {
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+        const CoordValueType dirty = this->computeDirty(ijk);
+#else
+        auto&& dirty = ijk;
+#endif
+        if (this->isCached<LeafT>(dirty)) {
+            return ((const LeafT*)mNode[0])->template getAndCache<OpT>(ijk, *this, args...);
+        } else if (this->isCached<NodeT1>(dirty)) {
+            return ((const NodeT1*)mNode[1])->template getAndCache<OpT>(ijk, *this, args...);
+        } else if (this->isCached<NodeT2>(dirty)) {
+            return ((const NodeT2*)mNode[2])->template getAndCache<OpT>(ijk, *this, args...);
+        }
+        return mRoot->template getAndCache<OpT>(ijk, *this, args...);
+    }
+
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto set(const CoordType& ijk, ArgsT&&... args) const
+    {
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+        const CoordValueType dirty = this->computeDirty(ijk);
+#else
+        auto&& dirty = ijk;
+#endif
+        if (this->isCached<LeafT>(dirty)) {
+            return ((LeafT*)mNode[0])->template setAndCache<OpT>(ijk, *this, args...);
+        } else if (this->isCached<NodeT1>(dirty)) {
+            return ((NodeT1*)mNode[1])->template setAndCache<OpT>(ijk, *this, args...);
+        } else if (this->isCached<NodeT2>(dirty)) {
+            return ((NodeT2*)mNode[2])->template setAndCache<OpT>(ijk, *this, args...);
+        }
+        return ((RootT*)mRoot)->template setAndCache<OpT>(ijk, *this, args...);
+    }
+
+    template<typename RayT>
+    __hostdev__ uint32_t getDim(const CoordType& ijk, const RayT& ray) const
+    {
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+        const CoordValueType dirty = this->computeDirty(ijk);
+#else
+        auto&& dirty = ijk;
+#endif
+        if (this->isCached<LeafT>(dirty)) {
+            return ((LeafT*)mNode[0])->getDimAndCache(ijk, ray, *this);
+        } else if (this->isCached<NodeT1>(dirty)) {
+            return ((NodeT1*)mNode[1])->getDimAndCache(ijk, ray, *this);
+        } else if (this->isCached<NodeT2>(dirty)) {
+            return ((NodeT2*)mNode[2])->getDimAndCache(ijk, ray, *this);
+        }
+        return mRoot->getDimAndCache(ijk, ray, *this);
+    }
+
+private:
+    /// @brief Allow nodes to insert themselves into the cache.
+    template<typename>
+    friend class RootNode;
+    template<typename, uint32_t>
+    friend class InternalNode;
+    template<typename, typename, template<uint32_t> class, uint32_t>
+    friend class LeafNode;
+
+    /// @brief Inserts a leaf node and key pair into this ReadAccessor
+    template<typename NodeT>
+    __hostdev__ void insert(const CoordType& ijk, const NodeT* node) const
+    {
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+        mKey = ijk;
+#else
+        mKeys[NodeT::LEVEL] = ijk & ~NodeT::MASK;
+#endif
+        mNode[NodeT::LEVEL] = node;
+    }
+}; // ReadAccessor<BuildT, 0, 1, 2>
+
+//////////////////////////////////////////////////
+
+/// @brief Free-standing function for convenient creation of a ReadAccessor with
+///        optional and customizable node caching.
+///
+/// @details createAccessor<>(grid):  No caching of nodes and hence it's thread-safe but slow
+///          createAccessor<0>(grid): Caching of leaf nodes only
+///          createAccessor<1>(grid): Caching of lower internal nodes only
+///          createAccessor<2>(grid): Caching of upper internal nodes only
+///          createAccessor<0,1>(grid): Caching of leaf and lower internal nodes
+///          createAccessor<0,2>(grid): Caching of leaf and upper internal nodes
+///          createAccessor<1,2>(grid): Caching of lower and upper internal nodes
+///          createAccessor<0,1,2>(grid): Caching of all nodes at all tree levels
+
+template<int LEVEL0 = -1, int LEVEL1 = -1, int LEVEL2 = -1, typename ValueT = float>
+ReadAccessor<ValueT, LEVEL0, LEVEL1, LEVEL2> createAccessor(const NanoGrid<ValueT>& grid)
+{
+    return ReadAccessor<ValueT, LEVEL0, LEVEL1, LEVEL2>(grid);
+}
+
+template<int LEVEL0 = -1, int LEVEL1 = -1, int LEVEL2 = -1, typename ValueT = float>
+ReadAccessor<ValueT, LEVEL0, LEVEL1, LEVEL2> createAccessor(const NanoTree<ValueT>& tree)
+{
+    return ReadAccessor<ValueT, LEVEL0, LEVEL1, LEVEL2>(tree);
+}
+
+template<int LEVEL0 = -1, int LEVEL1 = -1, int LEVEL2 = -1, typename ValueT = float>
+ReadAccessor<ValueT, LEVEL0, LEVEL1, LEVEL2> createAccessor(const NanoRoot<ValueT>& root)
+{
+    return ReadAccessor<ValueT, LEVEL0, LEVEL1, LEVEL2>(root);
+}
+
+//////////////////////////////////////////////////
+
+/// @brief This is a convenient class that allows for access to grid meta-data
+///        that are independent of the value type of a grid. That is, this class
+///        can be used to get information about a grid without actually knowing
+///        its ValueType.
+class GridMetaData
+{ // 768 bytes (32 byte aligned)
+    GridData  mGridData; // 672B
+    TreeData  mTreeData; // 64B
+    CoordBBox mIndexBBox; // 24B. AABB of active values in index space.
+    uint32_t  mRootTableSize, mPadding{0}; // 8B
+
+public:
+    template<typename T>
+    GridMetaData(const NanoGrid<T>& grid)
+    {
+        mGridData = *grid.data();
+        mTreeData = *grid.tree().data();
+        mIndexBBox = grid.indexBBox();
+        mRootTableSize = grid.tree().root().getTableSize();
+    }
+    GridMetaData(const GridData* gridData)
+    {
+        if (GridMetaData::safeCast(gridData)) {
+            *this = *reinterpret_cast<const GridMetaData*>(gridData);
+            //util::memcpy(this, (const GridMetaData*)gridData);
+        } else {// otherwise copy each member individually
+            mGridData  = *gridData;
+            mTreeData  = *reinterpret_cast<const TreeData*>(gridData->treePtr());
+            mIndexBBox = gridData->indexBBox();
+            mRootTableSize = gridData->rootTableSize();
+        }
+    }
+    GridMetaData& operator=(const GridMetaData&) = default;
+    /// @brief return true if the RootData follows right after the TreeData.
+    ///        If so, this implies that it's safe to cast the grid from which
+    ///        this instance was constructed to a GridMetaData
+    __hostdev__ bool safeCast() const { return mTreeData.isRootNext(); }
+
+    /// @brief return true if it is safe to cast the grid to a pointer
+    ///        of type GridMetaData, i.e. construction can be avoided.
+    __hostdev__ static bool      safeCast(const GridData *gridData){
+        NANOVDB_ASSERT(gridData && gridData->isValid());
+        return gridData->isRootConnected();
+    }
+    /// @brief return true if it is safe to cast the grid to a pointer
+    ///        of type GridMetaData, i.e. construction can be avoided.
+    template<typename T>
+    __hostdev__ static bool      safeCast(const NanoGrid<T>& grid){return grid.tree().isRootNext();}
+    __hostdev__ bool             isValid() const { return mGridData.isValid(); }
+    __hostdev__ const GridType&  gridType() const { return mGridData.mGridType; }
+    __hostdev__ const GridClass& gridClass() const { return mGridData.mGridClass; }
+    __hostdev__ bool             isLevelSet() const { return mGridData.mGridClass == GridClass::LevelSet; }
+    __hostdev__ bool             isFogVolume() const { return mGridData.mGridClass == GridClass::FogVolume; }
+    __hostdev__ bool             isStaggered() const { return mGridData.mGridClass == GridClass::Staggered; }
+    __hostdev__ bool             isPointIndex() const { return mGridData.mGridClass == GridClass::PointIndex; }
+    __hostdev__ bool             isGridIndex() const { return mGridData.mGridClass == GridClass::IndexGrid; }
+    __hostdev__ bool             isPointData() const { return mGridData.mGridClass == GridClass::PointData; }
+    __hostdev__ bool             isMask() const { return mGridData.mGridClass == GridClass::Topology; }
+    __hostdev__ bool             isUnknown() const { return mGridData.mGridClass == GridClass::Unknown; }
+    __hostdev__ bool             hasMinMax() const { return mGridData.mFlags.isMaskOn(GridFlags::HasMinMax); }
+    __hostdev__ bool             hasBBox() const { return mGridData.mFlags.isMaskOn(GridFlags::HasBBox); }
+    __hostdev__ bool             hasLongGridName() const { return mGridData.mFlags.isMaskOn(GridFlags::HasLongGridName); }
+    __hostdev__ bool             hasAverage() const { return mGridData.mFlags.isMaskOn(GridFlags::HasAverage); }
+    __hostdev__ bool             hasStdDeviation() const { return mGridData.mFlags.isMaskOn(GridFlags::HasStdDeviation); }
+    __hostdev__ bool             isBreadthFirst() const { return mGridData.mFlags.isMaskOn(GridFlags::IsBreadthFirst); }
+    __hostdev__ uint64_t         gridSize() const { return mGridData.mGridSize; }
+    __hostdev__ uint32_t         gridIndex() const { return mGridData.mGridIndex; }
+    __hostdev__ uint32_t         gridCount() const { return mGridData.mGridCount; }
+    __hostdev__ const char*      shortGridName() const { return mGridData.mGridName; }
+    __hostdev__ const Map&       map() const { return mGridData.mMap; }
+    __hostdev__ const Vec3dBBox& worldBBox() const { return mGridData.mWorldBBox; }
+    __hostdev__ const CoordBBox& indexBBox() const { return mIndexBBox; }
+    __hostdev__ Vec3d              voxelSize() const { return mGridData.mVoxelSize; }
+    __hostdev__ int                blindDataCount() const { return mGridData.mBlindMetadataCount; }
+    __hostdev__ uint64_t        activeVoxelCount() const { return mTreeData.mVoxelCount; }
+    __hostdev__ const uint32_t& activeTileCount(uint32_t level) const { return mTreeData.mTileCount[level - 1]; }
+    __hostdev__ uint32_t        nodeCount(uint32_t level) const { return mTreeData.mNodeCount[level]; }
+    __hostdev__ const Checksum& checksum() const { return mGridData.mChecksum; }
+    __hostdev__ uint32_t        rootTableSize() const { return mRootTableSize; }
+    __hostdev__ bool            isEmpty() const { return mRootTableSize == 0; }
+    __hostdev__ Version         version() const { return mGridData.mVersion; }
+}; // GridMetaData
+
+/// @brief Class to access points at a specific voxel location
+///
+/// @note If GridClass::PointIndex AttT should be uint32_t and if GridClass::PointData Vec3f
+template<typename AttT, typename BuildT = uint32_t>
+class PointAccessor : public DefaultReadAccessor<BuildT>
+{
+    using AccT = DefaultReadAccessor<BuildT>;
+    const NanoGrid<BuildT>& mGrid;
+    const AttT*             mData;
+
+public:
+    PointAccessor(const NanoGrid<BuildT>& grid)
+        : AccT(grid.tree().root())
+        , mGrid(grid)
+        , mData(grid.template getBlindData<AttT>(0))
+    {
+        NANOVDB_ASSERT(grid.gridType() == toGridType<BuildT>());
+        NANOVDB_ASSERT((grid.gridClass() == GridClass::PointIndex && util::is_same<uint32_t, AttT>::value) ||
+                       (grid.gridClass() == GridClass::PointData && util::is_same<Vec3f, AttT>::value));
+    }
+
+    /// @brief  return true if this access was initialized correctly
+    __hostdev__ operator bool() const { return mData != nullptr; }
+
+    __hostdev__ const NanoGrid<BuildT>& grid() const { return mGrid; }
+
+    /// @brief Return the total number of point in the grid and set the
+    ///        iterators to the complete range of points.
+    __hostdev__ uint64_t gridPoints(const AttT*& begin, const AttT*& end) const
+    {
+        const uint64_t count = mGrid.blindMetaData(0u).mValueCount;
+        begin = mData;
+        end = begin + count;
+        return count;
+    }
+    /// @brief Return the number of points in the leaf node containing the coordinate @a ijk.
+    ///        If this return value is larger than zero then the iterators @a begin and @a end
+    ///        will point to all the attributes contained within that leaf node.
+    __hostdev__ uint64_t leafPoints(const Coord& ijk, const AttT*& begin, const AttT*& end) const
+    {
+        auto* leaf = this->probeLeaf(ijk);
+        if (leaf == nullptr) {
+            return 0;
+        }
+        begin = mData + leaf->minimum();
+        end = begin + leaf->maximum();
+        return leaf->maximum();
+    }
+
+    /// @brief get iterators over attributes to points at a specific voxel location
+    __hostdev__ uint64_t voxelPoints(const Coord& ijk, const AttT*& begin, const AttT*& end) const
+    {
+        begin = end = nullptr;
+        if (auto* leaf = this->probeLeaf(ijk)) {
+            const uint32_t offset = NanoLeaf<BuildT>::CoordToOffset(ijk);
+            if (leaf->isActive(offset)) {
+                begin = mData + leaf->minimum();
+                end = begin + leaf->getValue(offset);
+                if (offset > 0u)
+                    begin += leaf->getValue(offset - 1);
+            }
+        }
+        return end - begin;
+    }
+}; // PointAccessor
+
+template<typename AttT>
+class PointAccessor<AttT, Point> : public DefaultReadAccessor<Point>
+{
+    using AccT = DefaultReadAccessor<Point>;
+    const NanoGrid<Point>& mGrid;
+    const AttT*             mData;
+
+public:
+    PointAccessor(const NanoGrid<Point>& grid)
+        : AccT(grid.tree().root())
+        , mGrid(grid)
+        , mData(grid.template getBlindData<AttT>(0))
+    {
+        NANOVDB_ASSERT(mData);
+        NANOVDB_ASSERT(grid.gridType() == GridType::PointIndex);
+        NANOVDB_ASSERT((grid.gridClass() == GridClass::PointIndex && util::is_same<uint32_t, AttT>::value) ||
+                       (grid.gridClass() == GridClass::PointData && util::is_same<Vec3f, AttT>::value) ||
+                       (grid.gridClass() == GridClass::PointData && util::is_same<Vec3d, AttT>::value) ||
+                       (grid.gridClass() == GridClass::PointData && util::is_same<Vec3u16, AttT>::value) ||
+                       (grid.gridClass() == GridClass::PointData && util::is_same<Vec3u8, AttT>::value));
+    }
+
+    /// @brief  return true if this access was initialized correctly
+    __hostdev__ operator bool() const { return mData != nullptr; }
+
+    __hostdev__ const NanoGrid<Point>& grid() const { return mGrid; }
+
+    /// @brief Return the total number of point in the grid and set the
+    ///        iterators to the complete range of points.
+    __hostdev__ uint64_t gridPoints(const AttT*& begin, const AttT*& end) const
+    {
+        const uint64_t count = mGrid.blindMetaData(0u).mValueCount;
+        begin = mData;
+        end = begin + count;
+        return count;
+    }
+    /// @brief Return the number of points in the leaf node containing the coordinate @a ijk.
+    ///        If this return value is larger than zero then the iterators @a begin and @a end
+    ///        will point to all the attributes contained within that leaf node.
+    __hostdev__ uint64_t leafPoints(const Coord& ijk, const AttT*& begin, const AttT*& end) const
+    {
+        auto* leaf = this->probeLeaf(ijk);
+        if (leaf == nullptr)
+            return 0;
+        begin = mData + leaf->offset();
+        end = begin + leaf->pointCount();
+        return leaf->pointCount();
+    }
+
+    /// @brief get iterators over attributes to points at a specific voxel location
+    __hostdev__ uint64_t voxelPoints(const Coord& ijk, const AttT*& begin, const AttT*& end) const
+    {
+        if (auto* leaf = this->probeLeaf(ijk)) {
+            const uint32_t n = NanoLeaf<Point>::CoordToOffset(ijk);
+            if (leaf->isActive(n)) {
+                begin = mData + leaf->first(n);
+                end = mData + leaf->last(n);
+                return end - begin;
+            }
+        }
+        begin = end = nullptr;
+        return 0u; // no leaf or inactive voxel
+    }
+}; // PointAccessor<AttT, Point>
+
+/// @brief Class to access values in channels at a specific voxel location.
+///
+/// @note The ChannelT template parameter can be either const and non-const.
+template<typename ChannelT, typename IndexT = ValueIndex>
+class ChannelAccessor : public DefaultReadAccessor<IndexT>
+{
+    static_assert(BuildTraits<IndexT>::is_index, "Expected an index build type");
+    using BaseT = DefaultReadAccessor<IndexT>;
+
+    const NanoGrid<IndexT>& mGrid;
+    ChannelT*               mChannel;
+
+public:
+    using ValueType = ChannelT;
+    using TreeType = NanoTree<IndexT>;
+    using AccessorType = ChannelAccessor<ChannelT, IndexT>;
+
+    /// @brief Ctor from an IndexGrid and an integer ID of an internal channel
+    ///        that is assumed to exist as blind data in the IndexGrid.
+    __hostdev__ ChannelAccessor(const NanoGrid<IndexT>& grid, uint32_t channelID = 0u)
+        : BaseT(grid.tree().root())
+        , mGrid(grid)
+        , mChannel(nullptr)
+    {
+        NANOVDB_ASSERT(isIndex(grid.gridType()));
+        NANOVDB_ASSERT(grid.gridClass() == GridClass::IndexGrid);
+        this->setChannel(channelID);
+    }
+
+    /// @brief Ctor from an IndexGrid and an external channel
+    __hostdev__ ChannelAccessor(const NanoGrid<IndexT>& grid, ChannelT* channelPtr)
+        : BaseT(grid.tree().root())
+        , mGrid(grid)
+        , mChannel(channelPtr)
+    {
+        NANOVDB_ASSERT(isIndex(grid.gridType()));
+        NANOVDB_ASSERT(grid.gridClass() == GridClass::IndexGrid);
+    }
+
+    /// @brief  return true if this access was initialized correctly
+    __hostdev__ operator bool() const { return mChannel != nullptr; }
+
+    /// @brief Return a const reference to the IndexGrid
+    __hostdev__ const NanoGrid<IndexT>& grid() const { return mGrid; }
+
+    /// @brief Return a const reference to the tree of the IndexGrid
+    __hostdev__ const TreeType& tree() const { return mGrid.tree(); }
+
+    /// @brief Return a vector of the axial voxel sizes
+    __hostdev__ const Vec3d& voxelSize() const { return mGrid.voxelSize(); }
+
+    /// @brief Return total number of values indexed by the IndexGrid
+    __hostdev__ const uint64_t& valueCount() const { return mGrid.valueCount(); }
+
+    /// @brief Change to an external channel
+    /// @return Pointer to channel data
+    __hostdev__ ChannelT* setChannel(ChannelT* channelPtr) {return mChannel = channelPtr;}
+
+    /// @brief Change to an internal channel, assuming it exists as as blind data
+    ///        in the IndexGrid.
+    /// @return Pointer to channel data, which could be NULL if channelID is out of range or
+    ///         if ChannelT does not match the value type of the blind data
+    __hostdev__ ChannelT* setChannel(uint32_t channelID)
+    {
+        return mChannel = const_cast<ChannelT*>(mGrid.template getBlindData<ChannelT>(channelID));
+    }
+
+    /// @brief Return the linear offset into a channel that maps to the specified coordinate
+    __hostdev__ uint64_t getIndex(const math::Coord& ijk) const { return BaseT::getValue(ijk); }
+    __hostdev__ uint64_t idx(int i, int j, int k) const { return BaseT::getValue(math::Coord(i, j, k)); }
+
+    /// @brief Return the value from a cached channel that maps to the specified coordinate
+    __hostdev__ ChannelT& getValue(const math::Coord& ijk) const { return mChannel[BaseT::getValue(ijk)]; }
+    __hostdev__ ChannelT& operator()(const math::Coord& ijk) const { return this->getValue(ijk); }
+    __hostdev__ ChannelT& operator()(int i, int j, int k) const { return this->getValue(math::Coord(i, j, k)); }
+
+    /// @brief return the state and updates the value of the specified voxel
+    __hostdev__ bool probeValue(const math::Coord& ijk, typename util::remove_const<ChannelT>::type& v) const
+    {
+        uint64_t   idx;
+        const bool isActive = BaseT::probeValue(ijk, idx);
+        v = mChannel[idx];
+        return isActive;
+    }
+    /// @brief Return the value from a specified channel that maps to the specified coordinate
+    ///
+    /// @note The template parameter can be either const or non-const
+    template<typename T>
+    __hostdev__ T& getValue(const math::Coord& ijk, T* channelPtr) const { return channelPtr[BaseT::getValue(ijk)]; }
+
+}; // ChannelAccessor
+
+#if 0
+// This MiniGridHandle class is only included as a stand-alone example. Note that aligned_alloc is a C++17 feature!
+// Normally we recommend using GridHandle defined in util/GridHandle.h but this minimal implementation could be an
+// alternative when using the IO methods defined below.
+struct MiniGridHandle {
+    struct BufferType {
+        uint8_t *data;
+        uint64_t size;
+        BufferType(uint64_t n=0) : data(std::aligned_alloc(NANOVDB_DATA_ALIGNMENT, n)), size(n) {assert(isValid(data));}
+        BufferType(BufferType &&other) : data(other.data), size(other.size) {other.data=nullptr; other.size=0;}
+        ~BufferType() {std::free(data);}
+        BufferType& operator=(const BufferType &other) = delete;
+        BufferType& operator=(BufferType &&other){data=other.data; size=other.size; other.data=nullptr; other.size=0; return *this;}
+        static BufferType create(size_t n, BufferType* dummy = nullptr) {return BufferType(n);}
+    } buffer;
+    MiniGridHandle(BufferType &&buf) : buffer(std::move(buf)) {}
+    const uint8_t* data() const {return buffer.data;}
+};// MiniGridHandle
+#endif
+
+namespace io {
+
+/// @brief Define compression codecs
+///
+/// @note NONE is the default, ZIP is slow but compact and BLOSC offers a great balance.
+///
+/// @throw NanoVDB optionally supports ZIP and BLOSC compression and will throw an exception
+///        if its support is required but missing.
+enum class Codec : uint16_t { NONE = 0,
+                              ZIP = 1,
+                              BLOSC = 2,
+                              End = 3,
+                              StrLen = 6 + End };
+
+__hostdev__ inline const char* toStr(char *dst, Codec codec)
+{
+    switch (codec){
+        case Codec::NONE:   return util::strcpy(dst, "NONE");
+        case Codec::ZIP:    return util::strcpy(dst, "ZIP");
+        case Codec::BLOSC : return util::strcpy(dst, "BLOSC");
+        default:            return util::strcpy(dst, "END");
+    }
+}
+
+__hostdev__ inline Codec toCodec(const char *str)
+{
+    if (util::streq(str, "none"))  return Codec::NONE;
+    if (util::streq(str, "zip"))   return Codec::ZIP;
+    if (util::streq(str, "blosc")) return Codec::BLOSC;
+    return Codec::End;
+}
+
+/// @brief Data encoded at the head of each segment of a file or stream.
+///
+/// @note A file or stream is composed of one or more segments that each contain
+//        one or more grids.
+struct FileHeader {// 16 bytes
+    uint64_t magic;//     8 bytes
+    Version  version;//   4 bytes version numbers
+    uint16_t gridCount;// 2 bytes
+    Codec    codec;//     2 bytes
+    bool isValid() const {return magic == NANOVDB_MAGIC_NUMB || magic == NANOVDB_MAGIC_FILE;}
+}; // FileHeader ( 16 bytes = 2 words )
+
+// @brief Data encoded for each of the grids associated with a segment.
+// Grid size in memory             (uint64_t)   |
+// Grid size on disk               (uint64_t)   |
+// Grid name hash key              (uint64_t)   |
+// Numer of active voxels          (uint64_t)   |
+// Grid type                       (uint32_t)   |
+// Grid class                      (uint32_t)   |
+// Characters in grid name         (uint32_t)   |
+// AABB in world space             (2*3*double) | one per grid in file
+// AABB in index space             (2*3*int)    |
+// Size of a voxel in world units  (3*double)   |
+// Byte size of the grid name      (uint32_t)   |
+// Number of nodes per level       (4*uint32_t) |
+// Numer of active tiles per level (3*uint32_t) |
+// Codec for file compression      (uint16_t)   |
+// Padding due to 8B alignment     (uint16_t)   |
+// Version number                  (uint32_t)   |
+struct FileMetaData
+{// 176 bytes
+    uint64_t    gridSize, fileSize, nameKey, voxelCount; // 4 * 8 = 32B.
+    GridType    gridType;  // 4B.
+    GridClass   gridClass; // 4B.
+    Vec3dBBox   worldBBox; // 2 * 3 * 8 = 48B.
+    CoordBBox   indexBBox; // 2 * 3 * 4 = 24B.
+    Vec3d       voxelSize; // 24B.
+    uint32_t    nameSize;  // 4B.
+    uint32_t    nodeCount[4]; //4 x 4 = 16B
+    uint32_t    tileCount[3];// 3 x 4 = 12B
+    Codec       codec;  // 2B
+    uint16_t    padding;// 2B, due to 8B alignment from uint64_t
+    Version     version;// 4B
+}; // FileMetaData
+
+// the following code block uses std and therefore needs to be ignored by CUDA and HIP
+#if !defined(__CUDA_ARCH__) && !defined(__HIP__)
+
+// Note that starting with version 32.6.0 it is possible to write and read raw grid buffers to
+// files, e.g. os.write((const char*)&buffer.data(), buffer.size()) or more conveniently as
+// handle.write(fileName). In addition to this simple approach we offer the methods below to
+// write traditional uncompressed nanovdb files that unlike raw files include metadata that
+// is used for tools like nanovdb_print.
+
+///
+/// @brief This is a standalone alternative to io::writeGrid(...,Codec::NONE) defined in util/IO.h
+///        Unlike the latter this function has no dependencies at all, not even NanoVDB.h, so it also
+///        works if client code only includes PNanoVDB.h!
+///
+/// @details Writes a raw NanoVDB buffer, possibly with multiple grids, to a stream WITHOUT compression.
+///          It follows all the conventions in util/IO.h so the stream can be read by all existing client
+///          code of NanoVDB.
+///
+/// @note This method will always write uncompressed grids to the stream, i.e. Blosc or ZIP compression
+///       is never applied! This is a fundamental limitation and feature of this standalone function.
+///
+/// @throw std::invalid_argument if buffer does not point to a valid NanoVDB grid.
+///
+/// @warning This is pretty ugly code that involves lots of pointer and bit manipulations - not for the faint of heart :)
+template<typename StreamT> // StreamT class must support: "void write(const char*, size_t)"
+void writeUncompressedGrid(StreamT& os, const GridData* gridData, bool raw = false)
+{
+    NANOVDB_ASSERT(gridData->mMagic == NANOVDB_MAGIC_NUMB || gridData->mMagic == NANOVDB_MAGIC_GRID);
+    NANOVDB_ASSERT(gridData->mVersion.isCompatible());
+    if (!raw) {// segment with a single grid:  FileHeader, FileMetaData, gridName, Grid
+#ifdef NANOVDB_USE_NEW_MAGIC_NUMBERS
+        FileHeader head{NANOVDB_MAGIC_FILE, gridData->mVersion, 1u, Codec::NONE};
+#else
+        FileHeader head{NANOVDB_MAGIC_NUMB, gridData->mVersion, 1u, Codec::NONE};
+#endif
+        const char* gridName = gridData->gridName();
+        const uint32_t nameSize = util::strlen(gridName) + 1;// include '\0'
+        const TreeData* treeData = (const TreeData*)(gridData->treePtr());
+        FileMetaData meta{gridData->mGridSize, gridData->mGridSize, 0u, treeData->mVoxelCount,
+                          gridData->mGridType, gridData->mGridClass, gridData->mWorldBBox,
+                          treeData->bbox(), gridData->mVoxelSize, nameSize,
+                          {treeData->mNodeCount[0], treeData->mNodeCount[1], treeData->mNodeCount[2], 1u},
+                          {treeData->mTileCount[0], treeData->mTileCount[1], treeData->mTileCount[2]},
+                          Codec::NONE, 0u, gridData->mVersion }; // FileMetaData
+        os.write((const char*)&head, sizeof(FileHeader)); // write header
+        os.write((const char*)&meta, sizeof(FileMetaData)); // write meta data
+        os.write(gridName, nameSize); // write grid name
+    }
+    os.write((const char*)gridData, gridData->mGridSize);// write the grid
+}// writeUncompressedGrid
+
+/// @brief  write multiple NanoVDB grids to a single file, without compression.
+/// @note To write all grids in a single GridHandle simply use handle.write("fieNane")
+template<typename GridHandleT, template<typename...> class VecT>
+void writeUncompressedGrids(const char* fileName, const VecT<GridHandleT>& handles, bool raw = false)
+{
+#ifdef NANOVDB_USE_IOSTREAMS // use this to switch between std::ofstream or FILE implementations
+    std::ofstream os(fileName, std::ios::out | std::ios::binary | std::ios::trunc);
+#else
+    struct StreamT {
+        FILE* fptr;
+        StreamT(const char* name) { fptr = fopen(name, "wb"); }
+        ~StreamT() { fclose(fptr); }
+        void write(const char* data, size_t n) { fwrite(data, 1, n, fptr); }
+        bool is_open() const { return fptr != NULL; }
+    } os(fileName);
+#endif
+    if (!os.is_open()) {
+        fprintf(stderr, "nanovdb::writeUncompressedGrids: Unable to open file \"%s\"for output\n", fileName);
+        exit(EXIT_FAILURE);
+    }
+    for (auto& h : handles) {
+        for (uint32_t n=0; n<h.gridCount(); ++n) writeUncompressedGrid(os, h.gridData(n), raw);
+    }
+} // writeUncompressedGrids
+
+/// @brief read all uncompressed grids from a stream and return their handles.
+///
+/// @throw std::invalid_argument if stream does not contain a single uncompressed valid NanoVDB grid
+///
+/// @details StreamT class must support: "bool read(char*, size_t)" and "void skip(uint32_t)"
+template<typename GridHandleT, typename StreamT, template<typename...> class VecT>
+VecT<GridHandleT> readUncompressedGrids(StreamT& is, const typename GridHandleT::BufferType& pool = typename GridHandleT::BufferType())
+{
+    VecT<GridHandleT> handles;
+    GridData data;
+    is.read((char*)&data, sizeof(GridData));
+    if (data.isValid()) {// stream contains a raw grid buffer
+        uint64_t size = data.mGridSize, sum = 0u;
+        while(data.mGridIndex + 1u < data.mGridCount) {
+            is.skip(data.mGridSize - sizeof(GridData));// skip grid
+            is.read((char*)&data, sizeof(GridData));// read sizeof(GridData) bytes
+            sum += data.mGridSize;
+        }
+        is.skip(-int64_t(sum + sizeof(GridData)));// rewind to start
+        auto buffer = GridHandleT::BufferType::create(size + sum, &pool);
+        is.read((char*)(buffer.data()), buffer.size());
+        handles.emplace_back(std::move(buffer));
+    } else {// Header0, MetaData0, gridName0, Grid0...HeaderN, MetaDataN, gridNameN, GridN
+        is.skip(-sizeof(GridData));// rewind
+        FileHeader head;
+        while(is.read((char*)&head, sizeof(FileHeader))) {
+            if (!head.isValid()) {
+                fprintf(stderr, "nanovdb::readUncompressedGrids: invalid magic number = \"%s\"\n", (const char*)&(head.magic));
+                exit(EXIT_FAILURE);
+            } else if (!head.version.isCompatible()) {
+                char str[20];
+                fprintf(stderr, "nanovdb::readUncompressedGrids: invalid major version = \"%s\"\n", toStr(str, head.version));
+                exit(EXIT_FAILURE);
+            } else if (head.codec != Codec::NONE) {
+                char str[8];
+                fprintf(stderr, "nanovdb::readUncompressedGrids: invalid codec = \"%s\"\n", toStr(str, head.codec));
+                exit(EXIT_FAILURE);
+            }
+            FileMetaData meta;
+            for (uint16_t i = 0; i < head.gridCount; ++i) { // read all grids in segment
+                is.read((char*)&meta, sizeof(FileMetaData));// read meta data
+                is.skip(meta.nameSize); // skip grid name
+                auto buffer = GridHandleT::BufferType::create(meta.gridSize, &pool);
+                is.read((char*)buffer.data(), meta.gridSize);// read grid
+                handles.emplace_back(std::move(buffer));
+            }// loop over grids in segment
+        }// loop over segments
+    }
+    return handles;
+} // readUncompressedGrids
+
+/// @brief Read a multiple un-compressed NanoVDB grids from a file and return them as a vector.
+template<typename GridHandleT, template<typename...> class VecT>
+VecT<GridHandleT> readUncompressedGrids(const char* fileName, const typename GridHandleT::BufferType& buffer = typename GridHandleT::BufferType())
+{
+#ifdef NANOVDB_USE_IOSTREAMS // use this to switch between std::ifstream or FILE implementations
+    struct StreamT : public std::ifstream {
+        StreamT(const char* name) : std::ifstream(name, std::ios::in | std::ios::binary){}
+        void skip(int64_t off) { this->seekg(off, std::ios_base::cur); }
+    };
+#else
+    struct StreamT {
+        FILE* fptr;
+        StreamT(const char* name) { fptr = fopen(name, "rb"); }
+        ~StreamT() { fclose(fptr); }
+        bool read(char* data, size_t n) {
+            size_t m = fread(data, 1, n, fptr);
+            return n == m;
+        }
+        void skip(int64_t off) { fseek(fptr, (long int)off, SEEK_CUR); }
+        bool is_open() const { return fptr != NULL; }
+    };
+#endif
+    StreamT is(fileName);
+    if (!is.is_open()) {
+        fprintf(stderr, "nanovdb::readUncompressedGrids: Unable to open file \"%s\"for input\n", fileName);
+        exit(EXIT_FAILURE);
+    }
+    return readUncompressedGrids<GridHandleT, StreamT, VecT>(is, buffer);
+} // readUncompressedGrids
+
+#endif // if !defined(__CUDA_ARCH__) && !defined(__HIP__)
+
+} // namespace io
+
+// ----------------------------> Implementations of random access methods <--------------------------------------
+
+/// @brief Implements Tree::getValue(math::Coord), i.e. return the value associated with a specific coordinate @c ijk.
+/// @tparam BuildT Build type of the grid being called
+/// @details The value at a coordinate maps to the background, a tile value or a leaf value.
+template<typename BuildT>
+struct GetValue
+{
+    __hostdev__ static auto get(const NanoRoot<BuildT>& root) { return root.mBackground; }
+    __hostdev__ static auto get(const typename NanoRoot<BuildT>::Tile& tile) { return tile.value; }
+    __hostdev__ static auto get(const NanoUpper<BuildT>& node, uint32_t n) { return node.mTable[n].value; }
+    __hostdev__ static auto get(const NanoLower<BuildT>& node, uint32_t n) { return node.mTable[n].value; }
+    __hostdev__ static auto get(const NanoLeaf<BuildT>& leaf,  uint32_t n) { return leaf.getValue(n); } // works with all build types
+}; // GetValue<BuildT>
+
+template<typename BuildT>
+struct SetValue
+{
+    static_assert(!BuildTraits<BuildT>::is_special, "SetValue does not support special value types");
+    using ValueT = typename NanoLeaf<BuildT>::ValueType;
+    __hostdev__ static auto set(NanoRoot<BuildT>&, const ValueT&) {} // no-op
+    __hostdev__ static auto set(typename NanoRoot<BuildT>::Tile& tile, const ValueT& v) { tile.value = v; }
+    __hostdev__ static auto set(NanoUpper<BuildT>& node, uint32_t n, const ValueT& v) { node.mTable[n].value = v; }
+    __hostdev__ static auto set(NanoLower<BuildT>& node, uint32_t n, const ValueT& v) { node.mTable[n].value = v; }
+    __hostdev__ static auto set(NanoLeaf<BuildT>& leaf,  uint32_t n, const ValueT& v) { leaf.mValues[n] = v; }
+}; // SetValue<BuildT>
+
+template<typename BuildT>
+struct SetVoxel
+{
+    static_assert(!BuildTraits<BuildT>::is_special, "SetVoxel does not support special value types");
+    using ValueT = typename NanoLeaf<BuildT>::ValueType;
+    __hostdev__ static auto set(NanoRoot<BuildT>&, const ValueT&) {} // no-op
+    __hostdev__ static auto set(typename NanoRoot<BuildT>::Tile&, const ValueT&) {} // no-op
+    __hostdev__ static auto set(NanoUpper<BuildT>&, uint32_t, const ValueT&) {} // no-op
+    __hostdev__ static auto set(NanoLower<BuildT>&, uint32_t, const ValueT&) {} // no-op
+    __hostdev__ static auto set(NanoLeaf<BuildT>& leaf, uint32_t n, const ValueT& v) { leaf.mValues[n] = v; }
+}; // SetVoxel<BuildT>
+
+/// @brief Implements Tree::isActive(math::Coord)
+/// @tparam BuildT Build type of the grid being called
+template<typename BuildT>
+struct GetState
+{
+    __hostdev__ static auto get(const NanoRoot<BuildT>&) { return false; }
+    __hostdev__ static auto get(const typename NanoRoot<BuildT>::Tile& tile) { return tile.state > 0; }
+    __hostdev__ static auto get(const NanoUpper<BuildT>& node, uint32_t n) { return node.mValueMask.isOn(n); }
+    __hostdev__ static auto get(const NanoLower<BuildT>& node, uint32_t n) { return node.mValueMask.isOn(n); }
+    __hostdev__ static auto get(const NanoLeaf<BuildT>& leaf,  uint32_t n) { return leaf.mValueMask.isOn(n); }
+}; // GetState<BuildT>
+
+/// @brief Implements Tree::getDim(math::Coord)
+/// @tparam BuildT Build type of the grid being called
+template<typename BuildT>
+struct GetDim
+{
+    __hostdev__ static uint32_t get(const NanoRoot<BuildT>&) { return 0u; } // background
+    __hostdev__ static uint32_t get(const typename NanoRoot<BuildT>::Tile&) { return 4096u; }
+    __hostdev__ static uint32_t get(const NanoUpper<BuildT>&, uint32_t) { return 128u; }
+    __hostdev__ static uint32_t get(const NanoLower<BuildT>&, uint32_t) { return 8u; }
+    __hostdev__ static uint32_t get(const NanoLeaf<BuildT>&, uint32_t) { return 1u; }
+}; // GetDim<BuildT>
+
+/// @brief Return the pointer to the leaf node that contains math::Coord. Implements Tree::probeLeaf(math::Coord)
+/// @tparam BuildT Build type of the grid being called
+template<typename BuildT>
+struct GetLeaf
+{
+    __hostdev__ static const NanoLeaf<BuildT>* get(const NanoRoot<BuildT>&) { return nullptr; }
+    __hostdev__ static const NanoLeaf<BuildT>* get(const typename NanoRoot<BuildT>::Tile&) { return nullptr; }
+    __hostdev__ static const NanoLeaf<BuildT>* get(const NanoUpper<BuildT>&, uint32_t) { return nullptr; }
+    __hostdev__ static const NanoLeaf<BuildT>* get(const NanoLower<BuildT>&, uint32_t) { return nullptr; }
+    __hostdev__ static const NanoLeaf<BuildT>* get(const NanoLeaf<BuildT>& leaf, uint32_t) { return &leaf; }
+}; // GetLeaf<BuildT>
+
+/// @brief Return point to the lower internal node where math::Coord maps to one of its values, i.e. terminates
+/// @tparam BuildT Build type of the grid being called
+template<typename BuildT>
+struct GetLower
+{
+    __hostdev__ static const NanoLower<BuildT>* get(const NanoRoot<BuildT>&) { return nullptr; }
+    __hostdev__ static const NanoLower<BuildT>* get(const typename NanoRoot<BuildT>::Tile&) { return nullptr; }
+    __hostdev__ static const NanoLower<BuildT>* get(const NanoUpper<BuildT>&, uint32_t) { return nullptr; }
+    __hostdev__ static const NanoLower<BuildT>* get(const NanoLower<BuildT>& node, uint32_t) { return &node; }
+    __hostdev__ static const NanoLower<BuildT>* get(const NanoLeaf<BuildT>&, uint32_t) { return nullptr; }
+}; // GetLower<BuildT>
+
+/// @brief Return point to the upper internal node where math::Coord maps to one of its values, i.e. terminates
+/// @tparam BuildT Build type of the grid being called
+template<typename BuildT>
+struct GetUpper
+{
+    __hostdev__ static const NanoUpper<BuildT>* get(const NanoRoot<BuildT>&) { return nullptr; }
+    __hostdev__ static const NanoUpper<BuildT>* get(const typename NanoRoot<BuildT>::Tile&) { return nullptr; }
+    __hostdev__ static const NanoUpper<BuildT>* get(const NanoUpper<BuildT>& node, uint32_t) { return &node; }
+    __hostdev__ static const NanoUpper<BuildT>* get(const NanoLower<BuildT>& node, uint32_t) { return nullptr; }
+    __hostdev__ static const NanoUpper<BuildT>* get(const NanoLeaf<BuildT>&, uint32_t) { return nullptr; }
+}; // GetUpper<BuildT>
+
+/// @brief Implements Tree::probeLeaf(math::Coord)
+/// @tparam BuildT Build type of the grid being called
+template<typename BuildT>
+struct ProbeValue
+{
+    using ValueT = typename BuildToValueMap<BuildT>::Type;
+    __hostdev__ static bool get(const NanoRoot<BuildT>& root, ValueT& v)
+    {
+        v = root.mBackground;
+        return false;
+    }
+    __hostdev__ static bool get(const typename NanoRoot<BuildT>::Tile& tile, ValueT& v)
+    {
+        v = tile.value;
+        return tile.state > 0u;
+    }
+    __hostdev__ static bool get(const NanoUpper<BuildT>& node, uint32_t n, ValueT& v)
+    {
+        v = node.mTable[n].value;
+        return node.mValueMask.isOn(n);
+    }
+    __hostdev__ static bool get(const NanoLower<BuildT>& node, uint32_t n, ValueT& v)
+    {
+        v = node.mTable[n].value;
+        return node.mValueMask.isOn(n);
+    }
+    __hostdev__ static bool get(const NanoLeaf<BuildT>& leaf, uint32_t n, ValueT& v)
+    {
+        v = leaf.getValue(n);
+        return leaf.mValueMask.isOn(n);
+    }
+}; // ProbeValue<BuildT>
+
+/// @brief Implements Tree::getNodeInfo(math::Coord)
+/// @tparam BuildT Build type of the grid being called
+template<typename BuildT>
+struct GetNodeInfo
+{
+    using ValueType = typename NanoLeaf<BuildT>::ValueType;
+    using FloatType = typename NanoLeaf<BuildT>::FloatType;
+    struct NodeInfo
+    {
+        uint32_t level, dim;
+        ValueType minimum, maximum;
+        FloatType average, stdDevi;
+        CoordBBox bbox;
+    };
+    __hostdev__ static NodeInfo get(const NanoRoot<BuildT>& root)
+    {
+        return NodeInfo{3u, NanoUpper<BuildT>::DIM, root.minimum(), root.maximum(), root.average(), root.stdDeviation(), root.bbox()};
+    }
+    __hostdev__ static NodeInfo get(const typename NanoRoot<BuildT>::Tile& tile)
+    {
+        return NodeInfo{3u, NanoUpper<BuildT>::DIM, tile.value, tile.value, static_cast<FloatType>(tile.value), 0, CoordBBox::createCube(tile.origin(), NanoUpper<BuildT>::DIM)};
+    }
+    __hostdev__ static NodeInfo get(const NanoUpper<BuildT>& node, uint32_t n)
+    {
+        return NodeInfo{2u, node.dim(), node.minimum(), node.maximum(), node.average(), node.stdDeviation(), node.bbox()};
+    }
+    __hostdev__ static NodeInfo get(const NanoLower<BuildT>& node, uint32_t n)
+    {
+        return NodeInfo{1u, node.dim(), node.minimum(), node.maximum(), node.average(), node.stdDeviation(), node.bbox()};
+    }
+    __hostdev__ static NodeInfo get(const NanoLeaf<BuildT>& leaf, uint32_t n)
+    {
+        return NodeInfo{0u, leaf.dim(), leaf.minimum(), leaf.maximum(), leaf.average(), leaf.stdDeviation(), leaf.bbox()};
+    }
+}; // GetNodeInfo<BuildT>
+
+} // namespace nanovdb ===================================================================
+
+#endif // end of NANOVDB_NANOVDB_H_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/NodeManager.h b/external/nanovdb/NodeManager.h
new file mode 100644
index 00000000..0d7686eb
--- /dev/null
+++ b/external/nanovdb/NodeManager.h
@@ -0,0 +1,327 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file nanovdb/NodeManager.h
+
+    \author Ken Museth
+
+    \date February 12, 2021
+
+    \brief This class allows for sequential access to nodes
+           in a NanoVDB tree on both the host and device.
+
+    \details The ordering of the sequential access to nodes is always breadth-first!
+*/
+
+#include <nanovdb/NanoVDB.h>// for NanoGrid etc
+#include <nanovdb/HostBuffer.h>// for HostBuffer
+
+#ifndef NANOVDB_NODEMANAGER_H_HAS_BEEN_INCLUDED
+#define NANOVDB_NODEMANAGER_H_HAS_BEEN_INCLUDED
+
+namespace nanovdb {
+
+/// @brief NodeManager allows for sequential access to nodes
+template <typename BuildT>
+class NodeManager;
+
+/// @brief NodeManagerHandle manages the memory of a NodeManager
+template<typename BufferT = HostBuffer>
+class NodeManagerHandle;
+
+/// @brief brief Construct a NodeManager and return its handle
+///
+/// @param grid grid whose nodes will be accessed sequentially
+/// @param buffer buffer from which to allocate the output handle
+///
+/// @note This is the only way to create a NodeManager since it's using
+///       managed memory pointed to by a NodeManagerHandle.
+template <typename BuildT, typename BufferT = HostBuffer>
+NodeManagerHandle<BufferT> createNodeManager(const NanoGrid<BuildT> &grid,
+                                             const BufferT& buffer = BufferT());
+
+struct NodeManagerData
+{// 48B = 6*8B
+    uint64_t        mMagic;// 8B
+    union {int64_t  mPadding; uint8_t mLinear;};// 8B of which 1B is used for a binary flag
+    void           *mGrid;//  8B pointer to either host or device grid
+    union {int64_t *mPtr[3], mOff[3];};// 24B, use mOff if mLinear!=0
+};
+
+/// @brief This class serves to manage a raw memory buffer of a NanoVDB NodeManager or LeafManager.
+template<typename BufferT>
+class NodeManagerHandle
+{
+    GridType mGridType{GridType::Unknown};
+    BufferT  mBuffer;
+
+    template<typename BuildT>
+    const NodeManager<BuildT>* getMgr() const {
+        return mGridType == toGridType<BuildT>() ? (const NodeManager<BuildT>*)mBuffer.data() : nullptr;
+    }
+
+    template<typename BuildT, typename U = BufferT>
+    typename util::enable_if<BufferTraits<U>::hasDeviceDual, const NodeManager<BuildT>*>::type
+    getDeviceMgr() const {
+        return mGridType == toGridType<BuildT>() ? (const NodeManager<BuildT>*)mBuffer.deviceData() : nullptr;
+    }
+
+    template <typename T>
+    static T* no_const(const T* ptr) { return const_cast<T*>(ptr); }
+
+public:
+    /// @brief Move constructor from a buffer
+    NodeManagerHandle(GridType gridType, BufferT&& buffer) : mGridType(gridType) { mBuffer = std::move(buffer); }
+    /// @brief Empty ctor
+    NodeManagerHandle() = default;
+    /// @brief Disallow copy-construction
+    NodeManagerHandle(const NodeManagerHandle&) = delete;
+    /// @brief Disallow copy assignment operation
+    NodeManagerHandle& operator=(const NodeManagerHandle&) = delete;
+    /// @brief Move copy assignment operation
+    NodeManagerHandle& operator=(NodeManagerHandle&& other) noexcept {
+        mGridType = other.mGridType;
+        mBuffer = std::move(other.mBuffer);
+        other.mGridType = GridType::Unknown;
+        return *this;
+    }
+    /// @brief Move copy-constructor
+    NodeManagerHandle(NodeManagerHandle&& other) noexcept {
+        mGridType = other.mGridType;
+        mBuffer = std::move(other.mBuffer);
+        other.mGridType = GridType::Unknown;
+    }
+    /// @brief Default destructor
+    ~NodeManagerHandle() { this->reset(); }
+    /// @brief clear the buffer
+    void reset() { mBuffer.clear(); }
+
+    /// @brief Return a reference to the buffer
+    BufferT& buffer() { return mBuffer; }
+
+    /// @brief Return a const reference to the buffer
+    const BufferT& buffer() const { return mBuffer; }
+
+    /// @brief Returns a non-const pointer to the data.
+    ///
+    /// @warning Note that the return pointer can be NULL if the NodeManagerHandle was not initialized
+    void* data() { return mBuffer.data(); }
+
+    /// @brief Returns a const pointer to the data.
+    ///
+    /// @warning Note that the return pointer can be NULL if the NodeManagerHandle was not initialized
+    const void* data() const { return mBuffer.data(); }
+
+    /// @brief Returns the size in bytes of the raw memory buffer managed by this NodeManagerHandle's allocator.
+    uint64_t size() const { return mBuffer.size(); }
+
+    /// @brief Returns a const pointer to the NodeManager encoded in this NodeManagerHandle.
+    ///
+    /// @warning Note that the return pointer can be NULL if the template parameter does not match the specified grid!
+    template<typename BuildT>
+    const NodeManager<BuildT>* mgr() const { return this->template getMgr<BuildT>(); }
+
+    /// @brief Returns a pointer to the NodeManager encoded in this NodeManagerHandle.
+    ///
+    /// @warning Note that the return pointer can be NULL if the template parameter does not match the specified grid!
+    template<typename BuildT>
+    NodeManager<BuildT>* mgr() { return no_const(this->template getMgr<BuildT>()); }
+
+    /// @brief Return a const pointer to the NodeManager encoded in this NodeManagerHandle on the device, e.g. GPU
+    ///
+    /// @warning Note that the return pointer can be NULL if the template parameter does not match the specified grid!
+    template<typename BuildT, typename U = BufferT>
+    typename util::enable_if<BufferTraits<U>::hasDeviceDual, const NodeManager<BuildT>*>::type
+    deviceMgr() const { return this->template getDeviceMgr<BuildT>(); }
+
+    /// @brief Return a const pointer to the NodeManager encoded in this NodeManagerHandle on the device, e.g. GPU
+    ///
+    /// @warning Note that the return pointer can be NULL if the template parameter does not match the specified grid!
+    template<typename BuildT, typename U = BufferT>
+    typename util::enable_if<BufferTraits<U>::hasDeviceDual, NodeManager<BuildT>*>::type
+    deviceMgr() { return no_const(this->template getDeviceMgr<BuildT>()); }
+
+    /// @brief Upload the NodeManager to the device, e.g. from CPU to GPU
+    ///
+    /// @note This method is only available if the buffer supports devices
+    template<typename U = BufferT>
+    typename util::enable_if<BufferTraits<U>::hasDeviceDual, void>::type
+    deviceUpload(void* deviceGrid, void* stream = nullptr, bool sync = true)
+    {
+        assert(deviceGrid);
+        auto *data = reinterpret_cast<NodeManagerData*>(mBuffer.data());
+        void *tmp = data->mGrid;
+        data->mGrid = deviceGrid;
+        mBuffer.deviceUpload(stream, sync);
+        data->mGrid = tmp;
+    }
+
+    /// @brief Download the NodeManager to from the device, e.g. from GPU to CPU
+    ///
+    /// @note This method is only available if the buffer supports devices
+    template<typename U = BufferT>
+    typename util::enable_if<BufferTraits<U>::hasDeviceDual, void>::type
+    deviceDownload(void* stream = nullptr, bool sync = true)
+    {
+        auto *data = reinterpret_cast<NodeManagerData*>(mBuffer.data());
+        void *tmp = data->mGrid;
+        mBuffer.deviceDownload(stream, sync);
+        data->mGrid = tmp;
+    }
+};// NodeManagerHandle
+
+/// @brief This class allows for sequential access to nodes in a NanoVDB tree
+///
+/// @details Nodes are always arranged breadth first during sequential access of nodes
+///          at a particular level.
+template<typename BuildT>
+class NodeManager : private NodeManagerData
+{
+    using DataT = NodeManagerData;
+    using GridT = NanoGrid<BuildT>;
+    using TreeT = typename GridTree<GridT>::type;
+    template<int LEVEL>
+    using NodeT = typename NodeTrait<TreeT, LEVEL>::type;
+    using RootT = NodeT<3>;// root node
+    using Node2 = NodeT<2>;// upper internal node
+    using Node1 = NodeT<1>;// lower internal node
+    using Node0 = NodeT<0>;// leaf node
+
+public:
+    static constexpr bool FIXED_SIZE = Node0::FIXED_SIZE && Node1::FIXED_SIZE && Node2::FIXED_SIZE;
+
+    NodeManager(const NodeManager&) = delete;
+    NodeManager(NodeManager&&) = delete;
+    NodeManager& operator=(const NodeManager&) = delete;
+    NodeManager& operator=(NodeManager&&) = delete;
+    ~NodeManager() = delete;
+
+    /// @brief return true if the nodes have both fixed size and are arranged breadth-first in memory.
+    ///        This allows for direct and memory-efficient linear access to nodes.
+    __hostdev__ static bool isLinear(const GridT &grid) {return FIXED_SIZE && grid.isBreadthFirst();}
+
+    /// @brief return true if the nodes have both fixed size and are arranged breadth-first in memory.
+    ///        This allows for direct and memory-efficient linear access to nodes.
+    __hostdev__ bool isLinear() const {return DataT::mLinear!=0u;}
+
+    /// @brief Return the memory footprint in bytes of the NodeManager derived from the specified grid
+    __hostdev__ static uint64_t memUsage(const GridT &grid) {
+        uint64_t size = sizeof(NodeManagerData);
+        if (!NodeManager::isLinear(grid)) {
+            const uint32_t *p = grid.tree().mNodeCount;
+            size += sizeof(int64_t)*(p[0]+p[1]+p[2]);
+        }
+        return size;
+    }
+
+    /// @brief Return the memory footprint in bytes of this instance
+    __hostdev__ uint64_t memUsage() const {return NodeManager::memUsage(this->grid());}
+
+    /// @brief Return a reference to the grid
+    __hostdev__       GridT& grid()       { return *reinterpret_cast<GridT*>(DataT::mGrid); }
+    __hostdev__ const GridT& grid() const { return *reinterpret_cast<const GridT*>(DataT::mGrid); }
+
+    /// @brief Return a reference to the tree
+    __hostdev__       TreeT& tree()       { return this->grid().tree(); }
+    __hostdev__ const TreeT& tree() const { return this->grid().tree(); }
+
+    /// @brief Return a reference to the root
+    __hostdev__       RootT& root()       { return this->tree().root(); }
+    __hostdev__ const RootT& root() const { return this->tree().root(); }
+
+    /// @brief Return the number of tree nodes at the specified level
+    /// @details 0 is leaf, 1 is lower internal, and 2 is upper internal level
+    __hostdev__ uint64_t nodeCount(int level) const { return this->tree().nodeCount(level); }
+
+    __hostdev__ uint64_t leafCount()  const { return this->tree().nodeCount(0); }
+    __hostdev__ uint64_t lowerCount() const { return this->tree().nodeCount(1); }
+    __hostdev__ uint64_t upperCount() const { return this->tree().nodeCount(2); }
+
+    /// @brief Return the i'th leaf node with respect to breadth-first ordering
+    template <int LEVEL>
+    __hostdev__ const NodeT<LEVEL>& node(uint32_t i) const {
+        NANOVDB_ASSERT(i < this->nodeCount(LEVEL));
+        const NodeT<LEVEL>* ptr = nullptr;
+        if (DataT::mLinear) {
+            ptr = util::PtrAdd<const NodeT<LEVEL>>(DataT::mGrid, DataT::mOff[LEVEL]) + i;
+        } else {
+            ptr = util::PtrAdd<const NodeT<LEVEL>>(DataT::mGrid, DataT::mPtr[LEVEL][i]);
+        }
+        NANOVDB_ASSERT(ptr && isAligned(ptr));
+        return *ptr;
+    }
+
+    /// @brief Return the i'th node with respect to breadth-first ordering
+    template <int LEVEL>
+    __hostdev__ NodeT<LEVEL>& node(uint32_t i) {
+        NANOVDB_ASSERT(i < this->nodeCount(LEVEL));
+        NodeT<LEVEL>* ptr = nullptr;
+        if (DataT::mLinear) {
+            ptr = util::PtrAdd<NodeT<LEVEL>>(DataT::mGrid, DataT::mOff[LEVEL]) + i;
+        } else {
+            ptr = util::PtrAdd<NodeT<LEVEL>>(DataT::mGrid, DataT::mPtr[LEVEL][i]);
+        }
+        NANOVDB_ASSERT(ptr && isAligned(ptr));
+        return *ptr;
+    }
+
+    /// @brief Return the i'th leaf node with respect to breadth-first ordering
+    __hostdev__ const Node0& leaf(uint32_t i) const { return this->node<0>(i); }
+    __hostdev__       Node0& leaf(uint32_t i)       { return this->node<0>(i); }
+
+    /// @brief Return the i'th lower internal node with respect to breadth-first ordering
+    __hostdev__ const Node1& lower(uint32_t i) const { return this->node<1>(i); }
+    __hostdev__       Node1& lower(uint32_t i)       { return this->node<1>(i); }
+
+    /// @brief Return the i'th upper internal node with respect to breadth-first ordering
+    __hostdev__ const Node2& upper(uint32_t i) const { return this->node<2>(i); }
+    __hostdev__       Node2& upper(uint32_t i)       { return this->node<2>(i); }
+
+}; // NodeManager<BuildT> class
+
+template <typename BuildT, typename BufferT>
+NodeManagerHandle<BufferT> createNodeManager(const NanoGrid<BuildT> &grid,
+                                             const BufferT& buffer)
+{
+    NodeManagerHandle<BufferT> handle(toGridType<BuildT>(), BufferT::create(NodeManager<BuildT>::memUsage(grid), &buffer));
+    auto *data = reinterpret_cast<NodeManagerData*>(handle.data());
+    NANOVDB_ASSERT(data && isAligned(data));
+    NANOVDB_ASSERT(toGridType<BuildT>() == grid.gridType());
+#ifdef NANOVDB_USE_NEW_MAGIC_NUMBERS
+    *data = NodeManagerData{NANOVDB_MAGIC_NODE, {0u}, (void*)&grid, {{0u,0u,0u}}};
+#else
+    *data = NodeManagerData{NANOVDB_MAGIC_NUMB, {0u}, (void*)&grid, {{0u,0u,0u}}};
+#endif
+
+    if (NodeManager<BuildT>::isLinear(grid)) {
+        data->mLinear = uint8_t(1u);
+        data->mOff[0] = util::PtrDiff(grid.tree().template getFirstNode<0>(), &grid);
+        data->mOff[1] = util::PtrDiff(grid.tree().template getFirstNode<1>(), &grid);
+        data->mOff[2] = util::PtrDiff(grid.tree().template getFirstNode<2>(), &grid);
+    } else {
+        int64_t *ptr0 = data->mPtr[0] = reinterpret_cast<int64_t*>(data + 1);
+        int64_t *ptr1 = data->mPtr[1] = data->mPtr[0] + grid.tree().nodeCount(0);
+        int64_t *ptr2 = data->mPtr[2] = data->mPtr[1] + grid.tree().nodeCount(1);
+        // Performs depth first traversal but breadth first insertion
+        for (auto it2 = grid.tree().root().cbeginChild(); it2; ++it2) {
+            *ptr2++ = util::PtrDiff(&*it2, &grid);
+            for (auto it1 = it2->cbeginChild(); it1; ++it1) {
+                *ptr1++ = util::PtrDiff(&*it1, &grid);
+                for (auto it0 = it1->cbeginChild(); it0; ++it0) {
+                    *ptr0++ = util::PtrDiff(&*it0, &grid);
+                }// loop over child nodes of the lower internal node
+            }// loop over child nodes of the upper internal node
+        }// loop over child nodes of the root node
+    }
+
+    return handle;// // is converted to r-value so return value is move constructed!
+}
+
+} // namespace nanovdb
+
+#if defined(__CUDACC__)
+#include <nanovdb/cuda/NodeManager.cuh>
+#endif// defined(__CUDACC__)
+
+#endif // NANOVDB_NODEMANAGER_H_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/PNanoVDB.h b/external/nanovdb/PNanoVDB.h
new file mode 100644
index 00000000..3e7b306b
--- /dev/null
+++ b/external/nanovdb/PNanoVDB.h
@@ -0,0 +1,3390 @@
+
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file   nanovdb/PNanoVDB.h
+
+    \author Andrew Reidmeyer
+
+    \brief  This file is a portable (e.g. pointer-less) C99/GLSL/HLSL port
+            of NanoVDB.h, which is compatible with most graphics APIs.
+*/
+
+#ifndef NANOVDB_PNANOVDB_H_HAS_BEEN_INCLUDED
+#define NANOVDB_PNANOVDB_H_HAS_BEEN_INCLUDED
+
+// ------------------------------------------------ Configuration -----------------------------------------------------------
+
+// platforms
+//#define PNANOVDB_C
+//#define PNANOVDB_HLSL
+//#define PNANOVDB_GLSL
+
+// addressing mode
+// PNANOVDB_ADDRESS_32
+// PNANOVDB_ADDRESS_64
+#if defined(PNANOVDB_C)
+#ifndef PNANOVDB_ADDRESS_32
+#define PNANOVDB_ADDRESS_64
+#endif
+#elif defined(PNANOVDB_HLSL)
+#ifndef PNANOVDB_ADDRESS_64
+#define PNANOVDB_ADDRESS_32
+#endif
+#elif defined(PNANOVDB_GLSL)
+#ifndef PNANOVDB_ADDRESS_64
+#define PNANOVDB_ADDRESS_32
+#endif
+#endif
+
+// bounds checking
+//#define PNANOVDB_BUF_BOUNDS_CHECK
+
+// enable HDDA by default on HLSL/GLSL, make explicit on C
+#if defined(PNANOVDB_C)
+//#define PNANOVDB_HDDA
+#ifdef PNANOVDB_HDDA
+#ifndef PNANOVDB_CMATH
+#define PNANOVDB_CMATH
+#endif
+#endif
+#elif defined(PNANOVDB_HLSL)
+#define PNANOVDB_HDDA
+#elif defined(PNANOVDB_GLSL)
+#define PNANOVDB_HDDA
+#endif
+
+#ifdef PNANOVDB_CMATH
+#ifndef __CUDACC_RTC__
+#include <math.h>
+#endif
+#endif
+
+// ------------------------------------------------ Buffer -----------------------------------------------------------
+
+#if defined(PNANOVDB_BUF_CUSTOM)
+// NOP
+#elif defined(PNANOVDB_C)
+#define PNANOVDB_BUF_C
+#elif defined(PNANOVDB_HLSL)
+#define PNANOVDB_BUF_HLSL
+#elif defined(PNANOVDB_GLSL)
+#define PNANOVDB_BUF_GLSL
+#endif
+
+#if defined(PNANOVDB_BUF_C)
+#ifndef __CUDACC_RTC__
+#include <stdint.h>
+#endif
+#if defined(__CUDACC__)
+#define PNANOVDB_BUF_FORCE_INLINE static __host__ __device__ __forceinline__
+#elif defined(_WIN32)
+#define PNANOVDB_BUF_FORCE_INLINE static inline __forceinline
+#else
+#define PNANOVDB_BUF_FORCE_INLINE static inline __attribute__((always_inline))
+#endif
+typedef struct pnanovdb_buf_t
+{
+    uint32_t* data;
+#ifdef PNANOVDB_BUF_BOUNDS_CHECK
+    uint64_t size_in_words;
+#endif
+}pnanovdb_buf_t;
+PNANOVDB_BUF_FORCE_INLINE pnanovdb_buf_t pnanovdb_make_buf(uint32_t* data, uint64_t size_in_words)
+{
+    pnanovdb_buf_t ret;
+    ret.data = data;
+#ifdef PNANOVDB_BUF_BOUNDS_CHECK
+    ret.size_in_words = size_in_words;
+#endif
+    return ret;
+}
+#if defined(PNANOVDB_ADDRESS_32)
+PNANOVDB_BUF_FORCE_INLINE uint32_t pnanovdb_buf_read_uint32(pnanovdb_buf_t buf, uint32_t byte_offset)
+{
+    uint32_t wordaddress = (byte_offset >> 2u);
+#ifdef PNANOVDB_BUF_BOUNDS_CHECK
+    return wordaddress < buf.size_in_words ? buf.data[wordaddress] : 0u;
+#else
+    return buf.data[wordaddress];
+#endif
+}
+PNANOVDB_BUF_FORCE_INLINE uint64_t pnanovdb_buf_read_uint64(pnanovdb_buf_t buf, uint32_t byte_offset)
+{
+    uint64_t* data64 = (uint64_t*)buf.data;
+    uint32_t wordaddress64 = (byte_offset >> 3u);
+#ifdef PNANOVDB_BUF_BOUNDS_CHECK
+    uint64_t size_in_words64 = buf.size_in_words >> 1u;
+    return wordaddress64 < size_in_words64 ? data64[wordaddress64] : 0llu;
+#else
+    return data64[wordaddress64];
+#endif
+}
+PNANOVDB_BUF_FORCE_INLINE void pnanovdb_buf_write_uint32(pnanovdb_buf_t buf, uint32_t byte_offset, uint32_t value)
+{
+    uint32_t wordaddress = (byte_offset >> 2u);
+#ifdef PNANOVDB_BUF_BOUNDS_CHECK
+    if (wordaddress < buf.size_in_words)
+    {
+        buf.data[wordaddress] = value;
+}
+#else
+    buf.data[wordaddress] = value;
+#endif
+}
+PNANOVDB_BUF_FORCE_INLINE void pnanovdb_buf_write_uint64(pnanovdb_buf_t buf, uint32_t byte_offset, uint64_t value)
+{
+    uint64_t* data64 = (uint64_t*)buf.data;
+    uint32_t wordaddress64 = (byte_offset >> 3u);
+#ifdef PNANOVDB_BUF_BOUNDS_CHECK
+    uint64_t size_in_words64 = buf.size_in_words >> 1u;
+    if (wordaddress64 < size_in_words64)
+    {
+        data64[wordaddress64] = value;
+    }
+#else
+    data64[wordaddress64] = value;
+#endif
+}
+#elif defined(PNANOVDB_ADDRESS_64)
+PNANOVDB_BUF_FORCE_INLINE uint32_t pnanovdb_buf_read_uint32(pnanovdb_buf_t buf, uint64_t byte_offset)
+{
+    uint64_t wordaddress = (byte_offset >> 2u);
+#ifdef PNANOVDB_BUF_BOUNDS_CHECK
+    return wordaddress < buf.size_in_words ? buf.data[wordaddress] : 0u;
+#else
+    return buf.data[wordaddress];
+#endif
+}
+PNANOVDB_BUF_FORCE_INLINE uint64_t pnanovdb_buf_read_uint64(pnanovdb_buf_t buf, uint64_t byte_offset)
+{
+    uint64_t* data64 = (uint64_t*)buf.data;
+    uint64_t wordaddress64 = (byte_offset >> 3u);
+#ifdef PNANOVDB_BUF_BOUNDS_CHECK
+    uint64_t size_in_words64 = buf.size_in_words >> 1u;
+    return wordaddress64 < size_in_words64 ? data64[wordaddress64] : 0llu;
+#else
+    return data64[wordaddress64];
+#endif
+}
+PNANOVDB_BUF_FORCE_INLINE void pnanovdb_buf_write_uint32(pnanovdb_buf_t buf, uint64_t byte_offset, uint32_t value)
+{
+    uint64_t wordaddress = (byte_offset >> 2u);
+#ifdef PNANOVDB_BUF_BOUNDS_CHECK
+    if (wordaddress < buf.size_in_words)
+    {
+        buf.data[wordaddress] = value;
+    }
+#else
+    buf.data[wordaddress] = value;
+#endif
+}
+PNANOVDB_BUF_FORCE_INLINE void pnanovdb_buf_write_uint64(pnanovdb_buf_t buf, uint64_t byte_offset, uint64_t value)
+{
+    uint64_t* data64 = (uint64_t*)buf.data;
+    uint64_t wordaddress64 = (byte_offset >> 3u);
+#ifdef PNANOVDB_BUF_BOUNDS_CHECK
+    uint64_t size_in_words64 = buf.size_in_words >> 1u;
+    if (wordaddress64 < size_in_words64)
+    {
+        data64[wordaddress64] = value;
+    }
+#else
+    data64[wordaddress64] = value;
+#endif
+}
+#endif
+typedef uint32_t pnanovdb_grid_type_t;
+#define PNANOVDB_GRID_TYPE_GET(grid_typeIn, nameIn) pnanovdb_grid_type_constants[grid_typeIn].nameIn
+#elif defined(PNANOVDB_BUF_HLSL)
+#if defined(PNANOVDB_ADDRESS_32)
+#define pnanovdb_buf_t StructuredBuffer<uint>
+uint pnanovdb_buf_read_uint32(pnanovdb_buf_t buf, uint byte_offset)
+{
+    return buf[(byte_offset >> 2u)];
+}
+uint2 pnanovdb_buf_read_uint64(pnanovdb_buf_t buf, uint byte_offset)
+{
+    uint2 ret;
+    ret.x = pnanovdb_buf_read_uint32(buf, byte_offset + 0u);
+    ret.y = pnanovdb_buf_read_uint32(buf, byte_offset + 4u);
+    return ret;
+}
+void pnanovdb_buf_write_uint32(pnanovdb_buf_t buf, uint byte_offset, uint value)
+{
+    // NOP, by default no write in HLSL
+}
+void pnanovdb_buf_write_uint64(pnanovdb_buf_t buf, uint byte_offset, uint2 value)
+{
+    // NOP, by default no write in HLSL
+}
+#elif defined(PNANOVDB_ADDRESS_64)
+#define pnanovdb_buf_t StructuredBuffer<uint>
+uint pnanovdb_buf_read_uint32(pnanovdb_buf_t buf, uint64_t byte_offset)
+{
+    return buf[uint(byte_offset >> 2u)];
+}
+uint64_t pnanovdb_buf_read_uint64(pnanovdb_buf_t buf, uint64_t byte_offset)
+{
+    uint64_t ret;
+    ret = pnanovdb_buf_read_uint32(buf, byte_offset + 0u);
+    ret = ret + (uint64_t(pnanovdb_buf_read_uint32(buf, byte_offset + 4u)) << 32u);
+    return ret;
+}
+void pnanovdb_buf_write_uint32(pnanovdb_buf_t buf, uint64_t byte_offset, uint value)
+{
+    // NOP, by default no write in HLSL
+}
+void pnanovdb_buf_write_uint64(pnanovdb_buf_t buf, uint64_t byte_offset, uint64_t value)
+{
+    // NOP, by default no write in HLSL
+}
+#endif
+#define pnanovdb_grid_type_t uint
+#define PNANOVDB_GRID_TYPE_GET(grid_typeIn, nameIn) pnanovdb_grid_type_constants[grid_typeIn].nameIn
+#elif defined(PNANOVDB_BUF_GLSL)
+struct pnanovdb_buf_t
+{
+    uint unused;    // to satisfy min struct size?
+};
+uint pnanovdb_buf_read_uint32(pnanovdb_buf_t buf, uint byte_offset)
+{
+    return pnanovdb_buf_data[(byte_offset >> 2u)];
+}
+uvec2 pnanovdb_buf_read_uint64(pnanovdb_buf_t buf, uint byte_offset)
+{
+    uvec2 ret;
+    ret.x = pnanovdb_buf_read_uint32(buf, byte_offset + 0u);
+    ret.y = pnanovdb_buf_read_uint32(buf, byte_offset + 4u);
+    return ret;
+}
+void pnanovdb_buf_write_uint32(pnanovdb_buf_t buf, uint byte_offset, uint value)
+{
+    // NOP, by default no write in HLSL
+}
+void pnanovdb_buf_write_uint64(pnanovdb_buf_t buf, uint byte_offset, uvec2 value)
+{
+    // NOP, by default no write in HLSL
+}
+#define pnanovdb_grid_type_t uint
+#define PNANOVDB_GRID_TYPE_GET(grid_typeIn, nameIn) pnanovdb_grid_type_constants[grid_typeIn].nameIn
+#endif
+
+// ------------------------------------------------ Basic Types -----------------------------------------------------------
+
+// force inline
+#if defined(PNANOVDB_C)
+#if defined(__CUDACC__)
+#define PNANOVDB_FORCE_INLINE static __host__ __device__ __forceinline__
+#elif defined(_WIN32)
+#define PNANOVDB_FORCE_INLINE static inline __forceinline
+#else
+#define PNANOVDB_FORCE_INLINE static inline __attribute__((always_inline))
+#endif
+#elif defined(PNANOVDB_HLSL)
+#define PNANOVDB_FORCE_INLINE
+#elif defined(PNANOVDB_GLSL)
+#define PNANOVDB_FORCE_INLINE
+#endif
+
+// struct typedef, static const, inout
+#if defined(PNANOVDB_C)
+#define PNANOVDB_STRUCT_TYPEDEF(X) typedef struct X X;
+#if defined(__CUDA_ARCH__)
+#define PNANOVDB_STATIC_CONST constexpr __constant__
+#else
+#define PNANOVDB_STATIC_CONST static const
+#endif
+#define PNANOVDB_INOUT(X) X*
+#define PNANOVDB_IN(X) const X*
+#define PNANOVDB_DEREF(X) (*X)
+#define PNANOVDB_REF(X) &X
+#elif defined(PNANOVDB_HLSL)
+#define PNANOVDB_STRUCT_TYPEDEF(X)
+#define PNANOVDB_STATIC_CONST static const
+#define PNANOVDB_INOUT(X) inout X
+#define PNANOVDB_IN(X) X
+#define PNANOVDB_DEREF(X) X
+#define PNANOVDB_REF(X) X
+#elif defined(PNANOVDB_GLSL)
+#define PNANOVDB_STRUCT_TYPEDEF(X)
+#define PNANOVDB_STATIC_CONST const
+#define PNANOVDB_INOUT(X) inout X
+#define PNANOVDB_IN(X) X
+#define PNANOVDB_DEREF(X) X
+#define PNANOVDB_REF(X) X
+#endif
+
+// basic types, type conversion
+#if defined(PNANOVDB_C)
+#define PNANOVDB_NATIVE_64
+#ifndef __CUDACC_RTC__
+#include <stdint.h>
+#endif
+#if !defined(PNANOVDB_MEMCPY_CUSTOM)
+#ifndef __CUDACC_RTC__
+#include <string.h>
+#endif
+#define pnanovdb_memcpy memcpy
+#endif
+typedef uint32_t pnanovdb_uint32_t;
+typedef int32_t pnanovdb_int32_t;
+typedef int32_t pnanovdb_bool_t;
+#define PNANOVDB_FALSE 0
+#define PNANOVDB_TRUE 1
+typedef uint64_t pnanovdb_uint64_t;
+typedef int64_t pnanovdb_int64_t;
+typedef struct pnanovdb_coord_t
+{
+    pnanovdb_int32_t x, y, z;
+}pnanovdb_coord_t;
+typedef struct pnanovdb_vec3_t
+{
+    float x, y, z;
+}pnanovdb_vec3_t;
+PNANOVDB_FORCE_INLINE pnanovdb_int32_t pnanovdb_uint32_as_int32(pnanovdb_uint32_t v) { return (pnanovdb_int32_t)v; }
+PNANOVDB_FORCE_INLINE pnanovdb_int64_t pnanovdb_uint64_as_int64(pnanovdb_uint64_t v) { return (pnanovdb_int64_t)v; }
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_int64_as_uint64(pnanovdb_int64_t v) { return (pnanovdb_uint64_t)v; }
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_int32_as_uint32(pnanovdb_int32_t v) { return (pnanovdb_uint32_t)v; }
+PNANOVDB_FORCE_INLINE float pnanovdb_uint32_as_float(pnanovdb_uint32_t v) { float vf; pnanovdb_memcpy(&vf, &v, sizeof(vf)); return vf; }
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_float_as_uint32(float v) { return *((pnanovdb_uint32_t*)(&v)); }
+PNANOVDB_FORCE_INLINE double pnanovdb_uint64_as_double(pnanovdb_uint64_t v) { double vf; pnanovdb_memcpy(&vf, &v, sizeof(vf)); return vf; }
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_double_as_uint64(double v) { return *((pnanovdb_uint64_t*)(&v)); }
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_uint64_low(pnanovdb_uint64_t v) { return (pnanovdb_uint32_t)v; }
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_uint64_high(pnanovdb_uint64_t v) { return (pnanovdb_uint32_t)(v >> 32u); }
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_uint32_as_uint64(pnanovdb_uint32_t x, pnanovdb_uint32_t y) { return ((pnanovdb_uint64_t)x) | (((pnanovdb_uint64_t)y) << 32u); }
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_uint32_as_uint64_low(pnanovdb_uint32_t x) { return ((pnanovdb_uint64_t)x); }
+PNANOVDB_FORCE_INLINE pnanovdb_int32_t pnanovdb_uint64_is_equal(pnanovdb_uint64_t a, pnanovdb_uint64_t b) { return a == b; }
+PNANOVDB_FORCE_INLINE pnanovdb_int32_t pnanovdb_int64_is_zero(pnanovdb_int64_t a) { return a == 0; }
+#ifdef PNANOVDB_CMATH
+PNANOVDB_FORCE_INLINE float pnanovdb_floor(float v) { return floorf(v); }
+#endif
+PNANOVDB_FORCE_INLINE pnanovdb_int32_t pnanovdb_float_to_int32(float v) { return (pnanovdb_int32_t)v; }
+PNANOVDB_FORCE_INLINE float pnanovdb_int32_to_float(pnanovdb_int32_t v) { return (float)v; }
+PNANOVDB_FORCE_INLINE float pnanovdb_uint32_to_float(pnanovdb_uint32_t v) { return (float)v; }
+PNANOVDB_FORCE_INLINE float pnanovdb_min(float a, float b) { return a < b ? a : b; }
+PNANOVDB_FORCE_INLINE float pnanovdb_max(float a, float b) { return a > b ? a : b; }
+#elif defined(PNANOVDB_HLSL)
+typedef uint pnanovdb_uint32_t;
+typedef int pnanovdb_int32_t;
+typedef bool pnanovdb_bool_t;
+#define PNANOVDB_FALSE false
+#define PNANOVDB_TRUE true
+typedef int3 pnanovdb_coord_t;
+typedef float3 pnanovdb_vec3_t;
+pnanovdb_int32_t pnanovdb_uint32_as_int32(pnanovdb_uint32_t v) { return int(v); }
+pnanovdb_uint32_t pnanovdb_int32_as_uint32(pnanovdb_int32_t v) { return uint(v); }
+float pnanovdb_uint32_as_float(pnanovdb_uint32_t v) { return asfloat(v); }
+pnanovdb_uint32_t pnanovdb_float_as_uint32(float v) { return asuint(v); }
+float pnanovdb_floor(float v) { return floor(v); }
+pnanovdb_int32_t pnanovdb_float_to_int32(float v) { return int(v); }
+float pnanovdb_int32_to_float(pnanovdb_int32_t v) { return float(v); }
+float pnanovdb_uint32_to_float(pnanovdb_uint32_t v) { return float(v); }
+float pnanovdb_min(float a, float b) { return min(a, b); }
+float pnanovdb_max(float a, float b) { return max(a, b); }
+#if defined(PNANOVDB_ADDRESS_32)
+typedef uint2 pnanovdb_uint64_t;
+typedef int2 pnanovdb_int64_t;
+pnanovdb_int64_t pnanovdb_uint64_as_int64(pnanovdb_uint64_t v) { return int2(v); }
+pnanovdb_uint64_t pnanovdb_int64_as_uint64(pnanovdb_int64_t v) { return uint2(v); }
+double pnanovdb_uint64_as_double(pnanovdb_uint64_t v) { return asdouble(v.x, v.y); }
+pnanovdb_uint64_t pnanovdb_double_as_uint64(double v) { uint2 ret; asuint(v, ret.x, ret.y); return ret; }
+pnanovdb_uint32_t pnanovdb_uint64_low(pnanovdb_uint64_t v) { return v.x; }
+pnanovdb_uint32_t pnanovdb_uint64_high(pnanovdb_uint64_t v) { return v.y; }
+pnanovdb_uint64_t pnanovdb_uint32_as_uint64(pnanovdb_uint32_t x, pnanovdb_uint32_t y) { return uint2(x, y); }
+pnanovdb_uint64_t pnanovdb_uint32_as_uint64_low(pnanovdb_uint32_t x) { return uint2(x, 0); }
+bool pnanovdb_uint64_is_equal(pnanovdb_uint64_t a, pnanovdb_uint64_t b) { return (a.x == b.x) && (a.y == b.y); }
+bool pnanovdb_int64_is_zero(pnanovdb_int64_t a) { return a.x == 0 && a.y == 0; }
+#else
+typedef uint64_t pnanovdb_uint64_t;
+typedef int64_t pnanovdb_int64_t;
+pnanovdb_int64_t pnanovdb_uint64_as_int64(pnanovdb_uint64_t v) { return int64_t(v); }
+pnanovdb_uint64_t pnanovdb_int64_as_uint64(pnanovdb_int64_t v) { return uint64_t(v); }
+double pnanovdb_uint64_as_double(pnanovdb_uint64_t v) { return asdouble(uint(v), uint(v >> 32u)); }
+pnanovdb_uint64_t pnanovdb_double_as_uint64(double v) { uint2 ret; asuint(v, ret.x, ret.y); return uint64_t(ret.x) + (uint64_t(ret.y) << 32u); }
+pnanovdb_uint32_t pnanovdb_uint64_low(pnanovdb_uint64_t v) { return uint(v); }
+pnanovdb_uint32_t pnanovdb_uint64_high(pnanovdb_uint64_t v) { return uint(v >> 32u); }
+pnanovdb_uint64_t pnanovdb_uint32_as_uint64(pnanovdb_uint32_t x, pnanovdb_uint32_t y) { return uint64_t(x) + (uint64_t(y) << 32u); }
+pnanovdb_uint64_t pnanovdb_uint32_as_uint64_low(pnanovdb_uint32_t x) { return uint64_t(x); }
+bool pnanovdb_uint64_is_equal(pnanovdb_uint64_t a, pnanovdb_uint64_t b) { return a == b; }
+bool pnanovdb_int64_is_zero(pnanovdb_int64_t a) { return a == 0; }
+#endif
+#elif defined(PNANOVDB_GLSL)
+#define pnanovdb_uint32_t uint
+#define pnanovdb_int32_t int
+#define pnanovdb_bool_t bool
+#define PNANOVDB_FALSE false
+#define PNANOVDB_TRUE true
+#define pnanovdb_uint64_t uvec2
+#define pnanovdb_int64_t ivec2
+#define pnanovdb_coord_t ivec3
+#define pnanovdb_vec3_t vec3
+pnanovdb_int32_t pnanovdb_uint32_as_int32(pnanovdb_uint32_t v) { return int(v); }
+pnanovdb_int64_t pnanovdb_uint64_as_int64(pnanovdb_uint64_t v) { return ivec2(v); }
+pnanovdb_uint64_t pnanovdb_int64_as_uint64(pnanovdb_int64_t v) { return uvec2(v); }
+pnanovdb_uint32_t pnanovdb_int32_as_uint32(pnanovdb_int32_t v) { return uint(v); }
+float pnanovdb_uint32_as_float(pnanovdb_uint32_t v) { return uintBitsToFloat(v); }
+pnanovdb_uint32_t pnanovdb_float_as_uint32(float v) { return floatBitsToUint(v); }
+double pnanovdb_uint64_as_double(pnanovdb_uint64_t v) { return packDouble2x32(uvec2(v.x, v.y)); }
+pnanovdb_uint64_t pnanovdb_double_as_uint64(double v) { return unpackDouble2x32(v); }
+pnanovdb_uint32_t pnanovdb_uint64_low(pnanovdb_uint64_t v) { return v.x; }
+pnanovdb_uint32_t pnanovdb_uint64_high(pnanovdb_uint64_t v) { return v.y; }
+pnanovdb_uint64_t pnanovdb_uint32_as_uint64(pnanovdb_uint32_t x, pnanovdb_uint32_t y) { return uvec2(x, y); }
+pnanovdb_uint64_t pnanovdb_uint32_as_uint64_low(pnanovdb_uint32_t x) { return uvec2(x, 0); }
+bool pnanovdb_uint64_is_equal(pnanovdb_uint64_t a, pnanovdb_uint64_t b) { return (a.x == b.x) && (a.y == b.y); }
+bool pnanovdb_int64_is_zero(pnanovdb_int64_t a) { return a.x == 0 && a.y == 0; }
+float pnanovdb_floor(float v) { return floor(v); }
+pnanovdb_int32_t pnanovdb_float_to_int32(float v) { return int(v); }
+float pnanovdb_int32_to_float(pnanovdb_int32_t v) { return float(v); }
+float pnanovdb_uint32_to_float(pnanovdb_uint32_t v) { return float(v); }
+float pnanovdb_min(float a, float b) { return min(a, b); }
+float pnanovdb_max(float a, float b) { return max(a, b); }
+#endif
+
+// ------------------------------------------------ Coord/Vec3 Utilties -----------------------------------------------------------
+
+#if defined(PNANOVDB_C)
+PNANOVDB_FORCE_INLINE pnanovdb_vec3_t pnanovdb_vec3_uniform(float a)
+{
+    pnanovdb_vec3_t v;
+    v.x = a;
+    v.y = a;
+    v.z = a;
+    return v;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_vec3_t pnanovdb_vec3_add(const pnanovdb_vec3_t a, const pnanovdb_vec3_t b)
+{
+    pnanovdb_vec3_t v;
+    v.x = a.x + b.x;
+    v.y = a.y + b.y;
+    v.z = a.z + b.z;
+    return v;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_vec3_t pnanovdb_vec3_sub(const pnanovdb_vec3_t a, const pnanovdb_vec3_t b)
+{
+    pnanovdb_vec3_t v;
+    v.x = a.x - b.x;
+    v.y = a.y - b.y;
+    v.z = a.z - b.z;
+    return v;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_vec3_t pnanovdb_vec3_mul(const pnanovdb_vec3_t a, const pnanovdb_vec3_t b)
+{
+    pnanovdb_vec3_t v;
+    v.x = a.x * b.x;
+    v.y = a.y * b.y;
+    v.z = a.z * b.z;
+    return v;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_vec3_t pnanovdb_vec3_div(const pnanovdb_vec3_t a, const pnanovdb_vec3_t b)
+{
+    pnanovdb_vec3_t v;
+    v.x = a.x / b.x;
+    v.y = a.y / b.y;
+    v.z = a.z / b.z;
+    return v;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_vec3_t pnanovdb_vec3_min(const pnanovdb_vec3_t a, const pnanovdb_vec3_t b)
+{
+    pnanovdb_vec3_t v;
+    v.x = a.x < b.x ? a.x : b.x;
+    v.y = a.y < b.y ? a.y : b.y;
+    v.z = a.z < b.z ? a.z : b.z;
+    return v;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_vec3_t pnanovdb_vec3_max(const pnanovdb_vec3_t a, const pnanovdb_vec3_t b)
+{
+    pnanovdb_vec3_t v;
+    v.x = a.x > b.x ? a.x : b.x;
+    v.y = a.y > b.y ? a.y : b.y;
+    v.z = a.z > b.z ? a.z : b.z;
+    return v;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_vec3_t pnanovdb_coord_to_vec3(const pnanovdb_coord_t coord)
+{
+    pnanovdb_vec3_t v;
+    v.x = pnanovdb_int32_to_float(coord.x);
+    v.y = pnanovdb_int32_to_float(coord.y);
+    v.z = pnanovdb_int32_to_float(coord.z);
+    return v;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_coord_t pnanovdb_coord_uniform(const pnanovdb_int32_t a)
+{
+    pnanovdb_coord_t v;
+    v.x = a;
+    v.y = a;
+    v.z = a;
+    return v;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_coord_t pnanovdb_coord_add(pnanovdb_coord_t a, pnanovdb_coord_t b)
+{
+    pnanovdb_coord_t v;
+    v.x = a.x + b.x;
+    v.y = a.y + b.y;
+    v.z = a.z + b.z;
+    return v;
+}
+#elif defined(PNANOVDB_HLSL)
+pnanovdb_vec3_t pnanovdb_vec3_uniform(float a) { return float3(a, a, a); }
+pnanovdb_vec3_t pnanovdb_vec3_add(pnanovdb_vec3_t a, pnanovdb_vec3_t b) { return a + b; }
+pnanovdb_vec3_t pnanovdb_vec3_sub(pnanovdb_vec3_t a, pnanovdb_vec3_t b) { return a - b; }
+pnanovdb_vec3_t pnanovdb_vec3_mul(pnanovdb_vec3_t a, pnanovdb_vec3_t b) { return a * b; }
+pnanovdb_vec3_t pnanovdb_vec3_div(pnanovdb_vec3_t a, pnanovdb_vec3_t b) { return a / b; }
+pnanovdb_vec3_t pnanovdb_vec3_min(pnanovdb_vec3_t a, pnanovdb_vec3_t b) { return min(a, b); }
+pnanovdb_vec3_t pnanovdb_vec3_max(pnanovdb_vec3_t a, pnanovdb_vec3_t b) { return max(a, b); }
+pnanovdb_vec3_t pnanovdb_coord_to_vec3(pnanovdb_coord_t coord) { return float3(coord); }
+pnanovdb_coord_t pnanovdb_coord_uniform(pnanovdb_int32_t a) { return int3(a, a, a); }
+pnanovdb_coord_t pnanovdb_coord_add(pnanovdb_coord_t a, pnanovdb_coord_t b) { return a + b; }
+#elif defined(PNANOVDB_GLSL)
+pnanovdb_vec3_t pnanovdb_vec3_uniform(float a) { return vec3(a, a, a); }
+pnanovdb_vec3_t pnanovdb_vec3_add(pnanovdb_vec3_t a, pnanovdb_vec3_t b) { return a + b; }
+pnanovdb_vec3_t pnanovdb_vec3_sub(pnanovdb_vec3_t a, pnanovdb_vec3_t b) { return a - b; }
+pnanovdb_vec3_t pnanovdb_vec3_mul(pnanovdb_vec3_t a, pnanovdb_vec3_t b) { return a * b; }
+pnanovdb_vec3_t pnanovdb_vec3_div(pnanovdb_vec3_t a, pnanovdb_vec3_t b) { return a / b; }
+pnanovdb_vec3_t pnanovdb_vec3_min(pnanovdb_vec3_t a, pnanovdb_vec3_t b) { return min(a, b); }
+pnanovdb_vec3_t pnanovdb_vec3_max(pnanovdb_vec3_t a, pnanovdb_vec3_t b) { return max(a, b); }
+pnanovdb_vec3_t pnanovdb_coord_to_vec3(const pnanovdb_coord_t coord) { return vec3(coord); }
+pnanovdb_coord_t pnanovdb_coord_uniform(pnanovdb_int32_t a) { return ivec3(a, a, a); }
+pnanovdb_coord_t pnanovdb_coord_add(pnanovdb_coord_t a, pnanovdb_coord_t b) { return a + b; }
+#endif
+
+// ------------------------------------------------ Uint64 Utils -----------------------------------------------------------
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_uint32_countbits(pnanovdb_uint32_t value)
+{
+#if defined(PNANOVDB_C)
+#if defined(_MSC_VER) && (_MSC_VER >= 1928) && defined(PNANOVDB_USE_INTRINSICS)
+    return __popcnt(value);
+#elif (defined(__GNUC__) || defined(__clang__)) && defined(PNANOVDB_USE_INTRINSICS)
+    return __builtin_popcount(value);
+#else
+    value = value - ((value >> 1) & 0x55555555);
+    value = (value & 0x33333333) + ((value >> 2) & 0x33333333);
+    value = (value + (value >> 4)) & 0x0F0F0F0F;
+    return (value * 0x01010101) >> 24;
+#endif
+#elif defined(PNANOVDB_HLSL)
+    return countbits(value);
+#elif defined(PNANOVDB_GLSL)
+    return bitCount(value);
+#endif
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_uint64_countbits(pnanovdb_uint64_t value)
+{
+    return pnanovdb_uint32_countbits(pnanovdb_uint64_low(value)) + pnanovdb_uint32_countbits(pnanovdb_uint64_high(value));
+}
+
+#if defined(PNANOVDB_ADDRESS_32)
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_uint64_offset(pnanovdb_uint64_t a, pnanovdb_uint32_t b)
+{
+    pnanovdb_uint32_t low = pnanovdb_uint64_low(a);
+    pnanovdb_uint32_t high = pnanovdb_uint64_high(a);
+    low += b;
+    if (low < b)
+    {
+        high += 1u;
+    }
+    return pnanovdb_uint32_as_uint64(low, high);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_uint64_dec(pnanovdb_uint64_t a)
+{
+    pnanovdb_uint32_t low = pnanovdb_uint64_low(a);
+    pnanovdb_uint32_t high = pnanovdb_uint64_high(a);
+    if (low == 0u)
+    {
+        high -= 1u;
+    }
+    low -= 1u;
+    return pnanovdb_uint32_as_uint64(low, high);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_uint64_to_uint32_lsr(pnanovdb_uint64_t a, pnanovdb_uint32_t b)
+{
+    pnanovdb_uint32_t low = pnanovdb_uint64_low(a);
+    pnanovdb_uint32_t high = pnanovdb_uint64_high(a);
+    return (b >= 32u) ?
+        (high >> (b - 32)) :
+        ((low >> b) | ((b > 0) ? (high << (32u - b)) : 0u));
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_uint64_bit_mask(pnanovdb_uint32_t bit_idx)
+{
+    pnanovdb_uint32_t mask_low = bit_idx < 32u ? 1u << bit_idx : 0u;
+    pnanovdb_uint32_t mask_high = bit_idx >= 32u ? 1u << (bit_idx - 32u) : 0u;
+    return pnanovdb_uint32_as_uint64(mask_low, mask_high);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_uint64_and(pnanovdb_uint64_t a, pnanovdb_uint64_t b)
+{
+    return pnanovdb_uint32_as_uint64(
+        pnanovdb_uint64_low(a) & pnanovdb_uint64_low(b),
+        pnanovdb_uint64_high(a) & pnanovdb_uint64_high(b)
+    );
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_uint64_any_bit(pnanovdb_uint64_t a)
+{
+    return pnanovdb_uint64_low(a) != 0u || pnanovdb_uint64_high(a) != 0u;
+}
+
+#else
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_uint64_offset(pnanovdb_uint64_t a, pnanovdb_uint32_t b)
+{
+    return a + b;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_uint64_dec(pnanovdb_uint64_t a)
+{
+    return a - 1u;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_uint64_to_uint32_lsr(pnanovdb_uint64_t a, pnanovdb_uint32_t b)
+{
+    return pnanovdb_uint64_low(a >> b);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_uint64_bit_mask(pnanovdb_uint32_t bit_idx)
+{
+    return 1llu << bit_idx;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_uint64_and(pnanovdb_uint64_t a, pnanovdb_uint64_t b)
+{
+    return a & b;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_uint64_any_bit(pnanovdb_uint64_t a)
+{
+    return a != 0llu;
+}
+#endif
+
+// ------------------------------------------------ Address Type -----------------------------------------------------------
+
+#if defined(PNANOVDB_ADDRESS_32)
+struct pnanovdb_address_t
+{
+    pnanovdb_uint32_t byte_offset;
+};
+PNANOVDB_STRUCT_TYPEDEF(pnanovdb_address_t)
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_address_offset(pnanovdb_address_t address, pnanovdb_uint32_t byte_offset)
+{
+    pnanovdb_address_t ret = address;
+    ret.byte_offset += byte_offset;
+    return ret;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_address_offset_neg(pnanovdb_address_t address, pnanovdb_uint32_t byte_offset)
+{
+    pnanovdb_address_t ret = address;
+    ret.byte_offset -= byte_offset;
+    return ret;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_address_offset_product(pnanovdb_address_t address, pnanovdb_uint32_t byte_offset, pnanovdb_uint32_t multiplier)
+{
+    pnanovdb_address_t ret = address;
+    ret.byte_offset += byte_offset * multiplier;
+    return ret;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_address_offset64(pnanovdb_address_t address, pnanovdb_uint64_t byte_offset)
+{
+    pnanovdb_address_t ret = address;
+    // lose high bits on 32-bit
+    ret.byte_offset += pnanovdb_uint64_low(byte_offset);
+    return ret;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_address_offset64_product(pnanovdb_address_t address, pnanovdb_uint64_t byte_offset, pnanovdb_uint32_t multiplier)
+{
+    pnanovdb_address_t ret = address;
+    ret.byte_offset += pnanovdb_uint64_low(byte_offset) * multiplier;
+    return ret;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_address_mask(pnanovdb_address_t address, pnanovdb_uint32_t mask)
+{
+    return address.byte_offset & mask;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_address_mask_inv(pnanovdb_address_t address, pnanovdb_uint32_t mask)
+{
+    pnanovdb_address_t ret = address;
+    ret.byte_offset &= (~mask);
+    return ret;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_address_null()
+{
+    pnanovdb_address_t ret = { 0 };
+    return ret;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_address_is_null(pnanovdb_address_t address)
+{
+    return address.byte_offset == 0u;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_address_in_interval(pnanovdb_address_t address, pnanovdb_address_t min_address, pnanovdb_address_t max_address)
+{
+    return address.byte_offset >= min_address.byte_offset && address.byte_offset < max_address.byte_offset;
+}
+#elif defined(PNANOVDB_ADDRESS_64)
+struct pnanovdb_address_t
+{
+    pnanovdb_uint64_t byte_offset;
+};
+PNANOVDB_STRUCT_TYPEDEF(pnanovdb_address_t)
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_address_offset(pnanovdb_address_t address, pnanovdb_uint32_t byte_offset)
+{
+    pnanovdb_address_t ret = address;
+    ret.byte_offset += byte_offset;
+    return ret;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_address_offset_neg(pnanovdb_address_t address, pnanovdb_uint32_t byte_offset)
+{
+    pnanovdb_address_t ret = address;
+    ret.byte_offset -= byte_offset;
+    return ret;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_address_offset_product(pnanovdb_address_t address, pnanovdb_uint32_t byte_offset, pnanovdb_uint32_t multiplier)
+{
+    pnanovdb_address_t ret = address;
+    ret.byte_offset += pnanovdb_uint32_as_uint64_low(byte_offset) * pnanovdb_uint32_as_uint64_low(multiplier);
+    return ret;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_address_offset64(pnanovdb_address_t address, pnanovdb_uint64_t byte_offset)
+{
+    pnanovdb_address_t ret = address;
+    ret.byte_offset += byte_offset;
+    return ret;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_address_offset64_product(pnanovdb_address_t address, pnanovdb_uint64_t byte_offset, pnanovdb_uint32_t multiplier)
+{
+    pnanovdb_address_t ret = address;
+    ret.byte_offset += byte_offset * pnanovdb_uint32_as_uint64_low(multiplier);
+    return ret;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_address_mask(pnanovdb_address_t address, pnanovdb_uint32_t mask)
+{
+    return pnanovdb_uint64_low(address.byte_offset) & mask;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_address_mask_inv(pnanovdb_address_t address, pnanovdb_uint32_t mask)
+{
+    pnanovdb_address_t ret = address;
+    ret.byte_offset &= (~pnanovdb_uint32_as_uint64_low(mask));
+    return ret;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_address_null()
+{
+    pnanovdb_address_t ret = { 0 };
+    return ret;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_address_is_null(pnanovdb_address_t address)
+{
+    return address.byte_offset == 0llu;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_address_in_interval(pnanovdb_address_t address, pnanovdb_address_t min_address, pnanovdb_address_t max_address)
+{
+    return address.byte_offset >= min_address.byte_offset && address.byte_offset < max_address.byte_offset;
+}
+#endif
+
+// ------------------------------------------------ High Level Buffer Read -----------------------------------------------------------
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_read_uint32(pnanovdb_buf_t buf, pnanovdb_address_t address)
+{
+    return pnanovdb_buf_read_uint32(buf, address.byte_offset);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_read_uint64(pnanovdb_buf_t buf, pnanovdb_address_t address)
+{
+    return pnanovdb_buf_read_uint64(buf, address.byte_offset);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_int32_t pnanovdb_read_int32(pnanovdb_buf_t buf, pnanovdb_address_t address)
+{
+    return pnanovdb_uint32_as_int32(pnanovdb_read_uint32(buf, address));
+}
+PNANOVDB_FORCE_INLINE float pnanovdb_read_float(pnanovdb_buf_t buf, pnanovdb_address_t address)
+{
+    return pnanovdb_uint32_as_float(pnanovdb_read_uint32(buf, address));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_int64_t pnanovdb_read_int64(pnanovdb_buf_t buf, pnanovdb_address_t address)
+{
+    return pnanovdb_uint64_as_int64(pnanovdb_read_uint64(buf, address));
+}
+PNANOVDB_FORCE_INLINE double pnanovdb_read_double(pnanovdb_buf_t buf, pnanovdb_address_t address)
+{
+    return pnanovdb_uint64_as_double(pnanovdb_read_uint64(buf, address));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_coord_t pnanovdb_read_coord(pnanovdb_buf_t buf, pnanovdb_address_t address)
+{
+    pnanovdb_coord_t ret;
+    ret.x = pnanovdb_uint32_as_int32(pnanovdb_read_uint32(buf, pnanovdb_address_offset(address, 0u)));
+    ret.y = pnanovdb_uint32_as_int32(pnanovdb_read_uint32(buf, pnanovdb_address_offset(address, 4u)));
+    ret.z = pnanovdb_uint32_as_int32(pnanovdb_read_uint32(buf, pnanovdb_address_offset(address, 8u)));
+    return ret;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_vec3_t pnanovdb_read_vec3(pnanovdb_buf_t buf, pnanovdb_address_t address)
+{
+    pnanovdb_vec3_t ret;
+    ret.x = pnanovdb_read_float(buf, pnanovdb_address_offset(address, 0u));
+    ret.y = pnanovdb_read_float(buf, pnanovdb_address_offset(address, 4u));
+    ret.z = pnanovdb_read_float(buf, pnanovdb_address_offset(address, 8u));
+    return ret;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_read_uint16(pnanovdb_buf_t buf, pnanovdb_address_t address)
+{
+    pnanovdb_uint32_t raw = pnanovdb_read_uint32(buf, pnanovdb_address_mask_inv(address, 3u));
+    return (raw >> (pnanovdb_address_mask(address, 2) << 3));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_read_uint8(pnanovdb_buf_t buf, pnanovdb_address_t address)
+{
+    pnanovdb_uint32_t raw = pnanovdb_read_uint32(buf, pnanovdb_address_mask_inv(address, 3u));
+    return (raw >> (pnanovdb_address_mask(address, 3) << 3)) & 255;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_vec3_t pnanovdb_read_vec3u16(pnanovdb_buf_t buf, pnanovdb_address_t address)
+{
+    pnanovdb_vec3_t ret;
+    const float scale = 1.f / 65535.f;
+    ret.x = scale * pnanovdb_uint32_to_float(pnanovdb_read_uint16(buf, pnanovdb_address_offset(address, 0u))) - 0.5f;
+    ret.y = scale * pnanovdb_uint32_to_float(pnanovdb_read_uint16(buf, pnanovdb_address_offset(address, 2u))) - 0.5f;
+    ret.z = scale * pnanovdb_uint32_to_float(pnanovdb_read_uint16(buf, pnanovdb_address_offset(address, 4u))) - 0.5f;
+    return ret;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_vec3_t pnanovdb_read_vec3u8(pnanovdb_buf_t buf, pnanovdb_address_t address)
+{
+    pnanovdb_vec3_t ret;
+    const float scale = 1.f / 255.f;
+    ret.x = scale * pnanovdb_uint32_to_float(pnanovdb_read_uint8(buf, pnanovdb_address_offset(address, 0u))) - 0.5f;
+    ret.y = scale * pnanovdb_uint32_to_float(pnanovdb_read_uint8(buf, pnanovdb_address_offset(address, 1u))) - 0.5f;
+    ret.z = scale * pnanovdb_uint32_to_float(pnanovdb_read_uint8(buf, pnanovdb_address_offset(address, 2u))) - 0.5f;
+    return ret;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_read_bit(pnanovdb_buf_t buf, pnanovdb_address_t address, pnanovdb_uint32_t bit_offset)
+{
+    pnanovdb_address_t word_address = pnanovdb_address_mask_inv(address, 3u);
+    pnanovdb_uint32_t bit_index = (pnanovdb_address_mask(address, 3u) << 3u) + bit_offset;
+    pnanovdb_uint32_t value_word = pnanovdb_buf_read_uint32(buf, word_address.byte_offset);
+    return ((value_word >> bit_index) & 1) != 0u;
+}
+
+#if defined(PNANOVDB_C)
+PNANOVDB_FORCE_INLINE short pnanovdb_read_half(pnanovdb_buf_t buf, pnanovdb_address_t address)
+{
+    pnanovdb_uint32_t raw = pnanovdb_read_uint32(buf, address);
+    return (short)(raw >> (pnanovdb_address_mask(address, 2) << 3));
+}
+#elif defined(PNANOVDB_HLSL)
+PNANOVDB_FORCE_INLINE float pnanovdb_read_half(pnanovdb_buf_t buf, pnanovdb_address_t address)
+{
+    pnanovdb_uint32_t raw = pnanovdb_read_uint32(buf, address);
+    return f16tof32(raw >> (pnanovdb_address_mask(address, 2) << 3));
+}
+#elif defined(PNANOVDB_GLSL)
+PNANOVDB_FORCE_INLINE float pnanovdb_read_half(pnanovdb_buf_t buf, pnanovdb_address_t address)
+{
+    pnanovdb_uint32_t raw = pnanovdb_read_uint32(buf, address);
+    return unpackHalf2x16(raw >> (pnanovdb_address_mask(address, 2) << 3)).x;
+}
+#endif
+
+// ------------------------------------------------ High Level Buffer Write -----------------------------------------------------------
+
+PNANOVDB_FORCE_INLINE void pnanovdb_write_uint32(pnanovdb_buf_t buf, pnanovdb_address_t address, pnanovdb_uint32_t value)
+{
+    pnanovdb_buf_write_uint32(buf, address.byte_offset, value);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_write_uint64(pnanovdb_buf_t buf, pnanovdb_address_t address, pnanovdb_uint64_t value)
+{
+    pnanovdb_buf_write_uint64(buf, address.byte_offset, value);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_write_int32(pnanovdb_buf_t buf, pnanovdb_address_t address, pnanovdb_int32_t value)
+{
+    pnanovdb_write_uint32(buf, address, pnanovdb_int32_as_uint32(value));
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_write_int64(pnanovdb_buf_t buf, pnanovdb_address_t address, pnanovdb_int64_t value)
+{
+    pnanovdb_buf_write_uint64(buf, address.byte_offset, pnanovdb_int64_as_uint64(value));
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_write_float(pnanovdb_buf_t buf, pnanovdb_address_t address, float value)
+{
+    pnanovdb_write_uint32(buf, address, pnanovdb_float_as_uint32(value));
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_write_double(pnanovdb_buf_t buf, pnanovdb_address_t address, double value)
+{
+    pnanovdb_write_uint64(buf, address, pnanovdb_double_as_uint64(value));
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_write_coord(pnanovdb_buf_t buf, pnanovdb_address_t address, PNANOVDB_IN(pnanovdb_coord_t) value)
+{
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(address, 0u), pnanovdb_int32_as_uint32(PNANOVDB_DEREF(value).x));
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(address, 4u), pnanovdb_int32_as_uint32(PNANOVDB_DEREF(value).y));
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(address, 8u), pnanovdb_int32_as_uint32(PNANOVDB_DEREF(value).z));
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_write_vec3(pnanovdb_buf_t buf, pnanovdb_address_t address, PNANOVDB_IN(pnanovdb_vec3_t) value)
+{
+    pnanovdb_write_float(buf, pnanovdb_address_offset(address, 0u), PNANOVDB_DEREF(value).x);
+    pnanovdb_write_float(buf, pnanovdb_address_offset(address, 4u), PNANOVDB_DEREF(value).y);
+    pnanovdb_write_float(buf, pnanovdb_address_offset(address, 8u), PNANOVDB_DEREF(value).z);
+}
+
+// ------------------------------------------------ Core Structures -----------------------------------------------------------
+
+#define PNANOVDB_MAGIC_NUMBER 0x304244566f6e614eUL// "NanoVDB0" in hex - little endian (uint64_t)
+#define PNANOVDB_MAGIC_GRID   0x314244566f6e614eUL// "NanoVDB1" in hex - little endian (uint64_t)
+#define PNANOVDB_MAGIC_FILE   0x324244566f6e614eUL// "NanoVDB2" in hex - little endian (uint64_t)
+
+#define PNANOVDB_MAJOR_VERSION_NUMBER 32// reflects changes to the ABI
+#define PNANOVDB_MINOR_VERSION_NUMBER  7// reflects changes to the API but not ABI
+#define PNANOVDB_PATCH_VERSION_NUMBER  0// reflects bug-fixes with no ABI or API changes
+
+#define PNANOVDB_GRID_TYPE_UNKNOWN 0
+#define PNANOVDB_GRID_TYPE_FLOAT 1
+#define PNANOVDB_GRID_TYPE_DOUBLE 2
+#define PNANOVDB_GRID_TYPE_INT16 3
+#define PNANOVDB_GRID_TYPE_INT32 4
+#define PNANOVDB_GRID_TYPE_INT64 5
+#define PNANOVDB_GRID_TYPE_VEC3F 6
+#define PNANOVDB_GRID_TYPE_VEC3D 7
+#define PNANOVDB_GRID_TYPE_MASK 8
+#define PNANOVDB_GRID_TYPE_HALF 9
+#define PNANOVDB_GRID_TYPE_UINT32 10
+#define PNANOVDB_GRID_TYPE_BOOLEAN 11
+#define PNANOVDB_GRID_TYPE_RGBA8 12
+#define PNANOVDB_GRID_TYPE_FP4 13
+#define PNANOVDB_GRID_TYPE_FP8 14
+#define PNANOVDB_GRID_TYPE_FP16 15
+#define PNANOVDB_GRID_TYPE_FPN 16
+#define PNANOVDB_GRID_TYPE_VEC4F 17
+#define PNANOVDB_GRID_TYPE_VEC4D 18
+#define PNANOVDB_GRID_TYPE_INDEX 19
+#define PNANOVDB_GRID_TYPE_ONINDEX 20
+#define PNANOVDB_GRID_TYPE_INDEXMASK 21
+#define PNANOVDB_GRID_TYPE_ONINDEXMASK 22
+#define PNANOVDB_GRID_TYPE_POINTINDEX 23
+#define PNANOVDB_GRID_TYPE_VEC3U8 24
+#define PNANOVDB_GRID_TYPE_VEC3U16 25
+#define PNANOVDB_GRID_TYPE_UINT8 26
+#define PNANOVDB_GRID_TYPE_END 27
+
+#define PNANOVDB_GRID_CLASS_UNKNOWN 0
+#define PNANOVDB_GRID_CLASS_LEVEL_SET 1     // narrow band level set, e.g. SDF
+#define PNANOVDB_GRID_CLASS_FOG_VOLUME 2    // fog volume, e.g. density
+#define PNANOVDB_GRID_CLASS_STAGGERED 3     // staggered MAC grid, e.g. velocity
+#define PNANOVDB_GRID_CLASS_POINT_INDEX 4   // point index grid
+#define PNANOVDB_GRID_CLASS_POINT_DATA 5    // point data grid
+#define PNANOVDB_GRID_CLASS_TOPOLOGY 6      // grid with active states only (no values)
+#define PNANOVDB_GRID_CLASS_VOXEL_VOLUME 7  // volume of geometric cubes, e.g. minecraft
+#define PNANOVDB_GRID_CLASS_INDEX_GRID 8    // grid whose values are offsets, e.g. into an external array
+#define PNANOVDB_GRID_CLASS_TENSOR_GRID 9 // grid which can have extra metadata and features
+#define PNANOVDB_GRID_CLASS_END 10
+
+#define PNANOVDB_GRID_FLAGS_HAS_LONG_GRID_NAME (1 << 0)
+#define PNANOVDB_GRID_FLAGS_HAS_BBOX (1 << 1)
+#define PNANOVDB_GRID_FLAGS_HAS_MIN_MAX (1 << 2)
+#define PNANOVDB_GRID_FLAGS_HAS_AVERAGE (1 << 3)
+#define PNANOVDB_GRID_FLAGS_HAS_STD_DEVIATION (1 << 4)
+#define PNANOVDB_GRID_FLAGS_IS_BREADTH_FIRST (1 << 5)
+#define PNANOVDB_GRID_FLAGS_END (1 << 6)
+
+#define PNANOVDB_LEAF_TYPE_DEFAULT 0
+#define PNANOVDB_LEAF_TYPE_LITE 1
+#define PNANOVDB_LEAF_TYPE_FP 2
+#define PNANOVDB_LEAF_TYPE_INDEX 3
+#define PNANOVDB_LEAF_TYPE_INDEXMASK 4
+#define PNANOVDB_LEAF_TYPE_POINTINDEX 5
+
+// BuildType = Unknown, float, double, int16_t, int32_t, int64_t, Vec3f, Vec3d, Mask, ...
+// bit count of values in leaf nodes, i.e. 8*sizeof(*nanovdb::LeafNode<BuildType>::mValues) or zero if no values are stored
+PNANOVDB_STATIC_CONST pnanovdb_uint32_t pnanovdb_grid_type_value_strides_bits[PNANOVDB_GRID_TYPE_END]  = {  0, 32, 64, 16, 32, 64,  96, 192,  0, 16, 32,  1, 32,  4,  8, 16,  0, 128, 256,  0,  0,  0,  0, 16, 24, 48,  8 };
+// bit count of the Tile union in InternalNodes, i.e. 8*sizeof(nanovdb::InternalData::Tile)
+PNANOVDB_STATIC_CONST pnanovdb_uint32_t pnanovdb_grid_type_table_strides_bits[PNANOVDB_GRID_TYPE_END]  = { 64, 64, 64, 64, 64, 64, 128, 192, 64, 64, 64, 64, 64, 64, 64, 64, 64, 128, 256, 64, 64, 64, 64, 64, 64, 64, 64 };
+// bit count of min/max values, i.e. 8*sizeof(nanovdb::LeafData::mMinimum) or zero if no min/max exists
+PNANOVDB_STATIC_CONST pnanovdb_uint32_t pnanovdb_grid_type_minmax_strides_bits[PNANOVDB_GRID_TYPE_END] = {  0, 32, 64, 16, 32, 64,  96, 192,  8, 16, 32,  8, 32, 32, 32, 32, 32, 128, 256, 64, 64, 64, 64, 64, 24, 48,  8 };
+// bit alignment of the value type, controlled by the smallest native type, which is why it is always 0, 8, 16, 32, or 64, e.g. for Vec3f it is 32
+PNANOVDB_STATIC_CONST pnanovdb_uint32_t pnanovdb_grid_type_minmax_aligns_bits[PNANOVDB_GRID_TYPE_END]  = {  0, 32, 64, 16, 32, 64,  32,  64,  8, 16, 32,  8, 32, 32, 32, 32, 32,  32,  64, 64, 64, 64, 64, 64,  8, 16,  8 };
+// bit alignment of the stats (avg/std-dev) types, e.g. 8*sizeof(nanovdb::LeafData::mAverage)
+PNANOVDB_STATIC_CONST pnanovdb_uint32_t pnanovdb_grid_type_stat_strides_bits[PNANOVDB_GRID_TYPE_END]   = {  0, 32, 64, 32, 32, 64,  32,  64,  8, 32, 32,  8, 32, 32, 32, 32, 32,  32,  64, 64, 64, 64, 64, 64, 32, 32, 32 };
+// one of the 4 leaf types defined above, e.g. PNANOVDB_LEAF_TYPE_INDEX = 3
+PNANOVDB_STATIC_CONST pnanovdb_uint32_t pnanovdb_grid_type_leaf_type[PNANOVDB_GRID_TYPE_END]           = {  0,  0,  0,  0,  0,  0,  0,    0,  1,  0,  0,  1,  0,  2,  2,  2,  2,   0,   0,  3,  3,  4,  4,  5,  0,  0,  0 };
+
+struct pnanovdb_map_t
+{
+    float matf[9];
+    float invmatf[9];
+    float vecf[3];
+    float taperf;
+    double matd[9];
+    double invmatd[9];
+    double vecd[3];
+    double taperd;
+};
+PNANOVDB_STRUCT_TYPEDEF(pnanovdb_map_t)
+struct pnanovdb_map_handle_t { pnanovdb_address_t address; };
+PNANOVDB_STRUCT_TYPEDEF(pnanovdb_map_handle_t)
+
+#define PNANOVDB_MAP_SIZE 264
+
+#define PNANOVDB_MAP_OFF_MATF 0
+#define PNANOVDB_MAP_OFF_INVMATF 36
+#define PNANOVDB_MAP_OFF_VECF 72
+#define PNANOVDB_MAP_OFF_TAPERF 84
+#define PNANOVDB_MAP_OFF_MATD 88
+#define PNANOVDB_MAP_OFF_INVMATD 160
+#define PNANOVDB_MAP_OFF_VECD 232
+#define PNANOVDB_MAP_OFF_TAPERD 256
+
+PNANOVDB_FORCE_INLINE float pnanovdb_map_get_matf(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index) {
+    return pnanovdb_read_float(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_MATF + 4u * index));
+}
+PNANOVDB_FORCE_INLINE float pnanovdb_map_get_invmatf(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index) {
+    return pnanovdb_read_float(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_INVMATF + 4u * index));
+}
+PNANOVDB_FORCE_INLINE float pnanovdb_map_get_vecf(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index) {
+    return pnanovdb_read_float(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_VECF + 4u * index));
+}
+PNANOVDB_FORCE_INLINE float pnanovdb_map_get_taperf(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index) {
+    return pnanovdb_read_float(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_TAPERF));
+}
+PNANOVDB_FORCE_INLINE double pnanovdb_map_get_matd(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index) {
+    return pnanovdb_read_double(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_MATD + 8u * index));
+}
+PNANOVDB_FORCE_INLINE double pnanovdb_map_get_invmatd(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index) {
+    return pnanovdb_read_double(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_INVMATD + 8u * index));
+}
+PNANOVDB_FORCE_INLINE double pnanovdb_map_get_vecd(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index) {
+    return pnanovdb_read_double(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_VECD + 8u * index));
+}
+PNANOVDB_FORCE_INLINE double pnanovdb_map_get_taperd(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index) {
+    return pnanovdb_read_double(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_TAPERD));
+}
+
+PNANOVDB_FORCE_INLINE void pnanovdb_map_set_matf(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index, float matf) {
+    pnanovdb_write_float(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_MATF + 4u * index), matf);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_map_set_invmatf(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index, float invmatf) {
+    pnanovdb_write_float(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_INVMATF + 4u * index), invmatf);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_map_set_vecf(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index, float vecf) {
+    pnanovdb_write_float(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_VECF + 4u * index), vecf);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_map_set_taperf(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index, float taperf) {
+    pnanovdb_write_float(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_TAPERF), taperf);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_map_set_matd(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index, double matd) {
+    pnanovdb_write_double(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_MATD + 8u * index), matd);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_map_set_invmatd(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index, double invmatd) {
+    pnanovdb_write_double(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_INVMATD + 8u * index), invmatd);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_map_set_vecd(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index, double vecd) {
+    pnanovdb_write_double(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_VECD + 8u * index), vecd);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_map_set_taperd(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index, double taperd) {
+    pnanovdb_write_double(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_TAPERD), taperd);
+}
+
+struct pnanovdb_grid_t
+{
+    pnanovdb_uint64_t magic;                    // 8 bytes,     0
+    pnanovdb_uint64_t checksum;                 // 8 bytes,     8
+    pnanovdb_uint32_t version;                  // 4 bytes,     16
+    pnanovdb_uint32_t flags;                    // 4 bytes,     20
+    pnanovdb_uint32_t grid_index;               // 4 bytes,     24
+    pnanovdb_uint32_t grid_count;               // 4 bytes,     28
+    pnanovdb_uint64_t grid_size;                // 8 bytes,     32
+    pnanovdb_uint32_t grid_name[256 / 4];       // 256 bytes,   40
+    pnanovdb_map_t map;                         // 264 bytes,   296
+    double world_bbox[6];                       // 48 bytes,    560
+    double voxel_size[3];                       // 24 bytes,    608
+    pnanovdb_uint32_t grid_class;               // 4 bytes,     632
+    pnanovdb_uint32_t grid_type;                // 4 bytes,     636
+    pnanovdb_int64_t blind_metadata_offset;     // 8 bytes,     640
+    pnanovdb_uint32_t blind_metadata_count;     // 4 bytes,     648
+    pnanovdb_uint32_t pad[5];                   // 20 bytes,    652
+};
+PNANOVDB_STRUCT_TYPEDEF(pnanovdb_grid_t)
+struct pnanovdb_grid_handle_t { pnanovdb_address_t address; };
+PNANOVDB_STRUCT_TYPEDEF(pnanovdb_grid_handle_t)
+
+#define PNANOVDB_GRID_SIZE 672
+
+#define PNANOVDB_GRID_OFF_MAGIC 0
+#define PNANOVDB_GRID_OFF_CHECKSUM 8
+#define PNANOVDB_GRID_OFF_VERSION 16
+#define PNANOVDB_GRID_OFF_FLAGS 20
+#define PNANOVDB_GRID_OFF_GRID_INDEX 24
+#define PNANOVDB_GRID_OFF_GRID_COUNT 28
+#define PNANOVDB_GRID_OFF_GRID_SIZE 32
+#define PNANOVDB_GRID_OFF_GRID_NAME 40
+#define PNANOVDB_GRID_OFF_MAP 296
+#define PNANOVDB_GRID_OFF_WORLD_BBOX 560
+#define PNANOVDB_GRID_OFF_VOXEL_SIZE 608
+#define PNANOVDB_GRID_OFF_GRID_CLASS 632
+#define PNANOVDB_GRID_OFF_GRID_TYPE 636
+#define PNANOVDB_GRID_OFF_BLIND_METADATA_OFFSET 640
+#define PNANOVDB_GRID_OFF_BLIND_METADATA_COUNT 648
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_grid_get_magic(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p) {
+    return pnanovdb_read_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_MAGIC));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_grid_get_checksum(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p) {
+    return pnanovdb_read_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_CHECKSUM));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_grid_get_version(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p) {
+    return pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_VERSION));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_grid_get_flags(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p) {
+    return pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_FLAGS));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_grid_get_grid_index(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p) {
+    return pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_GRID_INDEX));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_grid_get_grid_count(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p) {
+    return pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_GRID_COUNT));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_grid_get_grid_size(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p) {
+    return pnanovdb_read_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_GRID_SIZE));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_grid_get_grid_name(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t index) {
+    return pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_GRID_NAME + 4u * index));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_map_handle_t pnanovdb_grid_get_map(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p) {
+    pnanovdb_map_handle_t ret;
+    ret.address = pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_MAP);
+    return ret;
+}
+PNANOVDB_FORCE_INLINE double pnanovdb_grid_get_world_bbox(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t index) {
+    return pnanovdb_read_double(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_WORLD_BBOX + 8u * index));
+}
+PNANOVDB_FORCE_INLINE double pnanovdb_grid_get_voxel_size(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t index) {
+    return pnanovdb_read_double(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_VOXEL_SIZE + 8u * index));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_grid_get_grid_class(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p) {
+    return pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_GRID_CLASS));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_grid_get_grid_type(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p) {
+    return pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_GRID_TYPE));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_int64_t pnanovdb_grid_get_blind_metadata_offset(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p) {
+    return pnanovdb_read_int64(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_BLIND_METADATA_OFFSET));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_grid_get_blind_metadata_count(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p) {
+    return pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_BLIND_METADATA_COUNT));
+}
+
+PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_magic(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint64_t magic) {
+    pnanovdb_write_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_MAGIC), magic);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_checksum(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint64_t checksum) {
+    pnanovdb_write_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_CHECKSUM), checksum);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_version(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t version) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_VERSION), version);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_flags(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t flags) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_FLAGS), flags);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_grid_index(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t grid_index) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_GRID_INDEX), grid_index);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_grid_count(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t grid_count) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_GRID_COUNT), grid_count);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_grid_size(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint64_t grid_size) {
+    pnanovdb_write_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_GRID_SIZE), grid_size);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_grid_name(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t index, pnanovdb_uint32_t grid_name) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_GRID_NAME + 4u * index), grid_name);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_world_bbox(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t index, double world_bbox) {
+    pnanovdb_write_double(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_WORLD_BBOX + 8u * index), world_bbox);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_voxel_size(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t index, double voxel_size) {
+    pnanovdb_write_double(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_VOXEL_SIZE + 8u * index), voxel_size);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_grid_class(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t grid_class) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_GRID_CLASS), grid_class);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_grid_type(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t grid_type) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_GRID_TYPE), grid_type);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_blind_metadata_offset(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint64_t blind_metadata_offset) {
+    pnanovdb_write_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_BLIND_METADATA_OFFSET), blind_metadata_offset);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_blind_metadata_count(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t metadata_count) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_BLIND_METADATA_COUNT), metadata_count);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_make_version(pnanovdb_uint32_t major, pnanovdb_uint32_t minor, pnanovdb_uint32_t patch_num)
+{
+    return (major << 21u) | (minor << 10u) | patch_num;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_version_get_major(pnanovdb_uint32_t version)
+{
+    return (version >> 21u) & ((1u << 11u) - 1u);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_version_get_minor(pnanovdb_uint32_t version)
+{
+    return (version >> 10u) & ((1u << 11u) - 1u);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_version_get_patch(pnanovdb_uint32_t version)
+{
+    return version & ((1u << 10u) - 1u);
+}
+
+struct pnanovdb_gridblindmetadata_t
+{
+    pnanovdb_int64_t data_offset;       // 8 bytes,     0
+    pnanovdb_uint64_t value_count;      // 8 bytes,     8
+    pnanovdb_uint32_t value_size;       // 4 bytes,     16
+    pnanovdb_uint32_t semantic;         // 4 bytes,     20
+    pnanovdb_uint32_t data_class;       // 4 bytes,     24
+    pnanovdb_uint32_t data_type;        // 4 bytes,     28
+    pnanovdb_uint32_t name[256 / 4];    // 256 bytes,   32
+};
+PNANOVDB_STRUCT_TYPEDEF(pnanovdb_gridblindmetadata_t)
+struct pnanovdb_gridblindmetadata_handle_t { pnanovdb_address_t address; };
+PNANOVDB_STRUCT_TYPEDEF(pnanovdb_gridblindmetadata_handle_t)
+
+#define PNANOVDB_GRIDBLINDMETADATA_SIZE 288
+
+#define PNANOVDB_GRIDBLINDMETADATA_OFF_DATA_OFFSET 0
+#define PNANOVDB_GRIDBLINDMETADATA_OFF_VALUE_COUNT 8
+#define PNANOVDB_GRIDBLINDMETADATA_OFF_VALUE_SIZE 16
+#define PNANOVDB_GRIDBLINDMETADATA_OFF_SEMANTIC 20
+#define PNANOVDB_GRIDBLINDMETADATA_OFF_DATA_CLASS 24
+#define PNANOVDB_GRIDBLINDMETADATA_OFF_DATA_TYPE 28
+#define PNANOVDB_GRIDBLINDMETADATA_OFF_NAME 32
+
+PNANOVDB_FORCE_INLINE pnanovdb_int64_t pnanovdb_gridblindmetadata_get_data_offset(pnanovdb_buf_t buf, pnanovdb_gridblindmetadata_handle_t p) {
+    return pnanovdb_read_int64(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRIDBLINDMETADATA_OFF_DATA_OFFSET));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_gridblindmetadata_get_value_count(pnanovdb_buf_t buf, pnanovdb_gridblindmetadata_handle_t p) {
+    return pnanovdb_read_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRIDBLINDMETADATA_OFF_VALUE_COUNT));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_gridblindmetadata_get_value_size(pnanovdb_buf_t buf, pnanovdb_gridblindmetadata_handle_t p) {
+    return pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRIDBLINDMETADATA_OFF_VALUE_SIZE));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_gridblindmetadata_get_semantic(pnanovdb_buf_t buf, pnanovdb_gridblindmetadata_handle_t p) {
+    return pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRIDBLINDMETADATA_OFF_SEMANTIC));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_gridblindmetadata_get_data_class(pnanovdb_buf_t buf, pnanovdb_gridblindmetadata_handle_t p) {
+    return pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRIDBLINDMETADATA_OFF_DATA_CLASS));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_gridblindmetadata_get_data_type(pnanovdb_buf_t buf, pnanovdb_gridblindmetadata_handle_t p) {
+    return pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRIDBLINDMETADATA_OFF_DATA_TYPE));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_gridblindmetadata_get_name(pnanovdb_buf_t buf, pnanovdb_gridblindmetadata_handle_t p, pnanovdb_uint32_t index) {
+    return pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRIDBLINDMETADATA_OFF_NAME + 4u * index));
+}
+
+struct pnanovdb_tree_t
+{
+    pnanovdb_uint64_t node_offset_leaf;
+    pnanovdb_uint64_t node_offset_lower;
+    pnanovdb_uint64_t node_offset_upper;
+    pnanovdb_uint64_t node_offset_root;
+    pnanovdb_uint32_t node_count_leaf;
+    pnanovdb_uint32_t node_count_lower;
+    pnanovdb_uint32_t node_count_upper;
+    pnanovdb_uint32_t tile_count_leaf;
+    pnanovdb_uint32_t tile_count_lower;
+    pnanovdb_uint32_t tile_count_upper;
+    pnanovdb_uint64_t voxel_count;
+};
+PNANOVDB_STRUCT_TYPEDEF(pnanovdb_tree_t)
+struct pnanovdb_tree_handle_t { pnanovdb_address_t address; };
+PNANOVDB_STRUCT_TYPEDEF(pnanovdb_tree_handle_t)
+
+#define PNANOVDB_TREE_SIZE 64
+
+#define PNANOVDB_TREE_OFF_NODE_OFFSET_LEAF 0
+#define PNANOVDB_TREE_OFF_NODE_OFFSET_LOWER 8
+#define PNANOVDB_TREE_OFF_NODE_OFFSET_UPPER 16
+#define PNANOVDB_TREE_OFF_NODE_OFFSET_ROOT 24
+#define PNANOVDB_TREE_OFF_NODE_COUNT_LEAF 32
+#define PNANOVDB_TREE_OFF_NODE_COUNT_LOWER 36
+#define PNANOVDB_TREE_OFF_NODE_COUNT_UPPER 40
+#define PNANOVDB_TREE_OFF_TILE_COUNT_LEAF 44
+#define PNANOVDB_TREE_OFF_TILE_COUNT_LOWER 48
+#define PNANOVDB_TREE_OFF_TILE_COUNT_UPPER 52
+#define PNANOVDB_TREE_OFF_VOXEL_COUNT 56
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_tree_get_node_offset_leaf(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p) {
+    return pnanovdb_read_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_NODE_OFFSET_LEAF));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_tree_get_node_offset_lower(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p) {
+    return pnanovdb_read_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_NODE_OFFSET_LOWER));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_tree_get_node_offset_upper(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p) {
+    return pnanovdb_read_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_NODE_OFFSET_UPPER));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_tree_get_node_offset_root(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p) {
+    return pnanovdb_read_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_NODE_OFFSET_ROOT));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_tree_get_node_count_leaf(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p) {
+    return pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_NODE_COUNT_LEAF));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_tree_get_node_count_lower(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p) {
+    return pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_NODE_COUNT_LOWER));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_tree_get_node_count_upper(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p) {
+    return pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_NODE_COUNT_UPPER));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_tree_get_tile_count_leaf(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p) {
+    return pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_TILE_COUNT_LEAF));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_tree_get_tile_count_lower(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p) {
+    return pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_TILE_COUNT_LOWER));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_tree_get_tile_count_upper(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p) {
+    return pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_TILE_COUNT_UPPER));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_tree_get_voxel_count(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p) {
+    return pnanovdb_read_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_VOXEL_COUNT));
+}
+
+PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_node_offset_leaf(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint64_t node_offset_leaf) {
+    pnanovdb_write_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_NODE_OFFSET_LEAF), node_offset_leaf);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_node_offset_lower(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint64_t node_offset_lower) {
+    pnanovdb_write_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_NODE_OFFSET_LOWER), node_offset_lower);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_node_offset_upper(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint64_t node_offset_upper) {
+    pnanovdb_write_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_NODE_OFFSET_UPPER), node_offset_upper);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_node_offset_root(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint64_t node_offset_root) {
+    pnanovdb_write_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_NODE_OFFSET_ROOT), node_offset_root);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_node_count_leaf(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint32_t node_count_leaf) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_NODE_COUNT_LEAF), node_count_leaf);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_node_count_lower(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint32_t node_count_lower) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_NODE_COUNT_LOWER), node_count_lower);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_node_count_upper(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint32_t node_count_upper) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_NODE_COUNT_UPPER), node_count_upper);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_tile_count_leaf(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint32_t tile_count_leaf) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_TILE_COUNT_LEAF), tile_count_leaf);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_tile_count_lower(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint32_t tile_count_lower) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_TILE_COUNT_LOWER), tile_count_lower);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_tile_count_upper(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint32_t tile_count_upper) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_TILE_COUNT_UPPER), tile_count_upper);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_voxel_count(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint64_t voxel_count) {
+    pnanovdb_write_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_VOXEL_COUNT), voxel_count);
+}
+
+struct pnanovdb_root_t
+{
+    pnanovdb_coord_t bbox_min;
+    pnanovdb_coord_t bbox_max;
+    pnanovdb_uint32_t table_size;
+    pnanovdb_uint32_t pad1;         // background can start here
+    // background, min, max
+};
+PNANOVDB_STRUCT_TYPEDEF(pnanovdb_root_t)
+struct pnanovdb_root_handle_t { pnanovdb_address_t address; };
+PNANOVDB_STRUCT_TYPEDEF(pnanovdb_root_handle_t)
+
+#define PNANOVDB_ROOT_BASE_SIZE 28
+
+#define PNANOVDB_ROOT_OFF_BBOX_MIN 0
+#define PNANOVDB_ROOT_OFF_BBOX_MAX 12
+#define PNANOVDB_ROOT_OFF_TABLE_SIZE 24
+
+PNANOVDB_FORCE_INLINE pnanovdb_coord_t pnanovdb_root_get_bbox_min(pnanovdb_buf_t buf, pnanovdb_root_handle_t p) {
+    return pnanovdb_read_coord(buf, pnanovdb_address_offset(p.address, PNANOVDB_ROOT_OFF_BBOX_MIN));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_coord_t pnanovdb_root_get_bbox_max(pnanovdb_buf_t buf, pnanovdb_root_handle_t p) {
+    return pnanovdb_read_coord(buf, pnanovdb_address_offset(p.address, PNANOVDB_ROOT_OFF_BBOX_MAX));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_root_get_tile_count(pnanovdb_buf_t buf, pnanovdb_root_handle_t p) {
+    return pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_ROOT_OFF_TABLE_SIZE));
+}
+
+PNANOVDB_FORCE_INLINE void pnanovdb_root_set_bbox_min(pnanovdb_buf_t buf, pnanovdb_root_handle_t p, PNANOVDB_IN(pnanovdb_coord_t) bbox_min) {
+    pnanovdb_write_coord(buf, pnanovdb_address_offset(p.address, PNANOVDB_ROOT_OFF_BBOX_MIN), bbox_min);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_root_set_bbox_max(pnanovdb_buf_t buf, pnanovdb_root_handle_t p, PNANOVDB_IN(pnanovdb_coord_t) bbox_max) {
+    pnanovdb_write_coord(buf, pnanovdb_address_offset(p.address, PNANOVDB_ROOT_OFF_BBOX_MAX), bbox_max);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_root_set_tile_count(pnanovdb_buf_t buf, pnanovdb_root_handle_t p, pnanovdb_uint32_t tile_count) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_ROOT_OFF_TABLE_SIZE), tile_count);
+}
+
+struct pnanovdb_root_tile_t
+{
+    pnanovdb_uint64_t key;
+    pnanovdb_int64_t child;         // signed byte offset from root to the child node, 0 means it is a constant tile, so use value
+    pnanovdb_uint32_t state;
+    pnanovdb_uint32_t pad1;         // value can start here
+    // value
+};
+PNANOVDB_STRUCT_TYPEDEF(pnanovdb_root_tile_t)
+struct pnanovdb_root_tile_handle_t { pnanovdb_address_t address; };
+PNANOVDB_STRUCT_TYPEDEF(pnanovdb_root_tile_handle_t)
+
+#define PNANOVDB_ROOT_TILE_BASE_SIZE 20
+
+#define PNANOVDB_ROOT_TILE_OFF_KEY 0
+#define PNANOVDB_ROOT_TILE_OFF_CHILD 8
+#define PNANOVDB_ROOT_TILE_OFF_STATE 16
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_root_tile_get_key(pnanovdb_buf_t buf, pnanovdb_root_tile_handle_t p) {
+    return pnanovdb_read_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_ROOT_TILE_OFF_KEY));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_int64_t pnanovdb_root_tile_get_child(pnanovdb_buf_t buf, pnanovdb_root_tile_handle_t p) {
+    return pnanovdb_read_int64(buf, pnanovdb_address_offset(p.address, PNANOVDB_ROOT_TILE_OFF_CHILD));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_root_tile_get_state(pnanovdb_buf_t buf, pnanovdb_root_tile_handle_t p) {
+    return pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_ROOT_TILE_OFF_STATE));
+}
+
+PNANOVDB_FORCE_INLINE void pnanovdb_root_tile_set_key(pnanovdb_buf_t buf, pnanovdb_root_tile_handle_t p, pnanovdb_uint64_t key) {
+    pnanovdb_write_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_ROOT_TILE_OFF_KEY), key);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_root_tile_set_child(pnanovdb_buf_t buf, pnanovdb_root_tile_handle_t p, pnanovdb_int64_t child) {
+    pnanovdb_write_int64(buf, pnanovdb_address_offset(p.address, PNANOVDB_ROOT_TILE_OFF_CHILD), child);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_root_tile_set_state(pnanovdb_buf_t buf, pnanovdb_root_tile_handle_t p, pnanovdb_uint32_t state) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_ROOT_TILE_OFF_STATE), state);
+}
+
+struct pnanovdb_upper_t
+{
+    pnanovdb_coord_t bbox_min;
+    pnanovdb_coord_t bbox_max;
+    pnanovdb_uint64_t flags;
+    pnanovdb_uint32_t value_mask[1024];
+    pnanovdb_uint32_t child_mask[1024];
+    // min, max
+    // alignas(32) pnanovdb_uint32_t table[];
+};
+PNANOVDB_STRUCT_TYPEDEF(pnanovdb_upper_t)
+struct pnanovdb_upper_handle_t { pnanovdb_address_t address; };
+PNANOVDB_STRUCT_TYPEDEF(pnanovdb_upper_handle_t)
+
+#define PNANOVDB_UPPER_TABLE_COUNT 32768
+#define PNANOVDB_UPPER_BASE_SIZE 8224
+
+#define PNANOVDB_UPPER_OFF_BBOX_MIN 0
+#define PNANOVDB_UPPER_OFF_BBOX_MAX 12
+#define PNANOVDB_UPPER_OFF_FLAGS 24
+#define PNANOVDB_UPPER_OFF_VALUE_MASK 32
+#define PNANOVDB_UPPER_OFF_CHILD_MASK 4128
+
+PNANOVDB_FORCE_INLINE pnanovdb_coord_t pnanovdb_upper_get_bbox_min(pnanovdb_buf_t buf, pnanovdb_upper_handle_t p) {
+    return pnanovdb_read_coord(buf, pnanovdb_address_offset(p.address, PNANOVDB_UPPER_OFF_BBOX_MIN));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_coord_t pnanovdb_upper_get_bbox_max(pnanovdb_buf_t buf, pnanovdb_upper_handle_t p) {
+    return pnanovdb_read_coord(buf, pnanovdb_address_offset(p.address, PNANOVDB_UPPER_OFF_BBOX_MAX));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_upper_get_flags(pnanovdb_buf_t buf, pnanovdb_upper_handle_t p) {
+    return pnanovdb_read_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_UPPER_OFF_FLAGS));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_upper_get_value_mask(pnanovdb_buf_t buf, pnanovdb_upper_handle_t p, pnanovdb_uint32_t bit_index) {
+    pnanovdb_uint32_t value = pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_UPPER_OFF_VALUE_MASK + 4u * (bit_index >> 5u)));
+    return ((value >> (bit_index & 31u)) & 1) != 0u;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_upper_get_child_mask(pnanovdb_buf_t buf, pnanovdb_upper_handle_t p, pnanovdb_uint32_t bit_index) {
+    pnanovdb_uint32_t value = pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_UPPER_OFF_CHILD_MASK + 4u * (bit_index >> 5u)));
+    return ((value >> (bit_index & 31u)) & 1) != 0u;
+}
+
+PNANOVDB_FORCE_INLINE void pnanovdb_upper_set_bbox_min(pnanovdb_buf_t buf, pnanovdb_upper_handle_t p, PNANOVDB_IN(pnanovdb_coord_t) bbox_min) {
+    pnanovdb_write_coord(buf, pnanovdb_address_offset(p.address, PNANOVDB_UPPER_OFF_BBOX_MIN), bbox_min);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_upper_set_bbox_max(pnanovdb_buf_t buf, pnanovdb_upper_handle_t p, PNANOVDB_IN(pnanovdb_coord_t) bbox_max) {
+    pnanovdb_write_coord(buf, pnanovdb_address_offset(p.address, PNANOVDB_UPPER_OFF_BBOX_MAX), bbox_max);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_upper_set_child_mask(pnanovdb_buf_t buf, pnanovdb_upper_handle_t p, pnanovdb_uint32_t bit_index, pnanovdb_bool_t value) {
+    pnanovdb_address_t addr = pnanovdb_address_offset(p.address, PNANOVDB_UPPER_OFF_CHILD_MASK + 4u * (bit_index >> 5u));
+    pnanovdb_uint32_t valueMask = pnanovdb_read_uint32(buf, addr);
+    if (!value) { valueMask &= ~(1u << (bit_index & 31u)); }
+    if (value) valueMask |= (1u << (bit_index & 31u));
+    pnanovdb_write_uint32(buf, addr, valueMask);
+}
+
+struct pnanovdb_lower_t
+{
+    pnanovdb_coord_t bbox_min;
+    pnanovdb_coord_t bbox_max;
+    pnanovdb_uint64_t flags;
+    pnanovdb_uint32_t value_mask[128];
+    pnanovdb_uint32_t child_mask[128];
+    // min, max
+    // alignas(32) pnanovdb_uint32_t table[];
+};
+PNANOVDB_STRUCT_TYPEDEF(pnanovdb_lower_t)
+struct pnanovdb_lower_handle_t { pnanovdb_address_t address; };
+PNANOVDB_STRUCT_TYPEDEF(pnanovdb_lower_handle_t)
+
+#define PNANOVDB_LOWER_TABLE_COUNT 4096
+#define PNANOVDB_LOWER_BASE_SIZE 1056
+
+#define PNANOVDB_LOWER_OFF_BBOX_MIN 0
+#define PNANOVDB_LOWER_OFF_BBOX_MAX 12
+#define PNANOVDB_LOWER_OFF_FLAGS 24
+#define PNANOVDB_LOWER_OFF_VALUE_MASK 32
+#define PNANOVDB_LOWER_OFF_CHILD_MASK 544
+
+PNANOVDB_FORCE_INLINE pnanovdb_coord_t pnanovdb_lower_get_bbox_min(pnanovdb_buf_t buf, pnanovdb_lower_handle_t p) {
+    return pnanovdb_read_coord(buf, pnanovdb_address_offset(p.address, PNANOVDB_LOWER_OFF_BBOX_MIN));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_coord_t pnanovdb_lower_get_bbox_max(pnanovdb_buf_t buf, pnanovdb_lower_handle_t p) {
+    return pnanovdb_read_coord(buf, pnanovdb_address_offset(p.address, PNANOVDB_LOWER_OFF_BBOX_MAX));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_lower_get_flags(pnanovdb_buf_t buf, pnanovdb_lower_handle_t p) {
+    return pnanovdb_read_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_LOWER_OFF_FLAGS));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_lower_get_value_mask(pnanovdb_buf_t buf, pnanovdb_lower_handle_t p, pnanovdb_uint32_t bit_index) {
+    pnanovdb_uint32_t value = pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_LOWER_OFF_VALUE_MASK + 4u * (bit_index >> 5u)));
+    return ((value >> (bit_index & 31u)) & 1) != 0u;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_lower_get_child_mask(pnanovdb_buf_t buf, pnanovdb_lower_handle_t p, pnanovdb_uint32_t bit_index) {
+    pnanovdb_uint32_t value = pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_LOWER_OFF_CHILD_MASK + 4u * (bit_index >> 5u)));
+    return ((value >> (bit_index & 31u)) & 1) != 0u;
+}
+
+PNANOVDB_FORCE_INLINE void pnanovdb_lower_set_bbox_min(pnanovdb_buf_t buf, pnanovdb_lower_handle_t p, PNANOVDB_IN(pnanovdb_coord_t) bbox_min) {
+    pnanovdb_write_coord(buf, pnanovdb_address_offset(p.address, PNANOVDB_LOWER_OFF_BBOX_MIN), bbox_min);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_lower_set_bbox_max(pnanovdb_buf_t buf, pnanovdb_lower_handle_t p, PNANOVDB_IN(pnanovdb_coord_t) bbox_max) {
+    pnanovdb_write_coord(buf, pnanovdb_address_offset(p.address, PNANOVDB_LOWER_OFF_BBOX_MAX), bbox_max);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_lower_set_child_mask(pnanovdb_buf_t buf, pnanovdb_lower_handle_t p, pnanovdb_uint32_t bit_index, pnanovdb_bool_t value) {
+    pnanovdb_address_t addr = pnanovdb_address_offset(p.address, PNANOVDB_LOWER_OFF_CHILD_MASK + 4u * (bit_index >> 5u));
+    pnanovdb_uint32_t valueMask = pnanovdb_read_uint32(buf, addr);
+    if (!value) { valueMask &= ~(1u << (bit_index & 31u)); }
+    if (value) valueMask |= (1u << (bit_index & 31u));
+    pnanovdb_write_uint32(buf, addr, valueMask);
+}
+
+struct pnanovdb_leaf_t
+{
+    pnanovdb_coord_t bbox_min;
+    pnanovdb_uint32_t bbox_dif_and_flags;
+    pnanovdb_uint32_t value_mask[16];
+    // min, max
+    // alignas(32) pnanovdb_uint32_t values[];
+};
+PNANOVDB_STRUCT_TYPEDEF(pnanovdb_leaf_t)
+struct pnanovdb_leaf_handle_t { pnanovdb_address_t address; };
+PNANOVDB_STRUCT_TYPEDEF(pnanovdb_leaf_handle_t)
+
+#define PNANOVDB_LEAF_TABLE_COUNT 512
+#define PNANOVDB_LEAF_BASE_SIZE 80
+
+#define PNANOVDB_LEAF_OFF_BBOX_MIN 0
+#define PNANOVDB_LEAF_OFF_BBOX_DIF_AND_FLAGS 12
+#define PNANOVDB_LEAF_OFF_VALUE_MASK 16
+
+#define PNANOVDB_LEAF_TABLE_NEG_OFF_BBOX_DIF_AND_FLAGS 84
+#define PNANOVDB_LEAF_TABLE_NEG_OFF_MINIMUM 16
+#define PNANOVDB_LEAF_TABLE_NEG_OFF_QUANTUM 12
+
+PNANOVDB_FORCE_INLINE pnanovdb_coord_t pnanovdb_leaf_get_bbox_min(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t p) {
+    return pnanovdb_read_coord(buf, pnanovdb_address_offset(p.address, PNANOVDB_LEAF_OFF_BBOX_MIN));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_leaf_get_bbox_dif_and_flags(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t p) {
+    return pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_LEAF_OFF_BBOX_DIF_AND_FLAGS));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_leaf_get_value_mask(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t p, pnanovdb_uint32_t bit_index) {
+    pnanovdb_uint32_t value = pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_LEAF_OFF_VALUE_MASK + 4u * (bit_index >> 5u)));
+    return ((value >> (bit_index & 31u)) & 1) != 0u;
+}
+
+PNANOVDB_FORCE_INLINE void pnanovdb_leaf_set_bbox_min(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t p, PNANOVDB_IN(pnanovdb_coord_t) bbox_min) {
+    pnanovdb_write_coord(buf, pnanovdb_address_offset(p.address, PNANOVDB_LEAF_OFF_BBOX_MIN), bbox_min);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_leaf_set_bbox_dif_and_flags(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t p, pnanovdb_uint32_t bbox_dif_and_flags) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_LEAF_OFF_BBOX_DIF_AND_FLAGS), bbox_dif_and_flags);
+}
+
+struct pnanovdb_grid_type_constants_t
+{
+    pnanovdb_uint32_t root_off_background;
+    pnanovdb_uint32_t root_off_min;
+    pnanovdb_uint32_t root_off_max;
+    pnanovdb_uint32_t root_off_ave;
+    pnanovdb_uint32_t root_off_stddev;
+    pnanovdb_uint32_t root_size;
+    pnanovdb_uint32_t value_stride_bits;
+    pnanovdb_uint32_t table_stride;
+    pnanovdb_uint32_t root_tile_off_value;
+    pnanovdb_uint32_t root_tile_size;
+    pnanovdb_uint32_t upper_off_min;
+    pnanovdb_uint32_t upper_off_max;
+    pnanovdb_uint32_t upper_off_ave;
+    pnanovdb_uint32_t upper_off_stddev;
+    pnanovdb_uint32_t upper_off_table;
+    pnanovdb_uint32_t upper_size;
+    pnanovdb_uint32_t lower_off_min;
+    pnanovdb_uint32_t lower_off_max;
+    pnanovdb_uint32_t lower_off_ave;
+    pnanovdb_uint32_t lower_off_stddev;
+    pnanovdb_uint32_t lower_off_table;
+    pnanovdb_uint32_t lower_size;
+    pnanovdb_uint32_t leaf_off_min;
+    pnanovdb_uint32_t leaf_off_max;
+    pnanovdb_uint32_t leaf_off_ave;
+    pnanovdb_uint32_t leaf_off_stddev;
+    pnanovdb_uint32_t leaf_off_table;
+    pnanovdb_uint32_t leaf_size;
+};
+PNANOVDB_STRUCT_TYPEDEF(pnanovdb_grid_type_constants_t)
+
+// The following table with offsets will nedd to be updates as new GridTypes are added in NanoVDB.h
+PNANOVDB_STATIC_CONST pnanovdb_grid_type_constants_t pnanovdb_grid_type_constants[PNANOVDB_GRID_TYPE_END] =
+{
+{28, 28, 28, 28, 28, 32,  0, 8, 20, 32,  8224, 8224, 8224, 8224, 8224, 270368,  1056, 1056, 1056, 1056, 1056, 33824,  80, 80, 80, 80, 96, 96},
+{28, 32, 36, 40, 44, 64,  32, 8, 20, 32,  8224, 8228, 8232, 8236, 8256, 270400,  1056, 1060, 1064, 1068, 1088, 33856,  80, 84, 88, 92, 96, 2144},
+{32, 40, 48, 56, 64, 96,  64, 8, 24, 32,  8224, 8232, 8240, 8248, 8256, 270400,  1056, 1064, 1072, 1080, 1088, 33856,  80, 88, 96, 104, 128, 4224},
+{28, 30, 32, 36, 40, 64,  16, 8, 20, 32,  8224, 8226, 8228, 8232, 8256, 270400,  1056, 1058, 1060, 1064, 1088, 33856,  80, 82, 84, 88, 96, 1120},
+{28, 32, 36, 40, 44, 64,  32, 8, 20, 32,  8224, 8228, 8232, 8236, 8256, 270400,  1056, 1060, 1064, 1068, 1088, 33856,  80, 84, 88, 92, 96, 2144},
+{32, 40, 48, 56, 64, 96,  64, 8, 24, 32,  8224, 8232, 8240, 8248, 8256, 270400,  1056, 1064, 1072, 1080, 1088, 33856,  80, 88, 96, 104, 128, 4224},
+{28, 40, 52, 64, 68, 96,  96, 16, 20, 32,  8224, 8236, 8248, 8252, 8256, 532544,  1056, 1068, 1080, 1084, 1088, 66624,  80, 92, 104, 108, 128, 6272},
+{32, 56, 80, 104, 112, 128,  192, 24, 24, 64,  8224, 8248, 8272, 8280, 8288, 794720,  1056, 1080, 1104, 1112, 1120, 99424,  80, 104, 128, 136, 160, 12448},
+{28, 29, 30, 31, 32, 64,  0, 8, 20, 32,  8224, 8225, 8226, 8227, 8256, 270400,  1056, 1057, 1058, 1059, 1088, 33856,  80, 80, 80, 80, 96, 96},
+{28, 30, 32, 36, 40, 64,  16, 8, 20, 32,  8224, 8226, 8228, 8232, 8256, 270400,  1056, 1058, 1060, 1064, 1088, 33856,  80, 82, 84, 88, 96, 1120},
+{28, 32, 36, 40, 44, 64,  32, 8, 20, 32,  8224, 8228, 8232, 8236, 8256, 270400,  1056, 1060, 1064, 1068, 1088, 33856,  80, 84, 88, 92, 96, 2144},
+{28, 29, 30, 31, 32, 64,  1, 8, 20, 32,  8224, 8225, 8226, 8227, 8256, 270400,  1056, 1057, 1058, 1059, 1088, 33856,  80, 80, 80, 80, 96, 160},
+{28, 32, 36, 40, 44, 64,  32, 8, 20, 32,  8224, 8228, 8232, 8236, 8256, 270400,  1056, 1060, 1064, 1068, 1088, 33856,  80, 84, 88, 92, 96, 2144},
+{28, 32, 36, 40, 44, 64,  0, 8, 20, 32,  8224, 8228, 8232, 8236, 8256, 270400,  1056, 1060, 1064, 1068, 1088, 33856,  88, 90, 92, 94, 96, 352},
+{28, 32, 36, 40, 44, 64,  0, 8, 20, 32,  8224, 8228, 8232, 8236, 8256, 270400,  1056, 1060, 1064, 1068, 1088, 33856,  88, 90, 92, 94, 96, 608},
+{28, 32, 36, 40, 44, 64,  0, 8, 20, 32,  8224, 8228, 8232, 8236, 8256, 270400,  1056, 1060, 1064, 1068, 1088, 33856,  88, 90, 92, 94, 96, 1120},
+{28, 32, 36, 40, 44, 64,  0, 8, 20, 32,  8224, 8228, 8232, 8236, 8256, 270400,  1056, 1060, 1064, 1068, 1088, 33856,  88, 90, 92, 94, 96, 96},
+{28, 44, 60, 76, 80, 96,  128, 16, 20, 64,  8224, 8240, 8256, 8260, 8288, 532576,  1056, 1072, 1088, 1092, 1120, 66656,  80, 96, 112, 116, 128, 8320},
+{32, 64, 96, 128, 136, 160,  256, 32, 24, 64,  8224, 8256, 8288, 8296, 8320, 1056896,  1056, 1088, 1120, 1128, 1152, 132224,  80, 112, 144, 152, 160, 16544},
+{32, 40, 48, 56, 64, 96,  0, 8, 24, 32,  8224, 8232, 8240, 8248, 8256, 270400,  1056, 1064, 1072, 1080, 1088, 33856,  80, 80, 80, 80, 80, 96},
+{32, 40, 48, 56, 64, 96,  0, 8, 24, 32,  8224, 8232, 8240, 8248, 8256, 270400,  1056, 1064, 1072, 1080, 1088, 33856,  80, 80, 80, 80, 80, 96},
+{32, 40, 48, 56, 64, 96,  0, 8, 24, 32,  8224, 8232, 8240, 8248, 8256, 270400,  1056, 1064, 1072, 1080, 1088, 33856,  80, 80, 80, 80, 80, 160},
+{32, 40, 48, 56, 64, 96,  0, 8, 24, 32,  8224, 8232, 8240, 8248, 8256, 270400,  1056, 1064, 1072, 1080, 1088, 33856,  80, 80, 80, 80, 80, 160},
+{32, 40, 48, 56, 64, 96,  16, 8, 24, 32,  8224, 8232, 8240, 8248, 8256, 270400,  1056, 1064, 1072, 1080, 1088, 33856,  80, 88, 96, 96, 96, 1120},
+{28, 31, 34, 40, 44, 64,  24, 8, 20, 32,  8224, 8227, 8232, 8236, 8256, 270400,  1056, 1059, 1064, 1068, 1088, 33856,  80, 83, 88, 92, 96, 1632},
+{28, 34, 40, 48, 52, 64,  48, 8, 20, 32,  8224, 8230, 8236, 8240, 8256, 270400,  1056, 1062, 1068, 1072, 1088, 33856,  80, 86, 92, 96, 128, 3200},
+{28, 29, 30, 32, 36, 64,  8, 8, 20, 32,  8224, 8225, 8228, 8232, 8256, 270400,  1056, 1057, 1060, 1064, 1088, 33856,  80, 81, 84, 88, 96, 608},
+};
+
+// ------------------------------------------------ Basic Lookup -----------------------------------------------------------
+
+PNANOVDB_FORCE_INLINE pnanovdb_gridblindmetadata_handle_t pnanovdb_grid_get_gridblindmetadata(pnanovdb_buf_t buf, pnanovdb_grid_handle_t grid, pnanovdb_uint32_t index)
+{
+    pnanovdb_gridblindmetadata_handle_t meta = { grid.address };
+    pnanovdb_uint64_t byte_offset = pnanovdb_grid_get_blind_metadata_offset(buf, grid);
+    meta.address = pnanovdb_address_offset64(meta.address, byte_offset);
+    meta.address = pnanovdb_address_offset_product(meta.address, PNANOVDB_GRIDBLINDMETADATA_SIZE, index);
+    return meta;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_grid_get_gridblindmetadata_value_address(pnanovdb_buf_t buf, pnanovdb_grid_handle_t grid, pnanovdb_uint32_t index)
+{
+    pnanovdb_gridblindmetadata_handle_t meta = pnanovdb_grid_get_gridblindmetadata(buf, grid, index);
+    pnanovdb_int64_t byte_offset = pnanovdb_gridblindmetadata_get_data_offset(buf, meta);
+    pnanovdb_address_t address = pnanovdb_address_offset64(meta.address, pnanovdb_int64_as_uint64(byte_offset));
+    return address;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_tree_handle_t pnanovdb_grid_get_tree(pnanovdb_buf_t buf, pnanovdb_grid_handle_t grid)
+{
+    pnanovdb_tree_handle_t tree = { grid.address };
+    tree.address = pnanovdb_address_offset(tree.address, PNANOVDB_GRID_SIZE);
+    return tree;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_root_handle_t pnanovdb_tree_get_root(pnanovdb_buf_t buf, pnanovdb_tree_handle_t tree)
+{
+    pnanovdb_root_handle_t root = { tree.address };
+    pnanovdb_uint64_t byte_offset = pnanovdb_tree_get_node_offset_root(buf, tree);
+    root.address = pnanovdb_address_offset64(root.address, byte_offset);
+    return root;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_root_tile_handle_t pnanovdb_root_get_tile(pnanovdb_grid_type_t grid_type, pnanovdb_root_handle_t root, pnanovdb_uint32_t n)
+{
+    pnanovdb_root_tile_handle_t tile = { root.address };
+    tile.address = pnanovdb_address_offset(tile.address, PNANOVDB_GRID_TYPE_GET(grid_type, root_size));
+    tile.address = pnanovdb_address_offset_product(tile.address, PNANOVDB_GRID_TYPE_GET(grid_type, root_tile_size), n);
+    return tile;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_root_tile_handle_t pnanovdb_root_get_tile_zero(pnanovdb_grid_type_t grid_type, pnanovdb_root_handle_t root)
+{
+    pnanovdb_root_tile_handle_t tile = { root.address };
+    tile.address = pnanovdb_address_offset(tile.address, PNANOVDB_GRID_TYPE_GET(grid_type, root_size));
+    return tile;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_upper_handle_t pnanovdb_root_get_child(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_root_handle_t root, pnanovdb_root_tile_handle_t tile)
+{
+    pnanovdb_upper_handle_t upper = { root.address };
+    upper.address = pnanovdb_address_offset64(upper.address, pnanovdb_int64_as_uint64(pnanovdb_root_tile_get_child(buf, tile)));
+    return upper;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_coord_to_key(PNANOVDB_IN(pnanovdb_coord_t) ijk)
+{
+#if defined(PNANOVDB_NATIVE_64)
+    pnanovdb_uint64_t iu = pnanovdb_int32_as_uint32(PNANOVDB_DEREF(ijk).x) >> 12u;
+    pnanovdb_uint64_t ju = pnanovdb_int32_as_uint32(PNANOVDB_DEREF(ijk).y) >> 12u;
+    pnanovdb_uint64_t ku = pnanovdb_int32_as_uint32(PNANOVDB_DEREF(ijk).z) >> 12u;
+    return (ku) | (ju << 21u) | (iu << 42u);
+#else
+    pnanovdb_uint32_t iu = pnanovdb_int32_as_uint32(PNANOVDB_DEREF(ijk).x) >> 12u;
+    pnanovdb_uint32_t ju = pnanovdb_int32_as_uint32(PNANOVDB_DEREF(ijk).y) >> 12u;
+    pnanovdb_uint32_t ku = pnanovdb_int32_as_uint32(PNANOVDB_DEREF(ijk).z) >> 12u;
+    pnanovdb_uint32_t key_x = ku | (ju << 21);
+    pnanovdb_uint32_t key_y = (iu << 10) | (ju >> 11);
+    return pnanovdb_uint32_as_uint64(key_x, key_y);
+#endif
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_root_tile_handle_t pnanovdb_root_find_tile(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_root_handle_t root, PNANOVDB_IN(pnanovdb_coord_t) ijk)
+{
+    pnanovdb_uint32_t tile_count = pnanovdb_uint32_as_int32(pnanovdb_root_get_tile_count(buf, root));
+    pnanovdb_root_tile_handle_t tile = pnanovdb_root_get_tile_zero(grid_type, root);
+    pnanovdb_uint64_t key = pnanovdb_coord_to_key(ijk);
+    for (pnanovdb_uint32_t i = 0u; i < tile_count; i++)
+    {
+        if (pnanovdb_uint64_is_equal(key, pnanovdb_root_tile_get_key(buf, tile)))
+        {
+            return tile;
+        }
+        tile.address = pnanovdb_address_offset(tile.address, PNANOVDB_GRID_TYPE_GET(grid_type, root_tile_size));
+    }
+    pnanovdb_root_tile_handle_t null_handle = { pnanovdb_address_null() };
+    return null_handle;
+}
+
+// ----------------------------- Leaf Node ---------------------------------------
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_leaf_coord_to_offset(PNANOVDB_IN(pnanovdb_coord_t) ijk)
+{
+    return (((PNANOVDB_DEREF(ijk).x & 7) >> 0) << (2 * 3)) +
+        (((PNANOVDB_DEREF(ijk).y & 7) >> 0) << (3)) +
+        ((PNANOVDB_DEREF(ijk).z & 7) >> 0);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_leaf_get_min_address(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_leaf_handle_t node)
+{
+    pnanovdb_uint32_t byte_offset = PNANOVDB_GRID_TYPE_GET(grid_type, leaf_off_min);
+    return pnanovdb_address_offset(node.address, byte_offset);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_leaf_get_max_address(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_leaf_handle_t node)
+{
+    pnanovdb_uint32_t byte_offset = PNANOVDB_GRID_TYPE_GET(grid_type, leaf_off_max);
+    return pnanovdb_address_offset(node.address, byte_offset);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_leaf_get_ave_address(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_leaf_handle_t node)
+{
+    pnanovdb_uint32_t byte_offset = PNANOVDB_GRID_TYPE_GET(grid_type, leaf_off_ave);
+    return pnanovdb_address_offset(node.address, byte_offset);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_leaf_get_stddev_address(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_leaf_handle_t node)
+{
+    pnanovdb_uint32_t byte_offset = PNANOVDB_GRID_TYPE_GET(grid_type, leaf_off_stddev);
+    return pnanovdb_address_offset(node.address, byte_offset);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_leaf_get_table_address(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_leaf_handle_t node, pnanovdb_uint32_t n)
+{
+    pnanovdb_uint32_t byte_offset = PNANOVDB_GRID_TYPE_GET(grid_type, leaf_off_table) + ((PNANOVDB_GRID_TYPE_GET(grid_type, value_stride_bits) * n) >> 3u);
+    return pnanovdb_address_offset(node.address, byte_offset);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_leaf_get_value_address(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf, PNANOVDB_IN(pnanovdb_coord_t) ijk)
+{
+    pnanovdb_uint32_t n = pnanovdb_leaf_coord_to_offset(ijk);
+    return pnanovdb_leaf_get_table_address(grid_type, buf, leaf, n);
+}
+
+// ----------------------------- Leaf FP Types Specialization ---------------------------------------
+
+PNANOVDB_FORCE_INLINE float pnanovdb_leaf_fp_read_float(pnanovdb_buf_t buf, pnanovdb_address_t address, PNANOVDB_IN(pnanovdb_coord_t) ijk, pnanovdb_uint32_t value_log_bits)
+{
+    //  value_log_bits                                                          //   2     3       4
+    pnanovdb_uint32_t value_bits = 1u << value_log_bits;                        //   4     8      16
+    pnanovdb_uint32_t value_mask = (1u << value_bits) - 1u;                     // 0xF  0xFF  0xFFFF
+    pnanovdb_uint32_t values_per_word_bits = 5u - value_log_bits;               //   3     2       1
+    pnanovdb_uint32_t values_per_word_mask = (1u << values_per_word_bits) - 1u; //   7     3       1
+
+    pnanovdb_uint32_t n = pnanovdb_leaf_coord_to_offset(ijk);
+    float minimum = pnanovdb_read_float(buf, pnanovdb_address_offset_neg(address, PNANOVDB_LEAF_TABLE_NEG_OFF_MINIMUM));
+    float quantum = pnanovdb_read_float(buf, pnanovdb_address_offset_neg(address, PNANOVDB_LEAF_TABLE_NEG_OFF_QUANTUM));
+    pnanovdb_uint32_t raw = pnanovdb_read_uint32(buf, pnanovdb_address_offset(address, ((n >> values_per_word_bits) << 2u)));
+    pnanovdb_uint32_t value_compressed = (raw >> ((n & values_per_word_mask) << value_log_bits)) & value_mask;
+    return pnanovdb_uint32_to_float(value_compressed) * quantum + minimum;
+}
+
+PNANOVDB_FORCE_INLINE float pnanovdb_leaf_fp4_read_float(pnanovdb_buf_t buf, pnanovdb_address_t address, PNANOVDB_IN(pnanovdb_coord_t) ijk)
+{
+    return pnanovdb_leaf_fp_read_float(buf, address, ijk, 2u);
+}
+
+PNANOVDB_FORCE_INLINE float pnanovdb_leaf_fp8_read_float(pnanovdb_buf_t buf, pnanovdb_address_t address, PNANOVDB_IN(pnanovdb_coord_t) ijk)
+{
+    return pnanovdb_leaf_fp_read_float(buf, address, ijk, 3u);
+}
+
+PNANOVDB_FORCE_INLINE float pnanovdb_leaf_fp16_read_float(pnanovdb_buf_t buf, pnanovdb_address_t address, PNANOVDB_IN(pnanovdb_coord_t) ijk)
+{
+    return pnanovdb_leaf_fp_read_float(buf, address, ijk, 4u);
+}
+
+PNANOVDB_FORCE_INLINE float pnanovdb_leaf_fpn_read_float(pnanovdb_buf_t buf, pnanovdb_address_t address, PNANOVDB_IN(pnanovdb_coord_t) ijk)
+{
+    pnanovdb_uint32_t bbox_dif_and_flags = pnanovdb_read_uint32(buf, pnanovdb_address_offset_neg(address, PNANOVDB_LEAF_TABLE_NEG_OFF_BBOX_DIF_AND_FLAGS));
+    pnanovdb_uint32_t flags = bbox_dif_and_flags >> 24u;
+    pnanovdb_uint32_t value_log_bits = flags >> 5; // b = 0, 1, 2, 3, 4 corresponding to 1, 2, 4, 8, 16 bits
+    return pnanovdb_leaf_fp_read_float(buf, address, ijk, value_log_bits);
+}
+
+// ----------------------------- Leaf Index Specialization ---------------------------------------
+
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_leaf_index_has_stats(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf)
+{
+    return (pnanovdb_leaf_get_bbox_dif_and_flags(buf, leaf) & (1u << 28u)) != 0u;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_index_get_min_index(pnanovdb_buf_t buf, pnanovdb_address_t min_address)
+{
+    return pnanovdb_uint64_offset(pnanovdb_read_uint64(buf, min_address), 512u);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_index_get_max_index(pnanovdb_buf_t buf, pnanovdb_address_t max_address)
+{
+    return pnanovdb_uint64_offset(pnanovdb_read_uint64(buf, max_address), 513u);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_index_get_ave_index(pnanovdb_buf_t buf, pnanovdb_address_t ave_address)
+{
+    return pnanovdb_uint64_offset(pnanovdb_read_uint64(buf, ave_address), 514u);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_index_get_dev_index(pnanovdb_buf_t buf, pnanovdb_address_t dev_address)
+{
+    return pnanovdb_uint64_offset(pnanovdb_read_uint64(buf, dev_address), 515u);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_index_get_value_index(pnanovdb_buf_t buf, pnanovdb_address_t value_address, PNANOVDB_IN(pnanovdb_coord_t) ijk)
+{
+    pnanovdb_uint32_t n = pnanovdb_leaf_coord_to_offset(ijk);
+    pnanovdb_uint64_t offset = pnanovdb_read_uint64(buf, value_address);
+    return pnanovdb_uint64_offset(offset, n);
+}
+
+// ----------------------------- Leaf IndexMask Specialization ---------------------------------------
+
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_leaf_indexmask_has_stats(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf)
+{
+    return pnanovdb_leaf_index_has_stats(buf, leaf);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_indexmask_get_min_index(pnanovdb_buf_t buf, pnanovdb_address_t min_address)
+{
+    return pnanovdb_leaf_index_get_min_index(buf, min_address);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_indexmask_get_max_index(pnanovdb_buf_t buf, pnanovdb_address_t max_address)
+{
+    return pnanovdb_leaf_index_get_max_index(buf, max_address);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_indexmask_get_ave_index(pnanovdb_buf_t buf, pnanovdb_address_t ave_address)
+{
+    return pnanovdb_leaf_index_get_ave_index(buf, ave_address);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_indexmask_get_dev_index(pnanovdb_buf_t buf, pnanovdb_address_t dev_address)
+{
+    return pnanovdb_leaf_index_get_dev_index(buf, dev_address);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_indexmask_get_value_index(pnanovdb_buf_t buf, pnanovdb_address_t value_address, PNANOVDB_IN(pnanovdb_coord_t) ijk)
+{
+    return pnanovdb_leaf_index_get_value_index(buf, value_address, ijk);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_leaf_indexmask_get_mask_bit(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf, pnanovdb_uint32_t n)
+{
+    pnanovdb_uint32_t word_idx = n >> 5;
+    pnanovdb_uint32_t bit_idx = n & 31;
+    pnanovdb_uint32_t val_mask =
+        pnanovdb_read_uint32(buf, pnanovdb_address_offset(leaf.address, 96u + 4u * word_idx));
+    return (val_mask & (1u << bit_idx)) != 0u;
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_leaf_indexmask_set_mask_bit(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf, pnanovdb_uint32_t n, pnanovdb_bool_t v)
+{
+    pnanovdb_uint32_t word_idx = n >> 5;
+    pnanovdb_uint32_t bit_idx = n & 31;
+    pnanovdb_uint32_t val_mask =
+        pnanovdb_read_uint32(buf, pnanovdb_address_offset(leaf.address, 96u + 4u * word_idx));
+    if (v)
+    {
+        val_mask = val_mask | (1u << bit_idx);
+    }
+    else
+    {
+        val_mask = val_mask & ~(1u << bit_idx);
+    }
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(leaf.address, 96u + 4u * word_idx), val_mask);
+}
+
+// ----------------------------- Leaf OnIndex Specialization ---------------------------------------
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_leaf_onindex_get_value_count(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf)
+{
+    pnanovdb_uint64_t val_mask = pnanovdb_read_uint64(buf, pnanovdb_address_offset(leaf.address, PNANOVDB_LEAF_OFF_VALUE_MASK + 8u * 7u));
+    pnanovdb_uint64_t prefix_sum = pnanovdb_read_uint64(
+        buf, pnanovdb_address_offset(leaf.address, PNANOVDB_GRID_TYPE_GET(PNANOVDB_GRID_TYPE_ONINDEX, leaf_off_table) + 8u));
+    return pnanovdb_uint64_countbits(val_mask) + (pnanovdb_uint64_to_uint32_lsr(prefix_sum, 54u) & 511u);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_onindex_get_last_offset(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf)
+{
+    return pnanovdb_uint64_offset(
+        pnanovdb_read_uint64(buf, pnanovdb_address_offset(leaf.address, PNANOVDB_GRID_TYPE_GET(PNANOVDB_GRID_TYPE_ONINDEX, leaf_off_table))),
+        pnanovdb_leaf_onindex_get_value_count(buf, leaf) - 1u);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_leaf_onindex_has_stats(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf)
+{
+    return (pnanovdb_leaf_get_bbox_dif_and_flags(buf, leaf) & (1u << 28u)) != 0u;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_onindex_get_min_index(pnanovdb_buf_t buf, pnanovdb_address_t min_address)
+{
+    pnanovdb_leaf_handle_t leaf = { pnanovdb_address_offset_neg(min_address, PNANOVDB_GRID_TYPE_GET(PNANOVDB_GRID_TYPE_ONINDEX, leaf_off_table)) };
+    pnanovdb_uint64_t idx = pnanovdb_uint32_as_uint64_low(0u);
+    if (pnanovdb_leaf_onindex_has_stats(buf, leaf))
+    {
+        idx = pnanovdb_uint64_offset(pnanovdb_leaf_onindex_get_last_offset(buf, leaf), 1u);
+    }
+    return idx;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_onindex_get_max_index(pnanovdb_buf_t buf, pnanovdb_address_t max_address)
+{
+    pnanovdb_leaf_handle_t leaf = { pnanovdb_address_offset_neg(max_address, PNANOVDB_GRID_TYPE_GET(PNANOVDB_GRID_TYPE_ONINDEX, leaf_off_table)) };
+    pnanovdb_uint64_t idx = pnanovdb_uint32_as_uint64_low(0u);
+    if (pnanovdb_leaf_onindex_has_stats(buf, leaf))
+    {
+        idx = pnanovdb_uint64_offset(pnanovdb_leaf_onindex_get_last_offset(buf, leaf), 2u);
+    }
+    return idx;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_onindex_get_ave_index(pnanovdb_buf_t buf, pnanovdb_address_t ave_address)
+{
+    pnanovdb_leaf_handle_t leaf = { pnanovdb_address_offset_neg(ave_address, PNANOVDB_GRID_TYPE_GET(PNANOVDB_GRID_TYPE_ONINDEX, leaf_off_table)) };
+    pnanovdb_uint64_t idx = pnanovdb_uint32_as_uint64_low(0u);
+    if (pnanovdb_leaf_onindex_has_stats(buf, leaf))
+    {
+        idx = pnanovdb_uint64_offset(pnanovdb_leaf_onindex_get_last_offset(buf, leaf), 3u);
+    }
+    return idx;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_onindex_get_dev_index(pnanovdb_buf_t buf, pnanovdb_address_t dev_address)
+{
+    pnanovdb_leaf_handle_t leaf = { pnanovdb_address_offset_neg(dev_address, PNANOVDB_GRID_TYPE_GET(PNANOVDB_GRID_TYPE_ONINDEX, leaf_off_table)) };
+    pnanovdb_uint64_t idx = pnanovdb_uint32_as_uint64_low(0u);
+    if (pnanovdb_leaf_onindex_has_stats(buf, leaf))
+    {
+        idx = pnanovdb_uint64_offset(pnanovdb_leaf_onindex_get_last_offset(buf, leaf), 4u);
+    }
+    return idx;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_onindex_get_value_index(pnanovdb_buf_t buf, pnanovdb_address_t value_address, PNANOVDB_IN(pnanovdb_coord_t) ijk)
+{
+    pnanovdb_uint32_t n = pnanovdb_leaf_coord_to_offset(ijk);
+    pnanovdb_leaf_handle_t leaf = { pnanovdb_address_offset_neg(value_address, PNANOVDB_GRID_TYPE_GET(PNANOVDB_GRID_TYPE_ONINDEX, leaf_off_table)) };
+
+    pnanovdb_uint32_t word_idx = n >> 6u;
+    pnanovdb_uint32_t bit_idx = n & 63u;
+    pnanovdb_uint64_t val_mask = pnanovdb_read_uint64(buf, pnanovdb_address_offset(leaf.address, PNANOVDB_LEAF_OFF_VALUE_MASK + 8u * word_idx));
+    pnanovdb_uint64_t mask = pnanovdb_uint64_bit_mask(bit_idx);
+    pnanovdb_uint64_t value_index = pnanovdb_uint32_as_uint64_low(0u);
+    if (pnanovdb_uint64_any_bit(pnanovdb_uint64_and(val_mask, mask)))
+    {
+        pnanovdb_uint32_t sum = 0u;
+        sum += pnanovdb_uint64_countbits(pnanovdb_uint64_and(val_mask, pnanovdb_uint64_dec(mask)));
+        if (word_idx > 0u)
+        {
+            pnanovdb_uint64_t prefix_sum = pnanovdb_read_uint64(buf, pnanovdb_address_offset(value_address, 8u));
+            sum += pnanovdb_uint64_to_uint32_lsr(prefix_sum, 9u * (word_idx - 1u)) & 511u;
+        }
+        pnanovdb_uint64_t offset = pnanovdb_read_uint64(buf, value_address);
+        value_index = pnanovdb_uint64_offset(offset, sum);
+    }
+    return value_index;
+}
+
+// ----------------------------- Leaf OnIndexMask Specialization ---------------------------------------
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_leaf_onindexmask_get_value_count(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf)
+{
+    return pnanovdb_leaf_onindex_get_value_count(buf, leaf);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_onindexmask_get_last_offset(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf)
+{
+    return pnanovdb_leaf_onindex_get_last_offset(buf, leaf);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_leaf_onindexmask_has_stats(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf)
+{
+    return pnanovdb_leaf_onindex_has_stats(buf, leaf);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_onindexmask_get_min_index(pnanovdb_buf_t buf, pnanovdb_address_t min_address)
+{
+    return pnanovdb_leaf_onindex_get_min_index(buf, min_address);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_onindexmask_get_max_index(pnanovdb_buf_t buf, pnanovdb_address_t max_address)
+{
+    return pnanovdb_leaf_onindex_get_max_index(buf, max_address);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_onindexmask_get_ave_index(pnanovdb_buf_t buf, pnanovdb_address_t ave_address)
+{
+    return pnanovdb_leaf_onindex_get_ave_index(buf, ave_address);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_onindexmask_get_dev_index(pnanovdb_buf_t buf, pnanovdb_address_t dev_address)
+{
+    return pnanovdb_leaf_onindex_get_dev_index(buf, dev_address);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_onindexmask_get_value_index(pnanovdb_buf_t buf, pnanovdb_address_t value_address, PNANOVDB_IN(pnanovdb_coord_t) ijk)
+{
+    return pnanovdb_leaf_onindex_get_value_index(buf, value_address, ijk);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_leaf_onindexmask_get_mask_bit(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf, pnanovdb_uint32_t n)
+{
+    pnanovdb_uint32_t word_idx = n >> 5;
+    pnanovdb_uint32_t bit_idx = n & 31;
+    pnanovdb_uint32_t val_mask =
+        pnanovdb_read_uint32(buf, pnanovdb_address_offset(leaf.address, 96u + 4u * word_idx));
+    return (val_mask & (1u << bit_idx)) != 0u;
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_leaf_onindexmask_set_mask_bit(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf, pnanovdb_uint32_t n, pnanovdb_bool_t v)
+{
+    pnanovdb_uint32_t word_idx = n >> 5;
+    pnanovdb_uint32_t bit_idx = n & 31;
+    pnanovdb_uint32_t val_mask =
+        pnanovdb_read_uint32(buf, pnanovdb_address_offset(leaf.address, 96u + 4u * word_idx));
+    if (v)
+    {
+        val_mask = val_mask | (1u << bit_idx);
+    }
+    else
+    {
+        val_mask = val_mask & ~(1u << bit_idx);
+    }
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(leaf.address, 96u + 4u * word_idx), val_mask);
+}
+
+// ----------------------------- Leaf PointIndex Specialization ---------------------------------------
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_pointindex_get_offset(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf)
+{
+    return pnanovdb_read_uint64(buf, pnanovdb_leaf_get_min_address(PNANOVDB_GRID_TYPE_POINTINDEX, buf, leaf));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_pointindex_get_point_count(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf)
+{
+    return pnanovdb_read_uint64(buf, pnanovdb_leaf_get_max_address(PNANOVDB_GRID_TYPE_POINTINDEX, buf, leaf));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_pointindex_get_first(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf, pnanovdb_uint32_t i)
+{
+    return pnanovdb_uint64_offset(pnanovdb_leaf_pointindex_get_offset(buf, leaf),
+        (i == 0u ? 0u : pnanovdb_read_uint16(buf, pnanovdb_leaf_get_table_address(PNANOVDB_GRID_TYPE_POINTINDEX, buf, leaf, i - 1u))));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_pointindex_get_last(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf, pnanovdb_uint32_t i)
+{
+    return pnanovdb_uint64_offset(pnanovdb_leaf_pointindex_get_offset(buf, leaf),
+        pnanovdb_read_uint16(buf, pnanovdb_leaf_get_table_address(PNANOVDB_GRID_TYPE_POINTINDEX, buf, leaf, i)));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_pointindex_get_value(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf, pnanovdb_uint32_t i)
+{
+    return pnanovdb_uint32_as_uint64_low(pnanovdb_read_uint16(buf, pnanovdb_leaf_get_table_address(PNANOVDB_GRID_TYPE_POINTINDEX, buf, leaf, i)));
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_leaf_pointindex_set_value_only(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf, pnanovdb_uint32_t i, pnanovdb_uint32_t value)
+{
+    pnanovdb_address_t addr = pnanovdb_leaf_get_table_address(PNANOVDB_GRID_TYPE_POINTINDEX, buf, leaf, i);
+    pnanovdb_uint32_t raw32 = pnanovdb_read_uint32(buf, pnanovdb_address_mask_inv(addr, 3u));
+    if ((i & 1) == 0u)
+    {
+        raw32 = (raw32 & 0xFFFF0000) | (value & 0x0000FFFF);
+    }
+    else
+    {
+        raw32 = (raw32 & 0x0000FFFF) | (value << 16u);
+    }
+    pnanovdb_write_uint32(buf, addr, raw32);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_leaf_pointindex_set_on(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf, pnanovdb_uint32_t i)
+{
+    pnanovdb_uint32_t word_idx = i >> 5;
+    pnanovdb_uint32_t bit_idx = i & 31;
+    pnanovdb_address_t addr = pnanovdb_address_offset(leaf.address, PNANOVDB_LEAF_OFF_VALUE_MASK + 4u * word_idx);
+    pnanovdb_uint32_t val_mask = pnanovdb_read_uint32(buf, addr);
+    val_mask = val_mask | (1u << bit_idx);
+    pnanovdb_write_uint32(buf, addr, val_mask);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_leaf_pointindex_set_value(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf, pnanovdb_uint32_t i, pnanovdb_uint32_t value)
+{
+    pnanovdb_leaf_pointindex_set_on(buf, leaf, i);
+    pnanovdb_leaf_pointindex_set_value_only(buf, leaf, i, value);
+}
+
+// ------------------------------------------------ Lower Node -----------------------------------------------------------
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_lower_coord_to_offset(PNANOVDB_IN(pnanovdb_coord_t) ijk)
+{
+    return (((PNANOVDB_DEREF(ijk).x & 127) >> 3) << (2 * 4)) +
+           (((PNANOVDB_DEREF(ijk).y & 127) >> 3) << (4)) +
+            ((PNANOVDB_DEREF(ijk).z & 127) >> 3);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_lower_get_min_address(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_lower_handle_t node)
+{
+    pnanovdb_uint32_t byte_offset = PNANOVDB_GRID_TYPE_GET(grid_type, lower_off_min);
+    return pnanovdb_address_offset(node.address, byte_offset);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_lower_get_max_address(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_lower_handle_t node)
+{
+    pnanovdb_uint32_t byte_offset = PNANOVDB_GRID_TYPE_GET(grid_type, lower_off_max);
+    return pnanovdb_address_offset(node.address, byte_offset);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_lower_get_ave_address(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_lower_handle_t node)
+{
+    pnanovdb_uint32_t byte_offset = PNANOVDB_GRID_TYPE_GET(grid_type, lower_off_ave);
+    return pnanovdb_address_offset(node.address, byte_offset);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_lower_get_stddev_address(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_lower_handle_t node)
+{
+    pnanovdb_uint32_t byte_offset = PNANOVDB_GRID_TYPE_GET(grid_type, lower_off_stddev);
+    return pnanovdb_address_offset(node.address, byte_offset);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_lower_get_table_address(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_lower_handle_t node, pnanovdb_uint32_t n)
+{
+    pnanovdb_uint32_t byte_offset = PNANOVDB_GRID_TYPE_GET(grid_type, lower_off_table) + PNANOVDB_GRID_TYPE_GET(grid_type, table_stride) * n;
+    return pnanovdb_address_offset(node.address, byte_offset);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_int64_t pnanovdb_lower_get_table_child(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_lower_handle_t node, pnanovdb_uint32_t n)
+{
+    pnanovdb_address_t table_address = pnanovdb_lower_get_table_address(grid_type, buf, node, n);
+    return pnanovdb_read_int64(buf, table_address);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_leaf_handle_t pnanovdb_lower_get_child(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_lower_handle_t lower, pnanovdb_uint32_t n)
+{
+    pnanovdb_leaf_handle_t leaf = { lower.address };
+    leaf.address = pnanovdb_address_offset64(leaf.address, pnanovdb_int64_as_uint64(pnanovdb_lower_get_table_child(grid_type, buf, lower, n)));
+    return leaf;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_lower_get_value_address_and_level(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_lower_handle_t lower, PNANOVDB_IN(pnanovdb_coord_t) ijk, PNANOVDB_INOUT(pnanovdb_uint32_t) level)
+{
+    pnanovdb_uint32_t n = pnanovdb_lower_coord_to_offset(ijk);
+    pnanovdb_address_t value_address;
+    if (pnanovdb_lower_get_child_mask(buf, lower, n))
+    {
+        pnanovdb_leaf_handle_t child = pnanovdb_lower_get_child(grid_type, buf, lower, n);
+        value_address = pnanovdb_leaf_get_value_address(grid_type, buf, child, ijk);
+        PNANOVDB_DEREF(level) = 0u;
+    }
+    else
+    {
+        value_address = pnanovdb_lower_get_table_address(grid_type, buf, lower, n);
+        PNANOVDB_DEREF(level) = 1u;
+    }
+    return value_address;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_lower_get_value_address(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_lower_handle_t lower, PNANOVDB_IN(pnanovdb_coord_t) ijk)
+{
+    pnanovdb_uint32_t level;
+    return pnanovdb_lower_get_value_address_and_level(grid_type, buf, lower, ijk, PNANOVDB_REF(level));
+}
+
+// ------------------------------------------------ Upper Node -----------------------------------------------------------
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_upper_coord_to_offset(PNANOVDB_IN(pnanovdb_coord_t) ijk)
+{
+    return (((PNANOVDB_DEREF(ijk).x & 4095) >> 7) << (2 * 5)) +
+           (((PNANOVDB_DEREF(ijk).y & 4095) >> 7) << (5)) +
+            ((PNANOVDB_DEREF(ijk).z & 4095) >> 7);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_upper_get_min_address(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_upper_handle_t node)
+{
+    pnanovdb_uint32_t byte_offset = PNANOVDB_GRID_TYPE_GET(grid_type, upper_off_min);
+    return pnanovdb_address_offset(node.address, byte_offset);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_upper_get_max_address(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_upper_handle_t node)
+{
+    pnanovdb_uint32_t byte_offset = PNANOVDB_GRID_TYPE_GET(grid_type, upper_off_max);
+    return pnanovdb_address_offset(node.address, byte_offset);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_upper_get_ave_address(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_upper_handle_t node)
+{
+    pnanovdb_uint32_t byte_offset = PNANOVDB_GRID_TYPE_GET(grid_type, upper_off_ave);
+    return pnanovdb_address_offset(node.address, byte_offset);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_upper_get_stddev_address(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_upper_handle_t node)
+{
+    pnanovdb_uint32_t byte_offset = PNANOVDB_GRID_TYPE_GET(grid_type, upper_off_stddev);
+    return pnanovdb_address_offset(node.address, byte_offset);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_upper_get_table_address(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_upper_handle_t node, pnanovdb_uint32_t n)
+{
+    pnanovdb_uint32_t byte_offset = PNANOVDB_GRID_TYPE_GET(grid_type, upper_off_table) + PNANOVDB_GRID_TYPE_GET(grid_type, table_stride) * n;
+    return pnanovdb_address_offset(node.address, byte_offset);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_int64_t pnanovdb_upper_get_table_child(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_upper_handle_t node, pnanovdb_uint32_t n)
+{
+    pnanovdb_address_t bufAddress = pnanovdb_upper_get_table_address(grid_type, buf, node, n);
+    return pnanovdb_read_int64(buf, bufAddress);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_lower_handle_t pnanovdb_upper_get_child(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_upper_handle_t upper, pnanovdb_uint32_t n)
+{
+    pnanovdb_lower_handle_t lower = { upper.address };
+    lower.address = pnanovdb_address_offset64(lower.address, pnanovdb_int64_as_uint64(pnanovdb_upper_get_table_child(grid_type, buf, upper, n)));
+    return lower;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_upper_get_value_address_and_level(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_upper_handle_t upper, PNANOVDB_IN(pnanovdb_coord_t) ijk, PNANOVDB_INOUT(pnanovdb_uint32_t) level)
+{
+    pnanovdb_uint32_t n = pnanovdb_upper_coord_to_offset(ijk);
+    pnanovdb_address_t value_address;
+    if (pnanovdb_upper_get_child_mask(buf, upper, n))
+    {
+        pnanovdb_lower_handle_t child = pnanovdb_upper_get_child(grid_type, buf, upper, n);
+        value_address = pnanovdb_lower_get_value_address_and_level(grid_type, buf, child, ijk, level);
+    }
+    else
+    {
+        value_address = pnanovdb_upper_get_table_address(grid_type, buf, upper, n);
+        PNANOVDB_DEREF(level) = 2u;
+    }
+    return value_address;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_upper_get_value_address(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_upper_handle_t upper, PNANOVDB_IN(pnanovdb_coord_t) ijk)
+{
+    pnanovdb_uint32_t level;
+    return pnanovdb_upper_get_value_address_and_level(grid_type, buf, upper, ijk, PNANOVDB_REF(level));
+}
+
+PNANOVDB_FORCE_INLINE void pnanovdb_upper_set_table_child(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_upper_handle_t node, pnanovdb_uint32_t n, pnanovdb_int64_t child)
+{
+    pnanovdb_address_t bufAddress = pnanovdb_upper_get_table_address(grid_type, buf, node, n);
+    pnanovdb_write_int64(buf, bufAddress, child);
+}
+
+// ------------------------------------------------ Root -----------------------------------------------------------
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_root_get_min_address(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_root_handle_t root)
+{
+    pnanovdb_uint32_t byte_offset = PNANOVDB_GRID_TYPE_GET(grid_type, root_off_min);
+    return pnanovdb_address_offset(root.address, byte_offset);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_root_get_max_address(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_root_handle_t root)
+{
+    pnanovdb_uint32_t byte_offset = PNANOVDB_GRID_TYPE_GET(grid_type, root_off_max);
+    return pnanovdb_address_offset(root.address, byte_offset);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_root_get_ave_address(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_root_handle_t root)
+{
+    pnanovdb_uint32_t byte_offset = PNANOVDB_GRID_TYPE_GET(grid_type, root_off_ave);
+    return pnanovdb_address_offset(root.address, byte_offset);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_root_get_stddev_address(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_root_handle_t root)
+{
+    pnanovdb_uint32_t byte_offset = PNANOVDB_GRID_TYPE_GET(grid_type, root_off_stddev);
+    return pnanovdb_address_offset(root.address, byte_offset);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_root_tile_get_value_address(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_root_tile_handle_t root_tile)
+{
+    pnanovdb_uint32_t byte_offset = PNANOVDB_GRID_TYPE_GET(grid_type, root_tile_off_value);
+    return pnanovdb_address_offset(root_tile.address, byte_offset);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_root_get_value_address_and_level(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_root_handle_t root, PNANOVDB_IN(pnanovdb_coord_t) ijk, PNANOVDB_INOUT(pnanovdb_uint32_t) level)
+{
+    pnanovdb_root_tile_handle_t tile = pnanovdb_root_find_tile(grid_type, buf, root, ijk);
+    pnanovdb_address_t ret;
+    if (pnanovdb_address_is_null(tile.address))
+    {
+        ret = pnanovdb_address_offset(root.address, PNANOVDB_GRID_TYPE_GET(grid_type, root_off_background));
+        PNANOVDB_DEREF(level) = 4u;
+    }
+    else if (pnanovdb_int64_is_zero(pnanovdb_root_tile_get_child(buf, tile)))
+    {
+        ret = pnanovdb_address_offset(tile.address, PNANOVDB_GRID_TYPE_GET(grid_type, root_tile_off_value));
+        PNANOVDB_DEREF(level) = 3u;
+    }
+    else
+    {
+        pnanovdb_upper_handle_t child = pnanovdb_root_get_child(grid_type, buf, root, tile);
+        ret = pnanovdb_upper_get_value_address_and_level(grid_type, buf, child, ijk, level);
+    }
+    return ret;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_root_get_value_address(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_root_handle_t root, PNANOVDB_IN(pnanovdb_coord_t) ijk)
+{
+    pnanovdb_uint32_t level;
+    return pnanovdb_root_get_value_address_and_level(grid_type, buf, root, ijk, PNANOVDB_REF(level));
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_root_get_value_address_bit(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_root_handle_t root, PNANOVDB_IN(pnanovdb_coord_t) ijk, PNANOVDB_INOUT(pnanovdb_uint32_t) bit_index)
+{
+    pnanovdb_uint32_t level;
+    pnanovdb_address_t address = pnanovdb_root_get_value_address_and_level(grid_type, buf, root, ijk, PNANOVDB_REF(level));
+    PNANOVDB_DEREF(bit_index) = level == 0u ? pnanovdb_int32_as_uint32(PNANOVDB_DEREF(ijk).x & 7) : 0u;
+    return address;
+}
+
+PNANOVDB_FORCE_INLINE float pnanovdb_root_fp4_read_float(pnanovdb_buf_t buf, pnanovdb_address_t address, PNANOVDB_IN(pnanovdb_coord_t) ijk, pnanovdb_uint32_t level)
+{
+    float ret;
+    if (level == 0)
+    {
+        ret = pnanovdb_leaf_fp4_read_float(buf, address, ijk);
+    }
+    else
+    {
+        ret = pnanovdb_read_float(buf, address);
+    }
+    return ret;
+}
+
+PNANOVDB_FORCE_INLINE float pnanovdb_root_fp8_read_float(pnanovdb_buf_t buf, pnanovdb_address_t address, PNANOVDB_IN(pnanovdb_coord_t) ijk, pnanovdb_uint32_t level)
+{
+    float ret;
+    if (level == 0)
+    {
+        ret = pnanovdb_leaf_fp8_read_float(buf, address, ijk);
+    }
+    else
+    {
+        ret = pnanovdb_read_float(buf, address);
+    }
+    return ret;
+}
+
+PNANOVDB_FORCE_INLINE float pnanovdb_root_fp16_read_float(pnanovdb_buf_t buf, pnanovdb_address_t address, PNANOVDB_IN(pnanovdb_coord_t) ijk, pnanovdb_uint32_t level)
+{
+    float ret;
+    if (level == 0)
+    {
+        ret = pnanovdb_leaf_fp16_read_float(buf, address, ijk);
+    }
+    else
+    {
+        ret = pnanovdb_read_float(buf, address);
+    }
+    return ret;
+}
+
+PNANOVDB_FORCE_INLINE float pnanovdb_root_fpn_read_float(pnanovdb_buf_t buf, pnanovdb_address_t address, PNANOVDB_IN(pnanovdb_coord_t) ijk, pnanovdb_uint32_t level)
+{
+    float ret;
+    if (level == 0)
+    {
+        ret = pnanovdb_leaf_fpn_read_float(buf, address, ijk);
+    }
+    else
+    {
+        ret = pnanovdb_read_float(buf, address);
+    }
+    return ret;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_root_index_get_value_index(pnanovdb_buf_t buf, pnanovdb_address_t address, PNANOVDB_IN(pnanovdb_coord_t) ijk, pnanovdb_uint32_t level)
+{
+    pnanovdb_uint64_t ret;
+    if (level == 0)
+    {
+        ret = pnanovdb_leaf_index_get_value_index(buf, address, ijk);
+    }
+    else
+    {
+        ret = pnanovdb_read_uint64(buf, address);
+    }
+    return ret;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_root_onindex_get_value_index(pnanovdb_buf_t buf, pnanovdb_address_t address, PNANOVDB_IN(pnanovdb_coord_t) ijk, pnanovdb_uint32_t level)
+{
+    pnanovdb_uint64_t ret;
+    if (level == 0)
+    {
+        ret = pnanovdb_leaf_onindex_get_value_index(buf, address, ijk);
+    }
+    else
+    {
+        ret = pnanovdb_read_uint64(buf, address);
+    }
+    return ret;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_root_pointindex_get_point_range(
+    pnanovdb_buf_t buf,
+    pnanovdb_address_t value_address,
+    PNANOVDB_IN(pnanovdb_coord_t) ijk,
+    pnanovdb_uint32_t level,
+    PNANOVDB_INOUT(pnanovdb_uint64_t)range_begin,
+    PNANOVDB_INOUT(pnanovdb_uint64_t)range_end
+)
+{
+    pnanovdb_uint32_t local_range_begin = 0u;
+    pnanovdb_uint32_t local_range_end = 0u;
+    pnanovdb_uint64_t offset = pnanovdb_uint32_as_uint64_low(0u);
+    if (level == 0)
+    {
+        pnanovdb_uint32_t n = pnanovdb_leaf_coord_to_offset(ijk);
+        // recover leaf address
+        pnanovdb_leaf_handle_t leaf = { pnanovdb_address_offset_neg(value_address, PNANOVDB_GRID_TYPE_GET(PNANOVDB_GRID_TYPE_POINTINDEX, leaf_off_table) + 2u * n) };
+        if (n > 0u)
+        {
+            local_range_begin = pnanovdb_read_uint16(buf, pnanovdb_address_offset_neg(value_address, 2u));
+        }
+        local_range_end = pnanovdb_read_uint16(buf, value_address);
+        offset = pnanovdb_leaf_pointindex_get_offset(buf, leaf);
+    }
+    PNANOVDB_DEREF(range_begin) = pnanovdb_uint64_offset(offset, local_range_begin);
+    PNANOVDB_DEREF(range_end) = pnanovdb_uint64_offset(offset, local_range_end);
+    return pnanovdb_uint32_as_uint64_low(local_range_end - local_range_begin);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_root_pointindex_get_point_address_range(
+    pnanovdb_buf_t buf,
+    pnanovdb_grid_type_t value_type,
+    pnanovdb_address_t value_address,
+    pnanovdb_address_t blindmetadata_value_address,
+    PNANOVDB_IN(pnanovdb_coord_t) ijk,
+    pnanovdb_uint32_t level,
+    PNANOVDB_INOUT(pnanovdb_address_t)address_begin,
+    PNANOVDB_INOUT(pnanovdb_address_t)address_end
+)
+{
+    pnanovdb_uint64_t range_begin;
+    pnanovdb_uint64_t range_end;
+    pnanovdb_uint64_t range_size = pnanovdb_root_pointindex_get_point_range(buf, value_address, ijk, level, PNANOVDB_REF(range_begin), PNANOVDB_REF(range_end));
+
+    pnanovdb_uint32_t stride = 12u; // vec3f
+    if (value_type == PNANOVDB_GRID_TYPE_VEC3U8)
+    {
+        stride = 3u;
+    }
+    else if (value_type == PNANOVDB_GRID_TYPE_VEC3U16)
+    {
+        stride = 6u;
+    }
+    PNANOVDB_DEREF(address_begin) = pnanovdb_address_offset64_product(blindmetadata_value_address, range_begin, stride);
+    PNANOVDB_DEREF(address_end) = pnanovdb_address_offset64_product(blindmetadata_value_address, range_end, stride);
+    return range_size;
+}
+
+// ------------------------------------------------ ReadAccessor -----------------------------------------------------------
+
+struct pnanovdb_readaccessor_t
+{
+    pnanovdb_coord_t key;
+    pnanovdb_leaf_handle_t leaf;
+    pnanovdb_lower_handle_t lower;
+    pnanovdb_upper_handle_t upper;
+    pnanovdb_root_handle_t root;
+};
+PNANOVDB_STRUCT_TYPEDEF(pnanovdb_readaccessor_t)
+
+PNANOVDB_FORCE_INLINE void pnanovdb_readaccessor_init(PNANOVDB_INOUT(pnanovdb_readaccessor_t) acc, pnanovdb_root_handle_t root)
+{
+    PNANOVDB_DEREF(acc).key.x = 0x7FFFFFFF;
+    PNANOVDB_DEREF(acc).key.y = 0x7FFFFFFF;
+    PNANOVDB_DEREF(acc).key.z = 0x7FFFFFFF;
+    PNANOVDB_DEREF(acc).leaf.address = pnanovdb_address_null();
+    PNANOVDB_DEREF(acc).lower.address = pnanovdb_address_null();
+    PNANOVDB_DEREF(acc).upper.address = pnanovdb_address_null();
+    PNANOVDB_DEREF(acc).root = root;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_readaccessor_iscached0(PNANOVDB_INOUT(pnanovdb_readaccessor_t) acc, int dirty)
+{
+    if (pnanovdb_address_is_null(PNANOVDB_DEREF(acc).leaf.address)) { return PNANOVDB_FALSE; }
+    if ((dirty & ~((1u << 3) - 1u)) != 0)
+    {
+        PNANOVDB_DEREF(acc).leaf.address = pnanovdb_address_null();
+        return PNANOVDB_FALSE;
+    }
+    return PNANOVDB_TRUE;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_readaccessor_iscached1(PNANOVDB_INOUT(pnanovdb_readaccessor_t) acc, int dirty)
+{
+    if (pnanovdb_address_is_null(PNANOVDB_DEREF(acc).lower.address)) { return PNANOVDB_FALSE; }
+    if ((dirty & ~((1u << 7) - 1u)) != 0)
+    {
+        PNANOVDB_DEREF(acc).lower.address = pnanovdb_address_null();
+        return PNANOVDB_FALSE;
+    }
+    return PNANOVDB_TRUE;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_readaccessor_iscached2(PNANOVDB_INOUT(pnanovdb_readaccessor_t) acc, int dirty)
+{
+    if (pnanovdb_address_is_null(PNANOVDB_DEREF(acc).upper.address)) { return PNANOVDB_FALSE; }
+    if ((dirty & ~((1u << 12) - 1u)) != 0)
+    {
+        PNANOVDB_DEREF(acc).upper.address = pnanovdb_address_null();
+        return PNANOVDB_FALSE;
+    }
+    return PNANOVDB_TRUE;
+}
+PNANOVDB_FORCE_INLINE int pnanovdb_readaccessor_computedirty(PNANOVDB_INOUT(pnanovdb_readaccessor_t) acc, PNANOVDB_IN(pnanovdb_coord_t) ijk)
+{
+    return (PNANOVDB_DEREF(ijk).x ^ PNANOVDB_DEREF(acc).key.x) | (PNANOVDB_DEREF(ijk).y ^ PNANOVDB_DEREF(acc).key.y) | (PNANOVDB_DEREF(ijk).z ^ PNANOVDB_DEREF(acc).key.z);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_leaf_get_value_address_and_cache(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf, PNANOVDB_IN(pnanovdb_coord_t) ijk, PNANOVDB_INOUT(pnanovdb_readaccessor_t) acc)
+{
+    pnanovdb_uint32_t n = pnanovdb_leaf_coord_to_offset(ijk);
+    return pnanovdb_leaf_get_table_address(grid_type, buf, leaf, n);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_lower_get_value_address_and_level_and_cache(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_lower_handle_t lower, PNANOVDB_IN(pnanovdb_coord_t) ijk, PNANOVDB_INOUT(pnanovdb_readaccessor_t) acc, PNANOVDB_INOUT(pnanovdb_uint32_t) level)
+{
+    pnanovdb_uint32_t n = pnanovdb_lower_coord_to_offset(ijk);
+    pnanovdb_address_t value_address;
+    if (pnanovdb_lower_get_child_mask(buf, lower, n))
+    {
+        pnanovdb_leaf_handle_t child = pnanovdb_lower_get_child(grid_type, buf, lower, n);
+        PNANOVDB_DEREF(acc).leaf = child;
+        PNANOVDB_DEREF(acc).key = PNANOVDB_DEREF(ijk);
+        value_address = pnanovdb_leaf_get_value_address_and_cache(grid_type, buf, child, ijk, acc);
+        PNANOVDB_DEREF(level) = 0u;
+    }
+    else
+    {
+        value_address = pnanovdb_lower_get_table_address(grid_type, buf, lower, n);
+        PNANOVDB_DEREF(level) = 1u;
+    }
+    return value_address;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_lower_get_value_address_and_cache(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_lower_handle_t lower, PNANOVDB_IN(pnanovdb_coord_t) ijk, PNANOVDB_INOUT(pnanovdb_readaccessor_t) acc)
+{
+    pnanovdb_uint32_t level;
+    return pnanovdb_lower_get_value_address_and_level_and_cache(grid_type, buf, lower, ijk, acc, PNANOVDB_REF(level));
+}
+
+PNANOVDB_FORCE_INLINE void pnanovdb_lower_set_table_child(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_lower_handle_t node, pnanovdb_uint32_t n, pnanovdb_int64_t child)
+{
+    pnanovdb_address_t table_address = pnanovdb_lower_get_table_address(grid_type, buf, node, n);
+    pnanovdb_write_int64(buf, table_address, child);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_upper_get_value_address_and_level_and_cache(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_upper_handle_t upper, PNANOVDB_IN(pnanovdb_coord_t) ijk, PNANOVDB_INOUT(pnanovdb_readaccessor_t) acc, PNANOVDB_INOUT(pnanovdb_uint32_t) level)
+{
+    pnanovdb_uint32_t n = pnanovdb_upper_coord_to_offset(ijk);
+    pnanovdb_address_t value_address;
+    if (pnanovdb_upper_get_child_mask(buf, upper, n))
+    {
+        pnanovdb_lower_handle_t child = pnanovdb_upper_get_child(grid_type, buf, upper, n);
+        PNANOVDB_DEREF(acc).lower = child;
+        PNANOVDB_DEREF(acc).key = PNANOVDB_DEREF(ijk);
+        value_address = pnanovdb_lower_get_value_address_and_level_and_cache(grid_type, buf, child, ijk, acc, level);
+    }
+    else
+    {
+        value_address = pnanovdb_upper_get_table_address(grid_type, buf, upper, n);
+        PNANOVDB_DEREF(level) = 2u;
+    }
+    return value_address;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_upper_get_value_address_and_cache(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_upper_handle_t upper, PNANOVDB_IN(pnanovdb_coord_t) ijk, PNANOVDB_INOUT(pnanovdb_readaccessor_t) acc)
+{
+    pnanovdb_uint32_t level;
+    return pnanovdb_upper_get_value_address_and_level_and_cache(grid_type, buf, upper, ijk, acc, PNANOVDB_REF(level));
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_root_get_value_address_and_level_and_cache(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_root_handle_t root, PNANOVDB_IN(pnanovdb_coord_t) ijk, PNANOVDB_INOUT(pnanovdb_readaccessor_t) acc, PNANOVDB_INOUT(pnanovdb_uint32_t) level)
+{
+    pnanovdb_root_tile_handle_t tile = pnanovdb_root_find_tile(grid_type, buf, root, ijk);
+    pnanovdb_address_t ret;
+    if (pnanovdb_address_is_null(tile.address))
+    {
+        ret = pnanovdb_address_offset(root.address, PNANOVDB_GRID_TYPE_GET(grid_type, root_off_background));
+        PNANOVDB_DEREF(level) = 4u;
+    }
+    else if (pnanovdb_int64_is_zero(pnanovdb_root_tile_get_child(buf, tile)))
+    {
+        ret = pnanovdb_address_offset(tile.address, PNANOVDB_GRID_TYPE_GET(grid_type, root_tile_off_value));
+        PNANOVDB_DEREF(level) = 3u;
+    }
+    else
+    {
+        pnanovdb_upper_handle_t child = pnanovdb_root_get_child(grid_type, buf, root, tile);
+        PNANOVDB_DEREF(acc).upper = child;
+        PNANOVDB_DEREF(acc).key = PNANOVDB_DEREF(ijk);
+        ret = pnanovdb_upper_get_value_address_and_level_and_cache(grid_type, buf, child, ijk, acc, level);
+    }
+    return ret;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_root_get_value_address_and_cache(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_root_handle_t root, PNANOVDB_IN(pnanovdb_coord_t) ijk, PNANOVDB_INOUT(pnanovdb_readaccessor_t) acc)
+{
+    pnanovdb_uint32_t level;
+    return pnanovdb_root_get_value_address_and_level_and_cache(grid_type, buf, root, ijk, acc, PNANOVDB_REF(level));
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_readaccessor_get_value_address_and_level(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, PNANOVDB_INOUT(pnanovdb_readaccessor_t) acc, PNANOVDB_IN(pnanovdb_coord_t) ijk, PNANOVDB_INOUT(pnanovdb_uint32_t) level)
+{
+    int dirty = pnanovdb_readaccessor_computedirty(acc, ijk);
+
+    pnanovdb_address_t value_address;
+    if (pnanovdb_readaccessor_iscached0(acc, dirty))
+    {
+        value_address = pnanovdb_leaf_get_value_address_and_cache(grid_type, buf, PNANOVDB_DEREF(acc).leaf, ijk, acc);
+        PNANOVDB_DEREF(level) = 0u;
+    }
+    else if (pnanovdb_readaccessor_iscached1(acc, dirty))
+    {
+        value_address = pnanovdb_lower_get_value_address_and_level_and_cache(grid_type, buf, PNANOVDB_DEREF(acc).lower, ijk, acc, level);
+    }
+    else if (pnanovdb_readaccessor_iscached2(acc, dirty))
+    {
+        value_address = pnanovdb_upper_get_value_address_and_level_and_cache(grid_type, buf, PNANOVDB_DEREF(acc).upper, ijk, acc, level);
+    }
+    else
+    {
+        value_address = pnanovdb_root_get_value_address_and_level_and_cache(grid_type, buf, PNANOVDB_DEREF(acc).root, ijk, acc, level);
+    }
+    return value_address;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_readaccessor_get_value_address(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, PNANOVDB_INOUT(pnanovdb_readaccessor_t) acc, PNANOVDB_IN(pnanovdb_coord_t) ijk)
+{
+    pnanovdb_uint32_t level;
+    return pnanovdb_readaccessor_get_value_address_and_level(grid_type, buf, acc, ijk, PNANOVDB_REF(level));
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_readaccessor_get_value_address_bit(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, PNANOVDB_INOUT(pnanovdb_readaccessor_t) acc, PNANOVDB_IN(pnanovdb_coord_t) ijk, PNANOVDB_INOUT(pnanovdb_uint32_t) bit_index)
+{
+    pnanovdb_uint32_t level;
+    pnanovdb_address_t address = pnanovdb_readaccessor_get_value_address_and_level(grid_type, buf, acc, ijk, PNANOVDB_REF(level));
+    PNANOVDB_DEREF(bit_index) = level == 0u ? pnanovdb_int32_as_uint32(PNANOVDB_DEREF(ijk).x & 7) : 0u;
+    return address;
+}
+
+// ------------------------------------------------ ReadAccessor GetDim -----------------------------------------------------------
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_leaf_get_dim_and_cache(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf, PNANOVDB_IN(pnanovdb_coord_t) ijk, PNANOVDB_INOUT(pnanovdb_readaccessor_t) acc)
+{
+    return 1u;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_lower_get_dim_and_cache(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_lower_handle_t lower, PNANOVDB_IN(pnanovdb_coord_t) ijk, PNANOVDB_INOUT(pnanovdb_readaccessor_t) acc)
+{
+    pnanovdb_uint32_t n = pnanovdb_lower_coord_to_offset(ijk);
+    pnanovdb_uint32_t ret;
+    if (pnanovdb_lower_get_child_mask(buf, lower, n))
+    {
+        pnanovdb_leaf_handle_t child = pnanovdb_lower_get_child(grid_type, buf, lower, n);
+        PNANOVDB_DEREF(acc).leaf = child;
+        PNANOVDB_DEREF(acc).key = PNANOVDB_DEREF(ijk);
+        ret = pnanovdb_leaf_get_dim_and_cache(grid_type, buf, child, ijk, acc);
+    }
+    else
+    {
+        ret = (1u << (3u)); // node 0 dim
+    }
+    return ret;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_upper_get_dim_and_cache(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_upper_handle_t upper, PNANOVDB_IN(pnanovdb_coord_t) ijk, PNANOVDB_INOUT(pnanovdb_readaccessor_t) acc)
+{
+    pnanovdb_uint32_t n = pnanovdb_upper_coord_to_offset(ijk);
+    pnanovdb_uint32_t ret;
+    if (pnanovdb_upper_get_child_mask(buf, upper, n))
+    {
+        pnanovdb_lower_handle_t child = pnanovdb_upper_get_child(grid_type, buf, upper, n);
+        PNANOVDB_DEREF(acc).lower = child;
+        PNANOVDB_DEREF(acc).key = PNANOVDB_DEREF(ijk);
+        ret = pnanovdb_lower_get_dim_and_cache(grid_type, buf, child, ijk, acc);
+    }
+    else
+    {
+        ret = (1u << (4u + 3u)); // node 1 dim
+    }
+    return ret;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_root_get_dim_and_cache(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_root_handle_t root, PNANOVDB_IN(pnanovdb_coord_t) ijk, PNANOVDB_INOUT(pnanovdb_readaccessor_t) acc)
+{
+    pnanovdb_root_tile_handle_t tile = pnanovdb_root_find_tile(grid_type, buf, root, ijk);
+    pnanovdb_uint32_t ret;
+    if (pnanovdb_address_is_null(tile.address))
+    {
+        ret = 1u << (5u + 4u + 3u); // background, node 2 dim
+    }
+    else if (pnanovdb_int64_is_zero(pnanovdb_root_tile_get_child(buf, tile)))
+    {
+        ret = 1u << (5u + 4u + 3u); // tile value, node 2 dim
+    }
+    else
+    {
+        pnanovdb_upper_handle_t child = pnanovdb_root_get_child(grid_type, buf, root, tile);
+        PNANOVDB_DEREF(acc).upper = child;
+        PNANOVDB_DEREF(acc).key = PNANOVDB_DEREF(ijk);
+        ret = pnanovdb_upper_get_dim_and_cache(grid_type, buf, child, ijk, acc);
+    }
+    return ret;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_readaccessor_get_dim(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, PNANOVDB_INOUT(pnanovdb_readaccessor_t) acc, PNANOVDB_IN(pnanovdb_coord_t) ijk)
+{
+    int dirty = pnanovdb_readaccessor_computedirty(acc, ijk);
+
+    pnanovdb_uint32_t dim;
+    if (pnanovdb_readaccessor_iscached0(acc, dirty))
+    {
+        dim = pnanovdb_leaf_get_dim_and_cache(grid_type, buf, PNANOVDB_DEREF(acc).leaf, ijk, acc);
+    }
+    else if (pnanovdb_readaccessor_iscached1(acc, dirty))
+    {
+        dim = pnanovdb_lower_get_dim_and_cache(grid_type, buf, PNANOVDB_DEREF(acc).lower, ijk, acc);
+    }
+    else if (pnanovdb_readaccessor_iscached2(acc, dirty))
+    {
+        dim = pnanovdb_upper_get_dim_and_cache(grid_type, buf, PNANOVDB_DEREF(acc).upper, ijk, acc);
+    }
+    else
+    {
+        dim = pnanovdb_root_get_dim_and_cache(grid_type, buf, PNANOVDB_DEREF(acc).root, ijk, acc);
+    }
+    return dim;
+}
+
+// ------------------------------------------------ ReadAccessor IsActive -----------------------------------------------------------
+
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_leaf_is_active_and_cache(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf, PNANOVDB_IN(pnanovdb_coord_t) ijk, PNANOVDB_INOUT(pnanovdb_readaccessor_t) acc)
+{
+    pnanovdb_uint32_t n = pnanovdb_leaf_coord_to_offset(ijk);
+    return pnanovdb_leaf_get_value_mask(buf, leaf, n);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_lower_is_active_and_cache(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_lower_handle_t lower, PNANOVDB_IN(pnanovdb_coord_t) ijk, PNANOVDB_INOUT(pnanovdb_readaccessor_t) acc)
+{
+    pnanovdb_uint32_t n = pnanovdb_lower_coord_to_offset(ijk);
+    pnanovdb_bool_t is_active;
+    if (pnanovdb_lower_get_child_mask(buf, lower, n))
+    {
+        pnanovdb_leaf_handle_t child = pnanovdb_lower_get_child(grid_type, buf, lower, n);
+        PNANOVDB_DEREF(acc).leaf = child;
+        PNANOVDB_DEREF(acc).key = PNANOVDB_DEREF(ijk);
+        is_active = pnanovdb_leaf_is_active_and_cache(grid_type, buf, child, ijk, acc);
+    }
+    else
+    {
+        is_active = pnanovdb_lower_get_value_mask(buf, lower, n);
+    }
+    return is_active;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_upper_is_active_and_cache(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_upper_handle_t upper, PNANOVDB_IN(pnanovdb_coord_t) ijk, PNANOVDB_INOUT(pnanovdb_readaccessor_t) acc)
+{
+    pnanovdb_uint32_t n = pnanovdb_upper_coord_to_offset(ijk);
+    pnanovdb_bool_t is_active;
+    if (pnanovdb_upper_get_child_mask(buf, upper, n))
+    {
+        pnanovdb_lower_handle_t child = pnanovdb_upper_get_child(grid_type, buf, upper, n);
+        PNANOVDB_DEREF(acc).lower = child;
+        PNANOVDB_DEREF(acc).key = PNANOVDB_DEREF(ijk);
+        is_active = pnanovdb_lower_is_active_and_cache(grid_type, buf, child, ijk, acc);
+    }
+    else
+    {
+        is_active = pnanovdb_upper_get_value_mask(buf, upper, n);
+    }
+    return is_active;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_root_is_active_and_cache(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_root_handle_t root, PNANOVDB_IN(pnanovdb_coord_t) ijk, PNANOVDB_INOUT(pnanovdb_readaccessor_t) acc)
+{
+    pnanovdb_root_tile_handle_t tile = pnanovdb_root_find_tile(grid_type, buf, root, ijk);
+    pnanovdb_bool_t is_active;
+    if (pnanovdb_address_is_null(tile.address))
+    {
+        is_active = PNANOVDB_FALSE; // background
+    }
+    else if (pnanovdb_int64_is_zero(pnanovdb_root_tile_get_child(buf, tile)))
+    {
+        pnanovdb_uint32_t state = pnanovdb_root_tile_get_state(buf, tile);
+        is_active = state != 0u; // tile value
+    }
+    else
+    {
+        pnanovdb_upper_handle_t child = pnanovdb_root_get_child(grid_type, buf, root, tile);
+        PNANOVDB_DEREF(acc).upper = child;
+        PNANOVDB_DEREF(acc).key = PNANOVDB_DEREF(ijk);
+        is_active = pnanovdb_upper_is_active_and_cache(grid_type, buf, child, ijk, acc);
+    }
+    return is_active;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_readaccessor_is_active(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, PNANOVDB_INOUT(pnanovdb_readaccessor_t) acc, PNANOVDB_IN(pnanovdb_coord_t) ijk)
+{
+    int dirty = pnanovdb_readaccessor_computedirty(acc, ijk);
+
+    pnanovdb_bool_t is_active;
+    if (pnanovdb_readaccessor_iscached0(acc, dirty))
+    {
+        is_active = pnanovdb_leaf_is_active_and_cache(grid_type, buf, PNANOVDB_DEREF(acc).leaf, ijk, acc);
+    }
+    else if (pnanovdb_readaccessor_iscached1(acc, dirty))
+    {
+        is_active = pnanovdb_lower_is_active_and_cache(grid_type, buf, PNANOVDB_DEREF(acc).lower, ijk, acc);
+    }
+    else if (pnanovdb_readaccessor_iscached2(acc, dirty))
+    {
+        is_active = pnanovdb_upper_is_active_and_cache(grid_type, buf, PNANOVDB_DEREF(acc).upper, ijk, acc);
+    }
+    else
+    {
+        is_active = pnanovdb_root_is_active_and_cache(grid_type, buf, PNANOVDB_DEREF(acc).root, ijk, acc);
+    }
+    return is_active;
+}
+
+// ------------------------------------------------ Map Transforms -----------------------------------------------------------
+
+PNANOVDB_FORCE_INLINE pnanovdb_vec3_t pnanovdb_map_apply(pnanovdb_buf_t buf, pnanovdb_map_handle_t map, PNANOVDB_IN(pnanovdb_vec3_t) src)
+{
+    pnanovdb_vec3_t  dst;
+    float sx = PNANOVDB_DEREF(src).x;
+    float sy = PNANOVDB_DEREF(src).y;
+    float sz = PNANOVDB_DEREF(src).z;
+    dst.x = sx * pnanovdb_map_get_matf(buf, map, 0) + sy * pnanovdb_map_get_matf(buf, map, 1) + sz * pnanovdb_map_get_matf(buf, map, 2) + pnanovdb_map_get_vecf(buf, map, 0);
+    dst.y = sx * pnanovdb_map_get_matf(buf, map, 3) + sy * pnanovdb_map_get_matf(buf, map, 4) + sz * pnanovdb_map_get_matf(buf, map, 5) + pnanovdb_map_get_vecf(buf, map, 1);
+    dst.z = sx * pnanovdb_map_get_matf(buf, map, 6) + sy * pnanovdb_map_get_matf(buf, map, 7) + sz * pnanovdb_map_get_matf(buf, map, 8) + pnanovdb_map_get_vecf(buf, map, 2);
+    return dst;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_vec3_t pnanovdb_map_apply_inverse(pnanovdb_buf_t buf, pnanovdb_map_handle_t map, PNANOVDB_IN(pnanovdb_vec3_t) src)
+{
+    pnanovdb_vec3_t  dst;
+    float sx = PNANOVDB_DEREF(src).x - pnanovdb_map_get_vecf(buf, map, 0);
+    float sy = PNANOVDB_DEREF(src).y - pnanovdb_map_get_vecf(buf, map, 1);
+    float sz = PNANOVDB_DEREF(src).z - pnanovdb_map_get_vecf(buf, map, 2);
+    dst.x = sx * pnanovdb_map_get_invmatf(buf, map, 0) + sy * pnanovdb_map_get_invmatf(buf, map, 1) + sz * pnanovdb_map_get_invmatf(buf, map, 2);
+    dst.y = sx * pnanovdb_map_get_invmatf(buf, map, 3) + sy * pnanovdb_map_get_invmatf(buf, map, 4) + sz * pnanovdb_map_get_invmatf(buf, map, 5);
+    dst.z = sx * pnanovdb_map_get_invmatf(buf, map, 6) + sy * pnanovdb_map_get_invmatf(buf, map, 7) + sz * pnanovdb_map_get_invmatf(buf, map, 8);
+    return dst;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_vec3_t pnanovdb_map_apply_jacobi(pnanovdb_buf_t buf, pnanovdb_map_handle_t map, PNANOVDB_IN(pnanovdb_vec3_t) src)
+{
+    pnanovdb_vec3_t  dst;
+    float sx = PNANOVDB_DEREF(src).x;
+    float sy = PNANOVDB_DEREF(src).y;
+    float sz = PNANOVDB_DEREF(src).z;
+    dst.x = sx * pnanovdb_map_get_matf(buf, map, 0) + sy * pnanovdb_map_get_matf(buf, map, 1) + sz * pnanovdb_map_get_matf(buf, map, 2);
+    dst.y = sx * pnanovdb_map_get_matf(buf, map, 3) + sy * pnanovdb_map_get_matf(buf, map, 4) + sz * pnanovdb_map_get_matf(buf, map, 5);
+    dst.z = sx * pnanovdb_map_get_matf(buf, map, 6) + sy * pnanovdb_map_get_matf(buf, map, 7) + sz * pnanovdb_map_get_matf(buf, map, 8);
+    return dst;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_vec3_t pnanovdb_map_apply_inverse_jacobi(pnanovdb_buf_t buf, pnanovdb_map_handle_t map, PNANOVDB_IN(pnanovdb_vec3_t) src)
+{
+    pnanovdb_vec3_t  dst;
+    float sx = PNANOVDB_DEREF(src).x;
+    float sy = PNANOVDB_DEREF(src).y;
+    float sz = PNANOVDB_DEREF(src).z;
+    dst.x = sx * pnanovdb_map_get_invmatf(buf, map, 0) + sy * pnanovdb_map_get_invmatf(buf, map, 1) + sz * pnanovdb_map_get_invmatf(buf, map, 2);
+    dst.y = sx * pnanovdb_map_get_invmatf(buf, map, 3) + sy * pnanovdb_map_get_invmatf(buf, map, 4) + sz * pnanovdb_map_get_invmatf(buf, map, 5);
+    dst.z = sx * pnanovdb_map_get_invmatf(buf, map, 6) + sy * pnanovdb_map_get_invmatf(buf, map, 7) + sz * pnanovdb_map_get_invmatf(buf, map, 8);
+    return dst;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_vec3_t pnanovdb_grid_world_to_indexf(pnanovdb_buf_t buf, pnanovdb_grid_handle_t grid, PNANOVDB_IN(pnanovdb_vec3_t) src)
+{
+    pnanovdb_map_handle_t map = pnanovdb_grid_get_map(buf, grid);
+    return pnanovdb_map_apply_inverse(buf, map, src);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_vec3_t pnanovdb_grid_index_to_worldf(pnanovdb_buf_t buf, pnanovdb_grid_handle_t grid, PNANOVDB_IN(pnanovdb_vec3_t) src)
+{
+    pnanovdb_map_handle_t map = pnanovdb_grid_get_map(buf, grid);
+    return pnanovdb_map_apply(buf, map, src);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_vec3_t pnanovdb_grid_world_to_index_dirf(pnanovdb_buf_t buf, pnanovdb_grid_handle_t grid, PNANOVDB_IN(pnanovdb_vec3_t) src)
+{
+    pnanovdb_map_handle_t map = pnanovdb_grid_get_map(buf, grid);
+    return pnanovdb_map_apply_inverse_jacobi(buf, map, src);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_vec3_t pnanovdb_grid_index_to_world_dirf(pnanovdb_buf_t buf, pnanovdb_grid_handle_t grid, PNANOVDB_IN(pnanovdb_vec3_t) src)
+{
+    pnanovdb_map_handle_t map = pnanovdb_grid_get_map(buf, grid);
+    return pnanovdb_map_apply_jacobi(buf, map, src);
+}
+
+// ------------------------------------------------ DitherLUT -----------------------------------------------------------
+
+// This table was generated with
+/**************
+
+static constexpr inline uint32
+SYSwang_inthash(uint32 key)
+{
+    // From http://www.concentric.net/~Ttwang/tech/inthash.htm
+    key += ~(key << 16);
+    key ^=  (key >> 5);
+    key +=  (key << 3);
+    key ^=  (key >> 13);
+    key += ~(key << 9);
+    key ^=  (key >> 17);
+    return key;
+}
+
+static void
+ut_initDitherR(float *pattern, float offset,
+    int x, int y, int z, int res, int goalres)
+{
+    // These offsets are designed to maximize the difference between
+    // dither values in nearby voxels within a given 2x2x2 cell, without
+    // producing axis-aligned artifacts.  The are organized in row-major
+    // order.
+    static const float  theDitherOffset[] = {0,4,6,2,5,1,3,7};
+    static const float  theScale = 0.125F;
+    int         key = (((z << res) + y) << res) + x;
+
+    if (res == goalres)
+    {
+    pattern[key] = offset;
+    return;
+    }
+
+    // Randomly flip (on each axis) the dithering patterns used by the
+    // subcells.  This key is xor'd with the subcell index below before
+    // looking up in the dither offset list.
+    key = SYSwang_inthash(key) & 7;
+
+    x <<= 1;
+    y <<= 1;
+    z <<= 1;
+
+    offset *= theScale;
+    for (int i = 0; i < 8; i++)
+    ut_initDitherR(pattern, offset+theDitherOffset[i ^ key]*theScale,
+        x+(i&1), y+((i&2)>>1), z+((i&4)>>2), res+1, goalres);
+}
+
+// This is a compact algorithm that accomplishes essentially the same thing
+// as ut_initDither() above.  We should eventually switch to use this and
+// clean the dead code.
+static fpreal32 *
+ut_initDitherRecursive(int goalres)
+{
+    const int nfloat = 1 << (goalres*3);
+    float   *pattern = new float[nfloat];
+    ut_initDitherR(pattern, 1.0F, 0, 0, 0, 0, goalres);
+
+    // This has built an even spacing from 1/nfloat to 1.0.
+    // however, our dither pattern should be 1/(nfloat+1) to nfloat/(nfloat+1)
+    // So we do a correction here.  Note that the earlier calculations are
+    // done with powers of 2 so are exact, so it does make sense to delay
+    // the renormalization to this pass.
+    float correctionterm = nfloat / (nfloat+1.0F);
+    for (int i = 0; i < nfloat; i++)
+        pattern[i] *= correctionterm;
+    return pattern;
+}
+
+    theDitherMatrix = ut_initDitherRecursive(3);
+
+    for (int i = 0; i < 512/8; i ++)
+    {
+        for (int j = 0; j < 8; j ++)
+            std::cout << theDitherMatrix[i*8+j] << "f, ";
+        std::cout << std::endl;
+    }
+
+ **************/
+
+PNANOVDB_STATIC_CONST float pnanovdb_dither_lut[512] =
+{
+    0.14425f, 0.643275f, 0.830409f, 0.331384f, 0.105263f, 0.604289f, 0.167641f, 0.666667f,
+    0.892788f, 0.393762f, 0.0818713f, 0.580897f, 0.853801f, 0.354776f, 0.916179f, 0.417154f,
+    0.612086f, 0.11306f, 0.79922f, 0.300195f, 0.510721f, 0.0116959f, 0.947368f, 0.448343f,
+    0.362573f, 0.861598f, 0.0506823f, 0.549708f, 0.261209f, 0.760234f, 0.19883f, 0.697856f,
+    0.140351f, 0.639376f, 0.576998f, 0.0779727f, 0.522417f, 0.0233918f, 0.460039f, 0.959064f,
+    0.888889f, 0.389864f, 0.327485f, 0.826511f, 0.272904f, 0.77193f, 0.709552f, 0.210526f,
+    0.483431f, 0.982456f, 0.296296f, 0.795322f, 0.116959f, 0.615984f, 0.0545809f, 0.553606f,
+    0.732943f, 0.233918f, 0.545809f, 0.0467836f, 0.865497f, 0.366472f, 0.803119f, 0.304094f,
+    0.518519f, 0.0194932f, 0.45614f, 0.955166f, 0.729045f, 0.230019f, 0.54191f, 0.042885f,
+    0.269006f, 0.768031f, 0.705653f, 0.206628f, 0.479532f, 0.978558f, 0.292398f, 0.791423f,
+    0.237817f, 0.736842f, 0.424951f, 0.923977f, 0.136452f, 0.635478f, 0.323587f, 0.822612f,
+    0.986355f, 0.487329f, 0.674464f, 0.175439f, 0.88499f, 0.385965f, 0.573099f, 0.0740741f,
+    0.51462f, 0.0155945f, 0.202729f, 0.701754f, 0.148148f, 0.647174f, 0.834308f, 0.335283f,
+    0.265107f, 0.764133f, 0.951267f, 0.452242f, 0.896686f, 0.397661f, 0.08577f, 0.584795f,
+    0.8577f, 0.358674f, 0.920078f, 0.421053f, 0.740741f, 0.241715f, 0.678363f, 0.179337f,
+    0.109162f, 0.608187f, 0.17154f, 0.670565f, 0.491228f, 0.990253f, 0.42885f, 0.927875f,
+    0.0662768f, 0.565302f, 0.62768f, 0.128655f, 0.183236f, 0.682261f, 0.744639f, 0.245614f,
+    0.814815f, 0.315789f, 0.378168f, 0.877193f, 0.931774f, 0.432749f, 0.495127f, 0.994152f,
+    0.0350877f, 0.534113f, 0.97076f, 0.471735f, 0.214425f, 0.71345f, 0.526316f, 0.0272904f,
+    0.783626f, 0.2846f, 0.222222f, 0.721248f, 0.962963f, 0.463938f, 0.276803f, 0.775828f,
+    0.966862f, 0.467836f, 0.405458f, 0.904483f, 0.0701754f, 0.569201f, 0.881092f, 0.382066f,
+    0.218324f, 0.717349f, 0.654971f, 0.155945f, 0.818713f, 0.319688f, 0.132554f, 0.631579f,
+    0.0623782f, 0.561404f, 0.748538f, 0.249513f, 0.912281f, 0.413255f, 0.974659f, 0.475634f,
+    0.810916f, 0.311891f, 0.499025f, 0.998051f, 0.163743f, 0.662768f, 0.226121f, 0.725146f,
+    0.690058f, 0.191033f, 0.00389864f, 0.502924f, 0.557505f, 0.0584795f, 0.120858f, 0.619883f,
+    0.440546f, 0.939571f, 0.752437f, 0.253411f, 0.307992f, 0.807018f, 0.869396f, 0.37037f,
+    0.658869f, 0.159844f, 0.346979f, 0.846004f, 0.588694f, 0.0896686f, 0.152047f, 0.651072f,
+    0.409357f, 0.908382f, 0.596491f, 0.0974659f, 0.339181f, 0.838207f, 0.900585f, 0.401559f,
+    0.34308f, 0.842105f, 0.779727f, 0.280702f, 0.693957f, 0.194932f, 0.25731f, 0.756335f,
+    0.592593f, 0.0935673f, 0.0311891f, 0.530214f, 0.444444f, 0.94347f, 0.506823f, 0.00779727f,
+    0.68616f, 0.187135f, 0.124756f, 0.623782f, 0.288499f, 0.787524f, 0.350877f, 0.849903f,
+    0.436647f, 0.935673f, 0.873294f, 0.374269f, 0.538012f, 0.0389864f, 0.60039f, 0.101365f,
+    0.57115f, 0.0721248f, 0.758285f, 0.259259f, 0.719298f, 0.220273f, 0.532164f, 0.0331384f,
+    0.321637f, 0.820663f, 0.00974659f, 0.508772f, 0.469786f, 0.968811f, 0.282651f, 0.781676f,
+    0.539961f, 0.0409357f, 0.727096f, 0.22807f, 0.500975f, 0.00194932f, 0.563353f, 0.0643275f,
+    0.290448f, 0.789474f, 0.477583f, 0.976608f, 0.251462f, 0.750487f, 0.31384f, 0.812865f,
+    0.94152f, 0.442495f, 0.879142f, 0.380117f, 0.37232f, 0.871345f, 0.309942f, 0.808967f,
+    0.192982f, 0.692008f, 0.130604f, 0.62963f, 0.621832f, 0.122807f, 0.559454f, 0.0604289f,
+    0.660819f, 0.161793f, 0.723197f, 0.224172f, 0.403509f, 0.902534f, 0.840156f, 0.341131f,
+    0.411306f, 0.910331f, 0.473684f, 0.97271f, 0.653021f, 0.153996f, 0.0916179f, 0.590643f,
+    0.196881f, 0.695906f, 0.384016f, 0.883041f, 0.0955166f, 0.594542f, 0.157895f, 0.65692f,
+    0.945419f, 0.446394f, 0.633528f, 0.134503f, 0.844055f, 0.345029f, 0.906433f, 0.407407f,
+    0.165692f, 0.664717f, 0.103314f, 0.602339f, 0.126706f, 0.625731f, 0.189084f, 0.688109f,
+    0.91423f, 0.415205f, 0.851852f, 0.352827f, 0.875244f, 0.376218f, 0.937622f, 0.438596f,
+    0.317739f, 0.816764f, 0.255361f, 0.754386f, 0.996101f, 0.497076f, 0.933723f, 0.434698f,
+    0.567251f, 0.0682261f, 0.504873f, 0.00584795f, 0.247563f, 0.746589f, 0.185185f, 0.684211f,
+    0.037037f, 0.536062f, 0.0994152f, 0.598441f, 0.777778f, 0.278752f, 0.465887f, 0.964912f,
+    0.785575f, 0.28655f, 0.847953f, 0.348928f, 0.0292398f, 0.528265f, 0.7154f, 0.216374f,
+    0.39961f, 0.898636f, 0.961014f, 0.461988f, 0.0487329f, 0.547758f, 0.111111f, 0.610136f,
+    0.649123f, 0.150097f, 0.212476f, 0.711501f, 0.797271f, 0.298246f, 0.859649f, 0.360624f,
+    0.118908f, 0.617934f, 0.0565302f, 0.555556f, 0.329435f, 0.82846f, 0.516569f, 0.0175439f,
+    0.867446f, 0.368421f, 0.805068f, 0.306043f, 0.578947f, 0.079922f, 0.267057f, 0.766082f,
+    0.270955f, 0.76998f, 0.707602f, 0.208577f, 0.668616f, 0.169591f, 0.606238f, 0.107212f,
+    0.520468f, 0.0214425f, 0.45809f, 0.957115f, 0.419103f, 0.918129f, 0.356725f, 0.855751f,
+    0.988304f, 0.489279f, 0.426901f, 0.925926f, 0.450292f, 0.949318f, 0.512671f, 0.0136452f,
+    0.239766f, 0.738791f, 0.676413f, 0.177388f, 0.699805f, 0.20078f, 0.263158f, 0.762183f,
+    0.773879f, 0.274854f, 0.337232f, 0.836257f, 0.672515f, 0.173489f, 0.734893f, 0.235867f,
+    0.0253411f, 0.524366f, 0.586745f, 0.0877193f, 0.423002f, 0.922027f, 0.48538f, 0.984405f,
+    0.74269f, 0.243665f, 0.680312f, 0.181287f, 0.953216f, 0.454191f, 0.1423f, 0.641326f,
+    0.493177f, 0.992203f, 0.430799f, 0.929825f, 0.204678f, 0.703704f, 0.890838f, 0.391813f,
+    0.894737f, 0.395712f, 0.0838207f, 0.582846f, 0.0448343f, 0.54386f, 0.231969f, 0.730994f,
+    0.146199f, 0.645224f, 0.832359f, 0.333333f, 0.793372f, 0.294347f, 0.980507f, 0.481481f,
+    0.364522f, 0.863548f, 0.80117f, 0.302144f, 0.824561f, 0.325536f, 0.138402f, 0.637427f,
+    0.614035f, 0.11501f, 0.0526316f, 0.551657f, 0.0760234f, 0.575049f, 0.88694f, 0.387914f,
+};
+
+PNANOVDB_FORCE_INLINE float pnanovdb_dither_lookup(pnanovdb_bool_t enabled, int offset)
+{
+    return enabled ? pnanovdb_dither_lut[offset & 511] : 0.5f;
+}
+
+// ------------------------------------------------ HDDA -----------------------------------------------------------
+
+#ifdef PNANOVDB_HDDA
+
+// Comment out to disable this explicit round-off check
+#define PNANOVDB_ENFORCE_FORWARD_STEPPING
+
+#define PNANOVDB_HDDA_FLOAT_MAX 1e38f
+
+struct pnanovdb_hdda_t
+{
+    pnanovdb_int32_t dim;
+    float tmin;
+    float tmax;
+    pnanovdb_coord_t voxel;
+    pnanovdb_coord_t step;
+    pnanovdb_vec3_t delta;
+    pnanovdb_vec3_t next;
+};
+PNANOVDB_STRUCT_TYPEDEF(pnanovdb_hdda_t)
+
+PNANOVDB_FORCE_INLINE pnanovdb_coord_t pnanovdb_hdda_pos_to_ijk(PNANOVDB_IN(pnanovdb_vec3_t) pos)
+{
+    pnanovdb_coord_t voxel;
+    voxel.x = pnanovdb_float_to_int32(pnanovdb_floor(PNANOVDB_DEREF(pos).x));
+    voxel.y = pnanovdb_float_to_int32(pnanovdb_floor(PNANOVDB_DEREF(pos).y));
+    voxel.z = pnanovdb_float_to_int32(pnanovdb_floor(PNANOVDB_DEREF(pos).z));
+    return voxel;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_coord_t pnanovdb_hdda_pos_to_voxel(PNANOVDB_IN(pnanovdb_vec3_t) pos, int dim)
+{
+    pnanovdb_coord_t voxel;
+    voxel.x = pnanovdb_float_to_int32(pnanovdb_floor(PNANOVDB_DEREF(pos).x)) & (~(dim - 1));
+    voxel.y = pnanovdb_float_to_int32(pnanovdb_floor(PNANOVDB_DEREF(pos).y)) & (~(dim - 1));
+    voxel.z = pnanovdb_float_to_int32(pnanovdb_floor(PNANOVDB_DEREF(pos).z)) & (~(dim - 1));
+    return voxel;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_vec3_t pnanovdb_hdda_ray_start(PNANOVDB_IN(pnanovdb_vec3_t) origin, float tmin, PNANOVDB_IN(pnanovdb_vec3_t) direction)
+{
+    pnanovdb_vec3_t pos = pnanovdb_vec3_add(
+        pnanovdb_vec3_mul(PNANOVDB_DEREF(direction), pnanovdb_vec3_uniform(tmin)),
+        PNANOVDB_DEREF(origin)
+    );
+    return pos;
+}
+
+PNANOVDB_FORCE_INLINE void pnanovdb_hdda_init(PNANOVDB_INOUT(pnanovdb_hdda_t) hdda, PNANOVDB_IN(pnanovdb_vec3_t) origin, float tmin, PNANOVDB_IN(pnanovdb_vec3_t) direction, float tmax, int dim)
+{
+    PNANOVDB_DEREF(hdda).dim = dim;
+    PNANOVDB_DEREF(hdda).tmin = tmin;
+    PNANOVDB_DEREF(hdda).tmax = tmax;
+
+    pnanovdb_vec3_t pos = pnanovdb_hdda_ray_start(origin, tmin, direction);
+    pnanovdb_vec3_t dir_inv = pnanovdb_vec3_div(pnanovdb_vec3_uniform(1.f), PNANOVDB_DEREF(direction));
+
+    PNANOVDB_DEREF(hdda).voxel = pnanovdb_hdda_pos_to_voxel(PNANOVDB_REF(pos), dim);
+
+    // x
+    if (PNANOVDB_DEREF(direction).x == 0.f)
+    {
+        PNANOVDB_DEREF(hdda).next.x = PNANOVDB_HDDA_FLOAT_MAX;
+        PNANOVDB_DEREF(hdda).step.x = 0;
+        PNANOVDB_DEREF(hdda).delta.x = 0.f;
+    }
+    else if (dir_inv.x > 0.f)
+    {
+        PNANOVDB_DEREF(hdda).step.x = 1;
+        PNANOVDB_DEREF(hdda).next.x = PNANOVDB_DEREF(hdda).tmin + (PNANOVDB_DEREF(hdda).voxel.x + dim - pos.x) * dir_inv.x;
+        PNANOVDB_DEREF(hdda).delta.x = dir_inv.x;
+    }
+    else
+    {
+        PNANOVDB_DEREF(hdda).step.x = -1;
+        PNANOVDB_DEREF(hdda).next.x = PNANOVDB_DEREF(hdda).tmin + (PNANOVDB_DEREF(hdda).voxel.x - pos.x) * dir_inv.x;
+        PNANOVDB_DEREF(hdda).delta.x = -dir_inv.x;
+    }
+
+    // y
+    if (PNANOVDB_DEREF(direction).y == 0.f)
+    {
+        PNANOVDB_DEREF(hdda).next.y = PNANOVDB_HDDA_FLOAT_MAX;
+        PNANOVDB_DEREF(hdda).step.y = 0;
+        PNANOVDB_DEREF(hdda).delta.y = 0.f;
+    }
+    else if (dir_inv.y > 0.f)
+    {
+        PNANOVDB_DEREF(hdda).step.y = 1;
+        PNANOVDB_DEREF(hdda).next.y = PNANOVDB_DEREF(hdda).tmin + (PNANOVDB_DEREF(hdda).voxel.y + dim - pos.y) * dir_inv.y;
+        PNANOVDB_DEREF(hdda).delta.y = dir_inv.y;
+    }
+    else
+    {
+        PNANOVDB_DEREF(hdda).step.y = -1;
+        PNANOVDB_DEREF(hdda).next.y = PNANOVDB_DEREF(hdda).tmin + (PNANOVDB_DEREF(hdda).voxel.y - pos.y) * dir_inv.y;
+        PNANOVDB_DEREF(hdda).delta.y = -dir_inv.y;
+    }
+
+    // z
+    if (PNANOVDB_DEREF(direction).z == 0.f)
+    {
+        PNANOVDB_DEREF(hdda).next.z = PNANOVDB_HDDA_FLOAT_MAX;
+        PNANOVDB_DEREF(hdda).step.z = 0;
+        PNANOVDB_DEREF(hdda).delta.z = 0.f;
+    }
+    else if (dir_inv.z > 0.f)
+    {
+        PNANOVDB_DEREF(hdda).step.z = 1;
+        PNANOVDB_DEREF(hdda).next.z = PNANOVDB_DEREF(hdda).tmin + (PNANOVDB_DEREF(hdda).voxel.z + dim - pos.z) * dir_inv.z;
+        PNANOVDB_DEREF(hdda).delta.z = dir_inv.z;
+    }
+    else
+    {
+        PNANOVDB_DEREF(hdda).step.z = -1;
+        PNANOVDB_DEREF(hdda).next.z = PNANOVDB_DEREF(hdda).tmin + (PNANOVDB_DEREF(hdda).voxel.z - pos.z) * dir_inv.z;
+        PNANOVDB_DEREF(hdda).delta.z = -dir_inv.z;
+    }
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_hdda_update(PNANOVDB_INOUT(pnanovdb_hdda_t) hdda, PNANOVDB_IN(pnanovdb_vec3_t) origin, PNANOVDB_IN(pnanovdb_vec3_t) direction, int dim)
+{
+    if (PNANOVDB_DEREF(hdda).dim == dim)
+    {
+        return PNANOVDB_FALSE;
+    }
+    PNANOVDB_DEREF(hdda).dim = dim;
+
+    pnanovdb_vec3_t pos = pnanovdb_vec3_add(
+        pnanovdb_vec3_mul(PNANOVDB_DEREF(direction), pnanovdb_vec3_uniform(PNANOVDB_DEREF(hdda).tmin)),
+        PNANOVDB_DEREF(origin)
+    );
+    pnanovdb_vec3_t dir_inv = pnanovdb_vec3_div(pnanovdb_vec3_uniform(1.f), PNANOVDB_DEREF(direction));
+
+    PNANOVDB_DEREF(hdda).voxel = pnanovdb_hdda_pos_to_voxel(PNANOVDB_REF(pos), dim);
+
+    if (PNANOVDB_DEREF(hdda).step.x != 0)
+    {
+        PNANOVDB_DEREF(hdda).next.x = PNANOVDB_DEREF(hdda).tmin + (PNANOVDB_DEREF(hdda).voxel.x - pos.x) * dir_inv.x;
+        if (PNANOVDB_DEREF(hdda).step.x > 0)
+        {
+            PNANOVDB_DEREF(hdda).next.x += dim * dir_inv.x;
+        }
+    }
+    if (PNANOVDB_DEREF(hdda).step.y != 0)
+    {
+        PNANOVDB_DEREF(hdda).next.y = PNANOVDB_DEREF(hdda).tmin + (PNANOVDB_DEREF(hdda).voxel.y - pos.y) * dir_inv.y;
+        if (PNANOVDB_DEREF(hdda).step.y > 0)
+        {
+            PNANOVDB_DEREF(hdda).next.y += dim * dir_inv.y;
+        }
+    }
+    if (PNANOVDB_DEREF(hdda).step.z != 0)
+    {
+        PNANOVDB_DEREF(hdda).next.z = PNANOVDB_DEREF(hdda).tmin + (PNANOVDB_DEREF(hdda).voxel.z - pos.z) * dir_inv.z;
+        if (PNANOVDB_DEREF(hdda).step.z > 0)
+        {
+            PNANOVDB_DEREF(hdda).next.z += dim * dir_inv.z;
+        }
+    }
+
+    return PNANOVDB_TRUE;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_hdda_step(PNANOVDB_INOUT(pnanovdb_hdda_t) hdda)
+{
+    pnanovdb_bool_t ret;
+    if (PNANOVDB_DEREF(hdda).next.x < PNANOVDB_DEREF(hdda).next.y && PNANOVDB_DEREF(hdda).next.x < PNANOVDB_DEREF(hdda).next.z)
+    {
+#ifdef PNANOVDB_ENFORCE_FORWARD_STEPPING
+        if (PNANOVDB_DEREF(hdda).next.x <= PNANOVDB_DEREF(hdda).tmin)
+        {
+            PNANOVDB_DEREF(hdda).next.x += PNANOVDB_DEREF(hdda).tmin - 0.999999f * PNANOVDB_DEREF(hdda).next.x + 1.0e-6f;
+        }
+#endif
+        PNANOVDB_DEREF(hdda).tmin = PNANOVDB_DEREF(hdda).next.x;
+        PNANOVDB_DEREF(hdda).next.x += PNANOVDB_DEREF(hdda).dim * PNANOVDB_DEREF(hdda).delta.x;
+        PNANOVDB_DEREF(hdda).voxel.x += PNANOVDB_DEREF(hdda).dim * PNANOVDB_DEREF(hdda).step.x;
+        ret = PNANOVDB_DEREF(hdda).tmin <= PNANOVDB_DEREF(hdda).tmax;
+    }
+    else if (PNANOVDB_DEREF(hdda).next.y < PNANOVDB_DEREF(hdda).next.z)
+    {
+#ifdef PNANOVDB_ENFORCE_FORWARD_STEPPING
+        if (PNANOVDB_DEREF(hdda).next.y <= PNANOVDB_DEREF(hdda).tmin)
+        {
+            PNANOVDB_DEREF(hdda).next.y += PNANOVDB_DEREF(hdda).tmin - 0.999999f * PNANOVDB_DEREF(hdda).next.y + 1.0e-6f;
+        }
+#endif
+        PNANOVDB_DEREF(hdda).tmin = PNANOVDB_DEREF(hdda).next.y;
+        PNANOVDB_DEREF(hdda).next.y += PNANOVDB_DEREF(hdda).dim * PNANOVDB_DEREF(hdda).delta.y;
+        PNANOVDB_DEREF(hdda).voxel.y += PNANOVDB_DEREF(hdda).dim * PNANOVDB_DEREF(hdda).step.y;
+        ret = PNANOVDB_DEREF(hdda).tmin <= PNANOVDB_DEREF(hdda).tmax;
+    }
+    else
+    {
+#ifdef PNANOVDB_ENFORCE_FORWARD_STEPPING
+        if (PNANOVDB_DEREF(hdda).next.z <= PNANOVDB_DEREF(hdda).tmin)
+        {
+            PNANOVDB_DEREF(hdda).next.z += PNANOVDB_DEREF(hdda).tmin - 0.999999f * PNANOVDB_DEREF(hdda).next.z + 1.0e-6f;
+        }
+#endif
+        PNANOVDB_DEREF(hdda).tmin = PNANOVDB_DEREF(hdda).next.z;
+        PNANOVDB_DEREF(hdda).next.z += PNANOVDB_DEREF(hdda).dim * PNANOVDB_DEREF(hdda).delta.z;
+        PNANOVDB_DEREF(hdda).voxel.z += PNANOVDB_DEREF(hdda).dim * PNANOVDB_DEREF(hdda).step.z;
+        ret = PNANOVDB_DEREF(hdda).tmin <= PNANOVDB_DEREF(hdda).tmax;
+    }
+    return ret;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_hdda_ray_clip(
+    PNANOVDB_IN(pnanovdb_vec3_t) bbox_min,
+    PNANOVDB_IN(pnanovdb_vec3_t) bbox_max,
+    PNANOVDB_IN(pnanovdb_vec3_t) origin, PNANOVDB_INOUT(float) tmin,
+    PNANOVDB_IN(pnanovdb_vec3_t) direction, PNANOVDB_INOUT(float) tmax
+)
+{
+    pnanovdb_vec3_t dir_inv = pnanovdb_vec3_div(pnanovdb_vec3_uniform(1.f), PNANOVDB_DEREF(direction));
+    pnanovdb_vec3_t t0 = pnanovdb_vec3_mul(pnanovdb_vec3_sub(PNANOVDB_DEREF(bbox_min), PNANOVDB_DEREF(origin)), dir_inv);
+    pnanovdb_vec3_t t1 = pnanovdb_vec3_mul(pnanovdb_vec3_sub(PNANOVDB_DEREF(bbox_max), PNANOVDB_DEREF(origin)), dir_inv);
+    pnanovdb_vec3_t tmin3 = pnanovdb_vec3_min(t0, t1);
+    pnanovdb_vec3_t tmax3 = pnanovdb_vec3_max(t0, t1);
+    float tnear = pnanovdb_max(tmin3.x, pnanovdb_max(tmin3.y, tmin3.z));
+    float tfar = pnanovdb_min(tmax3.x, pnanovdb_min(tmax3.y, tmax3.z));
+    pnanovdb_bool_t hit = tnear <= tfar;
+    PNANOVDB_DEREF(tmin) = pnanovdb_max(PNANOVDB_DEREF(tmin), tnear);
+    PNANOVDB_DEREF(tmax) = pnanovdb_min(PNANOVDB_DEREF(tmax), tfar);
+    return hit;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_hdda_zero_crossing(
+    pnanovdb_grid_type_t grid_type,
+    pnanovdb_buf_t buf,
+    PNANOVDB_INOUT(pnanovdb_readaccessor_t) acc,
+    PNANOVDB_IN(pnanovdb_vec3_t) origin, float tmin,
+    PNANOVDB_IN(pnanovdb_vec3_t) direction, float tmax,
+    PNANOVDB_INOUT(float) thit,
+    PNANOVDB_INOUT(float) v
+)
+{
+    pnanovdb_coord_t bbox_min = pnanovdb_root_get_bbox_min(buf, PNANOVDB_DEREF(acc).root);
+    pnanovdb_coord_t bbox_max = pnanovdb_root_get_bbox_max(buf, PNANOVDB_DEREF(acc).root);
+    pnanovdb_vec3_t bbox_minf = pnanovdb_coord_to_vec3(bbox_min);
+    pnanovdb_vec3_t bbox_maxf = pnanovdb_coord_to_vec3(pnanovdb_coord_add(bbox_max, pnanovdb_coord_uniform(1)));
+
+    pnanovdb_bool_t hit = pnanovdb_hdda_ray_clip(PNANOVDB_REF(bbox_minf), PNANOVDB_REF(bbox_maxf), origin, PNANOVDB_REF(tmin), direction, PNANOVDB_REF(tmax));
+    if (!hit || tmax > 1.0e20f)
+    {
+        return PNANOVDB_FALSE;
+    }
+
+    pnanovdb_vec3_t pos = pnanovdb_hdda_ray_start(origin, tmin, direction);
+    pnanovdb_coord_t ijk = pnanovdb_hdda_pos_to_ijk(PNANOVDB_REF(pos));
+
+    pnanovdb_address_t address = pnanovdb_readaccessor_get_value_address(PNANOVDB_GRID_TYPE_FLOAT, buf, acc, PNANOVDB_REF(ijk));
+    float v0 = pnanovdb_read_float(buf, address);
+
+    pnanovdb_int32_t dim = pnanovdb_uint32_as_int32(pnanovdb_readaccessor_get_dim(PNANOVDB_GRID_TYPE_FLOAT, buf, acc, PNANOVDB_REF(ijk)));
+    pnanovdb_hdda_t hdda;
+    pnanovdb_hdda_init(PNANOVDB_REF(hdda), origin, tmin, direction, tmax, dim);
+    while (pnanovdb_hdda_step(PNANOVDB_REF(hdda)))
+    {
+        pnanovdb_vec3_t pos_start = pnanovdb_hdda_ray_start(origin, hdda.tmin + 1.0001f, direction);
+        ijk = pnanovdb_hdda_pos_to_ijk(PNANOVDB_REF(pos_start));
+        dim = pnanovdb_uint32_as_int32(pnanovdb_readaccessor_get_dim(PNANOVDB_GRID_TYPE_FLOAT, buf, acc, PNANOVDB_REF(ijk)));
+        pnanovdb_hdda_update(PNANOVDB_REF(hdda), origin, direction, dim);
+        if (hdda.dim > 1 || !pnanovdb_readaccessor_is_active(grid_type, buf, acc, PNANOVDB_REF(ijk)))
+        {
+            continue;
+        }
+        while (pnanovdb_hdda_step(PNANOVDB_REF(hdda)) && pnanovdb_readaccessor_is_active(grid_type, buf, acc, PNANOVDB_REF(hdda.voxel)))
+        {
+            ijk = hdda.voxel;
+            pnanovdb_address_t address = pnanovdb_readaccessor_get_value_address(PNANOVDB_GRID_TYPE_FLOAT, buf, acc, PNANOVDB_REF(ijk));
+            PNANOVDB_DEREF(v) = pnanovdb_read_float(buf, address);
+            if (PNANOVDB_DEREF(v) * v0 < 0.f)
+            {
+                PNANOVDB_DEREF(thit) = hdda.tmin;
+                return PNANOVDB_TRUE;
+            }
+        }
+    }
+    return PNANOVDB_FALSE;
+}
+
+#endif
+
+#endif // end of NANOVDB_PNANOVDB_H_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/cuda/DeviceBuffer.h b/external/nanovdb/cuda/DeviceBuffer.h
new file mode 100644
index 00000000..465bd9dc
--- /dev/null
+++ b/external/nanovdb/cuda/DeviceBuffer.h
@@ -0,0 +1,231 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file DeviceBuffer.h
+
+    \author Ken Museth
+
+    \date January 8, 2020
+
+    \brief Implements a simple dual (host/device) CUDA buffer.
+
+    \note This file has no device-only kernel functions,
+          which explains why it's a .h and not .cuh file.
+*/
+
+#ifndef NANOVDB_CUDA_DEVICEBUFFER_H_HAS_BEEN_INCLUDED
+#define NANOVDB_CUDA_DEVICEBUFFER_H_HAS_BEEN_INCLUDED
+
+#include <nanovdb/HostBuffer.h>// for BufferTraits
+#include <nanovdb/util/cuda/Util.h>// for cudaMalloc/cudaMallocManaged/cudaFree
+
+namespace nanovdb {// ================================================================
+
+namespace cuda {// ===================================================================
+
+// ----------------------------> DeviceBuffer <--------------------------------------
+
+/// @brief Simple memory buffer using un-managed pinned host memory when compiled with NVCC.
+///        Obviously this class is making explicit used of CUDA so replace it with your own memory
+///        allocator if you are not using CUDA.
+/// @note  While CUDA's pinned host memory allows for asynchronous memory copy between host and device
+///        it is significantly slower then cached (un-pinned) memory on the host.
+class DeviceBuffer
+{
+    uint64_t mSize; // total number of bytes managed by this buffer (assumed to be identical for host and device)
+    void *mCpuData, *mGpuData; // raw pointers to the host and device buffers
+    bool mManaged;
+
+public:
+    /// @brief Static factory method that return an instance of this buffer
+    /// @param size byte size of buffer to be initialized
+    /// @param dummy this argument is currently ignored but required to match the API of the HostBuffer
+    /// @param host If true buffer is initialized only on the host/CPU, else on the device/GPU
+    /// @param stream optional stream argument (defaults to stream NULL)
+    /// @return An instance of this class using move semantics
+    static DeviceBuffer create(uint64_t size, const DeviceBuffer* dummy = nullptr, bool host = true, void* stream = nullptr);
+
+    /// @brief Static factory method that return an instance of this buffer that wraps externally managed memory
+    /// @param size byte size of buffer specified by external memory
+    /// @param cpuData pointer to externally managed host memory
+    /// @param gpuData pointer to externally managed device memory
+    /// @return An instance of this class using move semantics
+    static DeviceBuffer create(uint64_t size, void* cpuData, void* gpuData);
+
+    /// @brief Constructor
+    /// @param size byte size of buffer to be initialized
+    /// @param host If true buffer is initialized only on the host/CPU, else on the device/GPU
+    /// @param stream optional stream argument (defaults to stream NULL)
+    DeviceBuffer(uint64_t size = 0, bool host = true, void* stream = nullptr)
+        : mSize(0)
+        , mCpuData(nullptr)
+        , mGpuData(nullptr)
+        , mManaged(false)
+    {
+        if (size > 0) this->init(size, host, stream);
+    }
+
+    DeviceBuffer(uint64_t size, void* cpuData, void* gpuData)
+        : mSize(size)
+        , mCpuData(cpuData)
+        , mGpuData(gpuData)
+        , mManaged(false)
+    {
+    }
+
+    /// @brief Disallow copy-construction
+    DeviceBuffer(const DeviceBuffer&) = delete;
+
+    /// @brief Move copy-constructor
+    DeviceBuffer(DeviceBuffer&& other) noexcept
+        : mSize(other.mSize)
+        , mCpuData(other.mCpuData)
+        , mGpuData(other.mGpuData)
+        , mManaged(other.mManaged)
+    {
+        other.mSize = 0;
+        other.mCpuData = nullptr;
+        other.mGpuData = nullptr;
+        other.mManaged = false;
+    }
+
+    /// @brief Disallow copy assignment operation
+    DeviceBuffer& operator=(const DeviceBuffer&) = delete;
+
+    /// @brief Move copy assignment operation
+    DeviceBuffer& operator=(DeviceBuffer&& other) noexcept
+    {
+        this->clear();
+        mSize = other.mSize;
+        mCpuData = other.mCpuData;
+        mGpuData = other.mGpuData;
+        mManaged = other.mManaged;
+        other.mSize = 0;
+        other.mCpuData = nullptr;
+        other.mGpuData = nullptr;
+        other.mManaged = false;
+        return *this;
+    }
+
+    /// @brief Destructor frees memory on both the host and device
+    ~DeviceBuffer() { this->clear(); };
+
+    /// @brief Initialize buffer
+    /// @param size byte size of buffer to be initialized
+    /// @param host If true buffer is initialized only on the host/CPU, else on the device/GPU
+    /// @note All existing buffers are first cleared
+    /// @warning size is expected to be non-zero. Use clear() clear buffer!
+    void init(uint64_t size, bool host = true, void* stream = nullptr);
+
+    /// @brief Retuns a raw pointer to the host/CPU buffer managed by this allocator.
+    /// @warning Note that the pointer can be NULL!
+    void* data() const { return mCpuData; }
+
+    /// @brief Retuns a raw pointer to the device/GPU buffer managed by this allocator.
+    /// @warning Note that the pointer can be NULL!
+    void* deviceData() const { return mGpuData; }
+
+    /// @brief  Upload this buffer from the host to the device, i.e. CPU -> GPU.
+    /// @param stream optional CUDA stream (defaults to CUDA stream 0)
+    /// @param sync if false the memory copy is asynchronous
+    /// @note If the device/GPU buffer does not exist it is first allocated
+    /// @warning Assumes that the host/CPU buffer already exists
+    void deviceUpload(void* stream = nullptr, bool sync = true) const;
+
+    /// @brief Upload this buffer from the device to the host, i.e. GPU -> CPU.
+    /// @param stream optional CUDA stream (defaults to CUDA stream 0)
+    /// @param sync if false the memory copy is asynchronous
+    /// @note If the host/CPU buffer does not exist it is first allocated
+    /// @warning Assumes that the device/GPU buffer already exists
+    void deviceDownload(void* stream = nullptr, bool sync = true) const;
+
+    /// @brief Returns the size in bytes of the raw memory buffer managed by this allocator.
+    uint64_t size() const { return mSize; }
+
+    //@{
+    /// @brief Returns true if this allocator is empty, i.e. has no allocated memory
+    bool empty() const { return mSize == 0; }
+    bool isEmpty() const { return mSize == 0; }
+    //@}
+
+    /// @brief De-allocate all memory managed by this allocator and set all pointers to NULL
+    void clear(void* stream = nullptr);
+
+}; // DeviceBuffer class
+
+// --------------------------> Implementations below <------------------------------------
+
+inline DeviceBuffer DeviceBuffer::create(uint64_t size, const DeviceBuffer*, bool host, void* stream)
+{
+    return DeviceBuffer(size, host, stream);
+}
+
+inline DeviceBuffer DeviceBuffer::create(uint64_t size, void* cpuData, void* gpuData)
+{
+    return DeviceBuffer(size, cpuData, gpuData);
+}
+
+inline void DeviceBuffer::init(uint64_t size, bool host, void* stream)
+{
+    if (mSize>0) this->clear(stream);
+    NANOVDB_ASSERT(size > 0);
+    if (host) {
+        cudaCheck(cudaMallocHost((void**)&mCpuData, size)); // un-managed pinned memory on the host (can be slow to access!). Always 32B aligned
+        checkPtr(mCpuData, "cuda::DeviceBuffer::init: failed to allocate host buffer");
+    } else {
+        cudaCheck(util::cuda::mallocAsync((void**)&mGpuData, size, reinterpret_cast<cudaStream_t>(stream))); // un-managed memory on the device, always 32B aligned!
+        checkPtr(mGpuData, "cuda::DeviceBuffer::init: failed to allocate device buffer");
+    }
+    mSize = size;
+    mManaged = true;
+} // DeviceBuffer::init
+
+inline void DeviceBuffer::deviceUpload(void* stream, bool sync) const
+{
+    if (!mManaged) throw std::runtime_error("DeviceBuffer::deviceUpload called on externally managed memory. Replace deviceUpload call with the appropriate external copy operation.");
+
+    checkPtr(mCpuData, "uninitialized cpu data");
+    if (mGpuData == nullptr) {
+        cudaCheck(util::cuda::mallocAsync((void**)&mGpuData, mSize, reinterpret_cast<cudaStream_t>(stream))); // un-managed memory on the device, always 32B aligned!
+    }
+    checkPtr(mGpuData, "uninitialized gpu data");
+    cudaCheck(cudaMemcpyAsync(mGpuData, mCpuData, mSize, cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream)));
+    if (sync) cudaCheck(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream)));
+} // DeviceBuffer::gpuUpload
+
+inline void DeviceBuffer::deviceDownload(void* stream, bool sync) const
+{
+    if (!mManaged) throw std::runtime_error("DeviceBuffer::deviceDownload called on externally managed memory. Replace deviceDownload call with the appropriate external copy operation.");
+
+    checkPtr(mGpuData, "uninitialized gpu data");
+    if (mCpuData == nullptr) {
+        cudaCheck(cudaMallocHost((void**)&mCpuData, mSize)); // un-managed pinned memory on the host (can be slow to access!). Always 32B aligned
+    }
+    checkPtr(mCpuData, "uninitialized cpu data");
+    cudaCheck(cudaMemcpyAsync(mCpuData, mGpuData, mSize, cudaMemcpyDeviceToHost, reinterpret_cast<cudaStream_t>(stream)));
+    if (sync) cudaCheck(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream)));
+} // DeviceBuffer::gpuDownload
+
+inline void DeviceBuffer::clear(void *stream)
+{
+    if (mManaged && mGpuData) cudaCheck(util::cuda::freeAsync(mGpuData, reinterpret_cast<cudaStream_t>(stream)));
+    if (mManaged && mCpuData) cudaCheck(cudaFreeHost(mCpuData));
+    mCpuData = mGpuData = nullptr;
+    mSize = 0;
+    mManaged = false;
+} // DeviceBuffer::clear
+
+}// namespace cuda
+
+using CudaDeviceBuffer [[deprecated("Use nanovdb::cuda::DeviceBuffer instead")]] = cuda::DeviceBuffer;
+
+template<>
+struct BufferTraits<cuda::DeviceBuffer>
+{
+    static constexpr bool hasDeviceDual = true;
+};
+
+}// namespace nanovdb
+
+#endif // end of NANOVDB_CUDA_DEVICEBUFFER_H_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/cuda/GridHandle.cuh b/external/nanovdb/cuda/GridHandle.cuh
new file mode 100644
index 00000000..a0fc96cb
--- /dev/null
+++ b/external/nanovdb/cuda/GridHandle.cuh
@@ -0,0 +1,145 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file nanovdb/cuda/GridHandle.cuh
+
+    \author Ken Museth, Doyub Kim
+
+    \date August 3, 2023
+
+    \brief Contains cuda kernels for GridHandle
+
+    \warning The header file contains cuda device code so be sure
+             to only include it in .cu files (or other .cuh files)
+*/
+
+#ifndef NANOVDB_CUDA_GRIDHANDLE_CUH_HAS_BEEN_INCLUDED
+#define NANOVDB_CUDA_GRIDHANDLE_CUH_HAS_BEEN_INCLUDED
+
+#include <nanovdb/cuda/DeviceBuffer.h>// required for instantiation of move c-tor of GridHandle
+#include <nanovdb/tools/cuda/GridChecksum.cuh>// for cuda::updateChecksum
+#include <nanovdb/GridHandle.h>
+
+namespace nanovdb {
+
+namespace cuda {
+
+namespace {// anonymous namespace
+__global__ void cpyGridHandleMeta(const GridData *d_data, GridHandleMetaData *d_meta)
+{
+    nanovdb::cpyGridHandleMeta(d_data, d_meta);
+}
+
+__global__ void updateGridCount(GridData *d_data, uint32_t gridIndex, uint32_t gridCount, bool *d_dirty)
+{
+    NANOVDB_ASSERT(gridIndex < gridCount);
+    if (*d_dirty = d_data->mGridIndex != gridIndex || d_data->mGridCount != gridCount) {
+        d_data->mGridIndex = gridIndex;
+        d_data->mGridCount = gridCount;
+        if (d_data->mChecksum.isEmpty()) *d_dirty = false;// no need to update checksum if it didn't already exist
+    }
+}
+}// anonymous namespace
+
+template<typename BufferT, template <class, class...> class VectorT = std::vector>
+inline typename util::enable_if<BufferTraits<BufferT>::hasDeviceDual, VectorT<GridHandle<BufferT>>>::type
+splitGridHandles(const GridHandle<BufferT> &handle, const BufferT* other = nullptr, cudaStream_t stream = 0)
+{
+    const void *ptr = handle.deviceData();
+    if (ptr == nullptr) return VectorT<GridHandle<BufferT>>();
+    VectorT<GridHandle<BufferT>> handles(handle.gridCount());
+    bool dirty, *d_dirty;// use this to check if the checksum needs to be recomputed
+    cudaCheck(util::cuda::mallocAsync((void**)&d_dirty, sizeof(bool), stream));
+    for (uint32_t n=0; n<handle.gridCount(); ++n) {
+        auto buffer = BufferT::create(handle.gridSize(n), other, false, stream);
+        GridData *dst = reinterpret_cast<GridData*>(buffer.deviceData());
+        const GridData *src = reinterpret_cast<const GridData*>(ptr);
+        cudaCheck(cudaMemcpyAsync(dst, src, handle.gridSize(n), cudaMemcpyDeviceToDevice, stream));
+        updateGridCount<<<1, 1, 0, stream>>>(dst, 0u, 1u, d_dirty);
+        cudaCheckError();
+        cudaCheck(cudaMemcpyAsync(&dirty, d_dirty, sizeof(bool), cudaMemcpyDeviceToHost, stream));
+        if (dirty) tools::cuda::updateChecksum(dst, CheckMode::Partial, stream);
+        handles[n] = nanovdb::GridHandle<BufferT>(std::move(buffer));
+        ptr = util::PtrAdd(ptr, handle.gridSize(n));
+    }
+    cudaCheck(util::cuda::freeAsync(d_dirty, stream));
+    return std::move(handles);
+}// cuda::splitGridHandles
+
+template<typename BufferT, template <class, class...> class VectorT>
+inline typename util::enable_if<BufferTraits<BufferT>::hasDeviceDual, GridHandle<BufferT>>::type
+mergeGridHandles(const VectorT<GridHandle<BufferT>> &handles, const BufferT* other = nullptr, cudaStream_t stream = 0)
+{
+    uint64_t size = 0u;
+    uint32_t counter = 0u, gridCount = 0u;
+    for (auto &h : handles) {
+        gridCount += h.gridCount();
+        for (uint32_t n=0; n<h.gridCount(); ++n) size += h.gridSize(n);
+    }
+    auto buffer = BufferT::create(size, other, false, stream);
+    void *dst = buffer.deviceData();
+    bool dirty, *d_dirty;// use this to check if the checksum needs to be recomputed
+    cudaCheck(util::cuda::mallocAsync((void**)&d_dirty, sizeof(bool), stream));
+    for (auto &h : handles) {
+        const void *src = h.deviceData();
+        for (uint32_t n=0; n<h.gridCount(); ++n) {
+            cudaCheck(cudaMemcpyAsync(dst, src, h.gridSize(n), cudaMemcpyDeviceToDevice, stream));
+            GridData *data = reinterpret_cast<GridData*>(dst);
+            updateGridCount<<<1, 1, 0, stream>>>(data, counter++, gridCount, d_dirty);
+            cudaCheckError();
+            cudaCheck(cudaMemcpyAsync(&dirty, d_dirty, sizeof(bool), cudaMemcpyDeviceToHost, stream));
+            if (dirty) tools::cuda::updateChecksum(data, CheckMode::Partial, stream);
+            dst = util::PtrAdd(dst, h.gridSize(n));
+            src = util::PtrAdd(src, h.gridSize(n));
+        }
+    }
+    cudaCheck(util::cuda::freeAsync(d_dirty, stream));
+    return GridHandle<BufferT>(std::move(buffer));
+}// cuda::mergeGridHandles
+
+}// namespace cuda
+
+template<typename BufferT, template <class, class...> class VectorT = std::vector>
+[[deprecated("Use nanovdb::cuda::splitGridHandles instead")]]
+inline typename util::enable_if<BufferTraits<BufferT>::hasDeviceDual, VectorT<GridHandle<BufferT>>>::type
+splitDeviceGrids(const GridHandle<BufferT> &handle, const BufferT* other = nullptr, cudaStream_t stream = 0)
+{ return cuda::splitGridHandles(handle, other, stream); }
+
+template<typename BufferT, template <class, class...> class VectorT>
+[[deprecated("Use nanovdb::cuda::mergeGridHandles instead")]]
+inline typename util::enable_if<BufferTraits<BufferT>::hasDeviceDual, GridHandle<BufferT>>::type
+mergeDeviceGrids(const VectorT<GridHandle<BufferT>> &handles, const BufferT* other = nullptr, cudaStream_t stream = 0)
+{ return cuda::mergeGridHandles<BufferT, VectorT>(handles, other, stream); }
+
+template<typename BufferT>
+template<typename T, typename util::enable_if<BufferTraits<T>::hasDeviceDual, int>::type>
+GridHandle<BufferT>::GridHandle(T&& buffer)
+{
+    static_assert(util::is_same<T,BufferT>::value, "Expected U==BufferT");
+    mBuffer = std::move(buffer);
+    if (auto *data = reinterpret_cast<const GridData*>(mBuffer.data())) {
+        if (!data->isValid()) throw std::runtime_error("GridHandle was constructed with an invalid host buffer");
+        mMetaData.resize(data->mGridCount);
+        cpyGridHandleMeta(data, mMetaData.data());
+    } else {
+        if (auto *d_data = reinterpret_cast<const GridData*>(mBuffer.deviceData())) {
+            GridData tmp;
+            cudaCheck(cudaMemcpy(&tmp, d_data, sizeof(GridData), cudaMemcpyDeviceToHost));
+            if (!tmp.isValid()) throw std::runtime_error("GridHandle was constructed with an invalid device buffer");
+            GridHandleMetaData *d_metaData;
+            cudaMalloc((void**)&d_metaData, tmp.mGridCount*sizeof(GridHandleMetaData));
+            cuda::cpyGridHandleMeta<<<1,1>>>(d_data, d_metaData);
+            mMetaData.resize(tmp.mGridCount);
+            cudaCheck(cudaMemcpy(mMetaData.data(), d_metaData,tmp.mGridCount*sizeof(GridHandleMetaData), cudaMemcpyDeviceToHost));
+            cudaCheck(cudaFree(d_metaData));
+        }
+    }
+}// GridHandle(T&& buffer)
+
+// Dummy function that ensures instantiation of the move-constructor above when BufferT=cuda::DeviceBuffer
+namespace {auto __dummy(){return GridHandle<cuda::DeviceBuffer>(std::move(cuda::DeviceBuffer()));}}
+
+} // namespace nanovdb
+
+#endif // NANOVDB_CUDA_GRIDHANDLE_CUH_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/cuda/NodeManager.cuh b/external/nanovdb/cuda/NodeManager.cuh
new file mode 100644
index 00000000..639155ce
--- /dev/null
+++ b/external/nanovdb/cuda/NodeManager.cuh
@@ -0,0 +1,104 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file nanovdb/cuda/NodeManager.cuh
+
+    \author Ken Museth
+
+    \date October 3, 2023
+
+    \brief Contains cuda kernels for NodeManager
+
+    \warning The header file contains cuda device code so be sure
+             to only include it in .cu files (or other .cuh files)
+*/
+
+#ifndef NANOVDB_CUDA_NODE_MANAGER_CUH_HAS_BEEN_INCLUDED
+#define NANOVDB_CUDA_NODE_MANAGER_CUH_HAS_BEEN_INCLUDED
+
+#include <nanovdb/util/cuda/Util.h>// for cuda::lambdaKernel
+#include <nanovdb/cuda/DeviceBuffer.h>
+#include <nanovdb/NodeManager.h>
+
+namespace nanovdb {
+
+namespace cuda {
+
+/// @brief Construct a NodeManager from a device grid pointer
+///
+/// @param d_grid device grid pointer whose nodes will be accessed sequentially
+/// @param buffer buffer from which to allocate the output handle
+/// @param stream cuda stream
+/// @return Handle that contains a device NodeManager
+template <typename BuildT, typename BufferT = DeviceBuffer>
+inline typename util::enable_if<BufferTraits<BufferT>::hasDeviceDual, NodeManagerHandle<BufferT>>::type
+createNodeManager(const NanoGrid<BuildT> *d_grid,
+                  const BufferT& pool = BufferT(),
+                  cudaStream_t stream = 0)
+{
+    auto buffer = BufferT::create(sizeof(NodeManagerData), &pool, false, stream);
+    auto *d_data = (NodeManagerData*)buffer.deviceData();
+    size_t size = 0u, *d_size;
+    cudaCheck(util::cuda::mallocAsync((void**)&d_size, sizeof(size_t), stream));
+    util::cuda::lambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {
+#ifdef NANOVDB_USE_NEW_MAGIC_NUMBERS
+        *d_data = NodeManagerData{NANOVDB_MAGIC_NODE,   0u, (void*)d_grid, {0u,0u,0u}};
+#else
+        *d_data = NodeManagerData{NANOVDB_MAGIC_NUMB, 0u, (void*)d_grid, {0u,0u,0u}};
+#endif
+        *d_size = sizeof(NodeManagerData);
+        auto &tree = d_grid->tree();
+        if (NodeManager<BuildT>::FIXED_SIZE && d_grid->isBreadthFirst()) {
+            d_data->mLinear = uint8_t(1u);
+            d_data->mOff[0] = util::PtrDiff(tree.template getFirstNode<0>(), d_grid);
+            d_data->mOff[1] = util::PtrDiff(tree.template getFirstNode<1>(), d_grid);
+            d_data->mOff[2] = util::PtrDiff(tree.template getFirstNode<2>(), d_grid);
+        } else {
+            *d_size += sizeof(uint64_t)*tree.totalNodeCount();
+        }
+    });
+    cudaCheckError();
+    cudaCheck(cudaMemcpyAsync(&size, d_size, sizeof(size_t), cudaMemcpyDeviceToHost, stream));
+    cudaCheck(util::cuda::freeAsync(d_size, stream));
+    if (size > sizeof(NodeManagerData)) {
+        auto tmp = BufferT::create(size, &pool, false, stream);// only allocate buffer on the device
+        cudaCheck(cudaMemcpyAsync(tmp.deviceData(), buffer.deviceData(), sizeof(NodeManagerData), cudaMemcpyDeviceToDevice, stream));
+        buffer = std::move(tmp);
+        d_data = reinterpret_cast<NodeManagerData*>(buffer.deviceData());
+        util::cuda::lambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__ (size_t) {
+            auto &tree = d_grid->tree();
+            int64_t *ptr0 = d_data->mPtr[0] = reinterpret_cast<int64_t*>(d_data + 1);
+            int64_t *ptr1 = d_data->mPtr[1] = d_data->mPtr[0] + tree.nodeCount(0);
+            int64_t *ptr2 = d_data->mPtr[2] = d_data->mPtr[1] + tree.nodeCount(1);
+            // Performs depth first traversal but breadth first insertion
+            for (auto it2 = tree.root().cbeginChild(); it2; ++it2) {
+                *ptr2++ = util::PtrDiff(&*it2, d_grid);
+                for (auto it1 = it2->cbeginChild(); it1; ++it1) {
+                    *ptr1++ = util::PtrDiff(&*it1, d_grid);
+                    for (auto it0 = it1->cbeginChild(); it0; ++it0) {
+                        *ptr0++ = util::PtrDiff(&*it0, d_grid);
+                    }// loop over child nodes of the lower internal node
+                }// loop over child nodes of the upper internal node
+            }// loop over child nodes of the root node
+        });
+    }
+
+    return NodeManagerHandle<BufferT>(toGridType<BuildT>(), std::move(buffer));
+}// cuda::createNodeManager
+
+}// namespace cuda
+
+template <typename BuildT, typename BufferT = cuda::DeviceBuffer>
+[[deprecated("Use cuda::createNodeManager instead")]]
+inline typename util::enable_if<BufferTraits<BufferT>::hasDeviceDual, NodeManagerHandle<BufferT>>::type
+cudaCreateNodeManager(const NanoGrid<BuildT> *d_grid,
+                      const BufferT& pool = BufferT(),
+                      cudaStream_t stream = 0)
+{
+    return cuda::createNodeManager<BuildT, BufferT>(d_grid, pool, stream);
+}
+
+} // namespace nanovdb
+
+#endif // NANOVDB_CUDA_NODE_MANAGER_CUH_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/io/IO.h b/external/nanovdb/io/IO.h
new file mode 100644
index 00000000..a7110846
--- /dev/null
+++ b/external/nanovdb/io/IO.h
@@ -0,0 +1,767 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file IO.h
+
+    \author Ken Museth
+
+    \date May 1, 2020
+
+    \brief Implements I/O for NanoVDB grids. Features optional BLOSC and ZIP
+           file compression, support for multiple grids per file as well as
+           multiple grid types.
+
+    \note  This file does NOT depend on OpenVDB, but optionally on ZIP and BLOSC
+
+    \details NanoVDB files take on of two formats:
+             1) multiple segments each with multiple grids (segments have easy to access metadata about its grids)
+             2) starting with verion 32.6.0 nanovdb files also support a raw buffer with one or more grids (just a
+             dump of a raw grid buffer, so no new metadata in headers as when using segments mentioned above).
+
+    // 1: Segment:  FileHeader, MetaData0, gridName0...MetaDataN, gridNameN, compressed Grid0, ... compressed GridN
+    // 2: Raw: Grid0, ... GridN
+*/
+
+#ifndef NANOVDB_IO_H_HAS_BEEN_INCLUDED
+#define NANOVDB_IO_H_HAS_BEEN_INCLUDED
+
+#include <nanovdb/NanoVDB.h>
+#include <nanovdb/GridHandle.h>
+#include <nanovdb/tools/GridChecksum.h>// for updateGridCount
+
+#include <fstream> // for std::ifstream
+#include <iostream> // for std::cerr/cout
+#include <string> // for std::string
+#include <sstream> // for std::stringstream
+#include <cstring> // for std::strcmp
+#include <memory> // for std::unique_ptr
+#include <vector> // for std::vector
+#ifdef NANOVDB_USE_ZIP
+#include <zlib.h> // for ZIP compression
+#endif
+#ifdef NANOVDB_USE_BLOSC
+#include <blosc.h> // for BLOSC compression
+#endif
+
+// Due to a bug in older versions of gcc, including fstream might
+// define "major" and "minor" which are used as member data below.
+// See https://bugzilla.redhat.com/show_bug.cgi?id=130601
+#if defined(major) || defined(minor)
+#undef major
+#undef minor
+#endif
+
+namespace nanovdb {// ==========================================================
+
+namespace io {// ===============================================================
+
+// --------------------------> writeGrid(s) <------------------------------------
+
+/// @brief Write a single grid to file (over-writing existing content of the file)
+///
+/// @note The single grid is written into a single segment, i.e. header with metadata about its type and size.
+template<typename BufferT>
+void writeGrid(const std::string& fileName, const GridHandle<BufferT>& handle, io::Codec codec = io::Codec::NONE, int verbose = 0);
+
+/// @brief Write multiple grids to file (over-writing existing content of the file)
+///
+/// @note The multiple grids are written into the same segment, i.e. header with metadata about all grids
+template<typename BufferT = HostBuffer, template<typename...> class VecT = std::vector>
+void writeGrids(const std::string& fileName, const VecT<GridHandle<BufferT>>& handles, Codec codec = Codec::NONE, int verbose = 0);
+
+// --------------------------> readGrid(s) <------------------------------------
+
+/// @brief Read and return one or all grids from a file into a single GridHandle
+/// @tparam BufferT Type of buffer used memory allocation
+/// @param fileName string name of file to be read from
+/// @param n zero-based signed index of the grid to be read.
+///          The default value of 0 means read only first grid.
+///          A negative value of n means read all grids in the file.
+/// @param verbose specify verbosity level. Default value of zero means quiet.
+/// @param buffer optional buffer used for memory allocation
+/// @return return a single GridHandle with one or all grids found in the file
+/// @throw will throw a std::runtime_error if the file does not contain a grid with index n
+template<typename BufferT = HostBuffer>
+GridHandle<BufferT> readGrid(const std::string& fileName, int n = 0, int verbose = 0, const BufferT& buffer = BufferT());
+
+/// @brief Read and return the first grid with a specific name from a file
+/// @tparam BufferT Type of buffer used memory allocation
+/// @param fileName string name of file to be read from
+/// @param gridName string name of the grid to be read
+/// @param verbose specify verbosity level. Default value of zero means quiet.
+/// @param buffer  optional buffer used for memory allocation
+/// @return return a single GridHandle containing the grid with the specific name
+/// @throw will throw a std::runtime_error if the file does not contain a grid with the specific name
+template<typename BufferT = HostBuffer>
+GridHandle<BufferT> readGrid(const std::string& fileName, const std::string& gridName, int verbose = 0, const BufferT& buffer = BufferT());
+
+/// @brief Read all the grids in the file and return them as a vector of multiple GridHandles, each containing
+///        all grids encoded in the same segment of the file (i.e. they where written together). This method also
+///        works if the file contains a raw grid buffer in which case a single GridHandle is returned.
+/// @tparam BufferT Type of buffer used memory allocation
+/// @param fileName string name of file to be read from
+/// @param verbose specify verbosity level. Default value of zero means quiet.
+/// @param buffer  optional buffer used for memory allocation
+/// @return Return a vector of GridHandles each containing all grids encoded
+///         in the same segment of the file (i.e. they where written together).
+template<typename BufferT = HostBuffer, template<typename...> class VecT = std::vector>
+VecT<GridHandle<BufferT>> readGrids(const std::string& fileName, int verbose = 0, const BufferT& buffer = BufferT());
+
+// -----------------------------------------------------------------------
+
+/// We fix a specific size for counting bytes in files so that they
+/// are saved the same regardless of machine precision.  (Note there are
+/// still little/bigendian issues, however)
+using fileSize_t = uint64_t;
+
+/// @brief Internal functions for compressed read/write of a NanoVDB GridHandle into a stream
+///
+/// @warning These functions should never be called directly by client code
+namespace Internal {
+static constexpr fileSize_t MAX_SIZE = 1UL << 30; // size is 1 GB
+
+template<typename BufferT>
+static fileSize_t write(std::ostream& os, const GridHandle<BufferT>& handle, Codec codec, uint32_t n);
+
+template<typename BufferT>
+static void read(std::istream& is, BufferT& buffer, Codec codec);
+
+static void read(std::istream& is, char* data, fileSize_t size, Codec codec);
+} // namespace Internal
+
+/// @brief Standard hash function to use on strings; std::hash may vary by
+///        platform/implementation and is know to produce frequent collisions.
+uint64_t stringHash(const char* cstr);
+
+/// @brief Return a uint64_t hash key of a std::string
+inline uint64_t stringHash(const std::string& str){return stringHash(str.c_str());}
+
+/// @brief Return a uint64_t with its bytes reversed so we can check for endianness
+inline uint64_t reverseEndianness(uint64_t val)
+{
+    return (((val) >> 56) & 0x00000000000000FF) | (((val) >> 40) & 0x000000000000FF00) |
+           (((val) >> 24) & 0x0000000000FF0000) | (((val) >>  8) & 0x00000000FF000000) |
+           (((val) <<  8) & 0x000000FF00000000) | (((val) << 24) & 0x0000FF0000000000) |
+           (((val) << 40) & 0x00FF000000000000) | (((val) << 56) & 0xFF00000000000000);
+}
+
+/// @brief This class defines the meta data stored for each grid in a segment
+///
+/// @details A segment consists of a FileHeader followed by a list of FileGridMetaData
+///          each followed by grid names and then finally the grids themselves.
+///
+/// @note This class should not be confused with nanovdb::GridMetaData defined in NanoVDB.h
+///       Also, io::FileMetaData is defined in NanoVDB.h.
+struct FileGridMetaData : public FileMetaData
+{
+    static_assert(sizeof(FileMetaData) == 176, "Unexpected sizeof(FileMetaData)");
+    std::string gridName;
+    void        read(std::istream& is);
+    void        write(std::ostream& os) const;
+    FileGridMetaData() {}
+    FileGridMetaData(uint64_t size, Codec c, const GridData &gridData);
+    uint64_t memUsage() const { return sizeof(FileMetaData) + nameSize; }
+}; // FileGridMetaData
+
+/// @brief This class defines all the data stored in segment of a file
+///
+/// @details A segment consists of a FileHeader followed by a list of FileGridMetaData
+///          each followed by grid names and then finally the grids themselves.
+struct Segment
+{
+    // Check assumptions made during read and write of FileHeader and FileMetaData
+    static_assert(sizeof(FileHeader) == 16u, "Unexpected sizeof(FileHeader)");
+    FileHeader header;// defined in NanoVDB.h
+    std::vector<FileGridMetaData> meta;// defined in NanoVDB.h
+    Segment(Codec c = Codec::NONE)
+#ifdef NANOVDB_USE_NEW_MAGIC_NUMBERS
+        : header{NANOVDB_MAGIC_FILE, Version(), 0u, c}
+#else
+        : header{NANOVDB_MAGIC_NUMB, Version(), 0u, c}
+#endif
+        , meta()
+    {
+    }
+    template<typename BufferT>
+    void     add(const GridHandle<BufferT>& h);
+    bool     read(std::istream& is);
+    void     write(std::ostream& os) const;
+    uint64_t memUsage() const;
+}; // Segment
+
+/// @brief Return true if the file contains a grid with the specified name
+bool hasGrid(const std::string& fileName, const std::string& gridName);
+
+/// @brief Return true if the stream contains a grid with the specified name
+bool hasGrid(std::istream& is, const std::string& gridName);
+
+/// @brief Reads and returns a vector of meta data for all the grids found in the specified file
+std::vector<FileGridMetaData> readGridMetaData(const std::string& fileName);
+
+/// @brief Reads and returns a vector of meta data for all the grids found in the specified stream
+std::vector<FileGridMetaData> readGridMetaData(std::istream& is);
+
+// --------------------------> Implementations for Internal <------------------------------------
+
+template<typename BufferT>
+fileSize_t Internal::write(std::ostream& os, const GridHandle<BufferT>& handle, Codec codec, unsigned int n)
+{
+    const char* data = reinterpret_cast<const char*>(handle.gridData(n));
+    fileSize_t  total = 0, residual = handle.gridSize(n);
+
+    switch (codec) {
+    case Codec::ZIP: {
+#ifdef NANOVDB_USE_ZIP
+        uLongf                   size = compressBound(static_cast<uLongf>(residual)); // Get an upper bound on the size of the compressed data.
+        std::unique_ptr<Bytef[]> tmp(new Bytef[size]);
+        const int                status = compress(tmp.get(), &size, reinterpret_cast<const Bytef*>(data), static_cast<uLongf>(residual));
+        if (status != Z_OK) std::runtime_error("Internal write error in ZIP");
+        if (size > residual) std::cerr << "\nWarning: Unexpected ZIP compression from " << residual << " to " << size << " bytes\n";
+        const fileSize_t outBytes = size;
+        os.write(reinterpret_cast<const char*>(&outBytes), sizeof(fileSize_t));
+        os.write(reinterpret_cast<const char*>(tmp.get()), outBytes);
+        total += sizeof(fileSize_t) + outBytes;
+#else
+        throw std::runtime_error("ZIP compression codec was disabled during build");
+#endif
+        break;
+    }
+    case Codec::BLOSC: {
+#ifdef NANOVDB_USE_BLOSC
+        do {
+            fileSize_t              chunk = residual < MAX_SIZE ? residual : MAX_SIZE, size = chunk + BLOSC_MAX_OVERHEAD;
+            std::unique_ptr<char[]> tmp(new char[size]);
+            const int               count = blosc_compress_ctx(9, 1, sizeof(float), chunk, data, tmp.get(), size, BLOSC_LZ4_COMPNAME, 1 << 18, 1);
+            if (count <= 0) std::runtime_error("Internal write error in BLOSC");
+            const fileSize_t outBytes = count;
+            os.write(reinterpret_cast<const char*>(&outBytes), sizeof(fileSize_t));
+            os.write(reinterpret_cast<const char*>(tmp.get()), outBytes);
+            total += sizeof(fileSize_t) + outBytes;
+            data += chunk;
+            residual -= chunk;
+        } while (residual > 0);
+#else
+        throw std::runtime_error("BLOSC compression codec was disabled during build");
+#endif
+        break;
+    }
+    default:
+        os.write(data, residual);
+        total += residual;
+    }
+    if (!os) throw std::runtime_error("Failed to write Tree to file");
+    return total;
+} // Internal::write
+
+template<typename BufferT>
+void Internal::read(std::istream& is, BufferT& buffer, Codec codec)
+{
+    Internal::read(is, reinterpret_cast<char*>(buffer.data()), buffer.size(), codec);
+} // Internal::read
+
+/// @brief read compressed grid from stream
+/// @param is input stream to read from
+/// @param data data buffer to write into. Must be of size @c residual or larger.
+/// @param residual expected byte size of uncompressed data.
+/// @param codec mode of compression
+void Internal::read(std::istream& is, char* data, fileSize_t residual, Codec codec)
+{
+    // read tree using optional compression
+    switch (codec) {
+    case Codec::ZIP: {
+#ifdef NANOVDB_USE_ZIP
+        fileSize_t size;
+        is.read(reinterpret_cast<char*>(&size), sizeof(fileSize_t));
+        std::unique_ptr<Bytef[]> tmp(new Bytef[size]);// temp buffer for compressed data
+        is.read(reinterpret_cast<char*>(tmp.get()), size);
+        uLongf numBytes = static_cast<uLongf>(residual);
+        int status = uncompress(reinterpret_cast<Bytef*>(data), &numBytes, tmp.get(), static_cast<uLongf>(size));
+        if (status != Z_OK) std::runtime_error("Internal read error in ZIP");
+        if (fileSize_t(numBytes) != residual) throw std::runtime_error("UNZIP failed on byte size");
+#else
+        throw std::runtime_error("ZIP compression codec was disabled during build");
+#endif
+        break;
+    }
+    case Codec::BLOSC: {
+#ifdef NANOVDB_USE_BLOSC
+        do {
+            fileSize_t size;
+            is.read(reinterpret_cast<char*>(&size), sizeof(fileSize_t));
+            std::unique_ptr<char[]> tmp(new char[size]);// temp buffer for compressed data
+            is.read(reinterpret_cast<char*>(tmp.get()), size);
+            const fileSize_t chunk = residual < MAX_SIZE ? residual : MAX_SIZE;
+            const int        count = blosc_decompress_ctx(tmp.get(), data, size_t(chunk), 1); //fails with more threads :(
+            if (count < 1) std::runtime_error("Internal read error in BLOSC");
+            if (count != int(chunk)) throw std::runtime_error("BLOSC failed on byte size");
+            data += size_t(chunk);
+            residual -= chunk;
+        } while (residual > 0);
+#else
+        throw std::runtime_error("BLOSC compression codec was disabled during build");
+#endif
+        break;
+    }
+    default:
+        is.read(data, residual);// read uncompressed data
+    }
+    if (!is) throw std::runtime_error("Failed to read Tree from file");
+} // Internal::read
+
+// --------------------------> Implementations for FileGridMetaData <------------------------------------
+
+inline FileGridMetaData::FileGridMetaData(uint64_t size, Codec c, const GridData &gridData)
+    : FileMetaData{size, // gridSize
+                   size, // fileSize (will typically be redefined)
+                   0u, // nameKey
+                   0u, // voxelCount
+                   gridData.mGridType, // gridType
+                   gridData.mGridClass, // gridClass
+                   gridData.mWorldBBox, // worldBBox
+                   gridData.indexBBox(), // indexBBox
+                   gridData.mVoxelSize, // voxelSize
+                   0, // nameSize
+                   {0, 0, 0, 1}, // nodeCount[4]
+                   {0, 0, 0}, // tileCount[3]
+                   c, // codec
+                   0, // padding
+                   Version()}// version
+    , gridName(gridData.gridName())
+{
+    auto &treeData = *reinterpret_cast<const TreeData*>(gridData.treePtr());
+    nameKey = stringHash(gridName);
+    voxelCount = treeData.mVoxelCount;
+    nameSize = static_cast<uint32_t>(gridName.size() + 1); // include '\0'
+    for (int i = 0; i < 3; ++i) {
+        FileMetaData::nodeCount[i] = treeData.mNodeCount[i];
+        FileMetaData::tileCount[i] = treeData.mTileCount[i];
+    }
+}// FileGridMetaData::FileGridMetaData
+
+inline void FileGridMetaData::write(std::ostream& os) const
+{
+    os.write(reinterpret_cast<const char*>(this), sizeof(FileMetaData));
+    os.write(gridName.c_str(), nameSize);
+    if (!os) throw std::runtime_error("Failed writing FileGridMetaData");
+}// FileGridMetaData::write
+
+inline void FileGridMetaData::read(std::istream& is)
+{
+    is.read(reinterpret_cast<char*>(this), sizeof(FileMetaData));
+    std::unique_ptr<char[]> tmp(new char[nameSize]);
+    is.read(reinterpret_cast<char*>(tmp.get()), nameSize);
+    gridName.assign(tmp.get());
+    if (!is) throw std::runtime_error("Failed reading FileGridMetaData");
+}// FileGridMetaData::read
+
+// --------------------------> Implementations for Segment <------------------------------------
+
+inline uint64_t Segment::memUsage() const
+{
+    uint64_t sum = sizeof(FileHeader);
+    for (auto& m : meta) sum += m.memUsage();// includes FileMetaData + grid name
+    return sum;
+}// Segment::memUsage
+
+template<typename BufferT>
+inline void Segment::add(const GridHandle<BufferT>& h)
+{
+    for (uint32_t i = 0; i < h.gridCount(); ++i) {
+        const GridData *gridData = h.gridData(i);
+        if (!gridData) throw std::runtime_error("Segment::add: GridHandle does not contain grid #" + std::to_string(i));
+        meta.emplace_back(h.gridSize(i), header.codec, *gridData);
+    }
+    header.gridCount += h.gridCount();
+}// Segment::add
+
+inline void Segment::write(std::ostream& os) const
+{
+    if (header.gridCount == 0) {
+        throw std::runtime_error("Segment contains no grids");
+    } else if (!os.write(reinterpret_cast<const char*>(&header), sizeof(FileHeader))) {
+        throw std::runtime_error("Failed to write FileHeader of Segment");
+    }
+    for (auto& m : meta) m.write(os);
+}// Segment::write
+
+inline bool Segment::read(std::istream& is)
+{
+    is.read(reinterpret_cast<char*>(&header), sizeof(FileHeader));
+    if (is.eof()) {// The EOF flag is only set once a read tries to read past the end of the file
+        is.clear(std::ios_base::eofbit);// clear eof flag so we can rewind and read again
+        return false;
+    }
+    const MagicType magic = toMagic(header.magic);
+    if (magic != MagicType::NanoVDB && magic != MagicType::NanoFile) {
+        // first check for byte-swapped header magic.
+        if (header.magic == reverseEndianness(NANOVDB_MAGIC_NUMB) ||
+            header.magic == reverseEndianness(NANOVDB_MAGIC_FILE)) {
+            throw std::runtime_error("This nvdb file has reversed endianness");
+        } else {
+            if (magic == MagicType::OpenVDB) {
+                throw std::runtime_error("Expected a NanoVDB file, but read an OpenVDB file!");
+            } else if (magic == MagicType::NanoGrid) {
+                throw std::runtime_error("Expected a NanoVDB file, but read a raw NanoVDB grid!");
+            } else {
+                throw std::runtime_error("Expected a NanoVDB file, but read a file of unknown type!");
+            }
+        }
+    } else if ( !header.version.isCompatible()) {
+        std::stringstream ss;
+        Version v;
+        is.read(reinterpret_cast<char*>(&v), sizeof(Version));// read GridData::mVersion located at byte 16=sizeof(FileHeader) is stream
+        if ( v.getMajor() == NANOVDB_MAJOR_VERSION_NUMBER) {
+            ss << "This file looks like it contains a raw grid buffer and not a standard file with meta data";
+        } else if ( header.version.getMajor() < NANOVDB_MAJOR_VERSION_NUMBER) {
+            char str[30];
+            ss << "The file contains an older version of NanoVDB: " << std::string(toStr(str, header.version)) << "!\n\t"
+               << "Recommendation: Re-generate this NanoVDB file with this version: " << NANOVDB_MAJOR_VERSION_NUMBER << ".X of NanoVDB";
+        } else {
+            ss << "This tool was compiled against an older version of NanoVDB: " << NANOVDB_MAJOR_VERSION_NUMBER << ".X!\n\t"
+               << "Recommendation: Re-compile this tool against the newer version: " << header.version.getMajor() << ".X of NanoVDB";
+        }
+        throw std::runtime_error("An unrecoverable error in nanovdb::Segment::read:\n\tIncompatible file format: " + ss.str());
+    }
+    meta.resize(header.gridCount);
+    for (auto& m : meta) {
+        m.read(is);
+        m.version = header.version;
+    }
+    return true;
+}// Segment::read
+
+// --------------------------> writeGrid <------------------------------------
+
+template<typename BufferT>
+void writeGrid(std::ostream& os, const GridHandle<BufferT>& handle, Codec codec)
+{
+    Segment seg(codec);
+    seg.add(handle);
+    const auto start = os.tellp();
+    seg.write(os); // write header without the correct fileSize (so it's allocated)
+    for (uint32_t i = 0; i < handle.gridCount(); ++i) {
+        seg.meta[i].fileSize = Internal::write(os, handle, codec, i);
+    }
+    os.seekp(start);
+    seg.write(os);// re-write header with the correct fileSize
+    os.seekp(0, std::ios_base::end);// skip to end
+}// writeGrid
+
+template<typename BufferT>
+void writeGrid(const std::string& fileName, const GridHandle<BufferT>& handle, Codec codec, int verbose)
+{
+    std::ofstream os(fileName, std::ios::out | std::ios::binary | std::ios::trunc);
+    if (!os.is_open()) {
+        throw std::ios_base::failure("Unable to open file named \"" + fileName + "\" for output");
+    }
+    writeGrid<BufferT>(os, handle, codec);
+    if (verbose) {
+        std::cout << "Wrote nanovdb::Grid to file named \"" << fileName << "\"" << std::endl;
+    }
+}// writeGrid
+
+// --------------------------> writeGrids <------------------------------------
+
+template<typename BufferT = HostBuffer, template<typename...> class VecT = std::vector>
+void writeGrids(std::ostream& os, const VecT<GridHandle<BufferT>>& handles, Codec codec = Codec::NONE)
+{
+    for (auto& h : handles) writeGrid(os, h, codec);
+}// writeGrids
+
+template<typename BufferT, template<typename...> class VecT>
+void writeGrids(const std::string& fileName, const VecT<GridHandle<BufferT>>& handles, Codec codec, int verbose)
+{
+    std::ofstream os(fileName, std::ios::out | std::ios::binary | std::ios::trunc);
+    if (!os.is_open()) throw std::ios_base::failure("Unable to open file named \"" + fileName + "\" for output");
+    writeGrids<BufferT, VecT>(os, handles, codec);
+    if (verbose) std::cout << "Wrote " << handles.size() << " nanovdb::Grid(s) to file named \"" << fileName << "\"" << std::endl;
+}// writeGrids
+
+// --------------------------> readGrid <------------------------------------
+
+template<typename BufferT>
+GridHandle<BufferT> readGrid(std::istream& is, int n, const BufferT& pool)
+{
+    GridHandle<BufferT> handle;
+    if (n<0) {// read all grids into the same buffer
+        try {//first try to read a raw grid buffer
+            handle.read(is, pool);
+        } catch(const std::logic_error&) {
+            Segment seg;
+            uint64_t bufferSize = 0u;
+            uint32_t gridCount = 0u, gridIndex = 0u;
+            const auto start = is.tellg();
+            while (seg.read(is)) {
+                std::streamoff skipSize = 0;
+                for (auto& m : seg.meta) {
+                    ++gridCount;
+                    bufferSize += m.gridSize;
+                    skipSize   += m.fileSize;
+                }// loop over grids in segment
+                is.seekg(skipSize, std::ios_base::cur); // skip forward from the current position
+            }// loop over segments
+            auto buffer = BufferT::create(bufferSize, &pool);
+            char *ptr = (char*)buffer.data();
+            is.seekg(start);// rewind
+            while (seg.read(is)) {
+                for (auto& m : seg.meta) {
+                    Internal::read(is, ptr, m.gridSize, seg.header.codec);
+                    tools::updateGridCount((GridData*)ptr, gridIndex++, gridCount);
+                    ptr += m.gridSize;
+                }// loop over grids in segment
+            }// loop over segments
+            return GridHandle<BufferT>(std::move(buffer));
+        }
+    } else {// read a specific grid
+        try {//first try to read a raw grid buffer
+            handle.read(is, uint32_t(n), pool);
+            tools::updateGridCount((GridData*)handle.data(), 0u, 1u);
+        } catch(const std::logic_error&) {
+            Segment seg;
+            int counter = -1;
+            while (seg.read(is)) {
+                std::streamoff seek = 0;
+                for (auto& m : seg.meta) {
+                    if (++counter == n) {
+                        auto buffer = BufferT::create(m.gridSize, &pool);
+                        Internal::read(is, buffer, seg.header.codec);
+                        tools::updateGridCount((GridData*)buffer.data(), 0u, 1u);
+                        return GridHandle<BufferT>(std::move(buffer));
+                    } else {
+                        seek += m.fileSize;
+                    }
+                }// loop over grids in segment
+                is.seekg(seek, std::ios_base::cur); // skip forward from the current position
+            }// loop over segments
+            if (n != counter) throw std::runtime_error("stream does not contain a #" + std::to_string(n) + " grid");
+        }
+    }
+    return handle;
+}// readGrid
+
+/// @brief Read the n'th grid
+template<typename BufferT>
+GridHandle<BufferT> readGrid(const std::string& fileName, int n, int verbose, const BufferT& buffer)
+{
+    std::ifstream is(fileName, std::ios::in | std::ios::binary);
+    if (!is.is_open()) throw std::ios_base::failure("Unable to open file named \"" + fileName + "\" for input");
+    auto handle = readGrid<BufferT>(is, n, buffer);
+    if (verbose) {
+        if (n<0) {
+            std::cout << "Read all NanoGrids from the file named \"" << fileName << "\"" << std::endl;
+        } else {
+            std::cout << "Read NanoGrid # " << n << " from the file named \"" << fileName << "\"" << std::endl;
+        }
+    }
+    return handle; // is converted to r-value and return value is move constructed.
+}// readGrid
+
+/// @brief Read a specific grid from an input stream given the name of the grid
+/// @tparam BufferT Buffer type used for allocation
+/// @param is input stream from which to read the grid
+/// @param gridName string name of the (first) grid to be returned
+/// @param pool optional memory pool from which to allocate the grid buffer
+/// @return Return the first grid in the input stream with a specific name
+/// @throw std::runtime_error with no grid exists with the specified name
+template<typename BufferT>
+GridHandle<BufferT> readGrid(std::istream& is, const std::string& gridName, const BufferT& pool)
+{
+    try {
+        GridHandle<BufferT> handle;
+        handle.read(is, gridName, pool);
+        return handle;
+    } catch(const std::logic_error&) {
+        const auto key = stringHash(gridName);
+        Segment seg;
+        while (seg.read(is)) {// loop over all segments in stream
+            std::streamoff seek = 0;
+            for (auto& m : seg.meta) {// loop over all grids in segment
+                if ((m.nameKey == 0u || m.nameKey == key) && m.gridName == gridName) { // check for hash key collision
+                    auto buffer = BufferT::create(m.gridSize, &pool);
+                    is.seekg(seek, std::ios_base::cur); // rewind
+                    Internal::read(is, buffer, seg.header.codec);
+                    tools::updateGridCount((GridData*)buffer.data(), 0u, 1u);
+                    return GridHandle<BufferT>(std::move(buffer));
+                } else {
+                    seek += m.fileSize;
+                }
+            }
+            is.seekg(seek, std::ios_base::cur); // skip forward from the current position
+        }
+    }
+    throw std::runtime_error("Grid name '" + gridName + "' not found in file");
+}// readGrid
+
+/// @brief Read the first grid with a specific name
+template<typename BufferT>
+GridHandle<BufferT> readGrid(const std::string& fileName, const std::string& gridName, int verbose, const BufferT& buffer)
+{
+    std::ifstream is(fileName, std::ios::in | std::ios::binary);
+    if (!is.is_open()) throw std::ios_base::failure("Unable to open file named \"" + fileName + "\" for input");
+    auto handle = readGrid<BufferT>(is, gridName, buffer);
+    if (verbose) {
+        if (handle) {
+            std::cout << "Read NanoGrid named \"" << gridName << "\" from the file named \"" << fileName << "\"" << std::endl;
+        } else {
+            std::cout << "File named \"" << fileName << "\" does not contain a grid named \"" + gridName + "\"" << std::endl;
+        }
+    }
+    return handle; // is converted to r-value and return value is move constructed.
+}// readGrid
+
+// --------------------------> readGrids <------------------------------------
+
+template<typename BufferT = HostBuffer, template<typename...> class VecT = std::vector>
+VecT<GridHandle<BufferT>> readGrids(std::istream& is, const BufferT& pool = BufferT())
+{
+    VecT<GridHandle<BufferT>> handles;
+    try {//first try to read a raw grid buffer
+        GridHandle<BufferT> handle;
+        handle.read(is, pool);// will throw if stream does not contain a raw grid buffer
+        handles.push_back(std::move(handle)); // force move copy assignment
+    } catch(const std::logic_error&) {
+        Segment seg;
+        while (seg.read(is)) {
+            uint64_t bufferSize = 0;
+            for (auto& m : seg.meta) bufferSize += m.gridSize;
+            auto buffer = BufferT::create(bufferSize, &pool);
+            uint64_t bufferOffset = 0;
+            for (uint16_t i = 0; i < seg.header.gridCount; ++i) {
+                auto *data = util::PtrAdd<GridData>(buffer.data(), bufferOffset);
+                Internal::read(is, (char*)data, seg.meta[i].gridSize, seg.header.codec);
+                tools::updateGridCount(data, uint32_t(i), uint32_t(seg.header.gridCount));
+                bufferOffset += seg.meta[i].gridSize;
+            }// loop over grids in segment
+            handles.emplace_back(std::move(buffer)); // force move copy assignment
+        }// loop over segments
+    }
+    return handles; // is converted to r-value and return value is move constructed.
+}// readGrids
+
+/// @brief Read all the grids
+template<typename BufferT, template<typename...> class VecT>
+VecT<GridHandle<BufferT>> readGrids(const std::string& fileName, int verbose, const BufferT& buffer)
+{
+    std::ifstream is(fileName, std::ios::in | std::ios::binary);
+    if (!is.is_open()) throw std::ios_base::failure("Unable to open file named \"" + fileName + "\" for input");
+    auto handles = readGrids<BufferT, VecT>(is, buffer);
+    if (verbose) std::cout << "Read " << handles.size() << " NanoGrid(s) from the file named \"" << fileName << "\"" << std::endl;
+    return handles; // is converted to r-value and return value is move constructed.
+}// readGrids
+
+// --------------------------> readGridMetaData <------------------------------------
+
+inline std::vector<FileGridMetaData> readGridMetaData(const std::string& fileName)
+{
+    std::ifstream is(fileName, std::ios::in | std::ios::binary);
+    if (!is.is_open()) throw std::ios_base::failure("Unable to open file named \"" + fileName + "\" for input");
+    return readGridMetaData(is); // is converted to r-value and return value is move constructed.
+}// readGridMetaData
+
+inline std::vector<FileGridMetaData> readGridMetaData(std::istream& is)
+{
+    Segment seg;
+    std::vector<FileGridMetaData> meta;
+    try {
+        GridHandle<> handle;// if stream contains a raw grid buffer we unfortunately have to load everything
+        handle.read(is);
+        seg.add(handle);
+        meta = std::move(seg.meta);
+    } catch(const std::logic_error&) {
+        while (seg.read(is)) {
+            std::streamoff skip = 0;
+            for (auto& m : seg.meta) {
+                meta.push_back(m);
+                skip += m.fileSize;
+            }// loop over grid meta data in segment
+            is.seekg(skip, std::ios_base::cur);
+        }// loop over segments
+    }
+    return meta; // is converted to r-value and return value is move constructed.
+}// readGridMetaData
+
+// --------------------------> hasGrid <------------------------------------
+
+inline bool hasGrid(const std::string& fileName, const std::string& gridName)
+{
+    std::ifstream is(fileName, std::ios::in | std::ios::binary);
+    if (!is.is_open()) throw std::ios_base::failure("Unable to open file named \"" + fileName + "\" for input");
+    return hasGrid(is, gridName);
+}// hasGrid
+
+inline bool hasGrid(std::istream& is, const std::string& gridName)
+{
+    const auto key = stringHash(gridName);
+    Segment seg;
+    while (seg.read(is)) {
+        std::streamoff seek = 0;
+        for (auto& m : seg.meta) {
+            if (m.nameKey == key && m.gridName == gridName) return true; // check for hash key collision
+            seek += m.fileSize;
+        }// loop over grid meta data in segment
+        is.seekg(seek, std::ios_base::cur);
+    }// loop over segments
+    return false;
+}// hasGrid
+
+// --------------------------> stringHash <------------------------------------
+
+inline uint64_t stringHash(const char* c_str)
+{
+    uint64_t hash = 0;// zero is returned when cstr = nullptr or "\0"
+    if (c_str) {
+        for (auto* str = reinterpret_cast<const unsigned char*>(c_str); *str; ++str) {
+            uint64_t overflow = hash >> (64 - 8);
+            hash *= 67; // Next-ish prime after 26 + 26 + 10
+            hash += *str + overflow;
+        }
+    }
+    return hash;
+}// stringHash
+
+} // namespace io ======================================================================
+
+template<typename T>
+inline std::ostream&
+operator<<(std::ostream& os, const math::BBox<math::Vec3<T>>& b)
+{
+    os << "(" << b[0][0] << "," << b[0][1] << "," << b[0][2] << ") -> "
+       << "(" << b[1][0] << "," << b[1][1] << "," << b[1][2] << ")";
+    return os;
+}
+
+inline std::ostream&
+operator<<(std::ostream& os, const CoordBBox& b)
+{
+    os << "(" << b[0][0] << "," << b[0][1] << "," << b[0][2] << ") -> "
+       << "(" << b[1][0] << "," << b[1][1] << "," << b[1][2] << ")";
+    return os;
+}
+
+inline std::ostream&
+operator<<(std::ostream& os, const Coord& ijk)
+{
+    os << "(" << ijk[0] << "," << ijk[1] << "," << ijk[2] << ")";
+    return os;
+}
+
+template<typename T>
+inline std::ostream&
+operator<<(std::ostream& os, const math::Vec3<T>& v)
+{
+    os << "(" << v[0] << "," << v[1] << "," << v[2] << ")";
+    return os;
+}
+
+template<typename T>
+inline std::ostream&
+operator<<(std::ostream& os, const math::Vec4<T>& v)
+{
+    os << "(" << v[0] << "," << v[1] << "," << v[2] << "," << v[3] << ")";
+    return os;
+}
+
+} // namespace nanovdb ===================================================================
+
+#endif // NANOVDB_IO_H_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/math/CSampleFromVoxels.h b/external/nanovdb/math/CSampleFromVoxels.h
new file mode 100644
index 00000000..c7820a70
--- /dev/null
+++ b/external/nanovdb/math/CSampleFromVoxels.h
@@ -0,0 +1,327 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+//
+// Simple C-wrapper for voxel interpolation functions
+//
+
+#ifndef __CSAMPLEFROMVOXELS__
+#define __CSAMPLEFROMVOXELS__
+
+#include "../CNanoVDB.h"
+
+#ifdef __OPENCL_VERSION__
+#else
+#include <math.h>
+#endif
+
+void
+cnanovdb_coord_round(cnanovdb_coord *RESTRICT coord, const cnanovdb_Vec3F *RESTRICT xyz)
+{
+#ifdef __OPENCL_VERSION__
+    coord->mVec[0] = floor(xyz->mVec[0]+0.5);
+    coord->mVec[1] = floor(xyz->mVec[1]+0.5);
+    coord->mVec[2] = floor(xyz->mVec[2]+0.5);
+#else
+    coord->mVec[0] = floorf(xyz->mVec[0]+0.5);
+    coord->mVec[1] = floorf(xyz->mVec[1]+0.5);
+    coord->mVec[2] = floorf(xyz->mVec[2]+0.5);
+#endif
+}
+
+void
+cnanovdb_coord_fract(cnanovdb_coord *RESTRICT coord, cnanovdb_Vec3F *RESTRICT fraction, const cnanovdb_Vec3F *RESTRICT xyz)
+{
+#ifdef __OPENCL_VERSION__
+    float               i0, i1, i2;
+    fraction->mVec[0] = fract(xyz->mVec[0], &i0);
+    coord->mVec[0] = i0;
+    fraction->mVec[1] = fract(xyz->mVec[1], &i1);
+    coord->mVec[1] = i1;
+    fraction->mVec[2] = fract(xyz->mVec[2], &i2);
+    coord->mVec[2] = i2;
+#else
+    float               i0, i1, i2;
+    i0 = floorf(xyz->mVec[0]);
+    fraction->mVec[0] = xyz->mVec[0] - i0;
+    coord->mVec[0] = i0;
+    i1 = floorf(xyz->mVec[1]);
+    fraction->mVec[1] = xyz->mVec[1] - i1;
+    coord->mVec[1] = i1;
+    i2 = floorf(xyz->mVec[2]);
+    fraction->mVec[2] = xyz->mVec[2] - i2;
+    coord->mVec[2] = i2;
+#endif
+}
+
+#define CREATE_STENCIL(VALUETYPE, SUFFIX) \
+typedef struct \
+{ \
+    VALUETYPE mStencil[2][2][2]; \
+    cnanovdb_coord mCoord; \
+} cnanovdb_stencil1##SUFFIX; \
+ \
+void \
+cnanovdb_stencil1##SUFFIX##_clear(cnanovdb_stencil1##SUFFIX *RESTRICT stencil) \
+{ \
+    /* Invalid coords. */ \
+    stencil->mCoord.mVec[0] = 0x80000000; \
+    stencil->mCoord.mVec[1] = 0x80000000; \
+    stencil->mCoord.mVec[2] = 0x80000000; \
+} \
+ \
+void \
+cnanovdb_stencil1##SUFFIX##_fill(cnanovdb_stencil1##SUFFIX *RESTRICT stencil, cnanovdb_readaccessor *RESTRICT acc, cnanovdb_coord *RESTRICT coord) \
+{ \
+    stencil->mStencil[0][0][0] = cnanovdb_readaccessor_getValue##SUFFIX(acc, coord); \
+    coord->mVec[2] += 1; \
+    stencil->mStencil[0][0][1] = cnanovdb_readaccessor_getValue##SUFFIX(acc, coord); \
+    coord->mVec[1] += 1; \
+    stencil->mStencil[0][1][1] = cnanovdb_readaccessor_getValue##SUFFIX(acc, coord); \
+    coord->mVec[2] -= 1; \
+    stencil->mStencil[0][1][0] = cnanovdb_readaccessor_getValue##SUFFIX(acc, coord); \
+ \
+    coord->mVec[0] += 1; \
+    stencil->mStencil[1][1][0] = cnanovdb_readaccessor_getValue##SUFFIX(acc, coord); \
+    coord->mVec[2] += 1; \
+    stencil->mStencil[1][1][1] = cnanovdb_readaccessor_getValue##SUFFIX(acc, coord); \
+    coord->mVec[1] -= 1; \
+    stencil->mStencil[1][0][1] = cnanovdb_readaccessor_getValue##SUFFIX(acc, coord); \
+    coord->mVec[2] -= 1; \
+    stencil->mStencil[1][0][0] = cnanovdb_readaccessor_getValue##SUFFIX(acc, coord); \
+    coord->mVec[0] -= 1; \
+     \
+    stencil->mCoord.mVec[0] = coord->mVec[0]; \
+    stencil->mCoord.mVec[1] = coord->mVec[1]; \
+    stencil->mCoord.mVec[2] = coord->mVec[2]; \
+} \
+ \
+void \
+cnanovdb_stencil1##SUFFIX##_update(cnanovdb_stencil1##SUFFIX *RESTRICT stencil, cnanovdb_readaccessor *RESTRICT acc, cnanovdb_coord *RESTRICT coord) \
+{ \
+    uint32_t change = (coord->mVec[0] ^ stencil->mCoord.mVec[0]) | \
+                      (coord->mVec[1] ^ stencil->mCoord.mVec[1]) | \
+                      (coord->mVec[2] ^ stencil->mCoord.mVec[2]); \
+    if (!change) \
+        return; \
+ \
+    cnanovdb_stencil1##SUFFIX##_fill(stencil, acc, coord); \
+} \
+/**/
+CREATE_STENCIL(float, F)
+CREATE_STENCIL(cnanovdb_Vec3F, F3)
+
+
+#define CREATE_LERPSIMPLE(VALUETYPE, SUFFIX) \
+VALUETYPE \
+cnanovdb_lerp##SUFFIX(VALUETYPE a, VALUETYPE b, float w) \
+{ \
+    return a + w * (b - a); \
+} \
+/**/
+
+CREATE_LERPSIMPLE(float, F)
+CREATE_LERPSIMPLE(double, D)
+
+cnanovdb_Vec3F
+cnanovdb_lerpF3(cnanovdb_Vec3F a, cnanovdb_Vec3F b, float w)
+{
+    a.mVec[0] = cnanovdb_lerpF(a.mVec[0], b.mVec[0], w);
+    a.mVec[1] = cnanovdb_lerpF(a.mVec[1], b.mVec[1], w);
+    a.mVec[2] = cnanovdb_lerpF(a.mVec[2], b.mVec[2], w);
+    return a;
+}
+
+#define CREATE_SAMPLE(VALUETYPE, SUFFIX) \
+VALUETYPE \
+cnanovdb_sample##SUFFIX##_nearest(cnanovdb_readaccessor *RESTRICT acc, const cnanovdb_Vec3F *RESTRICT xyz) \
+{ \
+    cnanovdb_coord coord; \
+    cnanovdb_coord_round(&coord, xyz); \
+    return cnanovdb_readaccessor_getValue##SUFFIX(acc, &coord); \
+} \
+ \
+VALUETYPE \
+cnanovdb_sample##SUFFIX##_trilinear(cnanovdb_readaccessor *RESTRICT acc, const cnanovdb_Vec3F *RESTRICT xyz) \
+{ \
+    cnanovdb_coord coord; \
+    cnanovdb_Vec3F fraction; \
+    cnanovdb_coord_fract(&coord, &fraction, xyz); \
+ \
+    VALUETYPE               vx, vx1, vy, vy1, vz, vz1; \
+ \
+    vz = cnanovdb_readaccessor_getValue##SUFFIX(acc, &coord); \
+    coord.mVec[2] += 1; \
+    vz1 = cnanovdb_readaccessor_getValue##SUFFIX(acc, &coord); \
+    vy = cnanovdb_lerp##SUFFIX(vz, vz1, fraction.mVec[2]); \
+ \
+    coord.mVec[1] += 1; \
+ \
+    vz1 = cnanovdb_readaccessor_getValue##SUFFIX(acc, &coord); \
+    coord.mVec[2] -= 1; \
+    vz = cnanovdb_readaccessor_getValue##SUFFIX(acc, &coord); \
+    vy1 = cnanovdb_lerp##SUFFIX(vz, vz1, fraction.mVec[2]); \
+ \
+    vx = cnanovdb_lerp##SUFFIX(vy, vy1, fraction.mVec[1]); \
+ \
+    coord.mVec[0] += 1; \
+ \
+    vz = cnanovdb_readaccessor_getValue##SUFFIX(acc, &coord); \
+    coord.mVec[2] += 1; \
+    vz1 = cnanovdb_readaccessor_getValue##SUFFIX(acc, &coord); \
+    vy1 = cnanovdb_lerp##SUFFIX(vz, vz1, fraction.mVec[2]); \
+ \
+    coord.mVec[1] -= 1; \
+ \
+    vz1 = cnanovdb_readaccessor_getValue##SUFFIX(acc, &coord); \
+    coord.mVec[2] -= 1; \
+    vz = cnanovdb_readaccessor_getValue##SUFFIX(acc, &coord); \
+    vy = cnanovdb_lerp##SUFFIX(vz, vz1, fraction.mVec[2]); \
+ \
+    vx1 = cnanovdb_lerp##SUFFIX(vy, vy1, fraction.mVec[1]); \
+ \
+    return cnanovdb_lerp##SUFFIX(vx, vx1, fraction.mVec[0]); \
+} \
+ \
+VALUETYPE \
+cnanovdb_sample##SUFFIX##_trilinear_stencil(cnanovdb_stencil1##SUFFIX *RESTRICT stencil, cnanovdb_readaccessor *RESTRICT acc, const cnanovdb_Vec3F *RESTRICT xyz) \
+{ \
+    cnanovdb_coord coord; \
+    cnanovdb_Vec3F fraction; \
+    cnanovdb_coord_fract(&coord, &fraction, xyz); \
+ \
+    cnanovdb_stencil1##SUFFIX##_update(stencil, acc, &coord); \
+ \
+    VALUETYPE               vx, vx1, vy, vy1, vz, vz1; \
+ \
+    vz = stencil->mStencil[0][0][0]; \
+    vz1 = stencil->mStencil[0][0][1]; \
+    vy = cnanovdb_lerp##SUFFIX(vz, vz1, fraction.mVec[2]); \
+ \
+    vz = stencil->mStencil[0][1][0]; \
+    vz1 = stencil->mStencil[0][1][1]; \
+    vy1 = cnanovdb_lerp##SUFFIX(vz, vz1, fraction.mVec[2]); \
+ \
+    vx = cnanovdb_lerp##SUFFIX(vy, vy1, fraction.mVec[1]); \
+ \
+    vz = stencil->mStencil[1][1][0]; \
+    vz1 = stencil->mStencil[1][1][1]; \
+    vy1 = cnanovdb_lerp##SUFFIX(vz, vz1, fraction.mVec[2]); \
+ \
+    vz = stencil->mStencil[1][0][0]; \
+    vz1 = stencil->mStencil[1][0][1]; \
+    vy = cnanovdb_lerp##SUFFIX(vz, vz1, fraction.mVec[2]); \
+ \
+    vx1 = cnanovdb_lerp##SUFFIX(vy, vy1, fraction.mVec[1]); \
+ \
+    return cnanovdb_lerp##SUFFIX(vx, vx1, fraction.mVec[0]); \
+} \
+/**/
+CREATE_SAMPLE(float, F)
+CREATE_SAMPLE(cnanovdb_Vec3F, F3)
+
+void
+cnanovdb_sampleF_gradient(cnanovdb_Vec3F *RESTRICT ret, cnanovdb_readaccessor *RESTRICT acc, const cnanovdb_Vec3F *RESTRICT xyz)
+{
+    cnanovdb_Vec3F qxyz;
+    qxyz.mVec[0] = xyz->mVec[0];
+    qxyz.mVec[1] = xyz->mVec[1];
+    qxyz.mVec[2] = xyz->mVec[2];
+    for (int i = 0; i < 3; i++)
+    {
+        float       sp, sm;
+
+        qxyz.mVec[i] -= 0.5;
+        sm = cnanovdb_sampleF_trilinear(acc, &qxyz);
+        qxyz.mVec[i] += 1.0;
+        sp = cnanovdb_sampleF_trilinear(acc, &qxyz);
+        qxyz.mVec[i] -= 0.5;
+        ret->mVec[i] = sp - sm;
+    }
+}
+
+void
+cnanovdb_sampleF_gradient0(cnanovdb_Vec3F *RESTRICT ret, cnanovdb_readaccessor *RESTRICT acc, const cnanovdb_Vec3F *RESTRICT xyz)
+{
+    cnanovdb_coord coord;
+    cnanovdb_Vec3F fraction;
+    cnanovdb_coord_fract(&coord, &fraction, xyz);
+
+    float stencil[2][2][2];
+
+    stencil[0][0][0] = cnanovdb_readaccessor_getValueF(acc, &coord);
+    coord.mVec[2] += 1;
+    stencil[0][0][1] = cnanovdb_readaccessor_getValueF(acc, &coord);
+    coord.mVec[1] += 1;
+    stencil[0][1][1] = cnanovdb_readaccessor_getValueF(acc, &coord);
+    coord.mVec[2] -= 1;
+    stencil[0][1][0] = cnanovdb_readaccessor_getValueF(acc, &coord);
+
+    coord.mVec[0] += 1;
+    stencil[1][1][0] = cnanovdb_readaccessor_getValueF(acc, &coord);
+    coord.mVec[2] += 1;
+    stencil[1][1][1] = cnanovdb_readaccessor_getValueF(acc, &coord);
+    coord.mVec[1] -= 1;
+    stencil[1][0][1] = cnanovdb_readaccessor_getValueF(acc, &coord);
+    coord.mVec[2] -= 1;
+    stencil[1][0][0] = cnanovdb_readaccessor_getValueF(acc, &coord);
+
+    float D[4];
+
+    D[0] = stencil[0][0][1] - stencil[0][0][0];
+    D[1] = stencil[0][1][1] - stencil[0][1][0];
+    D[2] = stencil[1][0][1] - stencil[1][0][0];
+    D[3] = stencil[1][1][1] - stencil[1][1][0];
+
+    ret->mVec[2] = cnanovdb_lerpF(
+                        cnanovdb_lerpF(D[0], D[1], fraction.mVec[1]),
+                        cnanovdb_lerpF(D[2], D[3], fraction.mVec[1]),
+                        fraction.mVec[0] );
+
+    float w = fraction.mVec[2];
+    D[0] = stencil[0][0][0] + D[0] * w;
+    D[1] = stencil[0][1][0] + D[1] * w;
+    D[2] = stencil[1][0][0] + D[2] * w;
+    D[3] = stencil[1][1][0] + D[3] * w;
+
+    ret->mVec[0] =   cnanovdb_lerpF(D[2], D[3], fraction.mVec[1])
+                   - cnanovdb_lerpF(D[0], D[1], fraction.mVec[1]);
+
+    ret->mVec[1] = cnanovdb_lerpF(D[1] - D[0], D[3] - D[2], fraction.mVec[0]);
+}
+
+void
+cnanovdb_sampleF_gradient0_stencil(cnanovdb_Vec3F *RESTRICT ret, cnanovdb_stencil1F *RESTRICT stencil, cnanovdb_readaccessor *RESTRICT acc, const cnanovdb_Vec3F *RESTRICT xyz)
+{
+    cnanovdb_coord coord;
+    cnanovdb_Vec3F fraction;
+    cnanovdb_coord_fract(&coord, &fraction, xyz);
+
+    cnanovdb_stencil1F_update(stencil, acc, &coord);
+
+    float D[4];
+
+    D[0] = stencil->mStencil[0][0][1] - stencil->mStencil[0][0][0];
+    D[1] = stencil->mStencil[0][1][1] - stencil->mStencil[0][1][0];
+    D[2] = stencil->mStencil[1][0][1] - stencil->mStencil[1][0][0];
+    D[3] = stencil->mStencil[1][1][1] - stencil->mStencil[1][1][0];
+
+    ret->mVec[2] = cnanovdb_lerpF(
+                        cnanovdb_lerpF(D[0], D[1], fraction.mVec[1]),
+                        cnanovdb_lerpF(D[2], D[3], fraction.mVec[1]),
+                        fraction.mVec[0] );
+
+    float w = fraction.mVec[2];
+    D[0] = stencil->mStencil[0][0][0] + D[0] * w;
+    D[1] = stencil->mStencil[0][1][0] + D[1] * w;
+    D[2] = stencil->mStencil[1][0][0] + D[2] * w;
+    D[3] = stencil->mStencil[1][1][0] + D[3] * w;
+
+    ret->mVec[0] =   cnanovdb_lerpF(D[2], D[3], fraction.mVec[1])
+                   - cnanovdb_lerpF(D[0], D[1], fraction.mVec[1]);
+
+    ret->mVec[1] = cnanovdb_lerpF(D[1] - D[0], D[3] - D[2], fraction.mVec[0]);
+}
+
+
+#endif
diff --git a/external/nanovdb/math/DitherLUT.h b/external/nanovdb/math/DitherLUT.h
new file mode 100644
index 00000000..7add4a6f
--- /dev/null
+++ b/external/nanovdb/math/DitherLUT.h
@@ -0,0 +1,189 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+//
+/// @author Jeff Lait
+///
+/// @date  May 13, 2021
+///
+/// @file DitherLUT.h
+///
+/// @brief Defines look up table to do dithering of 8^3 leaf nodes.
+
+#ifndef NANOVDB_DITHERLUT_HAS_BEEN_INCLUDED
+#define NANOVDB_DITHERLUT_HAS_BEEN_INCLUDED
+
+#include <nanovdb/NanoVDB.h>// for __hostdev__, Vec3, Min, Max, Pow2, Pow3, Pow4
+
+namespace nanovdb {
+
+namespace math {
+
+class DitherLUT
+{
+    const bool mEnable;
+public:
+    /// @brief Constructor with an optional scaling factor for the dithering
+    __hostdev__ DitherLUT(bool enable = true) : mEnable(enable) {}
+
+    /// @brief Retrieves dither threshold for an offset within an 8^3 leaf nodes.
+    ///
+    /// @param offset into the lookup table of size 512
+    __hostdev__ float operator()(const int offset)
+    {
+
+// This table was generated with
+/**************
+
+static constexpr inline uint32
+SYSwang_inthash(uint32 key)
+{
+    // From http://www.concentric.net/~Ttwang/tech/inthash.htm
+    key += ~(key << 16);
+    key ^=  (key >> 5);
+    key +=  (key << 3);
+    key ^=  (key >> 13);
+    key += ~(key << 9);
+    key ^=  (key >> 17);
+    return key;
+}
+
+static void
+ut_initDitherR(float *pattern, float offset,
+    int x, int y, int z, int res, int goalres)
+{
+    // These offsets are designed to maximize the difference between
+    // dither values in nearby voxels within a given 2x2x2 cell, without
+    // producing axis-aligned artifacts.  The are organized in row-major
+    // order.
+    static const float  theDitherOffset[] = {0,4,6,2,5,1,3,7};
+    static const float  theScale = 0.125F;
+    int         key = (((z << res) + y) << res) + x;
+
+    if (res == goalres)
+    {
+    pattern[key] = offset;
+    return;
+    }
+
+    // Randomly flip (on each axis) the dithering patterns used by the
+    // subcells.  This key is xor'd with the subcell index below before
+    // looking up in the dither offset list.
+    key = SYSwang_inthash(key) & 7;
+
+    x <<= 1;
+    y <<= 1;
+    z <<= 1;
+
+    offset *= theScale;
+    for (int i = 0; i < 8; i++)
+    ut_initDitherR(pattern, offset+theDitherOffset[i ^ key]*theScale,
+        x+(i&1), y+((i&2)>>1), z+((i&4)>>2), res+1, goalres);
+}
+
+// This is a compact algorithm that accomplishes essentially the same thing
+// as ut_initDither() above.  We should eventually switch to use this and
+// clean the dead code.
+static fpreal32 *
+ut_initDitherRecursive(int goalres)
+{
+    const int nfloat = 1 << (goalres*3);
+    float   *pattern = new float[nfloat];
+    ut_initDitherR(pattern, 1.0F, 0, 0, 0, 0, goalres);
+
+    // This has built an even spacing from 1/nfloat to 1.0.
+    // however, our dither pattern should be 1/(nfloat+1) to nfloat/(nfloat+1)
+    // So we do a correction here.  Note that the earlier calculations are
+    // done with powers of 2 so are exact, so it does make sense to delay
+    // the renormalization to this pass.
+    float correctionterm = nfloat / (nfloat+1.0F);
+    for (int i = 0; i < nfloat; i++)
+        pattern[i] *= correctionterm;
+    return pattern;
+}
+
+    theDitherMatrix = ut_initDitherRecursive(3);
+
+    for (int i = 0; i < 512/8; i ++)
+    {
+        for (int j = 0; j < 8; j ++)
+            std::cout << theDitherMatrix[i*8+j] << "f, ";
+        std::cout << std::endl;
+    }
+
+ **************/
+        static const float LUT[512] =
+        {
+            0.14425f, 0.643275f, 0.830409f, 0.331384f, 0.105263f, 0.604289f, 0.167641f, 0.666667f,
+            0.892788f, 0.393762f, 0.0818713f, 0.580897f, 0.853801f, 0.354776f, 0.916179f, 0.417154f,
+            0.612086f, 0.11306f, 0.79922f, 0.300195f, 0.510721f, 0.0116959f, 0.947368f, 0.448343f,
+            0.362573f, 0.861598f, 0.0506823f, 0.549708f, 0.261209f, 0.760234f, 0.19883f, 0.697856f,
+            0.140351f, 0.639376f, 0.576998f, 0.0779727f, 0.522417f, 0.0233918f, 0.460039f, 0.959064f,
+            0.888889f, 0.389864f, 0.327485f, 0.826511f, 0.272904f, 0.77193f, 0.709552f, 0.210526f,
+            0.483431f, 0.982456f, 0.296296f, 0.795322f, 0.116959f, 0.615984f, 0.0545809f, 0.553606f,
+            0.732943f, 0.233918f, 0.545809f, 0.0467836f, 0.865497f, 0.366472f, 0.803119f, 0.304094f,
+            0.518519f, 0.0194932f, 0.45614f, 0.955166f, 0.729045f, 0.230019f, 0.54191f, 0.042885f,
+            0.269006f, 0.768031f, 0.705653f, 0.206628f, 0.479532f, 0.978558f, 0.292398f, 0.791423f,
+            0.237817f, 0.736842f, 0.424951f, 0.923977f, 0.136452f, 0.635478f, 0.323587f, 0.822612f,
+            0.986355f, 0.487329f, 0.674464f, 0.175439f, 0.88499f, 0.385965f, 0.573099f, 0.0740741f,
+            0.51462f, 0.0155945f, 0.202729f, 0.701754f, 0.148148f, 0.647174f, 0.834308f, 0.335283f,
+            0.265107f, 0.764133f, 0.951267f, 0.452242f, 0.896686f, 0.397661f, 0.08577f, 0.584795f,
+            0.8577f, 0.358674f, 0.920078f, 0.421053f, 0.740741f, 0.241715f, 0.678363f, 0.179337f,
+            0.109162f, 0.608187f, 0.17154f, 0.670565f, 0.491228f, 0.990253f, 0.42885f, 0.927875f,
+            0.0662768f, 0.565302f, 0.62768f, 0.128655f, 0.183236f, 0.682261f, 0.744639f, 0.245614f,
+            0.814815f, 0.315789f, 0.378168f, 0.877193f, 0.931774f, 0.432749f, 0.495127f, 0.994152f,
+            0.0350877f, 0.534113f, 0.97076f, 0.471735f, 0.214425f, 0.71345f, 0.526316f, 0.0272904f,
+            0.783626f, 0.2846f, 0.222222f, 0.721248f, 0.962963f, 0.463938f, 0.276803f, 0.775828f,
+            0.966862f, 0.467836f, 0.405458f, 0.904483f, 0.0701754f, 0.569201f, 0.881092f, 0.382066f,
+            0.218324f, 0.717349f, 0.654971f, 0.155945f, 0.818713f, 0.319688f, 0.132554f, 0.631579f,
+            0.0623782f, 0.561404f, 0.748538f, 0.249513f, 0.912281f, 0.413255f, 0.974659f, 0.475634f,
+            0.810916f, 0.311891f, 0.499025f, 0.998051f, 0.163743f, 0.662768f, 0.226121f, 0.725146f,
+            0.690058f, 0.191033f, 0.00389864f, 0.502924f, 0.557505f, 0.0584795f, 0.120858f, 0.619883f,
+            0.440546f, 0.939571f, 0.752437f, 0.253411f, 0.307992f, 0.807018f, 0.869396f, 0.37037f,
+            0.658869f, 0.159844f, 0.346979f, 0.846004f, 0.588694f, 0.0896686f, 0.152047f, 0.651072f,
+            0.409357f, 0.908382f, 0.596491f, 0.0974659f, 0.339181f, 0.838207f, 0.900585f, 0.401559f,
+            0.34308f, 0.842105f, 0.779727f, 0.280702f, 0.693957f, 0.194932f, 0.25731f, 0.756335f,
+            0.592593f, 0.0935673f, 0.0311891f, 0.530214f, 0.444444f, 0.94347f, 0.506823f, 0.00779727f,
+            0.68616f, 0.187135f, 0.124756f, 0.623782f, 0.288499f, 0.787524f, 0.350877f, 0.849903f,
+            0.436647f, 0.935673f, 0.873294f, 0.374269f, 0.538012f, 0.0389864f, 0.60039f, 0.101365f,
+            0.57115f, 0.0721248f, 0.758285f, 0.259259f, 0.719298f, 0.220273f, 0.532164f, 0.0331384f,
+            0.321637f, 0.820663f, 0.00974659f, 0.508772f, 0.469786f, 0.968811f, 0.282651f, 0.781676f,
+            0.539961f, 0.0409357f, 0.727096f, 0.22807f, 0.500975f, 0.00194932f, 0.563353f, 0.0643275f,
+            0.290448f, 0.789474f, 0.477583f, 0.976608f, 0.251462f, 0.750487f, 0.31384f, 0.812865f,
+            0.94152f, 0.442495f, 0.879142f, 0.380117f, 0.37232f, 0.871345f, 0.309942f, 0.808967f,
+            0.192982f, 0.692008f, 0.130604f, 0.62963f, 0.621832f, 0.122807f, 0.559454f, 0.0604289f,
+            0.660819f, 0.161793f, 0.723197f, 0.224172f, 0.403509f, 0.902534f, 0.840156f, 0.341131f,
+            0.411306f, 0.910331f, 0.473684f, 0.97271f, 0.653021f, 0.153996f, 0.0916179f, 0.590643f,
+            0.196881f, 0.695906f, 0.384016f, 0.883041f, 0.0955166f, 0.594542f, 0.157895f, 0.65692f,
+            0.945419f, 0.446394f, 0.633528f, 0.134503f, 0.844055f, 0.345029f, 0.906433f, 0.407407f,
+            0.165692f, 0.664717f, 0.103314f, 0.602339f, 0.126706f, 0.625731f, 0.189084f, 0.688109f,
+            0.91423f, 0.415205f, 0.851852f, 0.352827f, 0.875244f, 0.376218f, 0.937622f, 0.438596f,
+            0.317739f, 0.816764f, 0.255361f, 0.754386f, 0.996101f, 0.497076f, 0.933723f, 0.434698f,
+            0.567251f, 0.0682261f, 0.504873f, 0.00584795f, 0.247563f, 0.746589f, 0.185185f, 0.684211f,
+            0.037037f, 0.536062f, 0.0994152f, 0.598441f, 0.777778f, 0.278752f, 0.465887f, 0.964912f,
+            0.785575f, 0.28655f, 0.847953f, 0.348928f, 0.0292398f, 0.528265f, 0.7154f, 0.216374f,
+            0.39961f, 0.898636f, 0.961014f, 0.461988f, 0.0487329f, 0.547758f, 0.111111f, 0.610136f,
+            0.649123f, 0.150097f, 0.212476f, 0.711501f, 0.797271f, 0.298246f, 0.859649f, 0.360624f,
+            0.118908f, 0.617934f, 0.0565302f, 0.555556f, 0.329435f, 0.82846f, 0.516569f, 0.0175439f,
+            0.867446f, 0.368421f, 0.805068f, 0.306043f, 0.578947f, 0.079922f, 0.267057f, 0.766082f,
+            0.270955f, 0.76998f, 0.707602f, 0.208577f, 0.668616f, 0.169591f, 0.606238f, 0.107212f,
+            0.520468f, 0.0214425f, 0.45809f, 0.957115f, 0.419103f, 0.918129f, 0.356725f, 0.855751f,
+            0.988304f, 0.489279f, 0.426901f, 0.925926f, 0.450292f, 0.949318f, 0.512671f, 0.0136452f,
+            0.239766f, 0.738791f, 0.676413f, 0.177388f, 0.699805f, 0.20078f, 0.263158f, 0.762183f,
+            0.773879f, 0.274854f, 0.337232f, 0.836257f, 0.672515f, 0.173489f, 0.734893f, 0.235867f,
+            0.0253411f, 0.524366f, 0.586745f, 0.0877193f, 0.423002f, 0.922027f, 0.48538f, 0.984405f,
+            0.74269f, 0.243665f, 0.680312f, 0.181287f, 0.953216f, 0.454191f, 0.1423f, 0.641326f,
+            0.493177f, 0.992203f, 0.430799f, 0.929825f, 0.204678f, 0.703704f, 0.890838f, 0.391813f,
+            0.894737f, 0.395712f, 0.0838207f, 0.582846f, 0.0448343f, 0.54386f, 0.231969f, 0.730994f,
+            0.146199f, 0.645224f, 0.832359f, 0.333333f, 0.793372f, 0.294347f, 0.980507f, 0.481481f,
+            0.364522f, 0.863548f, 0.80117f, 0.302144f, 0.824561f, 0.325536f, 0.138402f, 0.637427f,
+            0.614035f, 0.11501f, 0.0526316f, 0.551657f, 0.0760234f, 0.575049f, 0.88694f, 0.387914f,
+        };
+        return mEnable ? LUT[offset & 511] : 0.5f;// branch prediction should optimize this!
+    }
+}; // DitherLUT class
+
+}// namspace math
+
+}// namespace nanovdb
+
+#endif // NANOVDB_DITHERLUT_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/math/HDDA.h b/external/nanovdb/math/HDDA.h
new file mode 100644
index 00000000..c72a58a7
--- /dev/null
+++ b/external/nanovdb/math/HDDA.h
@@ -0,0 +1,510 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/// @file HDDA.h
+///
+/// @author Ken Museth
+///
+/// @brief Hierarchical Digital Differential Analyzers specialized for VDB.
+
+#ifndef NANOVDB_HDDA_H_HAS_BEEN_INCLUDED
+#define NANOVDB_HDDA_H_HAS_BEEN_INCLUDED
+
+// Comment out to disable this explicit round-off check
+#define ENFORCE_FORWARD_STEPPING
+
+#include <nanovdb/NanoVDB.h> // only dependency
+
+namespace nanovdb::math {
+
+/// @brief A Digital Differential Analyzer specialized for OpenVDB grids
+/// @note Conceptually similar to Bresenham's line algorithm applied
+/// to a 3D Ray intersecting OpenVDB nodes or voxels. Log2Dim = 0
+/// corresponds to a voxel and Log2Dim a tree node of size 2^Log2Dim.
+///
+/// @note The Ray template class is expected to have the following
+/// methods: test(time), t0(), t1(), invDir(), and  operator()(time).
+/// See the example Ray class above for their definition.
+template<typename RayT, typename CoordT = Coord>
+class HDDA
+{
+public:
+    using RealType = typename RayT::RealType;
+    using RealT = RealType;
+    using Vec3Type = typename RayT::Vec3Type;
+    using Vec3T = Vec3Type;
+    using CoordType = CoordT;
+
+    /// @brief Default ctor
+    HDDA() = default;
+
+    /// @brief ctor from ray and dimension at which the DDA marches
+    __hostdev__ HDDA(const RayT& ray, int dim) { this->init(ray, dim); }
+
+    /// @brief Re-initializes the HDDA
+    __hostdev__ void init(const RayT& ray, RealT startTime, RealT maxTime, int dim)
+    {
+        assert(startTime <= maxTime);
+        mDim = dim;
+        mT0 = startTime;
+        mT1 = maxTime;
+        const Vec3T &pos = ray(mT0), &dir = ray.dir(), &inv = ray.invDir();
+        mVoxel = RoundDown<CoordT>(pos) & (~(dim - 1));
+        for (int axis = 0; axis < 3; ++axis) {
+            if (dir[axis] == RealT(0)) { //handles dir = +/- 0
+                mNext[axis] = Maximum<RealT>::value(); //i.e. disabled!
+                mStep[axis] = 0;
+            } else if (inv[axis] > 0) {
+                mStep[axis] = 1;
+                mNext[axis] = mT0 + (mVoxel[axis] + dim - pos[axis]) * inv[axis];
+                mDelta[axis] = inv[axis];
+            } else {
+                mStep[axis] = -1;
+                mNext[axis] = mT0 + (mVoxel[axis] - pos[axis]) * inv[axis];
+                mDelta[axis] = -inv[axis];
+            }
+        }
+    }
+
+    /// @brief Simular to init above except it uses the bounds of the input ray
+    __hostdev__ void init(const RayT& ray, int dim) { this->init(ray, ray.t0(), ray.t1(), dim); }
+
+    /// @brief Updates the HDDA to march with the specified dimension
+    __hostdev__ bool update(const RayT& ray, int dim)
+    {
+        if (mDim == dim)
+            return false;
+        mDim = dim;
+        const Vec3T &pos = ray(mT0), &inv = ray.invDir();
+        mVoxel = RoundDown<CoordT>(pos) & (~(dim - 1));
+        for (int axis = 0; axis < 3; ++axis) {
+            if (mStep[axis] == 0)
+                continue;
+            mNext[axis] = mT0 + (mVoxel[axis] - pos[axis]) * inv[axis];
+            if (mStep[axis] > 0)
+                mNext[axis] += dim * inv[axis];
+        }
+
+        return true;
+    }
+
+    __hostdev__ int dim() const { return mDim; }
+
+    /// @brief Increment the voxel index to next intersected voxel or node
+    /// and returns true if the step in time does not exceed maxTime.
+    __hostdev__ bool step()
+    {
+        const int axis = MinIndex(mNext);
+#if 1
+        switch (axis) {
+        case 0:
+            return step<0>();
+        case 1:
+            return step<1>();
+        default:
+            return step<2>();
+        }
+#else
+        mT0 = mNext[axis];
+        mNext[axis] += mDim * mDelta[axis];
+        mVoxel[axis] += mDim * mStep[axis];
+        return mT0 <= mT1;
+#endif
+    }
+
+    /// @brief Return the index coordinates of the next node or voxel
+    /// intersected by the ray. If Log2Dim = 0 the return value is the
+    /// actual signed coordinate of the voxel, else it is the origin
+    /// of the corresponding VDB tree node or tile.
+    /// @note Incurs no computational overhead.
+    __hostdev__ const CoordT& voxel() const { return mVoxel; }
+
+    /// @brief Return the time (parameterized along the Ray) of the
+    /// first hit of a tree node of size 2^Log2Dim.
+    /// @details This value is initialized to startTime or ray.t0()
+    /// depending on the constructor used.
+    /// @note Incurs no computational overhead.
+    __hostdev__ RealType time() const { return mT0; }
+
+    /// @brief Return the maximum time (parameterized along the Ray).
+    __hostdev__ RealType maxTime() const { return mT1; }
+
+    /// @brief Return the time (parameterized along the Ray) of the
+    /// second (i.e. next) hit of a tree node of size 2^Log2Dim.
+    /// @note Incurs a (small) computational overhead.
+    __hostdev__ RealType next() const
+    {
+#if 1 //def __CUDA_ARCH__
+        return fminf(mT1, fminf(mNext[0], fminf(mNext[1], mNext[2])));
+#else
+        return std::min(mT1, std::min(mNext[0], std::min(mNext[1], mNext[2])));
+#endif
+    }
+
+private:
+    // helper to implement the general form
+    template<int axis>
+    __hostdev__ bool step()
+    {
+#ifdef ENFORCE_FORWARD_STEPPING
+        //if (mNext[axis] <= mT0) mNext[axis] += mT0 - mNext[axis] + fmaxf(mNext[axis]*1.0e-6f, 1.0e-6f);
+        //if (mNext[axis] <= mT0) mNext[axis] += mT0 - mNext[axis] + (mNext[axis] + 1.0f)*1.0e-6f;
+        if (mNext[axis] <= mT0) {
+            mNext[axis] += mT0 - 0.999999f * mNext[axis] + 1.0e-6f;
+        }
+#endif
+        mT0 = mNext[axis];
+        mNext[ axis] += mDim * mDelta[axis];
+        mVoxel[axis] += mDim * mStep[ axis];
+        return mT0 <= mT1;
+    }
+
+    int32_t mDim;
+    RealT   mT0, mT1; // min and max allowed times
+    CoordT  mVoxel, mStep; // current voxel location and step to next voxel location
+    Vec3T   mDelta, mNext; // delta time and next time
+}; // class HDDA
+
+/////////////////////////////////////////// ZeroCrossing ////////////////////////////////////////////
+
+/// @brief returns true if the ray intersects a zero-crossing at the voxel level of the grid in the accessor
+///        The empty-space ray-marching is performed at all levels of the tree using an
+///        HDDA. If an intersection is detected, then ijk is updated with the index coordinate of the closest
+///        voxel after the intersection point, v contains the grid values at ijk, and t is set to the time of
+///        the intersection along the ray.
+template<typename RayT, typename AccT>
+inline __hostdev__ bool ZeroCrossing(RayT& ray, AccT& acc, Coord& ijk, typename AccT::ValueType& v, float& t)
+{
+    if (!ray.clip(acc.root().bbox()) || ray.t1() > 1e20)
+        return false; // clip ray to bbox
+    static const float Delta = 1.0001f;
+    ijk = RoundDown<Coord>(ray.start()); // first hit of bbox
+    HDDA<RayT, Coord> hdda(ray, acc.getDim(ijk, ray));
+    const auto        v0 = acc.getValue(ijk);
+    while (hdda.step()) {
+        ijk = RoundDown<Coord>(ray(hdda.time() + Delta));
+        hdda.update(ray, acc.getDim(ijk, ray));
+        if (hdda.dim() > 1 || !acc.isActive(ijk))
+            continue; // either a tile value or an inactive voxel
+        while (hdda.step() && acc.isActive(hdda.voxel())) { // in the narrow band
+            v = acc.getValue(hdda.voxel());
+            if (v * v0 < 0) { // zero crossing
+                ijk = hdda.voxel();
+                t = hdda.time();
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+/////////////////////////////////////////// DDA ////////////////////////////////////////////
+
+/// @brief A Digital Differential Analyzer. Unlike HDDA (defined above) this DDA
+///        uses a fixed step-size defined by the template parameter Dim!
+///
+/// @note The Ray template class is expected to have the following
+/// methods: test(time), t0(), t1(), invDir(), and  operator()(time).
+/// See the example Ray class above for their definition.
+template<typename RayT, typename CoordT = Coord, int Dim = 1>
+class DDA
+{
+    static_assert(Dim >= 1, "Dim must be >= 1");
+
+public:
+    using RealType = typename RayT::RealType;
+    using RealT = RealType;
+    using Vec3Type = typename RayT::Vec3Type;
+    using Vec3T = Vec3Type;
+    using CoordType = CoordT;
+
+    /// @brief Default ctor
+    DDA() = default;
+
+    /// @brief ctor from ray and dimension at which the DDA marches
+    __hostdev__ DDA(const RayT& ray) { this->init(ray); }
+
+    /// @brief Re-initializes the DDA
+    __hostdev__ void init(const RayT& ray, RealT startTime, RealT maxTime)
+    {
+        assert(startTime <= maxTime);
+        mT0 = startTime;
+        mT1 = maxTime;
+        const Vec3T &pos = ray(mT0), &dir = ray.dir(), &inv = ray.invDir();
+        mVoxel = RoundDown<CoordT>(pos) & (~(Dim - 1));
+        for (int axis = 0; axis < 3; ++axis) {
+            if (dir[axis] == RealT(0)) { //handles dir = +/- 0
+                mNext[axis] = Maximum<RealT>::value(); //i.e. disabled!
+                mStep[axis] = 0;
+            } else if (inv[axis] > 0) {
+                mStep[axis] = Dim;
+                mNext[axis] = (mT0 + (mVoxel[axis] + Dim - pos[axis]) * inv[axis]);
+                mDelta[axis] = inv[axis];
+            } else {
+                mStep[axis] = -Dim;
+                mNext[axis] = mT0 + (mVoxel[axis] - pos[axis]) * inv[axis];
+                mDelta[axis] = -inv[axis];
+            }
+        }
+    }
+
+    /// @brief Simular to init above except it uses the bounds of the input ray
+    __hostdev__ void init(const RayT& ray) { this->init(ray, ray.t0(), ray.t1()); }
+
+    /// @brief Increment the voxel index to next intersected voxel or node
+    /// and returns true if the step in time does not exceed maxTime.
+    __hostdev__ bool step()
+    {
+        const int axis = MinIndex(mNext);
+#if 1
+        switch (axis) {
+        case 0:
+            return step<0>();
+        case 1:
+            return step<1>();
+        default:
+            return step<2>();
+        }
+#else
+#ifdef ENFORCE_FORWARD_STEPPING
+        if (mNext[axis] <= mT0) {
+            mNext[axis] += mT0 - 0.999999f * mNext[axis] + 1.0e-6f;
+        }
+#endif
+        mT0 = mNext[axis];
+        mNext[axis] += mDelta[axis];
+        mVoxel[axis] += mStep[axis];
+        return mT0 <= mT1;
+#endif
+    }
+
+    /// @brief Return the index coordinates of the next node or voxel
+    /// intersected by the ray. If Log2Dim = 0 the return value is the
+    /// actual signed coordinate of the voxel, else it is the origin
+    /// of the corresponding VDB tree node or tile.
+    /// @note Incurs no computational overhead.
+    __hostdev__ const CoordT& voxel() const { return mVoxel; }
+
+    /// @brief Return the time (parameterized along the Ray) of the
+    /// first hit of a tree node of size 2^Log2Dim.
+    /// @details This value is initialized to startTime or ray.t0()
+    /// depending on the constructor used.
+    /// @note Incurs no computational overhead.
+    __hostdev__ RealType time() const { return mT0; }
+
+    /// @brief Return the maximum time (parameterized along the Ray).
+    __hostdev__ RealType maxTime() const { return mT1; }
+
+    /// @brief Return the time (parameterized along the Ray) of the
+    /// second (i.e. next) hit of a tree node of size 2^Log2Dim.
+    /// @note Incurs a (small) computational overhead.
+    __hostdev__ RealType next() const
+    {
+        return Min(mT1, Min(mNext[0], Min(mNext[1], mNext[2])));
+    }
+
+    __hostdev__ int nextAxis() const
+    {
+        return nanovdb::math::MinIndex(mNext);
+    }
+
+private:
+    // helper to implement the general form
+    template<int axis>
+    __hostdev__ bool step()
+    {
+#ifdef ENFORCE_FORWARD_STEPPING
+        if (mNext[axis] <= mT0) {
+            mNext[axis] += mT0 - 0.999999f * mNext[axis] + 1.0e-6f;
+        }
+#endif
+        mT0 = mNext[axis];
+        mNext[axis] += mDelta[axis];
+        mVoxel[axis] += mStep[axis];
+        return mT0 <= mT1;
+    }
+
+    RealT  mT0, mT1; // min and max allowed times
+    CoordT mVoxel, mStep; // current voxel location and step to next voxel location
+    Vec3T  mDelta, mNext; // delta time and next time
+}; // class DDA
+
+/////////////////////////////////////////// ZeroCrossingNode ////////////////////////////////////////////
+
+template<typename RayT, typename NodeT>
+inline __hostdev__ bool ZeroCrossingNode(RayT& ray, const NodeT& node, float v0, nanovdb::math::Coord& ijk, float& v, float& t)
+{
+    math::BBox<Coord> bbox(node.origin(), node.origin() + Coord(node.dim() - 1));
+
+    if (!ray.clip(node.bbox())) {
+        return false;
+    }
+
+    const float t0 = ray.t0();
+
+    static const float Delta = 1.0001f;
+    ijk = Coord::Floor(ray(ray.t0() + Delta));
+
+    t = t0;
+    v = 0;
+
+    DDA<RayT, Coord, 1 << NodeT::LOG2DIM> dda(ray);
+    while (dda.step()) {
+        ijk = dda.voxel();
+
+        if (bbox.isInside(ijk) == false)
+            return false;
+
+        v = node.getValue(ijk);
+        if (v * v0 < 0) {
+            t = dda.time();
+            return true;
+        }
+    }
+    return false;
+}
+
+/////////////////////////////////////////// TreeMarcher ////////////////////////////////////////////
+
+/// @brief returns true if the ray intersects an active value at any level of the grid in the accessor.
+///        The empty-space ray-marching is performed at all levels of the tree using an
+///        HDDA. If an intersection is detected, then ijk is updated with the index coordinate of the first
+///        active voxel or tile, and t is set to the time of its intersection along the ray.
+template<typename RayT, typename AccT>
+inline __hostdev__ bool firstActive(RayT& ray, AccT& acc, Coord &ijk, float& t)
+{
+    if (!ray.clip(acc.root().bbox()) || ray.t1() > 1e20) {// clip ray to bbox
+        return false;// missed or undefined bbox
+    }
+    static const float Delta = 1.0001f;// forward step-size along the ray to avoid getting stuck
+    t = ray.t0();// initiate time
+    ijk = RoundDown<Coord>(ray.start()); // first voxel inside bbox
+    for (HDDA<RayT, Coord> hdda(ray, acc.getDim(ijk, ray)); !acc.isActive(ijk); hdda.update(ray, acc.getDim(ijk, ray))) {
+        if (!hdda.step()) return false;// leap-frog HDDA and exit if ray bound is exceeded
+        t = hdda.time() + Delta;// update time
+        ijk = RoundDown<Coord>( ray(t) );// update ijk
+    }
+    return true;
+}
+
+/////////////////////////////////////////// TreeMarcher ////////////////////////////////////////////
+
+/// @brief A Tree Marcher for Generic Grids
+
+template<typename NodeT, typename RayT, typename AccT, typename CoordT = Coord>
+class TreeMarcher
+{
+public:
+    using ChildT = typename NodeT::ChildNodeType;
+    using RealType = typename RayT::RealType;
+    using RealT = RealType;
+    using CoordType = CoordT;
+
+    inline __hostdev__ TreeMarcher(AccT& acc)
+        : mAcc(acc)
+    {
+    }
+
+    /// @brief Initialize the TreeMarcher with an index-space ray.
+    inline __hostdev__ bool init(const RayT& indexRay)
+    {
+        mRay = indexRay;
+        if (!mRay.clip(mAcc.root().bbox()))
+            return false; // clip ray to bbox
+
+        // tweak the intersection span into the bbox.
+        // CAVEAT: this will potentially clip some tiny corner intersections.
+        static const float Eps = 0.000001f;
+        const float        t0 = mRay.t0() + Eps;
+        const float        t1 = mRay.t1() - Eps;
+        if (t0 > t1)
+            return false;
+
+        const CoordT ijk = RoundDown<Coord>(mRay(t0));
+        const uint32_t    dim = mAcc.getDim(ijk, mRay);
+        mHdda.init(mRay, t0, t1, nanovdb::math::Max(dim, NodeT::dim()));
+
+        mT0 = (dim <= ChildT::dim()) ? mHdda.time() : -1; // potentially begin a span.
+        mTmax = t1;
+        return true;
+    }
+
+    /// @brief step the ray through the tree. If the ray hits a node then
+    /// populate t0 & t1, and the node.
+    /// @return true when a node of type NodeT is intersected, false otherwise.
+    inline __hostdev__ bool step(const NodeT** node, float& t0, float& t1)
+    {
+        // CAVEAT: if Delta is too large then it will clip corners of nodes in a visible way.
+        // but it has to be quite large when very far from the grid (due to fp32 rounding)
+        static const float Delta = 0.01f;
+        bool               hddaIsValid;
+
+        do {
+            t0 = mT0;
+
+            auto currentNode = mAcc.template getNode<NodeT>();
+
+            // get next node intersection...
+            hddaIsValid = mHdda.step();
+            const CoordT nextIjk = RoundDown<Coord>(mRay(mHdda.time() + Delta));
+            const auto   nextDim = mAcc.getDim(nextIjk, mRay);
+            mHdda.update(mRay, (int)Max(nextDim, NodeT::dim()));
+            mT0 = (nextDim <= ChildT::dim()) ? mHdda.time() : -1; // potentially begin a span.
+
+            if (t0 >= 0) { // we are in a span.
+                t1 = Min(mTmax, mHdda.time());
+
+                // TODO: clean this up!
+                if (t0 >= t1 || currentNode == nullptr)
+                    continue;
+
+                *node = currentNode;
+                return true;
+            }
+
+        } while (hddaIsValid);
+
+        return false;
+    }
+
+    inline __hostdev__ const RayT& ray() const { return mRay; }
+
+    inline __hostdev__ RayT& ray() { return mRay; }
+
+private:
+    AccT&             mAcc;
+    RayT              mRay;
+    HDDA<RayT, Coord> mHdda;
+    float             mT0;
+    float             mTmax;
+};// TreeMarcher
+
+/////////////////////////////////////////// PointTreeMarcher ////////////////////////////////////////////
+
+/// @brief A Tree Marcher for Point Grids
+///
+/// @note This class will handle correctly offseting the ray by 0.5 to ensure that
+/// the underlying HDDA will intersect with the grid-cells. See details below.
+
+template<typename AccT, typename RayT, typename CoordT = Coord>
+class PointTreeMarcher : public TreeMarcher<LeafNode<typename AccT::ValueType>, RayT, AccT, CoordT>
+{
+    using BaseT = TreeMarcher<LeafNode<typename AccT::ValueType>, RayT, AccT, CoordT>;
+public:
+    __hostdev__ PointTreeMarcher(AccT& acc) : BaseT(acc) {}
+
+    /// @brief Initiates this instance with a ray in index space.
+    ///
+    /// @details An offset by 0.5 is applied to the ray to account for the fact that points in vdb
+    ///          grids are bucketed into so-called grid cell, which are centered round grid voxels,
+    ///          whereas the DDA is based on so-called grid nodes, which are coincident with grid
+    ///          voxels. So, rather than offsettting the points by 0.5 to bring them into a grid
+    ///          node representation this method offsets the eye of the ray by 0.5, which effectively
+    ///          ensures that the DDA operates on grid cells as oppose to grid nodes. This subtle
+    ///          but important offset by 0.5 is explined in more details in our online documentation.
+    __hostdev__ bool init(RayT ray) { return BaseT::init(ray.offsetEye(0.5)); }
+};// PointTreeMarcher
+
+} // namespace nanovdb::math
+
+#endif // NANOVDB_HDDA_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/math/Math.h b/external/nanovdb/math/Math.h
new file mode 100644
index 00000000..da3a6162
--- /dev/null
+++ b/external/nanovdb/math/Math.h
@@ -0,0 +1,1448 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file   Math.h
+
+    \author Ken Museth
+
+    \date  January 8, 2020
+
+    \brief Math functions and classes
+
+*/
+
+#ifndef NANOVDB_MATH_MATH_H_HAS_BEEN_INCLUDED
+#define NANOVDB_MATH_MATH_H_HAS_BEEN_INCLUDED
+
+#include <nanovdb/util/Util.h>// for __hostdev__ and lots of other utility functions
+
+namespace nanovdb {// =================================================================
+
+namespace math {// =============================================================
+
+// ----------------------------> Various math functions <-------------------------------------
+
+//@{
+/// @brief Pi constant taken from Boost to match old behaviour
+template<typename T>
+inline __hostdev__ constexpr T pi()
+{
+    return 3.141592653589793238462643383279502884e+00;
+}
+template<>
+inline __hostdev__ constexpr float pi()
+{
+    return 3.141592653589793238462643383279502884e+00F;
+}
+template<>
+inline __hostdev__ constexpr double pi()
+{
+    return 3.141592653589793238462643383279502884e+00;
+}
+template<>
+inline __hostdev__ constexpr long double pi()
+{
+    return 3.141592653589793238462643383279502884e+00L;
+}
+//@}
+
+//@{
+/// Tolerance for floating-point comparison
+template<typename T>
+struct Tolerance;
+template<>
+struct Tolerance<float>
+{
+    __hostdev__ static float value() { return 1e-8f; }
+};
+template<>
+struct Tolerance<double>
+{
+    __hostdev__ static double value() { return 1e-15; }
+};
+//@}
+
+//@{
+/// Delta for small floating-point offsets
+template<typename T>
+struct Delta;
+template<>
+struct Delta<float>
+{
+    __hostdev__ static float value() { return 1e-5f; }
+};
+template<>
+struct Delta<double>
+{
+    __hostdev__ static double value() { return 1e-9; }
+};
+//@}
+
+//@{
+/// Maximum floating-point values
+template<typename T>
+struct Maximum;
+#if defined(__CUDA_ARCH__) || defined(__HIP__)
+template<>
+struct Maximum<int>
+{
+    __hostdev__ static int value() { return 2147483647; }
+};
+template<>
+struct Maximum<uint32_t>
+{
+    __hostdev__ static uint32_t value() { return 4294967295u; }
+};
+template<>
+struct Maximum<float>
+{
+    __hostdev__ static float value() { return 1e+38f; }
+};
+template<>
+struct Maximum<double>
+{
+    __hostdev__ static double value() { return 1e+308; }
+};
+#else
+template<typename T>
+struct Maximum
+{
+    static T value() { return std::numeric_limits<T>::max(); }
+};
+#endif
+//@}
+
+template<typename Type>
+__hostdev__ inline bool isApproxZero(const Type& x)
+{
+    return !(x > Tolerance<Type>::value()) && !(x < -Tolerance<Type>::value());
+}
+
+template<typename Type>
+__hostdev__ inline Type Min(Type a, Type b)
+{
+    return (a < b) ? a : b;
+}
+__hostdev__ inline int32_t Min(int32_t a, int32_t b)
+{
+    return int32_t(fminf(float(a), float(b)));
+}
+__hostdev__ inline uint32_t Min(uint32_t a, uint32_t b)
+{
+    return uint32_t(fminf(float(a), float(b)));
+}
+__hostdev__ inline float Min(float a, float b)
+{
+    return fminf(a, b);
+}
+__hostdev__ inline double Min(double a, double b)
+{
+    return fmin(a, b);
+}
+template<typename Type>
+__hostdev__ inline Type Max(Type a, Type b)
+{
+    return (a > b) ? a : b;
+}
+
+__hostdev__ inline int32_t Max(int32_t a, int32_t b)
+{
+    return int32_t(fmaxf(float(a), float(b)));
+}
+__hostdev__ inline uint32_t Max(uint32_t a, uint32_t b)
+{
+    return uint32_t(fmaxf(float(a), float(b)));
+}
+__hostdev__ inline float Max(float a, float b)
+{
+    return fmaxf(a, b);
+}
+__hostdev__ inline double Max(double a, double b)
+{
+    return fmax(a, b);
+}
+__hostdev__ inline float Clamp(float x, float a, float b)
+{
+    return Max(Min(x, b), a);
+}
+__hostdev__ inline double Clamp(double x, double a, double b)
+{
+    return Max(Min(x, b), a);
+}
+
+__hostdev__ inline float Fract(float x)
+{
+    return x - floorf(x);
+}
+__hostdev__ inline double Fract(double x)
+{
+    return x - floor(x);
+}
+
+__hostdev__ inline int32_t Floor(float x)
+{
+    return int32_t(floorf(x));
+}
+__hostdev__ inline int32_t Floor(double x)
+{
+    return int32_t(floor(x));
+}
+
+__hostdev__ inline int32_t Ceil(float x)
+{
+    return int32_t(ceilf(x));
+}
+__hostdev__ inline int32_t Ceil(double x)
+{
+    return int32_t(ceil(x));
+}
+
+template<typename T>
+__hostdev__ inline T Pow2(T x)
+{
+    return x * x;
+}
+
+template<typename T>
+__hostdev__ inline T Pow3(T x)
+{
+    return x * x * x;
+}
+
+template<typename T>
+__hostdev__ inline T Pow4(T x)
+{
+    return Pow2(x * x);
+}
+template<typename T>
+__hostdev__ inline T Abs(T x)
+{
+    return x < 0 ? -x : x;
+}
+
+template<>
+__hostdev__ inline float Abs(float x)
+{
+    return fabsf(x);
+}
+
+template<>
+__hostdev__ inline double Abs(double x)
+{
+    return fabs(x);
+}
+
+template<>
+__hostdev__ inline int Abs(int x)
+{
+    return abs(x);
+}
+
+template<typename CoordT, typename RealT, template<typename> class Vec3T>
+__hostdev__ inline CoordT Round(const Vec3T<RealT>& xyz);
+
+template<typename CoordT, template<typename> class Vec3T>
+__hostdev__ inline CoordT Round(const Vec3T<float>& xyz)
+{
+    return CoordT(int32_t(rintf(xyz[0])), int32_t(rintf(xyz[1])), int32_t(rintf(xyz[2])));
+    //return CoordT(int32_t(roundf(xyz[0])), int32_t(roundf(xyz[1])), int32_t(roundf(xyz[2])) );
+    //return CoordT(int32_t(floorf(xyz[0] + 0.5f)), int32_t(floorf(xyz[1] + 0.5f)), int32_t(floorf(xyz[2] + 0.5f)));
+}
+
+template<typename CoordT, template<typename> class Vec3T>
+__hostdev__ inline CoordT Round(const Vec3T<double>& xyz)
+{
+    return CoordT(int32_t(floor(xyz[0] + 0.5)), int32_t(floor(xyz[1] + 0.5)), int32_t(floor(xyz[2] + 0.5)));
+}
+
+template<typename CoordT, typename RealT, template<typename> class Vec3T>
+__hostdev__ inline CoordT RoundDown(const Vec3T<RealT>& xyz)
+{
+    return CoordT(Floor(xyz[0]), Floor(xyz[1]), Floor(xyz[2]));
+}
+
+//@{
+/// Return the square root of a floating-point value.
+__hostdev__ inline float Sqrt(float x)
+{
+    return sqrtf(x);
+}
+__hostdev__ inline double Sqrt(double x)
+{
+    return sqrt(x);
+}
+//@}
+
+/// Return the sign of the given value as an integer (either -1, 0 or 1).
+template<typename T>
+__hostdev__ inline T Sign(const T& x)
+{
+    return ((T(0) < x) ? T(1) : T(0)) - ((x < T(0)) ? T(1) : T(0));
+}
+
+template<typename Vec3T>
+__hostdev__ inline int MinIndex(const Vec3T& v)
+{
+#if 0
+    static const int hashTable[8] = {2, 1, 9, 1, 2, 9, 0, 0}; //9 are dummy values
+    const int        hashKey = ((v[0] < v[1]) << 2) + ((v[0] < v[2]) << 1) + (v[1] < v[2]); // ?*4+?*2+?*1
+    return hashTable[hashKey];
+#else
+    if (v[0] < v[1] && v[0] < v[2])
+        return 0;
+    if (v[1] < v[2])
+        return 1;
+    else
+        return 2;
+#endif
+}
+
+template<typename Vec3T>
+__hostdev__ inline int MaxIndex(const Vec3T& v)
+{
+#if 0
+    static const int hashTable[8] = {2, 1, 9, 1, 2, 9, 0, 0}; //9 are dummy values
+    const int        hashKey = ((v[0] > v[1]) << 2) + ((v[0] > v[2]) << 1) + (v[1] > v[2]); // ?*4+?*2+?*1
+    return hashTable[hashKey];
+#else
+    if (v[0] > v[1] && v[0] > v[2])
+        return 0;
+    if (v[1] > v[2])
+        return 1;
+    else
+        return 2;
+#endif
+}
+
+/// @brief round up byteSize to the nearest wordSize, e.g. to align to machine word: AlignUp<sizeof(size_t)(n)
+///
+/// @details both wordSize and byteSize are in byte units
+template<uint64_t wordSize>
+__hostdev__ inline uint64_t AlignUp(uint64_t byteCount)
+{
+    const uint64_t r = byteCount % wordSize;
+    return r ? byteCount - r + wordSize : byteCount;
+}
+
+// ------------------------------> Coord <--------------------------------------
+
+// forward declaration so we can define Coord::asVec3s and Coord::asVec3d
+template<typename>
+class Vec3;
+
+/// @brief Signed (i, j, k) 32-bit integer coordinate class, similar to openvdb::math::Coord
+class Coord
+{
+    int32_t mVec[3]; // private member data - three signed index coordinates
+public:
+    using ValueType = int32_t;
+    using IndexType = uint32_t;
+
+    /// @brief Initialize all coordinates to zero.
+    __hostdev__ Coord()
+        : mVec{0, 0, 0}
+    {
+    }
+
+    /// @brief Initializes all coordinates to the given signed integer.
+    __hostdev__ explicit Coord(ValueType n)
+        : mVec{n, n, n}
+    {
+    }
+
+    /// @brief Initializes coordinate to the given signed integers.
+    __hostdev__ Coord(ValueType i, ValueType j, ValueType k)
+        : mVec{i, j, k}
+    {
+    }
+
+    __hostdev__ Coord(ValueType* ptr)
+        : mVec{ptr[0], ptr[1], ptr[2]}
+    {
+    }
+
+    __hostdev__ int32_t x() const { return mVec[0]; }
+    __hostdev__ int32_t y() const { return mVec[1]; }
+    __hostdev__ int32_t z() const { return mVec[2]; }
+
+    __hostdev__ int32_t& x() { return mVec[0]; }
+    __hostdev__ int32_t& y() { return mVec[1]; }
+    __hostdev__ int32_t& z() { return mVec[2]; }
+
+    __hostdev__ static Coord max() { return Coord(int32_t((1u << 31) - 1)); }
+
+    __hostdev__ static Coord min() { return Coord(-int32_t((1u << 31) - 1) - 1); }
+
+    __hostdev__ static size_t memUsage() { return sizeof(Coord); }
+
+    /// @brief Return a const reference to the given Coord component.
+    /// @warning The argument is assumed to be 0, 1, or 2.
+    __hostdev__ const ValueType& operator[](IndexType i) const { return mVec[i]; }
+
+    /// @brief Return a non-const reference to the given Coord component.
+    /// @warning The argument is assumed to be 0, 1, or 2.
+    __hostdev__ ValueType& operator[](IndexType i) { return mVec[i]; }
+
+    /// @brief Assignment operator that works with openvdb::Coord
+    template<typename CoordT>
+    __hostdev__ Coord& operator=(const CoordT& other)
+    {
+        static_assert(sizeof(Coord) == sizeof(CoordT), "Mis-matched sizeof");
+        mVec[0] = other[0];
+        mVec[1] = other[1];
+        mVec[2] = other[2];
+        return *this;
+    }
+
+    /// @brief Return a new instance with coordinates masked by the given unsigned integer.
+    __hostdev__ Coord operator&(IndexType n) const { return Coord(mVec[0] & n, mVec[1] & n, mVec[2] & n); }
+
+    // @brief Return a new instance with coordinates left-shifted by the given unsigned integer.
+    __hostdev__ Coord operator<<(IndexType n) const { return Coord(mVec[0] << n, mVec[1] << n, mVec[2] << n); }
+
+    // @brief Return a new instance with coordinates right-shifted by the given unsigned integer.
+    __hostdev__ Coord operator>>(IndexType n) const { return Coord(mVec[0] >> n, mVec[1] >> n, mVec[2] >> n); }
+
+    /// @brief Return true if this Coord is lexicographically less than the given Coord.
+    __hostdev__ bool operator<(const Coord& rhs) const
+    {
+        return mVec[0] < rhs[0] ? true
+             : mVec[0] > rhs[0] ? false
+             : mVec[1] < rhs[1] ? true
+             : mVec[1] > rhs[1] ? false
+             : mVec[2] < rhs[2] ? true : false;
+    }
+
+    /// @brief Return true if this Coord is lexicographically less or equal to the given Coord.
+    __hostdev__ bool operator<=(const Coord& rhs) const
+    {
+        return mVec[0] < rhs[0] ? true
+             : mVec[0] > rhs[0] ? false
+             : mVec[1] < rhs[1] ? true
+             : mVec[1] > rhs[1] ? false
+             : mVec[2] <=rhs[2] ? true : false;
+    }
+
+    // @brief Return true if this Coord is lexicographically greater than the given Coord.
+    __hostdev__ bool operator>(const Coord& rhs) const
+    {
+        return mVec[0] > rhs[0] ? true
+             : mVec[0] < rhs[0] ? false
+             : mVec[1] > rhs[1] ? true
+             : mVec[1] < rhs[1] ? false
+             : mVec[2] > rhs[2] ? true : false;
+    }
+
+    // @brief Return true if this Coord is lexicographically greater or equal to the given Coord.
+    __hostdev__ bool operator>=(const Coord& rhs) const
+    {
+        return mVec[0] > rhs[0] ? true
+             : mVec[0] < rhs[0] ? false
+             : mVec[1] > rhs[1] ? true
+             : mVec[1] < rhs[1] ? false
+             : mVec[2] >=rhs[2] ? true : false;
+    }
+
+    // @brief Return true if the Coord components are identical.
+    __hostdev__ bool   operator==(const Coord& rhs) const { return mVec[0] == rhs[0] && mVec[1] == rhs[1] && mVec[2] == rhs[2]; }
+    __hostdev__ bool   operator!=(const Coord& rhs) const { return mVec[0] != rhs[0] || mVec[1] != rhs[1] || mVec[2] != rhs[2]; }
+    __hostdev__ Coord& operator&=(int n)
+    {
+        mVec[0] &= n;
+        mVec[1] &= n;
+        mVec[2] &= n;
+        return *this;
+    }
+    __hostdev__ Coord& operator<<=(uint32_t n)
+    {
+        mVec[0] <<= n;
+        mVec[1] <<= n;
+        mVec[2] <<= n;
+        return *this;
+    }
+    __hostdev__ Coord& operator>>=(uint32_t n)
+    {
+        mVec[0] >>= n;
+        mVec[1] >>= n;
+        mVec[2] >>= n;
+        return *this;
+    }
+    __hostdev__ Coord& operator+=(int n)
+    {
+        mVec[0] += n;
+        mVec[1] += n;
+        mVec[2] += n;
+        return *this;
+    }
+    __hostdev__ Coord  operator+(const Coord& rhs) const { return Coord(mVec[0] + rhs[0], mVec[1] + rhs[1], mVec[2] + rhs[2]); }
+    __hostdev__ Coord  operator-(const Coord& rhs) const { return Coord(mVec[0] - rhs[0], mVec[1] - rhs[1], mVec[2] - rhs[2]); }
+    __hostdev__ Coord  operator-() const { return Coord(-mVec[0], -mVec[1], -mVec[2]); }
+    __hostdev__ Coord& operator+=(const Coord& rhs)
+    {
+        mVec[0] += rhs[0];
+        mVec[1] += rhs[1];
+        mVec[2] += rhs[2];
+        return *this;
+    }
+    __hostdev__ Coord& operator-=(const Coord& rhs)
+    {
+        mVec[0] -= rhs[0];
+        mVec[1] -= rhs[1];
+        mVec[2] -= rhs[2];
+        return *this;
+    }
+
+    /// @brief Perform a component-wise minimum with the other Coord.
+    __hostdev__ Coord& minComponent(const Coord& other)
+    {
+        if (other[0] < mVec[0])
+            mVec[0] = other[0];
+        if (other[1] < mVec[1])
+            mVec[1] = other[1];
+        if (other[2] < mVec[2])
+            mVec[2] = other[2];
+        return *this;
+    }
+
+    /// @brief Perform a component-wise maximum with the other Coord.
+    __hostdev__ Coord& maxComponent(const Coord& other)
+    {
+        if (other[0] > mVec[0])
+            mVec[0] = other[0];
+        if (other[1] > mVec[1])
+            mVec[1] = other[1];
+        if (other[2] > mVec[2])
+            mVec[2] = other[2];
+        return *this;
+    }
+#if defined(__CUDACC__) // the following functions only run on the GPU!
+    __device__ inline Coord& minComponentAtomic(const Coord& other)
+    {
+        atomicMin(&mVec[0], other[0]);
+        atomicMin(&mVec[1], other[1]);
+        atomicMin(&mVec[2], other[2]);
+        return *this;
+    }
+    __device__ inline Coord& maxComponentAtomic(const Coord& other)
+    {
+        atomicMax(&mVec[0], other[0]);
+        atomicMax(&mVec[1], other[1]);
+        atomicMax(&mVec[2], other[2]);
+        return *this;
+    }
+#endif
+
+    __hostdev__ Coord offsetBy(ValueType dx, ValueType dy, ValueType dz) const
+    {
+        return Coord(mVec[0] + dx, mVec[1] + dy, mVec[2] + dz);
+    }
+
+    __hostdev__ Coord offsetBy(ValueType n) const { return this->offsetBy(n, n, n); }
+
+    /// Return true if any of the components of @a a are smaller than the
+    /// corresponding components of @a b.
+    __hostdev__ static inline bool lessThan(const Coord& a, const Coord& b)
+    {
+        return (a[0] < b[0] || a[1] < b[1] || a[2] < b[2]);
+    }
+
+    /// @brief Return the largest integer coordinates that are not greater
+    /// than @a xyz (node centered conversion).
+    template<typename Vec3T>
+    __hostdev__ static Coord Floor(const Vec3T& xyz) { return Coord(math::Floor(xyz[0]), math::Floor(xyz[1]), math::Floor(xyz[2])); }
+
+    /// @brief Return a hash key derived from the existing coordinates.
+    /// @details The hash function is originally taken from the SIGGRAPH paper:
+    ///          "VDB: High-resolution sparse volumes with dynamic topology"
+    ///          and the prime numbers are modified based on the ACM Transactions on Graphics paper:
+    ///          "Real-time 3D reconstruction at scale using voxel hashing" (the second number had a typo!)
+    template<int Log2N = 3 + 4 + 5>
+    __hostdev__ uint32_t hash() const { return ((1 << Log2N) - 1) & (mVec[0] * 73856093 ^ mVec[1] * 19349669 ^ mVec[2] * 83492791); }
+
+    /// @brief Return the octant of this Coord
+    //__hostdev__ size_t octant() const { return (uint32_t(mVec[0])>>31) | ((uint32_t(mVec[1])>>31)<<1) | ((uint32_t(mVec[2])>>31)<<2); }
+    __hostdev__ uint8_t octant() const { return (uint8_t(bool(mVec[0] & (1u << 31)))) |
+                                                (uint8_t(bool(mVec[1] & (1u << 31))) << 1) |
+                                                (uint8_t(bool(mVec[2] & (1u << 31))) << 2); }
+
+    /// @brief Return a single precision floating-point vector of this coordinate
+    __hostdev__ inline Vec3<float> asVec3s() const;
+
+    /// @brief Return a double precision floating-point vector of this coordinate
+    __hostdev__ inline Vec3<double> asVec3d() const;
+
+    // returns a copy of itself, so it mimics the behaviour of Vec3<T>::round()
+    __hostdev__ inline Coord round() const { return *this; }
+}; // Coord class
+
+// ----------------------------> Vec3 <--------------------------------------
+
+/// @brief A simple vector class with three components, similar to openvdb::math::Vec3
+template<typename T>
+class Vec3
+{
+    T mVec[3];
+
+public:
+    static const int SIZE = 3;
+    static const int size = 3; // in openvdb::math::Tuple
+    using ValueType = T;
+    Vec3() = default;
+    __hostdev__ explicit Vec3(T x)
+        : mVec{x, x, x}
+    {
+    }
+    __hostdev__ Vec3(T x, T y, T z)
+        : mVec{x, y, z}
+    {
+    }
+    template<template<class> class Vec3T, class T2>
+    __hostdev__ Vec3(const Vec3T<T2>& v)
+        : mVec{T(v[0]), T(v[1]), T(v[2])}
+    {
+        static_assert(Vec3T<T2>::size == size, "expected Vec3T::size==3!");
+    }
+    template<typename T2>
+    __hostdev__ explicit Vec3(const Vec3<T2>& v)
+        : mVec{T(v[0]), T(v[1]), T(v[2])}
+    {
+    }
+    __hostdev__ explicit Vec3(const Coord& ijk)
+        : mVec{T(ijk[0]), T(ijk[1]), T(ijk[2])}
+    {
+    }
+    __hostdev__ bool operator==(const Vec3& rhs) const { return mVec[0] == rhs[0] && mVec[1] == rhs[1] && mVec[2] == rhs[2]; }
+    __hostdev__ bool operator!=(const Vec3& rhs) const { return mVec[0] != rhs[0] || mVec[1] != rhs[1] || mVec[2] != rhs[2]; }
+    template<template<class> class Vec3T, class T2>
+    __hostdev__ Vec3& operator=(const Vec3T<T2>& rhs)
+    {
+        static_assert(Vec3T<T2>::size == size, "expected Vec3T::size==3!");
+        mVec[0] = rhs[0];
+        mVec[1] = rhs[1];
+        mVec[2] = rhs[2];
+        return *this;
+    }
+    __hostdev__ const T& operator[](int i) const { return mVec[i]; }
+    __hostdev__ T&       operator[](int i) { return mVec[i]; }
+    template<typename Vec3T>
+    __hostdev__ T dot(const Vec3T& v) const { return mVec[0] * v[0] + mVec[1] * v[1] + mVec[2] * v[2]; }
+    template<typename Vec3T>
+    __hostdev__ Vec3 cross(const Vec3T& v) const
+    {
+        return Vec3(mVec[1] * v[2] - mVec[2] * v[1],
+                    mVec[2] * v[0] - mVec[0] * v[2],
+                    mVec[0] * v[1] - mVec[1] * v[0]);
+    }
+    __hostdev__ T lengthSqr() const
+    {
+        return mVec[0] * mVec[0] + mVec[1] * mVec[1] + mVec[2] * mVec[2]; // 5 flops
+    }
+    __hostdev__ T     length() const { return Sqrt(this->lengthSqr()); }
+    __hostdev__ Vec3  operator-() const { return Vec3(-mVec[0], -mVec[1], -mVec[2]); }
+    __hostdev__ Vec3  operator*(const Vec3& v) const { return Vec3(mVec[0] * v[0], mVec[1] * v[1], mVec[2] * v[2]); }
+    __hostdev__ Vec3  operator/(const Vec3& v) const { return Vec3(mVec[0] / v[0], mVec[1] / v[1], mVec[2] / v[2]); }
+    __hostdev__ Vec3  operator+(const Vec3& v) const { return Vec3(mVec[0] + v[0], mVec[1] + v[1], mVec[2] + v[2]); }
+    __hostdev__ Vec3  operator-(const Vec3& v) const { return Vec3(mVec[0] - v[0], mVec[1] - v[1], mVec[2] - v[2]); }
+    __hostdev__ Vec3  operator+(const Coord& ijk) const { return Vec3(mVec[0] + ijk[0], mVec[1] + ijk[1], mVec[2] + ijk[2]); }
+    __hostdev__ Vec3  operator-(const Coord& ijk) const { return Vec3(mVec[0] - ijk[0], mVec[1] - ijk[1], mVec[2] - ijk[2]); }
+    __hostdev__ Vec3  operator*(const T& s) const { return Vec3(s * mVec[0], s * mVec[1], s * mVec[2]); }
+    __hostdev__ Vec3  operator/(const T& s) const { return (T(1) / s) * (*this); }
+    __hostdev__ Vec3& operator+=(const Vec3& v)
+    {
+        mVec[0] += v[0];
+        mVec[1] += v[1];
+        mVec[2] += v[2];
+        return *this;
+    }
+    __hostdev__ Vec3& operator+=(const Coord& ijk)
+    {
+        mVec[0] += T(ijk[0]);
+        mVec[1] += T(ijk[1]);
+        mVec[2] += T(ijk[2]);
+        return *this;
+    }
+    __hostdev__ Vec3& operator-=(const Vec3& v)
+    {
+        mVec[0] -= v[0];
+        mVec[1] -= v[1];
+        mVec[2] -= v[2];
+        return *this;
+    }
+    __hostdev__ Vec3& operator-=(const Coord& ijk)
+    {
+        mVec[0] -= T(ijk[0]);
+        mVec[1] -= T(ijk[1]);
+        mVec[2] -= T(ijk[2]);
+        return *this;
+    }
+    __hostdev__ Vec3& operator*=(const T& s)
+    {
+        mVec[0] *= s;
+        mVec[1] *= s;
+        mVec[2] *= s;
+        return *this;
+    }
+    __hostdev__ Vec3& operator/=(const T& s) { return (*this) *= T(1) / s; }
+    __hostdev__ Vec3& normalize() { return (*this) /= this->length(); }
+    /// @brief Perform a component-wise minimum with the other Coord.
+    __hostdev__ Vec3& minComponent(const Vec3& other)
+    {
+        if (other[0] < mVec[0])
+            mVec[0] = other[0];
+        if (other[1] < mVec[1])
+            mVec[1] = other[1];
+        if (other[2] < mVec[2])
+            mVec[2] = other[2];
+        return *this;
+    }
+
+    /// @brief Perform a component-wise maximum with the other Coord.
+    __hostdev__ Vec3& maxComponent(const Vec3& other)
+    {
+        if (other[0] > mVec[0])
+            mVec[0] = other[0];
+        if (other[1] > mVec[1])
+            mVec[1] = other[1];
+        if (other[2] > mVec[2])
+            mVec[2] = other[2];
+        return *this;
+    }
+    /// @brief Return the smallest vector component
+    __hostdev__ ValueType min() const
+    {
+        return mVec[0] < mVec[1] ? (mVec[0] < mVec[2] ? mVec[0] : mVec[2]) : (mVec[1] < mVec[2] ? mVec[1] : mVec[2]);
+    }
+    /// @brief Return the largest vector component
+    __hostdev__ ValueType max() const
+    {
+        return mVec[0] > mVec[1] ? (mVec[0] > mVec[2] ? mVec[0] : mVec[2]) : (mVec[1] > mVec[2] ? mVec[1] : mVec[2]);
+    }
+    /// @brief Round each component if this Vec<T> up to its integer value
+    /// @return Return an integer Coord
+    __hostdev__ Coord floor() const { return Coord(Floor(mVec[0]), Floor(mVec[1]), Floor(mVec[2])); }
+    /// @brief Round each component if this Vec<T> down to its integer value
+    /// @return Return an integer Coord
+    __hostdev__ Coord ceil() const { return Coord(Ceil(mVec[0]), Ceil(mVec[1]), Ceil(mVec[2])); }
+    /// @brief Round each component if this Vec<T> to its closest integer value
+    /// @return Return an integer Coord
+    __hostdev__ Coord round() const
+    {
+        if constexpr(util::is_same<T, float>::value) {
+            return Coord(Floor(mVec[0] + 0.5f), Floor(mVec[1] + 0.5f), Floor(mVec[2] + 0.5f));
+        } else if constexpr(util::is_same<T, int>::value) {
+            return Coord(mVec[0], mVec[1], mVec[2]);
+        } else {
+            return Coord(Floor(mVec[0] + 0.5), Floor(mVec[1] + 0.5), Floor(mVec[2] + 0.5));
+        }
+    }
+
+    /// @brief return a non-const raw constant pointer to array of three vector components
+    __hostdev__ T* asPointer() { return mVec; }
+    /// @brief return a const raw constant pointer to array of three vector components
+    __hostdev__ const T* asPointer() const { return mVec; }
+}; // Vec3<T>
+
+template<typename T1, typename T2>
+__hostdev__ inline Vec3<T2> operator*(T1 scalar, const Vec3<T2>& vec)
+{
+    return Vec3<T2>(scalar * vec[0], scalar * vec[1], scalar * vec[2]);
+}
+template<typename T1, typename T2>
+__hostdev__ inline Vec3<T2> operator/(T1 scalar, const Vec3<T2>& vec)
+{
+    return Vec3<T2>(scalar / vec[0], scalar / vec[1], scalar / vec[2]);
+}
+
+/// @brief Return a single precision floating-point vector of this coordinate
+__hostdev__ inline Vec3<float> Coord::asVec3s() const
+{
+    return Vec3<float>(float(mVec[0]), float(mVec[1]), float(mVec[2]));
+}
+
+/// @brief Return a double precision floating-point vector of this coordinate
+__hostdev__ inline Vec3<double> Coord::asVec3d() const
+{
+    return Vec3<double>(double(mVec[0]), double(mVec[1]), double(mVec[2]));
+}
+
+// ----------------------------> Vec4 <--------------------------------------
+
+/// @brief A simple vector class with four components, similar to openvdb::math::Vec4
+template<typename T>
+class Vec4
+{
+    T mVec[4];
+
+public:
+    static const int SIZE = 4;
+    static const int size = 4;
+    using ValueType = T;
+    Vec4() = default;
+    __hostdev__ explicit Vec4(T x)
+        : mVec{x, x, x, x}
+    {
+    }
+    __hostdev__ Vec4(T x, T y, T z, T w)
+        : mVec{x, y, z, w}
+    {
+    }
+    template<typename T2>
+    __hostdev__ explicit Vec4(const Vec4<T2>& v)
+        : mVec{T(v[0]), T(v[1]), T(v[2]), T(v[3])}
+    {
+    }
+    template<template<class> class Vec4T, class T2>
+    __hostdev__ Vec4(const Vec4T<T2>& v)
+        : mVec{T(v[0]), T(v[1]), T(v[2]), T(v[3])}
+    {
+        static_assert(Vec4T<T2>::size == size, "expected Vec4T::size==4!");
+    }
+    __hostdev__ bool operator==(const Vec4& rhs) const { return mVec[0] == rhs[0] && mVec[1] == rhs[1] && mVec[2] == rhs[2] && mVec[3] == rhs[3]; }
+    __hostdev__ bool operator!=(const Vec4& rhs) const { return mVec[0] != rhs[0] || mVec[1] != rhs[1] || mVec[2] != rhs[2] || mVec[3] != rhs[3]; }
+    template<template<class> class Vec4T, class T2>
+    __hostdev__ Vec4& operator=(const Vec4T<T2>& rhs)
+    {
+        static_assert(Vec4T<T2>::size == size, "expected Vec4T::size==4!");
+        mVec[0] = rhs[0];
+        mVec[1] = rhs[1];
+        mVec[2] = rhs[2];
+        mVec[3] = rhs[3];
+        return *this;
+    }
+
+    __hostdev__ const T& operator[](int i) const { return mVec[i]; }
+    __hostdev__ T&       operator[](int i) { return mVec[i]; }
+    template<typename Vec4T>
+    __hostdev__ T dot(const Vec4T& v) const { return mVec[0] * v[0] + mVec[1] * v[1] + mVec[2] * v[2] + mVec[3] * v[3]; }
+    __hostdev__ T lengthSqr() const
+    {
+        return mVec[0] * mVec[0] + mVec[1] * mVec[1] + mVec[2] * mVec[2] + mVec[3] * mVec[3]; // 7 flops
+    }
+    __hostdev__ T     length() const { return Sqrt(this->lengthSqr()); }
+    __hostdev__ Vec4  operator-() const { return Vec4(-mVec[0], -mVec[1], -mVec[2], -mVec[3]); }
+    __hostdev__ Vec4  operator*(const Vec4& v) const { return Vec4(mVec[0] * v[0], mVec[1] * v[1], mVec[2] * v[2], mVec[3] * v[3]); }
+    __hostdev__ Vec4  operator/(const Vec4& v) const { return Vec4(mVec[0] / v[0], mVec[1] / v[1], mVec[2] / v[2], mVec[3] / v[3]); }
+    __hostdev__ Vec4  operator+(const Vec4& v) const { return Vec4(mVec[0] + v[0], mVec[1] + v[1], mVec[2] + v[2], mVec[3] + v[3]); }
+    __hostdev__ Vec4  operator-(const Vec4& v) const { return Vec4(mVec[0] - v[0], mVec[1] - v[1], mVec[2] - v[2], mVec[3] - v[3]); }
+    __hostdev__ Vec4  operator*(const T& s) const { return Vec4(s * mVec[0], s * mVec[1], s * mVec[2], s * mVec[3]); }
+    __hostdev__ Vec4  operator/(const T& s) const { return (T(1) / s) * (*this); }
+    __hostdev__ Vec4& operator+=(const Vec4& v)
+    {
+        mVec[0] += v[0];
+        mVec[1] += v[1];
+        mVec[2] += v[2];
+        mVec[3] += v[3];
+        return *this;
+    }
+    __hostdev__ Vec4& operator-=(const Vec4& v)
+    {
+        mVec[0] -= v[0];
+        mVec[1] -= v[1];
+        mVec[2] -= v[2];
+        mVec[3] -= v[3];
+        return *this;
+    }
+    __hostdev__ Vec4& operator*=(const T& s)
+    {
+        mVec[0] *= s;
+        mVec[1] *= s;
+        mVec[2] *= s;
+        mVec[3] *= s;
+        return *this;
+    }
+    __hostdev__ Vec4& operator/=(const T& s) { return (*this) *= T(1) / s; }
+    __hostdev__ Vec4& normalize() { return (*this) /= this->length(); }
+    /// @brief Perform a component-wise minimum with the other Coord.
+    __hostdev__ Vec4& minComponent(const Vec4& other)
+    {
+        if (other[0] < mVec[0])
+            mVec[0] = other[0];
+        if (other[1] < mVec[1])
+            mVec[1] = other[1];
+        if (other[2] < mVec[2])
+            mVec[2] = other[2];
+        if (other[3] < mVec[3])
+            mVec[3] = other[3];
+        return *this;
+    }
+
+    /// @brief Perform a component-wise maximum with the other Coord.
+    __hostdev__ Vec4& maxComponent(const Vec4& other)
+    {
+        if (other[0] > mVec[0])
+            mVec[0] = other[0];
+        if (other[1] > mVec[1])
+            mVec[1] = other[1];
+        if (other[2] > mVec[2])
+            mVec[2] = other[2];
+        if (other[3] > mVec[3])
+            mVec[3] = other[3];
+        return *this;
+    }
+}; // Vec4<T>
+
+template<typename T1, typename T2>
+__hostdev__ inline Vec4<T2> operator*(T1 scalar, const Vec4<T2>& vec)
+{
+    return Vec4<T2>(scalar * vec[0], scalar * vec[1], scalar * vec[2], scalar * vec[3]);
+}
+template<typename T1, typename T2>
+__hostdev__ inline Vec4<T2> operator/(T1 scalar, const Vec4<T2>& vec)
+{
+    return Vec4<T2>(scalar / vec[0], scalar / vec[1], scalar / vec[2], scalar / vec[3]);
+}
+
+// ----------------------------> matMult <--------------------------------------
+
+/// @brief Multiply a 3x3 matrix and a 3d vector using 32bit floating point arithmetics
+/// @note This corresponds to a linear mapping, e.g. scaling, rotation etc.
+/// @tparam Vec3T Template type of the input and output 3d vectors
+/// @param mat pointer to an array of floats with the 3x3 matrix
+/// @param xyz input vector to be multiplied by the matrix
+/// @return result of matrix-vector multiplication, i.e. mat x xyz
+template<typename Vec3T>
+__hostdev__ inline Vec3T matMult(const float* mat, const Vec3T& xyz)
+{
+    return Vec3T(fmaf(static_cast<float>(xyz[0]), mat[0], fmaf(static_cast<float>(xyz[1]), mat[1], static_cast<float>(xyz[2]) * mat[2])),
+                 fmaf(static_cast<float>(xyz[0]), mat[3], fmaf(static_cast<float>(xyz[1]), mat[4], static_cast<float>(xyz[2]) * mat[5])),
+                 fmaf(static_cast<float>(xyz[0]), mat[6], fmaf(static_cast<float>(xyz[1]), mat[7], static_cast<float>(xyz[2]) * mat[8]))); // 6 fmaf + 3 mult = 9 flops
+}
+
+/// @brief Multiply a 3x3 matrix and a 3d vector using 64bit floating point arithmetics
+/// @note This corresponds to a linear mapping, e.g. scaling, rotation etc.
+/// @tparam Vec3T Template type of the input and output 3d vectors
+/// @param mat pointer to an array of floats with the 3x3 matrix
+/// @param xyz input vector to be multiplied by the matrix
+/// @return result of matrix-vector multiplication, i.e. mat x xyz
+template<typename Vec3T>
+__hostdev__ inline Vec3T matMult(const double* mat, const Vec3T& xyz)
+{
+    return Vec3T(fma(static_cast<double>(xyz[0]), mat[0], fma(static_cast<double>(xyz[1]), mat[1], static_cast<double>(xyz[2]) * mat[2])),
+                 fma(static_cast<double>(xyz[0]), mat[3], fma(static_cast<double>(xyz[1]), mat[4], static_cast<double>(xyz[2]) * mat[5])),
+                 fma(static_cast<double>(xyz[0]), mat[6], fma(static_cast<double>(xyz[1]), mat[7], static_cast<double>(xyz[2]) * mat[8]))); // 6 fmaf + 3 mult = 9 flops
+}
+
+/// @brief Multiply a 3x3 matrix to a 3d vector and add another 3d vector using 32bit floating point arithmetics
+/// @note This corresponds to an affine transformation, i.e a linear mapping followed by a translation. e.g. scale/rotation and translation
+/// @tparam Vec3T Template type of the input and output 3d vectors
+/// @param mat pointer to an array of floats with the 3x3 matrix
+/// @param vec 3d vector to be added AFTER the matrix multiplication
+/// @param xyz input vector to be multiplied by the matrix and a translated by @c vec
+/// @return result of affine transformation, i.e. (mat x xyz) + vec
+template<typename Vec3T>
+__hostdev__ inline Vec3T matMult(const float* mat, const float* vec, const Vec3T& xyz)
+{
+    return Vec3T(fmaf(static_cast<float>(xyz[0]), mat[0], fmaf(static_cast<float>(xyz[1]), mat[1], fmaf(static_cast<float>(xyz[2]), mat[2], vec[0]))),
+                 fmaf(static_cast<float>(xyz[0]), mat[3], fmaf(static_cast<float>(xyz[1]), mat[4], fmaf(static_cast<float>(xyz[2]), mat[5], vec[1]))),
+                 fmaf(static_cast<float>(xyz[0]), mat[6], fmaf(static_cast<float>(xyz[1]), mat[7], fmaf(static_cast<float>(xyz[2]), mat[8], vec[2])))); // 9 fmaf = 9 flops
+}
+
+/// @brief Multiply a 3x3 matrix to a 3d vector and add another 3d vector using 64bit floating point arithmetics
+/// @note This corresponds to an affine transformation, i.e a linear mapping followed by a translation. e.g. scale/rotation and translation
+/// @tparam Vec3T Template type of the input and output 3d vectors
+/// @param mat pointer to an array of floats with the 3x3 matrix
+/// @param vec 3d vector to be added AFTER the matrix multiplication
+/// @param xyz input vector to be multiplied by the matrix and a translated by @c vec
+/// @return result of affine transformation, i.e. (mat x xyz) + vec
+template<typename Vec3T>
+__hostdev__ inline Vec3T matMult(const double* mat, const double* vec, const Vec3T& xyz)
+{
+    return Vec3T(fma(static_cast<double>(xyz[0]), mat[0], fma(static_cast<double>(xyz[1]), mat[1], fma(static_cast<double>(xyz[2]), mat[2], vec[0]))),
+                 fma(static_cast<double>(xyz[0]), mat[3], fma(static_cast<double>(xyz[1]), mat[4], fma(static_cast<double>(xyz[2]), mat[5], vec[1]))),
+                 fma(static_cast<double>(xyz[0]), mat[6], fma(static_cast<double>(xyz[1]), mat[7], fma(static_cast<double>(xyz[2]), mat[8], vec[2])))); // 9 fma = 9 flops
+}
+
+/// @brief Multiply the transposed of a 3x3 matrix and a 3d vector using 32bit floating point arithmetics
+/// @note This corresponds to an inverse linear mapping, e.g. inverse scaling, inverse rotation etc.
+/// @tparam Vec3T Template type of the input and output 3d vectors
+/// @param mat pointer to an array of floats with the 3x3 matrix
+/// @param xyz input vector to be multiplied by the transposed matrix
+/// @return result of matrix-vector multiplication, i.e. mat^T x xyz
+template<typename Vec3T>
+__hostdev__ inline Vec3T matMultT(const float* mat, const Vec3T& xyz)
+{
+    return Vec3T(fmaf(static_cast<float>(xyz[0]), mat[0], fmaf(static_cast<float>(xyz[1]), mat[3], static_cast<float>(xyz[2]) * mat[6])),
+                 fmaf(static_cast<float>(xyz[0]), mat[1], fmaf(static_cast<float>(xyz[1]), mat[4], static_cast<float>(xyz[2]) * mat[7])),
+                 fmaf(static_cast<float>(xyz[0]), mat[2], fmaf(static_cast<float>(xyz[1]), mat[5], static_cast<float>(xyz[2]) * mat[8]))); // 6 fmaf + 3 mult = 9 flops
+}
+
+/// @brief Multiply the transposed of a 3x3 matrix and a 3d vector using 64bit floating point arithmetics
+/// @note This corresponds to an inverse linear mapping, e.g. inverse scaling, inverse rotation etc.
+/// @tparam Vec3T Template type of the input and output 3d vectors
+/// @param mat pointer to an array of floats with the 3x3 matrix
+/// @param xyz input vector to be multiplied by the transposed matrix
+/// @return result of matrix-vector multiplication, i.e. mat^T x xyz
+template<typename Vec3T>
+__hostdev__ inline Vec3T matMultT(const double* mat, const Vec3T& xyz)
+{
+    return Vec3T(fma(static_cast<double>(xyz[0]), mat[0], fma(static_cast<double>(xyz[1]), mat[3], static_cast<double>(xyz[2]) * mat[6])),
+                 fma(static_cast<double>(xyz[0]), mat[1], fma(static_cast<double>(xyz[1]), mat[4], static_cast<double>(xyz[2]) * mat[7])),
+                 fma(static_cast<double>(xyz[0]), mat[2], fma(static_cast<double>(xyz[1]), mat[5], static_cast<double>(xyz[2]) * mat[8]))); // 6 fmaf + 3 mult = 9 flops
+}
+
+template<typename Vec3T>
+__hostdev__ inline Vec3T matMultT(const float* mat, const float* vec, const Vec3T& xyz)
+{
+    return Vec3T(fmaf(static_cast<float>(xyz[0]), mat[0], fmaf(static_cast<float>(xyz[1]), mat[3], fmaf(static_cast<float>(xyz[2]), mat[6], vec[0]))),
+                 fmaf(static_cast<float>(xyz[0]), mat[1], fmaf(static_cast<float>(xyz[1]), mat[4], fmaf(static_cast<float>(xyz[2]), mat[7], vec[1]))),
+                 fmaf(static_cast<float>(xyz[0]), mat[2], fmaf(static_cast<float>(xyz[1]), mat[5], fmaf(static_cast<float>(xyz[2]), mat[8], vec[2])))); // 9 fmaf = 9 flops
+}
+
+template<typename Vec3T>
+__hostdev__ inline Vec3T matMultT(const double* mat, const double* vec, const Vec3T& xyz)
+{
+    return Vec3T(fma(static_cast<double>(xyz[0]), mat[0], fma(static_cast<double>(xyz[1]), mat[3], fma(static_cast<double>(xyz[2]), mat[6], vec[0]))),
+                 fma(static_cast<double>(xyz[0]), mat[1], fma(static_cast<double>(xyz[1]), mat[4], fma(static_cast<double>(xyz[2]), mat[7], vec[1]))),
+                 fma(static_cast<double>(xyz[0]), mat[2], fma(static_cast<double>(xyz[1]), mat[5], fma(static_cast<double>(xyz[2]), mat[8], vec[2])))); // 9 fma = 9 flops
+}
+
+// ----------------------------> BBox <-------------------------------------
+
+// Base-class for static polymorphism (cannot be constructed directly)
+template<typename Vec3T>
+struct BaseBBox
+{
+    Vec3T                    mCoord[2];
+    __hostdev__ bool         operator==(const BaseBBox& rhs) const { return mCoord[0] == rhs.mCoord[0] && mCoord[1] == rhs.mCoord[1]; };
+    __hostdev__ bool         operator!=(const BaseBBox& rhs) const { return mCoord[0] != rhs.mCoord[0] || mCoord[1] != rhs.mCoord[1]; };
+    __hostdev__ const Vec3T& operator[](int i) const { return mCoord[i]; }
+    __hostdev__ Vec3T&       operator[](int i) { return mCoord[i]; }
+    __hostdev__ Vec3T&       min() { return mCoord[0]; }
+    __hostdev__ Vec3T&       max() { return mCoord[1]; }
+    __hostdev__ const Vec3T& min() const { return mCoord[0]; }
+    __hostdev__ const Vec3T& max() const { return mCoord[1]; }
+    __hostdev__ BaseBBox&    translate(const Vec3T& xyz)
+    {
+        mCoord[0] += xyz;
+        mCoord[1] += xyz;
+        return *this;
+    }
+    /// @brief Expand this bounding box to enclose point @c xyz.
+    __hostdev__ BaseBBox& expand(const Vec3T& xyz)
+    {
+        mCoord[0].minComponent(xyz);
+        mCoord[1].maxComponent(xyz);
+        return *this;
+    }
+
+    /// @brief Expand this bounding box to enclose the given bounding box.
+    __hostdev__ BaseBBox& expand(const BaseBBox& bbox)
+    {
+        mCoord[0].minComponent(bbox[0]);
+        mCoord[1].maxComponent(bbox[1]);
+        return *this;
+    }
+
+    /// @brief Intersect this bounding box with the given bounding box.
+    __hostdev__ BaseBBox& intersect(const BaseBBox& bbox)
+    {
+        mCoord[0].maxComponent(bbox[0]);
+        mCoord[1].minComponent(bbox[1]);
+        return *this;
+    }
+
+    //__hostdev__ BaseBBox expandBy(typename Vec3T::ValueType padding) const
+    //{
+    //    return BaseBBox(mCoord[0].offsetBy(-padding),mCoord[1].offsetBy(padding));
+    //}
+    __hostdev__ bool isInside(const Vec3T& xyz)
+    {
+        if (xyz[0] < mCoord[0][0] || xyz[1] < mCoord[0][1] || xyz[2] < mCoord[0][2])
+            return false;
+        if (xyz[0] > mCoord[1][0] || xyz[1] > mCoord[1][1] || xyz[2] > mCoord[1][2])
+            return false;
+        return true;
+    }
+
+protected:
+    __hostdev__ BaseBBox() {}
+    __hostdev__ BaseBBox(const Vec3T& min, const Vec3T& max)
+        : mCoord{min, max}
+    {
+    }
+}; // BaseBBox
+
+template<typename Vec3T, bool = util::is_floating_point<typename Vec3T::ValueType>::value>
+struct BBox;
+
+/// @brief Partial template specialization for floating point coordinate types.
+///
+/// @note Min is inclusive and max is exclusive. If min = max the dimension of
+///       the bounding box is zero and therefore it is also empty.
+template<typename Vec3T>
+struct BBox<Vec3T, true> : public BaseBBox<Vec3T>
+{
+    using Vec3Type = Vec3T;
+    using ValueType = typename Vec3T::ValueType;
+    static_assert(util::is_floating_point<ValueType>::value, "Expected a floating point coordinate type");
+    using BaseT = BaseBBox<Vec3T>;
+    using BaseT::mCoord;
+    /// @brief Default construction sets BBox to an empty bbox
+    __hostdev__ BBox()
+        : BaseT(Vec3T( Maximum<typename Vec3T::ValueType>::value()),
+                Vec3T(-Maximum<typename Vec3T::ValueType>::value()))
+    {
+    }
+    __hostdev__ BBox(const Vec3T& min, const Vec3T& max)
+        : BaseT(min, max)
+    {
+    }
+    __hostdev__ BBox(const Coord& min, const Coord& max)
+        : BaseT(Vec3T(ValueType(min[0]), ValueType(min[1]), ValueType(min[2])),
+                Vec3T(ValueType(max[0] + 1), ValueType(max[1] + 1), ValueType(max[2] + 1)))
+    {
+    }
+    __hostdev__ static BBox createCube(const Coord& min, typename Coord::ValueType dim)
+    {
+        return BBox(min, min.offsetBy(dim));
+    }
+
+    __hostdev__ BBox(const BaseBBox<Coord>& bbox)
+        : BBox(bbox[0], bbox[1])
+    {
+    }
+    __hostdev__ bool  empty() const { return mCoord[0][0] >= mCoord[1][0] ||
+                                             mCoord[0][1] >= mCoord[1][1] ||
+                                             mCoord[0][2] >= mCoord[1][2]; }
+    __hostdev__ operator bool() const { return mCoord[0][0] < mCoord[1][0] &&
+                                               mCoord[0][1] < mCoord[1][1] &&
+                                               mCoord[0][2] < mCoord[1][2]; }
+    __hostdev__ Vec3T dim() const { return *this ? this->max() - this->min() : Vec3T(0); }
+    __hostdev__ bool  isInside(const Vec3T& p) const
+    {
+        return p[0] > mCoord[0][0] && p[1] > mCoord[0][1] && p[2] > mCoord[0][2] &&
+               p[0] < mCoord[1][0] && p[1] < mCoord[1][1] && p[2] < mCoord[1][2];
+    }
+
+}; // BBox<Vec3T, true>
+
+/// @brief Partial template specialization for integer coordinate types
+///
+/// @note Both min and max are INCLUDED in the bbox so dim = max - min + 1. So,
+///       if min = max the bounding box contains exactly one point and dim = 1!
+template<typename CoordT>
+struct BBox<CoordT, false> : public BaseBBox<CoordT>
+{
+    static_assert(util::is_same<int, typename CoordT::ValueType>::value, "Expected \"int\" coordinate type");
+    using BaseT = BaseBBox<CoordT>;
+    using BaseT::mCoord;
+    /// @brief Iterator over the domain covered by a BBox
+    /// @details z is the fastest-moving coordinate.
+    class Iterator
+    {
+        const BBox& mBBox;
+        CoordT      mPos;
+
+    public:
+        __hostdev__ Iterator(const BBox& b)
+            : mBBox(b)
+            , mPos(b.min())
+        {
+        }
+        __hostdev__ Iterator(const BBox& b, const Coord& p)
+            : mBBox(b)
+            , mPos(p)
+        {
+        }
+        __hostdev__ Iterator& operator++()
+        {
+            if (mPos[2] < mBBox[1][2]) { // this is the most common case
+                ++mPos[2];// increment z
+            } else if (mPos[1] < mBBox[1][1]) {
+                mPos[2] = mBBox[0][2];// reset z
+                ++mPos[1];// increment y
+            } else if (mPos[0] <= mBBox[1][0]) {
+                mPos[2] = mBBox[0][2];// reset z
+                mPos[1] = mBBox[0][1];// reset y
+                ++mPos[0];// increment x
+            }
+            return *this;
+        }
+        __hostdev__ Iterator operator++(int)
+        {
+            auto tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+        __hostdev__ bool operator==(const Iterator& rhs) const
+        {
+            NANOVDB_ASSERT(mBBox == rhs.mBBox);
+            return mPos == rhs.mPos;
+        }
+        __hostdev__ bool operator!=(const Iterator& rhs) const
+        {
+            NANOVDB_ASSERT(mBBox == rhs.mBBox);
+            return mPos != rhs.mPos;
+        }
+        __hostdev__ bool operator<(const Iterator& rhs) const
+        {
+            NANOVDB_ASSERT(mBBox == rhs.mBBox);
+            return mPos < rhs.mPos;
+        }
+        __hostdev__ bool operator<=(const Iterator& rhs) const
+        {
+            NANOVDB_ASSERT(mBBox == rhs.mBBox);
+            return mPos <= rhs.mPos;
+        }
+        /// @brief Return @c true if the iterator still points to a valid coordinate.
+        __hostdev__ operator bool() const { return mPos <= mBBox[1]; }
+        __hostdev__ const CoordT& operator*() const { return mPos; }
+    }; // Iterator
+    __hostdev__ Iterator begin() const { return Iterator{*this}; }
+    __hostdev__ Iterator end()   const { return Iterator{*this, CoordT(mCoord[1][0]+1, mCoord[0][1], mCoord[0][2])}; }
+    __hostdev__          BBox()
+        : BaseT(CoordT::max(), CoordT::min())
+    {
+    }
+    __hostdev__ BBox(const CoordT& min, const CoordT& max)
+        : BaseT(min, max)
+    {
+    }
+
+    template<typename SplitT>
+    __hostdev__ BBox(BBox& other, const SplitT&)
+        : BaseT(other.mCoord[0], other.mCoord[1])
+    {
+        NANOVDB_ASSERT(this->is_divisible());
+        const int n = MaxIndex(this->dim());
+        mCoord[1][n] = (mCoord[0][n] + mCoord[1][n]) >> 1;
+        other.mCoord[0][n] = mCoord[1][n] + 1;
+    }
+
+    __hostdev__ static BBox createCube(const CoordT& min, typename CoordT::ValueType dim)
+    {
+        return BBox(min, min.offsetBy(dim - 1));
+    }
+
+    __hostdev__ static BBox createCube(typename CoordT::ValueType min, typename CoordT::ValueType max)
+    {
+        return BBox(CoordT(min), CoordT(max));
+    }
+
+    __hostdev__ bool is_divisible() const { return mCoord[0][0] < mCoord[1][0] &&
+                                                   mCoord[0][1] < mCoord[1][1] &&
+                                                   mCoord[0][2] < mCoord[1][2]; }
+    /// @brief Return true if this bounding box is empty, e.g. uninitialized
+    __hostdev__ bool     empty() const { return mCoord[0][0] > mCoord[1][0] ||
+                                                mCoord[0][1] > mCoord[1][1] ||
+                                                mCoord[0][2] > mCoord[1][2]; }
+    /// @brief Convert this BBox to boolean true if it is not empty
+    __hostdev__ operator bool() const { return mCoord[0][0] <= mCoord[1][0] &&
+                                               mCoord[0][1] <= mCoord[1][1] &&
+                                               mCoord[0][2] <= mCoord[1][2]; }
+    __hostdev__ CoordT   dim() const { return *this ? this->max() - this->min() + Coord(1) : Coord(0); }
+    __hostdev__ uint64_t volume() const
+    {
+        auto d = this->dim();
+        return uint64_t(d[0]) * uint64_t(d[1]) * uint64_t(d[2]);
+    }
+    __hostdev__ bool isInside(const CoordT& p) const { return !(CoordT::lessThan(p, this->min()) || CoordT::lessThan(this->max(), p)); }
+    /// @brief Return @c true if the given bounding box is inside this bounding box.
+    __hostdev__ bool isInside(const BBox& b) const
+    {
+        return !(CoordT::lessThan(b.min(), this->min()) || CoordT::lessThan(this->max(), b.max()));
+    }
+
+    /// @brief Return @c true if the given bounding box overlaps with this bounding box.
+    __hostdev__ bool hasOverlap(const BBox& b) const
+    {
+        return !(CoordT::lessThan(this->max(), b.min()) || CoordT::lessThan(b.max(), this->min()));
+    }
+
+    /// @warning This converts a CoordBBox into a floating-point bounding box which implies that max += 1 !
+    template<typename RealT = double>
+    __hostdev__ BBox<Vec3<RealT>> asReal() const
+    {
+        static_assert(util::is_floating_point<RealT>::value, "CoordBBox::asReal: Expected a floating point coordinate");
+        return BBox<Vec3<RealT>>(Vec3<RealT>(RealT(mCoord[0][0]), RealT(mCoord[0][1]), RealT(mCoord[0][2])),
+                                 Vec3<RealT>(RealT(mCoord[1][0] + 1), RealT(mCoord[1][1] + 1), RealT(mCoord[1][2] + 1)));
+    }
+    /// @brief Return a new instance that is expanded by the specified padding.
+    __hostdev__ BBox expandBy(typename CoordT::ValueType padding) const
+    {
+        return BBox(mCoord[0].offsetBy(-padding), mCoord[1].offsetBy(padding));
+    }
+
+    /// @brief  @brief transform this coordinate bounding box by the specified map
+    /// @param map mapping of index to world coordinates
+    /// @return world bounding box
+    template<typename Map>
+    __hostdev__ auto transform(const Map& map) const
+    {
+        using Vec3T = Vec3<double>;
+        const Vec3T tmp = map.applyMap(Vec3T(mCoord[0][0], mCoord[0][1], mCoord[0][2]));
+        BBox<Vec3T> bbox(tmp, tmp);// return value
+        bbox.expand(map.applyMap(Vec3T(mCoord[0][0], mCoord[0][1], mCoord[1][2])));
+        bbox.expand(map.applyMap(Vec3T(mCoord[0][0], mCoord[1][1], mCoord[0][2])));
+        bbox.expand(map.applyMap(Vec3T(mCoord[1][0], mCoord[0][1], mCoord[0][2])));
+        bbox.expand(map.applyMap(Vec3T(mCoord[1][0], mCoord[1][1], mCoord[0][2])));
+        bbox.expand(map.applyMap(Vec3T(mCoord[1][0], mCoord[0][1], mCoord[1][2])));
+        bbox.expand(map.applyMap(Vec3T(mCoord[0][0], mCoord[1][1], mCoord[1][2])));
+        bbox.expand(map.applyMap(Vec3T(mCoord[1][0], mCoord[1][1], mCoord[1][2])));
+        return bbox;
+    }
+
+#if defined(__CUDACC__) // the following functions only run on the GPU!
+    __device__ inline BBox& expandAtomic(const CoordT& ijk)
+    {
+        mCoord[0].minComponentAtomic(ijk);
+        mCoord[1].maxComponentAtomic(ijk);
+        return *this;
+    }
+    __device__ inline BBox& expandAtomic(const BBox& bbox)
+    {
+        mCoord[0].minComponentAtomic(bbox[0]);
+        mCoord[1].maxComponentAtomic(bbox[1]);
+        return *this;
+    }
+    __device__ inline BBox& intersectAtomic(const BBox& bbox)
+    {
+        mCoord[0].maxComponentAtomic(bbox[0]);
+        mCoord[1].minComponentAtomic(bbox[1]);
+        return *this;
+    }
+#endif
+}; // BBox<CoordT, false>
+
+// --------------------------> Rgba8 <------------------------------------
+
+/// @brief 8-bit red, green, blue, alpha packed into 32 bit unsigned int
+class Rgba8
+{
+    union
+    {
+        uint8_t  c[4];   // 4 integer color channels of red, green, blue and alpha components.
+        uint32_t packed; // 32 bit packed representation
+    } mData;
+
+public:
+    static const int SIZE = 4;
+    using ValueType = uint8_t;
+
+    /// @brief Default copy constructor
+    Rgba8(const Rgba8&) = default;
+
+    /// @brief Default move constructor
+    Rgba8(Rgba8&&) = default;
+
+    /// @brief Default move assignment operator
+    /// @return non-const reference to this instance
+    Rgba8&      operator=(Rgba8&&) = default;
+
+    /// @brief Default copy assignment operator
+    /// @return non-const reference to this instance
+    Rgba8&      operator=(const Rgba8&) = default;
+
+    /// @brief Default ctor initializes all channels to zero
+    __hostdev__ Rgba8()
+        : mData{{0, 0, 0, 0}}
+    {
+        static_assert(sizeof(uint32_t) == sizeof(Rgba8), "Unexpected sizeof");
+    }
+
+    /// @brief integer r,g,b,a ctor where alpha channel defaults to opaque
+    /// @note all values should be in the range 0u to 255u
+    __hostdev__ Rgba8(uint8_t r, uint8_t g, uint8_t b, uint8_t a = 255u)
+        : mData{{r, g, b, a}}
+    {
+    }
+
+    /// @brief  @brief ctor where all channels are initialized to the same value
+    /// @note value should be in the range 0u to 255u
+    explicit __hostdev__ Rgba8(uint8_t v)
+        : mData{{v, v, v, v}}
+    {
+    }
+
+    /// @brief floating-point r,g,b,a ctor where alpha channel defaults to opaque
+    /// @note all values should be in the range 0.0f to 1.0f
+    __hostdev__ Rgba8(float r, float g, float b, float a = 1.0f)
+        : mData{{static_cast<uint8_t>(0.5f + r * 255.0f), // round floats to nearest integers
+                 static_cast<uint8_t>(0.5f + g * 255.0f), // double {{}} is needed due to union
+                 static_cast<uint8_t>(0.5f + b * 255.0f),
+                 static_cast<uint8_t>(0.5f + a * 255.0f)}}
+    {
+    }
+
+    /// @brief Vec3f r,g,b ctor (alpha channel it set to 1)
+    /// @note all values should be in the range 0.0f to 1.0f
+    __hostdev__ Rgba8(const Vec3<float>& rgb)
+        : Rgba8(rgb[0], rgb[1], rgb[2])
+    {
+    }
+
+    /// @brief Vec4f r,g,b,a ctor
+    /// @note all values should be in the range 0.0f to 1.0f
+    __hostdev__ Rgba8(const Vec4<float>& rgba)
+        : Rgba8(rgba[0], rgba[1], rgba[2], rgba[3])
+    {
+    }
+
+    __hostdev__ bool  operator< (const Rgba8& rhs) const { return mData.packed < rhs.mData.packed; }
+    __hostdev__ bool  operator==(const Rgba8& rhs) const { return mData.packed == rhs.mData.packed; }
+    __hostdev__ float lengthSqr() const
+    {
+        return 0.0000153787005f * (float(mData.c[0]) * mData.c[0] +
+                                   float(mData.c[1]) * mData.c[1] +
+                                   float(mData.c[2]) * mData.c[2]); //1/255^2
+    }
+    __hostdev__ float           length() const { return sqrtf(this->lengthSqr()); }
+    /// @brief return n'th color channel as a float in the range 0 to 1
+    __hostdev__ float           asFloat(int n) const { return 0.003921569f*float(mData.c[n]); }// divide by 255
+    __hostdev__ const uint8_t&  operator[](int n) const { return mData.c[n]; }
+    __hostdev__ uint8_t&        operator[](int n) { return mData.c[n]; }
+    __hostdev__ const uint32_t& packed() const { return mData.packed; }
+    __hostdev__ uint32_t&       packed() { return mData.packed; }
+    __hostdev__ const uint8_t&  r() const { return mData.c[0]; }
+    __hostdev__ const uint8_t&  g() const { return mData.c[1]; }
+    __hostdev__ const uint8_t&  b() const { return mData.c[2]; }
+    __hostdev__ const uint8_t&  a() const { return mData.c[3]; }
+    __hostdev__ uint8_t&        r() { return mData.c[0]; }
+    __hostdev__ uint8_t&        g() { return mData.c[1]; }
+    __hostdev__ uint8_t&        b() { return mData.c[2]; }
+    __hostdev__ uint8_t&        a() { return mData.c[3]; }
+    __hostdev__                 operator Vec3<float>() const {
+        return Vec3<float>(this->asFloat(0), this->asFloat(1), this->asFloat(2));
+    }
+    __hostdev__                 operator Vec4<float>() const {
+        return Vec4<float>(this->asFloat(0), this->asFloat(1), this->asFloat(2), this->asFloat(3));
+    }
+}; // Rgba8
+
+using Vec3d  = Vec3<double>;
+using Vec3f  = Vec3<float>;
+using Vec3i  = Vec3<int32_t>;
+using Vec3u  = Vec3<uint32_t>;
+using Vec3u8 = Vec3<uint8_t>;
+using Vec3u16 = Vec3<uint16_t>;
+
+using Vec4R  = Vec4<double>;
+using Vec4d  = Vec4<double>;
+using Vec4f  = Vec4<float>;
+using Vec4i  = Vec4<int>;
+
+}// namespace math ===============================================================
+
+using Rgba8 [[deprecated("Use math::Rgba8 instead.")]] = math::Rgba8;
+using math::Coord;
+
+using Vec3d = math::Vec3<double>;
+using Vec3f = math::Vec3<float>;
+using Vec3i = math::Vec3<int32_t>;
+using Vec3u = math::Vec3<uint32_t>;
+using Vec3u8 = math::Vec3<uint8_t>;
+using Vec3u16 = math::Vec3<uint16_t>;
+
+using Vec4R = math::Vec4<double>;
+using Vec4d = math::Vec4<double>;
+using Vec4f = math::Vec4<float>;
+using Vec4i = math::Vec4<int>;
+
+using CoordBBox = math::BBox<Coord>;
+using Vec3dBBox = math::BBox<Vec3d>;
+using BBoxR [[deprecated("Use Vec3dBBox instead.")]] = math::BBox<Vec3d>;
+
+} // namespace nanovdb ===================================================================
+
+#endif // end of NANOVDB_MATH_MATH_H_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/math/Ray.h b/external/nanovdb/math/Ray.h
new file mode 100644
index 00000000..236982f3
--- /dev/null
+++ b/external/nanovdb/math/Ray.h
@@ -0,0 +1,557 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/// @file Ray.h
+///
+/// @author Ken Museth
+///
+/// @brief A Ray class.
+
+#ifndef NANOVDB_MATH_RAY_H_HAS_BEEN_INCLUDED
+#define NANOVDB_MATH_RAY_H_HAS_BEEN_INCLUDED
+
+#include <nanovdb/NanoVDB.h> // for Vec3
+namespace nanovdb {// ===================================================
+
+namespace math {// ======================================================
+
+template<typename RealT>
+class Ray
+{
+public:
+    using RealType = RealT;
+    using Vec3Type = Vec3<RealT>;
+    using Vec3T = Vec3Type;
+
+    struct TimeSpan
+    {
+        RealT t0, t1;
+        /// @brief Default constructor
+        __hostdev__ TimeSpan() {}
+        /// @brief Constructor
+        __hostdev__ TimeSpan(RealT _t0, RealT _t1)
+            : t0(_t0)
+            , t1(_t1)
+        {
+        }
+        /// @brief Set both times
+        __hostdev__ void set(RealT _t0, RealT _t1)
+        {
+            t0 = _t0;
+            t1 = _t1;
+        }
+        /// @brief Get both times
+        __hostdev__ void get(RealT& _t0, RealT& _t1) const
+        {
+            _t0 = t0;
+            _t1 = t1;
+        }
+        /// @brief Return @c true if t1 is larger than t0 by at least eps.
+        __hostdev__ bool valid(RealT eps = Delta<RealT>::value()) const { return (t1 - t0) > eps; }
+        /// @brief Return the midpoint of the ray.
+        __hostdev__ RealT mid() const { return 0.5 * (t0 + t1); }
+        /// @brief Multiplies both times
+        __hostdev__ void scale(RealT s)
+        {
+            assert(s > 0);
+            t0 *= s;
+            t1 *= s;
+        }
+        /// @brief Return @c true if time is inclusive
+        __hostdev__ bool test(RealT t) const { return (t >= t0 && t <= t1); }
+    };
+
+    __hostdev__ Ray(const Vec3Type& eye = Vec3Type(0, 0, 0),
+                    const Vec3Type& direction = Vec3Type(1, 0, 0),
+                    RealT           t0 = Delta<RealT>::value(),
+                    RealT           t1 = Maximum<RealT>::value())
+        : mEye(eye)
+        , mDir(direction)
+        , mInvDir(1 / mDir[0], 1 / mDir[1], 1 / mDir[2])
+        , mTimeSpan(t0, t1)
+        , mSign{mInvDir[0] < 0, mInvDir[1] < 0, mInvDir[2] < 0}
+    {
+    }
+
+    __hostdev__ Ray& offsetEye(RealT offset)
+    {
+        mEye[0] += offset;
+        mEye[1] += offset;
+        mEye[2] += offset;
+        return *this;
+    }
+
+    __hostdev__ Ray& setEye(const Vec3Type& eye)
+    {
+        mEye = eye;
+        return *this;
+    }
+
+    __hostdev__ Ray& setDir(const Vec3Type& dir)
+    {
+        mDir = dir;
+        mInvDir[0] = 1.0 / mDir[0];
+        mInvDir[1] = 1.0 / mDir[1];
+        mInvDir[2] = 1.0 / mDir[2];
+        mSign[0] = mInvDir[0] < 0;
+        mSign[1] = mInvDir[1] < 0;
+        mSign[2] = mInvDir[2] < 0;
+        return *this;
+    }
+
+    __hostdev__ Ray& setMinTime(RealT t0)
+    {
+        mTimeSpan.t0 = t0;
+        return *this;
+    }
+
+    __hostdev__ Ray& setMaxTime(RealT t1)
+    {
+        mTimeSpan.t1 = t1;
+        return *this;
+    }
+
+    __hostdev__ Ray& setTimes(
+        RealT t0 = Delta<RealT>::value(),
+        RealT t1 = Maximum<RealT>::value())
+    {
+        assert(t0 > 0 && t1 > 0);
+        mTimeSpan.set(t0, t1);
+        return *this;
+    }
+
+    __hostdev__ Ray& scaleTimes(RealT scale)
+    {
+        mTimeSpan.scale(scale);
+        return *this;
+    }
+
+    __hostdev__ Ray& reset(
+        const Vec3Type& eye,
+        const Vec3Type& direction,
+        RealT           t0 = Delta<RealT>::value(),
+        RealT           t1 = Maximum<RealT>::value())
+    {
+        this->setEye(eye);
+        this->setDir(direction);
+        this->setTimes(t0, t1);
+        return *this;
+    }
+
+    __hostdev__ const Vec3T& eye() const { return mEye; }
+
+    __hostdev__ const Vec3T& dir() const { return mDir; }
+
+    __hostdev__ const Vec3T& invDir() const { return mInvDir; }
+
+    __hostdev__ RealT t0() const { return mTimeSpan.t0; }
+
+    __hostdev__ RealT t1() const { return mTimeSpan.t1; }
+
+    __hostdev__ int sign(int i) const { return mSign[i]; }
+
+    /// @brief Return the position along the ray at the specified time.
+    __hostdev__ Vec3T operator()(RealT time) const
+    {
+#if 1
+        return Vec3T(fmaf(time, mDir[0], mEye[0]),
+                     fmaf(time, mDir[1], mEye[1]),
+                     fmaf(time, mDir[2], mEye[2]));
+#else
+        return mEye + mDir * time;
+#endif
+    }
+
+    /// @brief Return the starting point of the ray.
+    __hostdev__ Vec3T start() const { return (*this)(mTimeSpan.t0); }
+
+    /// @brief Return the endpoint of the ray.
+    __hostdev__ Vec3T end() const { return (*this)(mTimeSpan.t1); }
+
+    /// @brief Return the midpoint of the ray.
+    __hostdev__ Vec3T mid() const { return (*this)(mTimeSpan.mid()); }
+
+    /// @brief Return @c true if t1 is larger than t0 by at least eps.
+    __hostdev__ bool valid(RealT eps = Delta<float>::value()) const { return mTimeSpan.valid(eps); }
+
+    /// @brief Return @c true if @a time is within t0 and t1, both inclusive.
+    __hostdev__ bool test(RealT time) const { return mTimeSpan.test(time); }
+
+    /// @brief Return a new Ray that is transformed with the specified map.
+    ///
+    /// @param map  the map from which to construct the new Ray.
+    ///
+    /// @warning Assumes a linear map and a normalized direction.
+    ///
+    /// @details The requirement that the direction is normalized
+    ///          follows from the transformation of t0 and t1 - and that fact that
+    ///          we want applyMap and applyInverseMap to be inverse operations.
+    template<typename MapType>
+    __hostdev__ Ray applyMap(const MapType& map) const
+    {
+        const Vec3T eye = map.applyMap(mEye);
+        const Vec3T dir = map.applyJacobian(mDir);
+        const RealT length = dir.length(), invLength = RealT(1) / length;
+        RealT       t1 = mTimeSpan.t1;
+        if (mTimeSpan.t1 < Maximum<RealT>::value()) {
+            t1 *= length;
+        }
+        return Ray(eye, dir * invLength, length * mTimeSpan.t0, t1);
+    }
+    template<typename MapType>
+    __hostdev__ Ray applyMapF(const MapType& map) const
+    {
+        const Vec3T eye = map.applyMapF(mEye);
+        const Vec3T dir = map.applyJacobianF(mDir);
+        const RealT length = dir.length(), invLength = RealT(1) / length;
+        RealT       t1 = mTimeSpan.t1;
+        if (mTimeSpan.t1 < Maximum<RealT>::value()) {
+            t1 *= length;
+        }
+        return Ray(eye, dir * invLength, length * mTimeSpan.t0, t1);
+    }
+
+    /// @brief Return a new Ray that is transformed with the inverse of the specified map.
+    ///
+    /// @param map  the map from which to construct the new Ray by inverse mapping.
+    ///
+    /// @warning Assumes a linear map and a normalized direction.
+    ///
+    /// @details The requirement that the direction is normalized
+    ///          follows from the transformation of t0 and t1 - and that fact that
+    ///          we want applyMap and applyInverseMap to be inverse operations.
+    template<typename MapType>
+    __hostdev__ Ray applyInverseMap(const MapType& map) const
+    {
+        const Vec3T eye = map.applyInverseMap(mEye);
+        const Vec3T dir = map.applyInverseJacobian(mDir);
+        const RealT length = dir.length(), invLength = RealT(1) / length;
+        return Ray(eye, dir * invLength, length * mTimeSpan.t0, length * mTimeSpan.t1);
+    }
+    template<typename MapType>
+    __hostdev__ Ray applyInverseMapF(const MapType& map) const
+    {
+        const Vec3T eye = map.applyInverseMapF(mEye);
+        const Vec3T dir = map.applyInverseJacobianF(mDir);
+        const RealT length = dir.length(), invLength = RealT(1) / length;
+        return Ray(eye, dir * invLength, length * mTimeSpan.t0, length * mTimeSpan.t1);
+    }
+
+    /// @brief Return a new ray in world space, assuming the existing
+    ///        ray is represented in the index space of the specified grid.
+    template<typename GridType>
+    __hostdev__ Ray indexToWorldF(const GridType& grid) const
+    {
+        const Vec3T eye = grid.indexToWorldF(mEye);
+        const Vec3T dir = grid.indexToWorldDirF(mDir);
+        const RealT length = dir.length(), invLength = RealT(1) / length;
+        RealT       t1 = mTimeSpan.t1;
+        if (mTimeSpan.t1 < Maximum<RealT>::value()) {
+            t1 *= length;
+        }
+        return Ray(eye, dir * invLength, length * mTimeSpan.t0, t1);
+    }
+
+    /// @brief Return a new ray in index space, assuming the existing
+    ///        ray is represented in the world space of the specified grid.
+    template<typename GridType>
+    __hostdev__ Ray worldToIndexF(const GridType& grid) const
+    {
+        const Vec3T eye = grid.worldToIndexF(mEye);
+        const Vec3T dir = grid.worldToIndexDirF(mDir);
+        const RealT length = dir.length(), invLength = RealT(1) / length;
+        RealT       t1 = mTimeSpan.t1;
+        if (mTimeSpan.t1 < Maximum<RealT>::value()) {
+            t1 *= length;
+        }
+        return Ray(eye, dir * invLength, length * mTimeSpan.t0, t1);
+    }
+
+    /// @brief Return true if this ray intersects the specified sphere.
+    ///
+    /// @param center The center of the sphere in the same space as this ray.
+    /// @param radius The radius of the sphere in the same units as this ray.
+    /// @param t0     The first intersection point if an intersection exists.
+    /// @param t1     The second intersection point if an intersection exists.
+    ///
+    /// @note If the return value is true, i.e. a hit, and t0 =
+    ///       this->t0() or t1 == this->t1() only one true intersection exist.
+    __hostdev__ bool intersects(const Vec3T& center, RealT radius, RealT& t0, RealT& t1) const
+    {
+        const Vec3T origin = mEye - center;
+        const RealT A = mDir.lengthSqr();
+        const RealT B = 2 * mDir.dot(origin);
+        const RealT C = origin.lengthSqr() - radius * radius;
+        const RealT D = B * B - 4 * A * C;
+
+        if (D < 0) {
+            return false;
+        }
+        const RealT Q = RealT(-0.5) * (B < 0 ? (B + Sqrt(D)) : (B - Sqrt(D)));
+
+        t0 = Q / A;
+        t1 = C / Q;
+
+        if (t0 > t1) {
+            RealT tmp = t0;
+            t0 = t1;
+            t1 = tmp;
+        }
+        if (t0 < mTimeSpan.t0) {
+            t0 = mTimeSpan.t0;
+        }
+        if (t1 > mTimeSpan.t1) {
+            t1 = mTimeSpan.t1;
+        }
+        return t0 <= t1;
+    }
+
+    /// @brief Return true if this ray intersects the specified sphere.
+    ///
+    /// @param center The center of the sphere in the same space as this ray.
+    /// @param radius The radius of the sphere in the same units as this ray.
+    __hostdev__ bool intersects(const Vec3T& center, RealT radius) const
+    {
+        RealT t0, t1;
+        return this->intersects(center, radius, t0, t1) > 0;
+    }
+
+    /// @brief Return true if this ray intersects the specified sphere.
+    ///
+    /// @note For intersection this ray is clipped to the two intersection points.
+    ///
+    /// @param center The center of the sphere in the same space as this ray.
+    /// @param radius The radius of the sphere in the same units as this ray.
+    __hostdev__ bool clip(const Vec3T& center, RealT radius)
+    {
+        RealT      t0, t1;
+        const bool hit = this->intersects(center, radius, t0, t1);
+        if (hit) {
+            mTimeSpan.set(t0, t1);
+        }
+        return hit;
+    }
+#if 0
+    /// @brief Return true if the Ray intersects the specified
+    ///        axisaligned bounding box.
+    ///
+    /// @param bbox Axis-aligned bounding box in the same space as the Ray.
+    /// @param t0   If an intersection is detected this is assigned
+    ///             the time for the first intersection point.
+    /// @param t1   If an intersection is detected this is assigned
+    ///             the time for the second intersection point.
+    template<typename BBoxT>
+    __hostdev__  bool intersects(const BBoxT& bbox, RealT& t0, RealT& t1) const
+    {
+        t0       = (bbox[  mSign[0]][0] - mEye[0]) * mInvDir[0];
+        RealT t2 = (bbox[1-mSign[1]][1] - mEye[1]) * mInvDir[1];
+        if (t0 > t2) return false;
+        t1       = (bbox[1-mSign[0]][0] - mEye[0]) * mInvDir[0];
+        RealT t3 = (bbox[  mSign[1]][1] - mEye[1]) * mInvDir[1];
+        if (t3 > t1) return false;
+        if (t3 > t0) t0 = t3;
+        if (t2 < t1) t1 = t2;
+        t3 = (bbox[  mSign[2]][2] - mEye[2]) * mInvDir[2];
+        if (t3 > t1) return false;
+        t2 = (bbox[1-mSign[2]][2] - mEye[2]) * mInvDir[2];
+        if (t0 > t2) return false;
+        if (t3 > t0) t0 = t3;
+        if (mTimeSpan.t1 < t0) return false;
+        if (t2 < t1) t1 = t2;
+        if (mTimeSpan.t0 > t1) return false;
+        if (mTimeSpan.t0 > t0) t0 = mTimeSpan.t0;
+        if (mTimeSpan.t1 < t1) t1 = mTimeSpan.t1;
+        return true;
+        /*
+        mTimeSpan.get(_t0, _t1);
+        double t0 = _t0, t1 = _t1;
+        for (int i = 0; i < 3; ++i) {
+            //if (abs(mDir[i])<1e-3) continue;
+            double a = (double(bbox.min()[i]) - mEye[i]) * mInvDir[i];
+            double b = (double(bbox.max()[i]) - mEye[i]) * mInvDir[i];
+            if (a > b) {
+                double tmp = a;
+                a = b;
+                b = tmp;
+            }
+            if (a > t0) t0 = a;
+            if (b < t1) t1 = b;
+            if (t0 > t1) {
+                //if (gVerbose) printf("Missed BBOX: (%i,%i,%i) -> (%i,%i,%i) t0=%f t1=%f\n",
+                //                     bbox.min()[0], bbox.min()[1], bbox.min()[2],
+                //                     bbox.max()[0], bbox.max()[1], bbox.max()[2], t0, t1);
+                return false;
+            }
+        }
+        _t0 = t0; _t1 = t1;
+        return true;
+        */
+    }
+#else
+    /// @brief Returns true if this ray intersects an index bounding box.
+    ///        If the return value is true t0 and t1 are set to the intersection
+    ///        times along the ray.
+    ///
+    /// @warning Intersection with a CoordBBox internally converts to a floating-point bbox
+    ///          which imples that the max is padded with one voxel, i.e. bbox.max += 1! This
+    ///          avoids gaps between neighboring CoordBBox'es, say from neighboring tree nodes.
+    __hostdev__ bool intersects(const CoordBBox& bbox, RealT& t0, RealT& t1) const
+    {
+        mTimeSpan.get(t0, t1);
+        for (int i = 0; i < 3; ++i) {
+            RealT a = RealT(bbox.min()[i]), b = RealT(bbox.max()[i] + 1);
+            if (a >= b) { // empty bounding box
+                return false;
+            }
+            a = (a - mEye[i]) * mInvDir[i];
+            b = (b - mEye[i]) * mInvDir[i];
+            if (a > b) {
+                RealT tmp = a;
+                a = b;
+                b = tmp;
+            }
+            if (a > t0) {
+                t0 = a;
+            }
+            if (b < t1) {
+                t1 = b;
+            }
+            if (t0 > t1) {
+                return false;
+            }
+        }
+        return true;
+    }
+    /// @brief Returns true if this ray intersects a floating-point bounding box.
+    ///        If the return value is true t0 and t1 are set to the intersection
+    ///        times along the ray.
+    template<typename OtherVec3T>
+    __hostdev__ bool intersects(const BBox<OtherVec3T>& bbox, RealT& t0, RealT& t1) const
+    {
+        static_assert(util::is_floating_point<typename OtherVec3T::ValueType>::value, "Ray::intersects: Expected a floating point coordinate");
+        mTimeSpan.get(t0, t1);
+        for (int i = 0; i < 3; ++i) {
+            RealT a = RealT(bbox.min()[i]), b = RealT(bbox.max()[i]);
+            if (a >= b) { // empty bounding box
+                return false;
+            }
+            a = (a - mEye[i]) * mInvDir[i];
+            b = (b - mEye[i]) * mInvDir[i];
+            if (a > b) {
+                RealT tmp = a;
+                a = b;
+                b = tmp;
+            }
+            if (a > t0) {
+                t0 = a;
+            }
+            if (b < t1) {
+                t1 = b;
+            }
+            if (t0 > t1) {
+                return false;
+            }
+        }
+        return true;
+    }
+#endif
+
+    /// @brief Return true if this ray intersects the specified bounding box.
+    ///
+    /// @param bbox Axis-aligned bounding box in the same space as this ray.
+    ///
+    /// @warning If @a bbox is of the type CoordBBox it is converted to a floating-point
+    ///          bounding box, which imples that the max is padded with one voxel, i.e.
+    ///          bbox.max += 1! This avoids gaps between neighboring CoordBBox'es, say
+    ///          from neighboring tree nodes.
+    template<typename BBoxT>
+    __hostdev__ bool intersects(const BBoxT& bbox) const
+    {
+#if 1
+        RealT t0, t1;
+        return this->intersects(bbox, t0, t1);
+#else
+        //BBox<Vec3T> bbox(Vec3T(_bbox[0][0]-1e-4,_bbox[0][1]-1e-4,_bbox[0][2]-1e-4),
+        //                 Vec3T(_bbox[1][0]+1e-4,_bbox[1][1]+1e-4,_bbox[1][2]+1e-4));
+        RealT t0 = (bbox[mSign[0]][0] - mEye[0]) * mInvDir[0];
+        RealT t2 = (bbox[1 - mSign[1]][1] - mEye[1]) * mInvDir[1];
+        if (t0 > t2) return false;
+        RealT t1 = (bbox[1 - mSign[0]][0] - mEye[0]) * mInvDir[0];
+        RealT t3 = (bbox[mSign[1]][1] - mEye[1]) * mInvDir[1];
+        if (t3 > t1) return false;
+        if (t3 > t0) t0 = t3;
+        if (t2 < t1) t1 = t2;
+        t3 = (bbox[mSign[2]][2] - mEye[2]) * mInvDir[2];
+        if (t3 > t1) return false;
+        t2 = (bbox[1 - mSign[2]][2] - mEye[2]) * mInvDir[2];
+        if (t0 > t2) return false;
+        //if (t3 > t0) t0 = t3;
+        //if (mTimeSpan.t1 < t0) return false;
+        //if (t2 < t1) t1 = t2;
+        //return mTimeSpan.t0 < t1;
+        return true;
+#endif
+    }
+
+    /// @brief Return true if this ray intersects the specified bounding box.
+    ///
+    /// @param bbox Axis-aligned bounding box in the same space as this ray.
+    ///
+    /// @warning If @a bbox is of the type CoordBBox it is converted to a floating-point
+    ///          bounding box, which imples that the max is padded with one voxel, i.e.
+    ///          bbox.max += 1! This avoids gaps between neighboring CoordBBox'es, say
+    ///          from neighboring tree nodes.
+    ///
+    /// @note For intersection this ray is clipped to the two intersection points.
+    template<typename BBoxT>
+    __hostdev__ bool clip(const BBoxT& bbox)
+    {
+        RealT      t0, t1;
+        const bool hit = this->intersects(bbox, t0, t1);
+        if (hit) {
+            mTimeSpan.set(t0, t1);
+        }
+        return hit;
+    }
+
+    /// @brief Return true if the Ray intersects the plane specified
+    ///        by a normal and distance from the origin.
+    ///
+    /// @param normal   Normal of the plane.
+    /// @param distance Distance of the plane to the origin.
+    /// @param t        Time of intersection, if one exists.
+    __hostdev__ bool intersects(const Vec3T& normal, RealT distance, RealT& t) const
+    {
+        const RealT cosAngle = mDir.dot(normal);
+        if (isApproxZero(cosAngle)) {
+            return false; // ray is parallel to plane
+        }
+        t = (distance - mEye.dot(normal)) / cosAngle;
+        return this->test(t);
+    }
+
+    /// @brief Return true if the Ray intersects the plane specified
+    ///        by a normal and point.
+    ///
+    /// @param normal   Normal of the plane.
+    /// @param point    Point in the plane.
+    /// @param t        Time of intersection, if one exists.
+    __hostdev__ bool intersects(const Vec3T& normal, const Vec3T& point, RealT& t) const
+    {
+        return this->intersects(normal, point.dot(normal), t);
+    }
+
+private:
+    Vec3T    mEye, mDir, mInvDir;
+    TimeSpan mTimeSpan;
+    int      mSign[3];
+}; // end of Ray class
+
+} // namespace math =========================================================
+
+template<typename RealT>
+using Ray [[deprecated("Use nanovdb::math::Ray instead")]] = math::Ray<RealT>;
+
+} // namespace nanovdb =======================================================
+
+#endif // NANOVDB_MATH_RAY_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/math/SampleFromVoxels.h b/external/nanovdb/math/SampleFromVoxels.h
new file mode 100644
index 00000000..d183f74a
--- /dev/null
+++ b/external/nanovdb/math/SampleFromVoxels.h
@@ -0,0 +1,996 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+//////////////////////////////////////////////////////////////////////////
+///
+/// @file SampleFromVoxels.h
+///
+/// @brief NearestNeighborSampler, TrilinearSampler, TriquadraticSampler and TricubicSampler
+///
+/// @note These interpolators employ internal caching for better performance when used repeatedly
+///       in the same voxel location, so try to reuse an instance of these classes more than once.
+///
+/// @warning While all the interpolators defined below work with both scalars and vectors
+///          values (e.g. float and Vec3<float>) TrilinarSampler::zeroCrossing and
+///          Trilinear::gradient will only compile with floating point value types.
+///
+/// @author Ken Museth
+///
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef NANOVDB_SAMPLE_FROM_VOXELS_H_HAS_BEEN_INCLUDED
+#define NANOVDB_SAMPLE_FROM_VOXELS_H_HAS_BEEN_INCLUDED
+
+// Only define __hostdev__ when compiling as NVIDIA CUDA
+#if defined(__CUDACC__) || defined(__HIP__)
+#define __hostdev__ __host__ __device__
+#else
+#include <cmath> // for floor
+#define __hostdev__
+#endif
+
+#include <nanovdb/math/Math.h>
+
+namespace nanovdb {
+
+namespace math {
+
+// Forward declaration of sampler with specific polynomial orders
+template<typename TreeT, int Order, bool UseCache = true>
+class SampleFromVoxels;
+
+/// @brief Factory free-function for a sampler of specific polynomial orders
+///
+/// @details This allows for the compact syntax:
+/// @code
+///   auto acc = grid.getAccessor();
+///   auto smp = nanovdb::math::createSampler<1>( acc );
+/// @endcode
+template<int Order, typename TreeOrAccT, bool UseCache = true>
+__hostdev__ SampleFromVoxels<TreeOrAccT, Order, UseCache> createSampler(const TreeOrAccT& acc)
+{
+    return SampleFromVoxels<TreeOrAccT, Order, UseCache>(acc);
+}
+
+/// @brief Utility function that returns the Coord of the round-down of @a xyz
+///        and redefined @xyz as the fractional part, ie xyz-in = return-value + xyz-out
+template<typename CoordT, typename RealT, template<typename> class Vec3T>
+__hostdev__ inline CoordT Floor(Vec3T<RealT>& xyz);
+
+/// @brief Template specialization of Floor for Vec3<float>
+template<typename CoordT, template<typename> class Vec3T>
+__hostdev__ inline CoordT Floor(Vec3T<float>& xyz)
+{
+    const float ijk[3] = {floorf(xyz[0]), floorf(xyz[1]), floorf(xyz[2])};
+    xyz[0] -= ijk[0];
+    xyz[1] -= ijk[1];
+    xyz[2] -= ijk[2];
+    return CoordT(int32_t(ijk[0]), int32_t(ijk[1]), int32_t(ijk[2]));
+}
+
+/// @brief Template specialization of Floor for Vec3<float>
+template<typename CoordT, template<typename> class Vec3T>
+__hostdev__ inline CoordT Floor(Vec3T<double>& xyz)
+{
+    const double ijk[3] = {floor(xyz[0]), floor(xyz[1]), floor(xyz[2])};
+    xyz[0] -= ijk[0];
+    xyz[1] -= ijk[1];
+    xyz[2] -= ijk[2];
+    return CoordT(int32_t(ijk[0]), int32_t(ijk[1]), int32_t(ijk[2]));
+}
+
+// ------------------------------> NearestNeighborSampler <--------------------------------------
+
+/// @brief Nearest neighbor, i.e. zero order, interpolator with caching
+template<typename TreeOrAccT>
+class SampleFromVoxels<TreeOrAccT, 0, true>
+{
+public:
+    using ValueT = typename TreeOrAccT::ValueType;
+    using CoordT = typename TreeOrAccT::CoordType;
+
+    static const int ORDER = 0;
+    /// @brief Construction from a Tree or ReadAccessor
+    __hostdev__ SampleFromVoxels(const TreeOrAccT& acc)
+        : mAcc(acc)
+        , mPos(CoordT::max())
+    {
+    }
+
+    __hostdev__ const TreeOrAccT& accessor() const { return mAcc; }
+
+    /// @note xyz is in index space space
+    template<typename Vec3T>
+    inline __hostdev__ ValueT operator()(const Vec3T& xyz) const;
+
+    inline __hostdev__ ValueT operator()(const CoordT& ijk) const;
+
+private:
+    const TreeOrAccT& mAcc;
+    mutable CoordT    mPos;
+    mutable ValueT    mVal; // private cache
+}; // SampleFromVoxels<TreeOrAccT, 0, true>
+
+/// @brief Nearest neighbor, i.e. zero order, interpolator without caching
+template<typename TreeOrAccT>
+class SampleFromVoxels<TreeOrAccT, 0, false>
+{
+public:
+    using ValueT = typename TreeOrAccT::ValueType;
+    using CoordT = typename TreeOrAccT::CoordType;
+    static const int ORDER = 0;
+
+    /// @brief Construction from a Tree or ReadAccessor
+    __hostdev__ SampleFromVoxels(const TreeOrAccT& acc)
+        : mAcc(acc)
+    {
+    }
+
+    __hostdev__ const TreeOrAccT& accessor() const { return mAcc; }
+
+    /// @note xyz is in index space space
+    template<typename Vec3T>
+    inline __hostdev__ ValueT operator()(const Vec3T& xyz) const;
+
+    inline __hostdev__ ValueT operator()(const CoordT& ijk) const { return mAcc.getValue(ijk);}
+
+private:
+    const TreeOrAccT& mAcc;
+}; // SampleFromVoxels<TreeOrAccT, 0, false>
+
+template<typename TreeOrAccT>
+template<typename Vec3T>
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 0, true>::operator()(const Vec3T& xyz) const
+{
+    const CoordT ijk = math::Round<CoordT>(xyz);
+    if (ijk != mPos) {
+        mPos = ijk;
+        mVal = mAcc.getValue(mPos);
+    }
+    return mVal;
+}
+
+template<typename TreeOrAccT>
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 0, true>::operator()(const CoordT& ijk) const
+{
+    if (ijk != mPos) {
+        mPos = ijk;
+        mVal = mAcc.getValue(mPos);
+    }
+    return mVal;
+}
+
+template<typename TreeOrAccT>
+template<typename Vec3T>
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 0, false>::operator()(const Vec3T& xyz) const
+{
+    return mAcc.getValue(math::Round<CoordT>(xyz));
+}
+
+// ------------------------------> TrilinearSampler <--------------------------------------
+
+/// @brief Tri-linear sampler, i.e. first order, interpolator
+template<typename TreeOrAccT>
+class TrilinearSampler
+{
+protected:
+    const TreeOrAccT& mAcc;
+
+public:
+    using ValueT = typename TreeOrAccT::ValueType;
+    using CoordT = typename TreeOrAccT::CoordType;
+    static const int ORDER = 1;
+
+    /// @brief Protected constructor from a Tree or ReadAccessor
+    __hostdev__ TrilinearSampler(const TreeOrAccT& acc) : mAcc(acc) {}
+
+    __hostdev__ const TreeOrAccT& accessor() const { return mAcc; }
+
+    /// @brief Extract the stencil of 8 values
+    inline __hostdev__ void stencil(CoordT& ijk, ValueT (&v)[2][2][2]) const;
+
+    template<typename RealT, template<typename...> class Vec3T>
+    static inline __hostdev__ ValueT sample(const Vec3T<RealT> &uvw, const ValueT (&v)[2][2][2]);
+
+    template<typename RealT, template<typename...> class Vec3T>
+    static inline __hostdev__ Vec3T<ValueT> gradient(const Vec3T<RealT> &uvw, const ValueT (&v)[2][2][2]);
+
+    static inline __hostdev__ bool zeroCrossing(const ValueT (&v)[2][2][2]);
+}; // TrilinearSamplerBase
+
+template<typename TreeOrAccT>
+__hostdev__ void TrilinearSampler<TreeOrAccT>::stencil(CoordT& ijk, ValueT (&v)[2][2][2]) const
+{
+    v[0][0][0] = mAcc.getValue(ijk); // i, j, k
+
+    ijk[2] += 1;
+    v[0][0][1] = mAcc.getValue(ijk); // i, j, k + 1
+
+    ijk[1] += 1;
+    v[0][1][1] = mAcc.getValue(ijk); // i, j+1, k + 1
+
+    ijk[2] -= 1;
+    v[0][1][0] = mAcc.getValue(ijk); // i, j+1, k
+
+    ijk[0] += 1;
+    ijk[1] -= 1;
+    v[1][0][0] = mAcc.getValue(ijk); // i+1, j, k
+
+    ijk[2] += 1;
+    v[1][0][1] = mAcc.getValue(ijk); // i+1, j, k + 1
+
+    ijk[1] += 1;
+    v[1][1][1] = mAcc.getValue(ijk); // i+1, j+1, k + 1
+
+    ijk[2] -= 1;
+    v[1][1][0] = mAcc.getValue(ijk); // i+1, j+1, k
+}
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ typename TreeOrAccT::ValueType TrilinearSampler<TreeOrAccT>::sample(const Vec3T<RealT> &uvw, const ValueT (&v)[2][2][2])
+{
+#if 0
+  auto lerp = [](ValueT a, ValueT b, ValueT w){ return fma(w, b-a, a); };// = w*(b-a) + a
+  //auto lerp = [](ValueT a, ValueT b, ValueT w){ return fma(w, b, fma(-w, a, a));};// = (1-w)*a + w*b
+#else
+    auto lerp = [](ValueT a, ValueT b, RealT w) { return a + ValueT(w) * (b - a); };
+#endif
+    return lerp(lerp(lerp(v[0][0][0], v[0][0][1], uvw[2]), lerp(v[0][1][0], v[0][1][1], uvw[2]), uvw[1]),
+                lerp(lerp(v[1][0][0], v[1][0][1], uvw[2]), lerp(v[1][1][0], v[1][1][1], uvw[2]), uvw[1]),
+                uvw[0]);
+}
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ Vec3T<typename TreeOrAccT::ValueType> TrilinearSampler<TreeOrAccT>::gradient(const Vec3T<RealT> &uvw, const ValueT (&v)[2][2][2])
+{
+    static_assert(util::is_floating_point<ValueT>::value, "TrilinearSampler::gradient requires a floating-point type");
+#if 0
+  auto lerp = [](ValueT a, ValueT b, ValueT w){ return fma(w, b-a, a); };// = w*(b-a) + a
+  //auto lerp = [](ValueT a, ValueT b, ValueT w){ return fma(w, b, fma(-w, a, a));};// = (1-w)*a + w*b
+#else
+    auto lerp = [](ValueT a, ValueT b, RealT w) { return a + ValueT(w) * (b - a); };
+#endif
+
+    ValueT D[4] = {v[0][0][1] - v[0][0][0], v[0][1][1] - v[0][1][0], v[1][0][1] - v[1][0][0], v[1][1][1] - v[1][1][0]};
+
+    // Z component
+    Vec3T<ValueT> grad(0, 0, lerp(lerp(D[0], D[1], uvw[1]), lerp(D[2], D[3], uvw[1]), uvw[0]));
+
+    const ValueT w = ValueT(uvw[2]);
+    D[0] = v[0][0][0] + D[0] * w;
+    D[1] = v[0][1][0] + D[1] * w;
+    D[2] = v[1][0][0] + D[2] * w;
+    D[3] = v[1][1][0] + D[3] * w;
+
+    // X component
+    grad[0] = lerp(D[2], D[3], uvw[1]) - lerp(D[0], D[1], uvw[1]);
+
+    // Y component
+    grad[1] = lerp(D[1] - D[0], D[3] - D[2], uvw[0]);
+
+    return grad;
+}
+
+template<typename TreeOrAccT>
+__hostdev__ bool TrilinearSampler<TreeOrAccT>::zeroCrossing(const ValueT (&v)[2][2][2])
+{
+    static_assert(util::is_floating_point<ValueT>::value, "TrilinearSampler::zeroCrossing requires a floating-point type");
+    const bool less = v[0][0][0] < ValueT(0);
+    return (less ^ (v[0][0][1] < ValueT(0))) ||
+           (less ^ (v[0][1][1] < ValueT(0))) ||
+           (less ^ (v[0][1][0] < ValueT(0))) ||
+           (less ^ (v[1][0][0] < ValueT(0))) ||
+           (less ^ (v[1][0][1] < ValueT(0))) ||
+           (less ^ (v[1][1][1] < ValueT(0))) ||
+           (less ^ (v[1][1][0] < ValueT(0)));
+}
+
+/// @brief Template specialization that does not use caching of stencil points
+template<typename TreeOrAccT>
+class SampleFromVoxels<TreeOrAccT, 1, false> : public TrilinearSampler<TreeOrAccT>
+{
+    using BaseT = TrilinearSampler<TreeOrAccT>;
+    using ValueT = typename TreeOrAccT::ValueType;
+    using CoordT = typename TreeOrAccT::CoordType;
+
+public:
+
+    /// @brief Construction from a Tree or ReadAccessor
+    __hostdev__ SampleFromVoxels(const TreeOrAccT& acc) : BaseT(acc) {}
+
+    /// @note xyz is in index space space
+    template<typename RealT, template<typename...> class Vec3T>
+    inline __hostdev__ ValueT operator()(Vec3T<RealT> xyz) const;
+
+    /// @note ijk is in index space space
+    __hostdev__ ValueT operator()(const CoordT &ijk) const {return BaseT::mAcc.getValue(ijk);}
+
+    /// @brief Return the gradient in index space.
+    ///
+    /// @warning Will only compile with floating point value types
+    template<typename RealT, template<typename...> class Vec3T>
+    inline __hostdev__ Vec3T<ValueT> gradient(Vec3T<RealT> xyz) const;
+
+    /// @brief Return true if the tr-linear stencil has a zero crossing at the specified index position.
+    ///
+    /// @warning Will only compile with floating point value types
+    template<typename RealT, template<typename...> class Vec3T>
+    inline __hostdev__ bool zeroCrossing(Vec3T<RealT> xyz) const;
+
+}; // SampleFromVoxels<TreeOrAccT, 1, false>
+
+/// @brief Template specialization with caching of stencil values
+template<typename TreeOrAccT>
+class SampleFromVoxels<TreeOrAccT, 1, true> : public TrilinearSampler<TreeOrAccT>
+{
+    using BaseT = TrilinearSampler<TreeOrAccT>;
+    using ValueT = typename TreeOrAccT::ValueType;
+    using CoordT = typename TreeOrAccT::CoordType;
+
+    mutable CoordT mPos;
+    mutable ValueT mVal[2][2][2];
+
+    template<typename RealT, template<typename...> class Vec3T>
+    __hostdev__ void cache(Vec3T<RealT>& xyz) const;
+public:
+
+    /// @brief Construction from a Tree or ReadAccessor
+    __hostdev__ SampleFromVoxels(const TreeOrAccT& acc) : BaseT(acc), mPos(CoordT::max()){}
+
+    /// @note xyz is in index space space
+    template<typename RealT, template<typename...> class Vec3T>
+    inline __hostdev__ ValueT operator()(Vec3T<RealT> xyz) const;
+
+    // @note ijk is in index space space
+    __hostdev__ ValueT operator()(const CoordT &ijk) const;
+
+    /// @brief Return the gradient in index space.
+    ///
+    /// @warning Will only compile with floating point value types
+    template<typename RealT, template<typename...> class Vec3T>
+    inline __hostdev__ Vec3T<ValueT> gradient(Vec3T<RealT> xyz) const;
+
+    /// @brief Return true if the tr-linear stencil has a zero crossing at the specified index position.
+    ///
+    /// @warning Will only compile with floating point value types
+    template<typename RealT, template<typename...> class Vec3T>
+    inline __hostdev__ bool zeroCrossing(Vec3T<RealT> xyz) const;
+
+    /// @brief Return true if the cached tri-linear stencil has a zero crossing.
+    ///
+    /// @warning Will only compile with floating point value types
+    __hostdev__ bool zeroCrossing() const { return BaseT::zeroCrossing(mVal); }
+
+}; // SampleFromVoxels<TreeOrAccT, 1, true>
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 1, true>::operator()(Vec3T<RealT> xyz) const
+{
+    this->cache(xyz);
+    return BaseT::sample(xyz, mVal);
+}
+
+template<typename TreeOrAccT>
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 1, true>::operator()(const CoordT &ijk) const
+{
+    return  ijk == mPos ? mVal[0][0][0] : BaseT::mAcc.getValue(ijk);
+}
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ Vec3T<typename TreeOrAccT::ValueType> SampleFromVoxels<TreeOrAccT, 1, true>::gradient(Vec3T<RealT> xyz) const
+{
+    this->cache(xyz);
+    return BaseT::gradient(xyz, mVal);
+}
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ bool SampleFromVoxels<TreeOrAccT, 1, true>::zeroCrossing(Vec3T<RealT> xyz) const
+{
+    this->cache(xyz);
+    return BaseT::zeroCrossing(mVal);
+}
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ void SampleFromVoxels<TreeOrAccT, 1, true>::cache(Vec3T<RealT>& xyz) const
+{
+    CoordT ijk = Floor<CoordT>(xyz);
+    if (ijk != mPos) {
+        mPos = ijk;
+        BaseT::stencil(ijk, mVal);
+    }
+}
+
+#if 0
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 1, false>::operator()(Vec3T<RealT> xyz) const
+{
+    ValueT val[2][2][2];
+    CoordT ijk = Floor<CoordT>(xyz);
+    BaseT::stencil(ijk, val);
+    return BaseT::sample(xyz, val);
+}
+
+#else
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 1, false>::operator()(Vec3T<RealT> xyz) const
+{
+    auto lerp = [](ValueT a, ValueT b, RealT w) { return a + ValueT(w) * (b - a); };
+
+    CoordT coord = Floor<CoordT>(xyz);
+
+    ValueT vx, vx1, vy, vy1, vz, vz1;
+
+    vz = BaseT::mAcc.getValue(coord);
+    coord[2] += 1;
+    vz1 = BaseT::mAcc.getValue(coord);
+    vy = lerp(vz, vz1, xyz[2]);
+
+    coord[1] += 1;
+
+    vz1 = BaseT::mAcc.getValue(coord);
+    coord[2] -= 1;
+    vz = BaseT::mAcc.getValue(coord);
+    vy1 = lerp(vz, vz1, xyz[2]);
+
+    vx = lerp(vy, vy1, xyz[1]);
+
+    coord[0] += 1;
+
+    vz = BaseT::mAcc.getValue(coord);
+    coord[2] += 1;
+    vz1 = BaseT::mAcc.getValue(coord);
+    vy1 = lerp(vz, vz1, xyz[2]);
+
+    coord[1] -= 1;
+
+    vz1 = BaseT::mAcc.getValue(coord);
+    coord[2] -= 1;
+    vz = BaseT::mAcc.getValue(coord);
+    vy = lerp(vz, vz1, xyz[2]);
+
+    vx1 = lerp(vy, vy1, xyz[1]);
+
+    return lerp(vx, vx1, xyz[0]);
+}
+#endif
+
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ inline Vec3T<typename TreeOrAccT::ValueType> SampleFromVoxels<TreeOrAccT, 1, false>::gradient(Vec3T<RealT> xyz) const
+{
+    ValueT val[2][2][2];
+    CoordT ijk = Floor<CoordT>(xyz);
+    BaseT::stencil(ijk, val);
+    return BaseT::gradient(xyz, val);
+}
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ bool SampleFromVoxels<TreeOrAccT, 1, false>::zeroCrossing(Vec3T<RealT> xyz) const
+{
+    ValueT val[2][2][2];
+    CoordT ijk = Floor<CoordT>(xyz);
+    BaseT::stencil(ijk, val);
+    return BaseT::zeroCrossing(val);
+}
+
+// ------------------------------> TriquadraticSampler <--------------------------------------
+
+/// @brief Tri-quadratic sampler, i.e. second order, interpolator
+template<typename TreeOrAccT>
+class TriquadraticSampler
+{
+protected:
+    const TreeOrAccT& mAcc;
+
+public:
+    using ValueT = typename TreeOrAccT::ValueType;
+    using CoordT = typename TreeOrAccT::CoordType;
+    static const int ORDER = 1;
+
+    /// @brief Protected constructor from a Tree or ReadAccessor
+    __hostdev__ TriquadraticSampler(const TreeOrAccT& acc) : mAcc(acc) {}
+
+    __hostdev__ const TreeOrAccT& accessor() const { return mAcc; }
+
+    /// @brief Extract the stencil of 27 values
+    inline __hostdev__ void stencil(const CoordT &ijk, ValueT (&v)[3][3][3]) const;
+
+    template<typename RealT, template<typename...> class Vec3T>
+    static inline __hostdev__ ValueT sample(const Vec3T<RealT> &uvw, const ValueT (&v)[3][3][3]);
+
+    static inline __hostdev__ bool zeroCrossing(const ValueT (&v)[3][3][3]);
+}; // TriquadraticSamplerBase
+
+template<typename TreeOrAccT>
+__hostdev__ void TriquadraticSampler<TreeOrAccT>::stencil(const CoordT &ijk, ValueT (&v)[3][3][3]) const
+{
+    CoordT p(ijk[0] - 1, 0, 0);
+    for (int dx = 0; dx < 3; ++dx, ++p[0]) {
+        p[1] = ijk[1] - 1;
+        for (int dy = 0; dy < 3; ++dy, ++p[1]) {
+            p[2] = ijk[2] - 1;
+            for (int dz = 0; dz < 3; ++dz, ++p[2]) {
+                v[dx][dy][dz] = mAcc.getValue(p);// extract the stencil of 27 values
+            }
+        }
+    }
+}
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ typename TreeOrAccT::ValueType TriquadraticSampler<TreeOrAccT>::sample(const Vec3T<RealT> &uvw, const ValueT (&v)[3][3][3])
+{
+    auto kernel = [](const ValueT* value, double weight)->ValueT {
+        return weight * (weight * (0.5f * (value[0] + value[2]) - value[1]) +
+                        0.5f * (value[2] - value[0])) + value[1];
+    };
+
+    ValueT vx[3];
+    for (int dx = 0; dx < 3; ++dx) {
+        ValueT vy[3];
+        for (int dy = 0; dy < 3; ++dy) {
+            vy[dy] = kernel(&v[dx][dy][0], uvw[2]);
+        }//loop over y
+        vx[dx] = kernel(vy, uvw[1]);
+    }//loop over x
+    return kernel(vx, uvw[0]);
+}
+
+template<typename TreeOrAccT>
+__hostdev__ bool TriquadraticSampler<TreeOrAccT>::zeroCrossing(const ValueT (&v)[3][3][3])
+{
+    static_assert(util::is_floating_point<ValueT>::value, "TrilinearSampler::zeroCrossing requires a floating-point type");
+    const bool less = v[0][0][0] < ValueT(0);
+    for (int dx = 0; dx < 3; ++dx) {
+        for (int dy = 0; dy < 3; ++dy) {
+            for (int dz = 0; dz < 3; ++dz) {
+                if (less ^ (v[dx][dy][dz] < ValueT(0))) return true;
+            }
+        }
+    }
+    return false;
+}
+
+/// @brief Template specialization that does not use caching of stencil points
+template<typename TreeOrAccT>
+class SampleFromVoxels<TreeOrAccT, 2, false> : public TriquadraticSampler<TreeOrAccT>
+{
+    using BaseT = TriquadraticSampler<TreeOrAccT>;
+    using ValueT = typename TreeOrAccT::ValueType;
+    using CoordT = typename TreeOrAccT::CoordType;
+public:
+
+    /// @brief Construction from a Tree or ReadAccessor
+    __hostdev__ SampleFromVoxels(const TreeOrAccT& acc) : BaseT(acc) {}
+
+    /// @note xyz is in index space space
+    template<typename RealT, template<typename...> class Vec3T>
+    inline __hostdev__ ValueT operator()(Vec3T<RealT> xyz) const;
+
+    __hostdev__ ValueT operator()(const CoordT &ijk) const {return BaseT::mAcc.getValue(ijk);}
+
+    /// @brief Return true if the tr-linear stencil has a zero crossing at the specified index position.
+    ///
+    /// @warning Will only compile with floating point value types
+    template<typename RealT, template<typename...> class Vec3T>
+    inline __hostdev__ bool zeroCrossing(Vec3T<RealT> xyz) const;
+
+}; // SampleFromVoxels<TreeOrAccT, 2, false>
+
+/// @brief Template specialization with caching of stencil values
+template<typename TreeOrAccT>
+class SampleFromVoxels<TreeOrAccT, 2, true> : public TriquadraticSampler<TreeOrAccT>
+{
+    using BaseT = TriquadraticSampler<TreeOrAccT>;
+    using ValueT = typename TreeOrAccT::ValueType;
+    using CoordT = typename TreeOrAccT::CoordType;
+
+    mutable CoordT mPos;
+    mutable ValueT mVal[3][3][3];
+
+    template<typename RealT, template<typename...> class Vec3T>
+    __hostdev__ void cache(Vec3T<RealT>& xyz) const;
+public:
+
+    /// @brief Construction from a Tree or ReadAccessor
+    __hostdev__ SampleFromVoxels(const TreeOrAccT& acc) : BaseT(acc), mPos(CoordT::max()){}
+
+    /// @note xyz is in index space space
+    template<typename RealT, template<typename...> class Vec3T>
+    inline __hostdev__ ValueT operator()(Vec3T<RealT> xyz) const;
+
+    inline __hostdev__ ValueT operator()(const CoordT &ijk) const;
+
+    /// @brief Return true if the tr-linear stencil has a zero crossing at the specified index position.
+    ///
+    /// @warning Will only compile with floating point value types
+    template<typename RealT, template<typename...> class Vec3T>
+    inline __hostdev__ bool zeroCrossing(Vec3T<RealT> xyz) const;
+
+    /// @brief Return true if the cached tri-linear stencil has a zero crossing.
+    ///
+    /// @warning Will only compile with floating point value types
+    __hostdev__ bool zeroCrossing() const { return BaseT::zeroCrossing(mVal); }
+
+}; // SampleFromVoxels<TreeOrAccT, 2, true>
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 2, true>::operator()(Vec3T<RealT> xyz) const
+{
+    this->cache(xyz);
+    return BaseT::sample(xyz, mVal);
+}
+
+template<typename TreeOrAccT>
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 2, true>::operator()(const CoordT &ijk) const
+{
+    return  ijk == mPos ? mVal[1][1][1] : BaseT::mAcc.getValue(ijk);
+}
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ bool SampleFromVoxels<TreeOrAccT, 2, true>::zeroCrossing(Vec3T<RealT> xyz) const
+{
+    this->cache(xyz);
+    return BaseT::zeroCrossing(mVal);
+}
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ void SampleFromVoxels<TreeOrAccT, 2, true>::cache(Vec3T<RealT>& xyz) const
+{
+    CoordT ijk = Floor<CoordT>(xyz);
+    if (ijk != mPos) {
+        mPos = ijk;
+        BaseT::stencil(ijk, mVal);
+    }
+}
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 2, false>::operator()(Vec3T<RealT> xyz) const
+{
+    ValueT val[3][3][3];
+    CoordT ijk = Floor<CoordT>(xyz);
+    BaseT::stencil(ijk, val);
+    return BaseT::sample(xyz, val);
+}
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ bool SampleFromVoxels<TreeOrAccT, 2, false>::zeroCrossing(Vec3T<RealT> xyz) const
+{
+    ValueT val[3][3][3];
+    CoordT ijk = Floor<CoordT>(xyz);
+    BaseT::stencil(ijk, val);
+    return BaseT::zeroCrossing(val);
+}
+
+// ------------------------------> TricubicSampler <--------------------------------------
+
+/// @brief Tri-cubic sampler, i.e. third order, interpolator.
+///
+/// @details See the following paper for implementation details:
+/// Lekien, F. and Marsden, J.: Tricubic interpolation in three dimensions.
+///                         In: International Journal for Numerical Methods
+///                         in Engineering (2005), No. 63, p. 455-471
+
+template<typename TreeOrAccT>
+class TricubicSampler
+{
+protected:
+    using ValueT = typename TreeOrAccT::ValueType;
+    using CoordT = typename TreeOrAccT::CoordType;
+
+    const TreeOrAccT& mAcc;
+
+public:
+    /// @brief Construction from a Tree or ReadAccessor
+    __hostdev__ TricubicSampler(const TreeOrAccT& acc)
+        : mAcc(acc)
+    {
+    }
+
+    __hostdev__ const TreeOrAccT& accessor() const { return mAcc; }
+
+     /// @brief Extract the stencil of 8 values
+    inline __hostdev__ void stencil(const CoordT& ijk, ValueT (&c)[64]) const;
+
+    template<typename RealT, template<typename...> class Vec3T>
+    static inline __hostdev__ ValueT sample(const Vec3T<RealT> &uvw, const ValueT (&c)[64]);
+}; // TricubicSampler
+
+template<typename TreeOrAccT>
+__hostdev__ void TricubicSampler<TreeOrAccT>::stencil(const CoordT& ijk, ValueT (&C)[64]) const
+{
+    auto fetch = [&](int i, int j, int k) -> ValueT& { return C[((i + 1) << 4) + ((j + 1) << 2) + k + 1]; };
+
+    // fetch 64 point stencil values
+    for (int i = -1; i < 3; ++i) {
+        for (int j = -1; j < 3; ++j) {
+            fetch(i, j, -1) = mAcc.getValue(ijk + CoordT(i, j, -1));
+            fetch(i, j,  0) = mAcc.getValue(ijk + CoordT(i, j,  0));
+            fetch(i, j,  1) = mAcc.getValue(ijk + CoordT(i, j,  1));
+            fetch(i, j,  2) = mAcc.getValue(ijk + CoordT(i, j,  2));
+        }
+    }
+    const ValueT half(0.5), quarter(0.25), eighth(0.125);
+    const ValueT X[64] = {// values of f(x,y,z) at the 8 corners (each from 1 stencil value).
+                          fetch(0, 0, 0),
+                          fetch(1, 0, 0),
+                          fetch(0, 1, 0),
+                          fetch(1, 1, 0),
+                          fetch(0, 0, 1),
+                          fetch(1, 0, 1),
+                          fetch(0, 1, 1),
+                          fetch(1, 1, 1),
+                          // values of df/dx at the 8 corners (each from 2 stencil values).
+                          half * (fetch(1, 0, 0) - fetch(-1, 0, 0)),
+                          half * (fetch(2, 0, 0) - fetch(0, 0, 0)),
+                          half * (fetch(1, 1, 0) - fetch(-1, 1, 0)),
+                          half * (fetch(2, 1, 0) - fetch(0, 1, 0)),
+                          half * (fetch(1, 0, 1) - fetch(-1, 0, 1)),
+                          half * (fetch(2, 0, 1) - fetch(0, 0, 1)),
+                          half * (fetch(1, 1, 1) - fetch(-1, 1, 1)),
+                          half * (fetch(2, 1, 1) - fetch(0, 1, 1)),
+                          // values of df/dy at the 8 corners (each from 2 stencil values).
+                          half * (fetch(0, 1, 0) - fetch(0, -1, 0)),
+                          half * (fetch(1, 1, 0) - fetch(1, -1, 0)),
+                          half * (fetch(0, 2, 0) - fetch(0, 0, 0)),
+                          half * (fetch(1, 2, 0) - fetch(1, 0, 0)),
+                          half * (fetch(0, 1, 1) - fetch(0, -1, 1)),
+                          half * (fetch(1, 1, 1) - fetch(1, -1, 1)),
+                          half * (fetch(0, 2, 1) - fetch(0, 0, 1)),
+                          half * (fetch(1, 2, 1) - fetch(1, 0, 1)),
+                          // values of df/dz at the 8 corners (each from 2 stencil values).
+                          half * (fetch(0, 0, 1) - fetch(0, 0, -1)),
+                          half * (fetch(1, 0, 1) - fetch(1, 0, -1)),
+                          half * (fetch(0, 1, 1) - fetch(0, 1, -1)),
+                          half * (fetch(1, 1, 1) - fetch(1, 1, -1)),
+                          half * (fetch(0, 0, 2) - fetch(0, 0, 0)),
+                          half * (fetch(1, 0, 2) - fetch(1, 0, 0)),
+                          half * (fetch(0, 1, 2) - fetch(0, 1, 0)),
+                          half * (fetch(1, 1, 2) - fetch(1, 1, 0)),
+                          // values of d2f/dxdy at the 8 corners (each from 4 stencil values).
+                          quarter * (fetch(1, 1, 0) - fetch(-1, 1, 0) - fetch(1, -1, 0) + fetch(-1, -1, 0)),
+                          quarter * (fetch(2, 1, 0) - fetch(0, 1, 0) - fetch(2, -1, 0) + fetch(0, -1, 0)),
+                          quarter * (fetch(1, 2, 0) - fetch(-1, 2, 0) - fetch(1, 0, 0) + fetch(-1, 0, 0)),
+                          quarter * (fetch(2, 2, 0) - fetch(0, 2, 0) - fetch(2, 0, 0) + fetch(0, 0, 0)),
+                          quarter * (fetch(1, 1, 1) - fetch(-1, 1, 1) - fetch(1, -1, 1) + fetch(-1, -1, 1)),
+                          quarter * (fetch(2, 1, 1) - fetch(0, 1, 1) - fetch(2, -1, 1) + fetch(0, -1, 1)),
+                          quarter * (fetch(1, 2, 1) - fetch(-1, 2, 1) - fetch(1, 0, 1) + fetch(-1, 0, 1)),
+                          quarter * (fetch(2, 2, 1) - fetch(0, 2, 1) - fetch(2, 0, 1) + fetch(0, 0, 1)),
+                          // values of d2f/dxdz at the 8 corners (each from 4 stencil values).
+                          quarter * (fetch(1, 0, 1) - fetch(-1, 0, 1) - fetch(1, 0, -1) + fetch(-1, 0, -1)),
+                          quarter * (fetch(2, 0, 1) - fetch(0, 0, 1) - fetch(2, 0, -1) + fetch(0, 0, -1)),
+                          quarter * (fetch(1, 1, 1) - fetch(-1, 1, 1) - fetch(1, 1, -1) + fetch(-1, 1, -1)),
+                          quarter * (fetch(2, 1, 1) - fetch(0, 1, 1) - fetch(2, 1, -1) + fetch(0, 1, -1)),
+                          quarter * (fetch(1, 0, 2) - fetch(-1, 0, 2) - fetch(1, 0, 0) + fetch(-1, 0, 0)),
+                          quarter * (fetch(2, 0, 2) - fetch(0, 0, 2) - fetch(2, 0, 0) + fetch(0, 0, 0)),
+                          quarter * (fetch(1, 1, 2) - fetch(-1, 1, 2) - fetch(1, 1, 0) + fetch(-1, 1, 0)),
+                          quarter * (fetch(2, 1, 2) - fetch(0, 1, 2) - fetch(2, 1, 0) + fetch(0, 1, 0)),
+                          // values of d2f/dydz at the 8 corners (each from 4 stencil values).
+                          quarter * (fetch(0, 1, 1) - fetch(0, -1, 1) - fetch(0, 1, -1) + fetch(0, -1, -1)),
+                          quarter * (fetch(1, 1, 1) - fetch(1, -1, 1) - fetch(1, 1, -1) + fetch(1, -1, -1)),
+                          quarter * (fetch(0, 2, 1) - fetch(0, 0, 1) - fetch(0, 2, -1) + fetch(0, 0, -1)),
+                          quarter * (fetch(1, 2, 1) - fetch(1, 0, 1) - fetch(1, 2, -1) + fetch(1, 0, -1)),
+                          quarter * (fetch(0, 1, 2) - fetch(0, -1, 2) - fetch(0, 1, 0) + fetch(0, -1, 0)),
+                          quarter * (fetch(1, 1, 2) - fetch(1, -1, 2) - fetch(1, 1, 0) + fetch(1, -1, 0)),
+                          quarter * (fetch(0, 2, 2) - fetch(0, 0, 2) - fetch(0, 2, 0) + fetch(0, 0, 0)),
+                          quarter * (fetch(1, 2, 2) - fetch(1, 0, 2) - fetch(1, 2, 0) + fetch(1, 0, 0)),
+                          // values of d3f/dxdydz at the 8 corners (each from 8 stencil values).
+                          eighth * (fetch(1, 1, 1) - fetch(-1, 1, 1) - fetch(1, -1, 1) + fetch(-1, -1, 1) - fetch(1, 1, -1) + fetch(-1, 1, -1) + fetch(1, -1, -1) - fetch(-1, -1, -1)),
+                          eighth * (fetch(2, 1, 1) - fetch(0, 1, 1) - fetch(2, -1, 1) + fetch(0, -1, 1) - fetch(2, 1, -1) + fetch(0, 1, -1) + fetch(2, -1, -1) - fetch(0, -1, -1)),
+                          eighth * (fetch(1, 2, 1) - fetch(-1, 2, 1) - fetch(1, 0, 1) + fetch(-1, 0, 1) - fetch(1, 2, -1) + fetch(-1, 2, -1) + fetch(1, 0, -1) - fetch(-1, 0, -1)),
+                          eighth * (fetch(2, 2, 1) - fetch(0, 2, 1) - fetch(2, 0, 1) + fetch(0, 0, 1) - fetch(2, 2, -1) + fetch(0, 2, -1) + fetch(2, 0, -1) - fetch(0, 0, -1)),
+                          eighth * (fetch(1, 1, 2) - fetch(-1, 1, 2) - fetch(1, -1, 2) + fetch(-1, -1, 2) - fetch(1, 1, 0) + fetch(-1, 1, 0) + fetch(1, -1, 0) - fetch(-1, -1, 0)),
+                          eighth * (fetch(2, 1, 2) - fetch(0, 1, 2) - fetch(2, -1, 2) + fetch(0, -1, 2) - fetch(2, 1, 0) + fetch(0, 1, 0) + fetch(2, -1, 0) - fetch(0, -1, 0)),
+                          eighth * (fetch(1, 2, 2) - fetch(-1, 2, 2) - fetch(1, 0, 2) + fetch(-1, 0, 2) - fetch(1, 2, 0) + fetch(-1, 2, 0) + fetch(1, 0, 0) - fetch(-1, 0, 0)),
+                          eighth * (fetch(2, 2, 2) - fetch(0, 2, 2) - fetch(2, 0, 2) + fetch(0, 0, 2) - fetch(2, 2, 0) + fetch(0, 2, 0) + fetch(2, 0, 0) - fetch(0, 0, 0))};
+
+    // 4Kb of static table (int8_t has a range of -127 -> 127 which suffices)
+    static const int8_t A[64][64] = {
+        {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {-3, 3, 0, 0, 0, 0, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {2, -2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, -3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {9, -9, -9, 9, 0, 0, 0, 0, 6, 3, -6, -3, 0, 0, 0, 0, 6, -6, 3, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {-6, 6, 6, -6, 0, 0, 0, 0, -3, -3, 3, 3, 0, 0, 0, 0, -4, 4, -2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -2, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {2, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 2, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {-6, 6, 6, -6, 0, 0, 0, 0, -4, -2, 4, 2, 0, 0, 0, 0, -3, 3, -3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -1, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {4, -4, -4, 4, 0, 0, 0, 0, 2, 2, -2, -2, 0, 0, 0, 0, 2, -2, 2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, 0, 0, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, -9, -9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 3, -6, -3, 0, 0, 0, 0, 6, -6, 3, -3, 0, 0, 0, 0, 4, 2, 2, 1, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -6, 6, 6, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, 3, 3, 0, 0, 0, 0, -4, 4, -2, 2, 0, 0, 0, 0, -2, -2, -1, -1, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -6, 6, 6, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, -2, 4, 2, 0, 0, 0, 0, -3, 3, -3, 3, 0, 0, 0, 0, -2, -1, -2, -1, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, -4, -4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, -2, -2, 0, 0, 0, 0, 2, -2, 2, -2, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0},
+        {-3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, -3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {9, -9, 0, 0, -9, 9, 0, 0, 6, 3, 0, 0, -6, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, -6, 0, 0, 3, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 2, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {-6, 6, 0, 0, 6, -6, 0, 0, -3, -3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, 4, 0, 0, -2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -2, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, 0, 0, -1, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, -9, 0, 0, -9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 3, 0, 0, -6, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, -6, 0, 0, 3, -3, 0, 0, 4, 2, 0, 0, 2, 1, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -6, 6, 0, 0, 6, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, 4, 0, 0, -2, 2, 0, 0, -2, -2, 0, 0, -1, -1, 0, 0},
+        {9, 0, -9, 0, -9, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3, 0, -6, 0, -3, 0, 6, 0, -6, 0, 3, 0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 9, 0, -9, 0, -9, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3, 0, -6, 0, -3, 0, 6, 0, -6, 0, 3, 0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 2, 0, 2, 0, 1, 0},
+        {-27, 27, 27, -27, 27, -27, -27, 27, -18, -9, 18, 9, 18, 9, -18, -9, -18, 18, -9, 9, 18, -18, 9, -9, -18, 18, 18, -18, -9, 9, 9, -9, -12, -6, -6, -3, 12, 6, 6, 3, -12, -6, 12, 6, -6, -3, 6, 3, -12, 12, -6, 6, -6, 6, -3, 3, -8, -4, -4, -2, -4, -2, -2, -1},
+        {18, -18, -18, 18, -18, 18, 18, -18, 9, 9, -9, -9, -9, -9, 9, 9, 12, -12, 6, -6, -12, 12, -6, 6, 12, -12, -12, 12, 6, -6, -6, 6, 6, 6, 3, 3, -6, -6, -3, -3, 6, 6, -6, -6, 3, 3, -3, -3, 8, -8, 4, -4, 4, -4, 2, -2, 4, 4, 2, 2, 2, 2, 1, 1},
+        {-6, 0, 6, 0, 6, 0, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 0, -3, 0, 3, 0, 3, 0, -4, 0, 4, 0, -2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -2, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, -6, 0, 6, 0, 6, 0, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 0, -3, 0, 3, 0, 3, 0, -4, 0, 4, 0, -2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -2, 0, -1, 0, -1, 0},
+        {18, -18, -18, 18, -18, 18, 18, -18, 12, 6, -12, -6, -12, -6, 12, 6, 9, -9, 9, -9, -9, 9, -9, 9, 12, -12, -12, 12, 6, -6, -6, 6, 6, 3, 6, 3, -6, -3, -6, -3, 8, 4, -8, -4, 4, 2, -4, -2, 6, -6, 6, -6, 3, -3, 3, -3, 4, 2, 4, 2, 2, 1, 2, 1},
+        {-12, 12, 12, -12, 12, -12, -12, 12, -6, -6, 6, 6, 6, 6, -6, -6, -6, 6, -6, 6, 6, -6, 6, -6, -8, 8, 8, -8, -4, 4, 4, -4, -3, -3, -3, -3, 3, 3, 3, 3, -4, -4, 4, 4, -2, -2, 2, 2, -4, 4, -4, 4, -2, 2, -2, 2, -2, -2, -2, -2, -1, -1, -1, -1},
+        {2, 0, 0, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {-6, 6, 0, 0, 6, -6, 0, 0, -4, -2, 0, 0, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, -3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -1, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {4, -4, 0, 0, -4, 4, 0, 0, 2, 2, 0, 0, -2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -6, 6, 0, 0, 6, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, -2, 0, 0, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, -3, 3, 0, 0, -2, -1, 0, 0, -2, -1, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, -4, 0, 0, -4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, -2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 2, -2, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0},
+        {-6, 0, 6, 0, 6, 0, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, 0, -2, 0, 4, 0, 2, 0, -3, 0, 3, 0, -3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, 0, -2, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, -6, 0, 6, 0, 6, 0, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, 0, -2, 0, 4, 0, 2, 0, -3, 0, 3, 0, -3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, 0, -2, 0, -1, 0},
+        {18, -18, -18, 18, -18, 18, 18, -18, 12, 6, -12, -6, -12, -6, 12, 6, 12, -12, 6, -6, -12, 12, -6, 6, 9, -9, -9, 9, 9, -9, -9, 9, 8, 4, 4, 2, -8, -4, -4, -2, 6, 3, -6, -3, 6, 3, -6, -3, 6, -6, 3, -3, 6, -6, 3, -3, 4, 2, 2, 1, 4, 2, 2, 1},
+        {-12, 12, 12, -12, 12, -12, -12, 12, -6, -6, 6, 6, 6, 6, -6, -6, -8, 8, -4, 4, 8, -8, 4, -4, -6, 6, 6, -6, -6, 6, 6, -6, -4, -4, -2, -2, 4, 4, 2, 2, -3, -3, 3, 3, -3, -3, 3, 3, -4, 4, -2, 2, -4, 4, -2, 2, -2, -2, -1, -1, -2, -2, -1, -1},
+        {4, 0, -4, 0, -4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, -2, 0, -2, 0, 2, 0, -2, 0, 2, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0, 0, 0, 4, 0, -4, 0, -4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, -2, 0, -2, 0, 2, 0, -2, 0, 2, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0},
+        {-12, 12, 12, -12, 12, -12, -12, 12, -8, -4, 8, 4, 8, 4, -8, -4, -6, 6, -6, 6, 6, -6, 6, -6, -6, 6, 6, -6, -6, 6, 6, -6, -4, -2, -4, -2, 4, 2, 4, 2, -4, -2, 4, 2, -4, -2, 4, 2, -3, 3, -3, 3, -3, 3, -3, 3, -2, -1, -2, -1, -2, -1, -2, -1},
+        {8, -8, -8, 8, -8, 8, 8, -8, 4, 4, -4, -4, -4, -4, 4, 4, 4, -4, 4, -4, -4, 4, -4, 4, 4, -4, -4, 4, 4, -4, -4, 4, 2, 2, 2, 2, -2, -2, -2, -2, 2, 2, -2, -2, 2, 2, -2, -2, 2, -2, 2, -2, 2, -2, 2, -2, 1, 1, 1, 1, 1, 1, 1, 1}};
+
+    for (int i = 0; i < 64; ++i) { // C = A * X
+        C[i] = ValueT(0);
+#if 0
+    for (int j = 0; j < 64; j += 4) {
+      C[i] = fma(A[i][j], X[j], fma(A[i][j+1], X[j+1], fma(A[i][j+2], X[j+2], fma(A[i][j+3], X[j+3], C[i]))));
+    }
+#else
+        for (int j = 0; j < 64; j += 4) {
+            C[i] += A[i][j] * X[j] + A[i][j + 1] * X[j + 1] + A[i][j + 2] * X[j + 2] + A[i][j + 3] * X[j + 3];
+        }
+#endif
+    }
+}
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ typename TreeOrAccT::ValueType TricubicSampler<TreeOrAccT>::sample(const Vec3T<RealT> &xyz, const ValueT (&C)[64])
+{
+    ValueT zPow(1), sum(0);
+    for (int k = 0, n = 0; k < 4; ++k) {
+        ValueT yPow(1);
+        for (int j = 0; j < 4; ++j, n += 4) {
+#if 0
+            sum = fma( yPow, zPow * fma(xyz[0], fma(xyz[0], fma(xyz[0], C[n + 3], C[n + 2]), C[n + 1]), C[n]), sum);
+#else
+            sum += yPow * zPow * (C[n] + xyz[0] * (C[n + 1] + xyz[0] * (C[n + 2] + xyz[0] * C[n + 3])));
+#endif
+            yPow *= xyz[1];
+        }
+        zPow *= xyz[2];
+    }
+    return sum;
+}
+
+template<typename TreeOrAccT>
+class SampleFromVoxels<TreeOrAccT, 3, true> : public TricubicSampler<TreeOrAccT>
+{
+    using BaseT  = TricubicSampler<TreeOrAccT>;
+    using ValueT = typename TreeOrAccT::ValueType;
+    using CoordT = typename TreeOrAccT::CoordType;
+
+    mutable CoordT mPos;
+    mutable ValueT mC[64];
+
+    template<typename RealT, template<typename...> class Vec3T>
+    __hostdev__ void cache(Vec3T<RealT>& xyz) const;
+
+public:
+    /// @brief Construction from a Tree or ReadAccessor
+    __hostdev__ SampleFromVoxels(const TreeOrAccT& acc)
+        : BaseT(acc)
+    {
+    }
+
+    /// @note xyz is in index space space
+    template<typename RealT, template<typename...> class Vec3T>
+    inline __hostdev__ ValueT operator()(Vec3T<RealT> xyz) const;
+
+    // @brief Return value at the coordinate @a ijk in index space space
+    __hostdev__ ValueT operator()(const CoordT &ijk) const {return BaseT::mAcc.getValue(ijk);}
+
+}; // SampleFromVoxels<TreeOrAccT, 3, true>
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 3, true>::operator()(Vec3T<RealT> xyz) const
+{
+    this->cache(xyz);
+    return BaseT::sample(xyz, mC);
+}
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ void SampleFromVoxels<TreeOrAccT, 3, true>::cache(Vec3T<RealT>& xyz) const
+{
+    CoordT ijk = Floor<CoordT>(xyz);
+    if (ijk != mPos) {
+        mPos = ijk;
+        BaseT::stencil(ijk, mC);
+    }
+}
+
+template<typename TreeOrAccT>
+class SampleFromVoxels<TreeOrAccT, 3, false> : public TricubicSampler<TreeOrAccT>
+{
+    using BaseT  = TricubicSampler<TreeOrAccT>;
+    using ValueT = typename TreeOrAccT::ValueType;
+    using CoordT = typename TreeOrAccT::CoordType;
+
+public:
+    /// @brief Construction from a Tree or ReadAccessor
+    __hostdev__ SampleFromVoxels(const TreeOrAccT& acc)
+        : BaseT(acc)
+    {
+    }
+
+    /// @note xyz is in index space space
+    template<typename RealT, template<typename...> class Vec3T>
+    inline __hostdev__ ValueT operator()(Vec3T<RealT> xyz) const;
+
+    __hostdev__ ValueT operator()(const CoordT &ijk) const {return BaseT::mAcc.getValue(ijk);}
+
+}; // SampleFromVoxels<TreeOrAccT, 3, true>
+
+template<typename TreeOrAccT>
+template<typename RealT, template<typename...> class Vec3T>
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 3, false>::operator()(Vec3T<RealT> xyz) const
+{
+    ValueT C[64];
+    CoordT ijk = Floor<CoordT>(xyz);
+    BaseT::stencil(ijk, C);
+    return BaseT::sample(xyz, C);
+}
+
+}// namespace math
+
+template<int Order, typename TreeOrAccT, bool UseCache = true>
+[[deprecated("Use nanovdb::math::createSampler instead")]]
+__hostdev__ math::SampleFromVoxels<TreeOrAccT, Order, UseCache> createSampler(const TreeOrAccT& acc)
+{
+    return math::SampleFromVoxels<TreeOrAccT, Order, UseCache>(acc);
+}
+
+} // namespace nanovdb
+
+#endif // NANOVDB_SAMPLE_FROM_VOXELS_H_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/math/Stencils.h b/external/nanovdb/math/Stencils.h
new file mode 100644
index 00000000..e4663810
--- /dev/null
+++ b/external/nanovdb/math/Stencils.h
@@ -0,0 +1,1032 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+//
+/// @author Ken Museth
+///
+/// @date  April 9, 2021
+///
+/// @file Stencils.h
+///
+/// @brief Defines various finite-difference stencils that allow for the
+///        computation of gradients of order 1 to 5, mean curvatures,
+///        gaussian curvatures, principal curvatures, tri-linear interpolation,
+///        zero-crossing, laplacian, and closest point transform.
+
+#ifndef NANOVDB_MATH_STENCILS_HAS_BEEN_INCLUDED
+#define NANOVDB_MATH_STENCILS_HAS_BEEN_INCLUDED
+
+#include <nanovdb/math/Math.h>// for __hostdev__, Vec3, Min, Max, Pow2, Pow3, Pow4
+
+namespace nanovdb {
+
+namespace math {
+
+// ---------------------------- WENO5 ----------------------------
+
+/// @brief Implementation of nominally fifth-order finite-difference WENO
+/// @details This function returns the numerical flux.  See "High Order Finite Difference and
+/// Finite Volume WENO Schemes and Discontinuous Galerkin Methods for CFD" - Chi-Wang Shu
+/// ICASE Report No 2001-11 (page 6).  Also see ICASE No 97-65 for a more complete reference
+/// (Shu, 1997).
+/// Given v1 = f(x-2dx), v2 = f(x-dx), v3 = f(x), v4 = f(x+dx) and v5 = f(x+2dx),
+/// return an interpolated value f(x+dx/2) with the special property that
+/// ( f(x+dx/2) - f(x-dx/2) ) / dx  = df/dx (x) + error,
+/// where the error is fifth-order in smooth regions: O(dx) <= error <=O(dx^5)
+template<typename ValueType, typename RealT = ValueType>
+__hostdev__ inline ValueType
+WENO5(const ValueType& v1,
+      const ValueType& v2,
+      const ValueType& v3,
+      const ValueType& v4,
+      const ValueType& v5,
+      RealT scale2 = 1.0)// openvdb uses scale2 = 0.01
+{
+    static const RealT C = 13.0 / 12.0;
+    // WENO is formulated for non-dimensional equations, here the optional scale2
+    // is a reference value (squared) for the function being interpolated.  For
+    // example if 'v' is of order 1000, then scale2 = 10^6 is ok.  But in practice
+    // leave scale2 = 1.
+    const RealT eps = RealT(1.0e-6) * scale2;
+    // {\tilde \omega_k} = \gamma_k / ( \beta_k + \epsilon)^2 in Shu's ICASE report)
+    const RealT A1 = RealT(0.1)/Pow2(C*Pow2(v1-2*v2+v3)+RealT(0.25)*Pow2(v1-4*v2+3*v3)+eps),
+                A2 = RealT(0.6)/Pow2(C*Pow2(v2-2*v3+v4)+RealT(0.25)*Pow2(v2-v4)+eps),
+                A3 = RealT(0.3)/Pow2(C*Pow2(v3-2*v4+v5)+RealT(0.25)*Pow2(3*v3-4*v4+v5)+eps);
+
+    return static_cast<ValueType>((A1*(2*v1 - 7*v2 + 11*v3) +
+                                   A2*(5*v3 -   v2 +  2*v4) +
+                                   A3*(2*v3 + 5*v4 -    v5))/(6*(A1+A2+A3)));
+}
+
+// ---------------------------- GodunovsNormSqrd ----------------------------
+
+template <typename RealT>
+__hostdev__ inline RealT
+GodunovsNormSqrd(bool isOutside,
+                 RealT dP_xm, RealT dP_xp,
+                 RealT dP_ym, RealT dP_yp,
+                 RealT dP_zm, RealT dP_zp)
+{
+    RealT dPLen2;
+    if (isOutside) { // outside
+        dPLen2  = Max(Pow2(Max(dP_xm, RealT(0))), Pow2(Min(dP_xp, RealT(0)))); // (dP/dx)2
+        dPLen2 += Max(Pow2(Max(dP_ym, RealT(0))), Pow2(Min(dP_yp, RealT(0)))); // (dP/dy)2
+        dPLen2 += Max(Pow2(Max(dP_zm, RealT(0))), Pow2(Min(dP_zp, RealT(0)))); // (dP/dz)2
+    } else { // inside
+        dPLen2  = Max(Pow2(Min(dP_xm, RealT(0))), Pow2(Max(dP_xp, RealT(0)))); // (dP/dx)2
+        dPLen2 += Max(Pow2(Min(dP_ym, RealT(0))), Pow2(Max(dP_yp, RealT(0)))); // (dP/dy)2
+        dPLen2 += Max(Pow2(Min(dP_zm, RealT(0))), Pow2(Max(dP_zp, RealT(0)))); // (dP/dz)2
+    }
+    return dPLen2; // |\nabla\phi|^2
+}
+
+template<typename RealT>
+__hostdev__ inline RealT
+GodunovsNormSqrd(bool isOutside,
+                 const Vec3<RealT>& gradient_m,
+                 const Vec3<RealT>& gradient_p)
+{
+    return GodunovsNormSqrd<RealT>(isOutside,
+                                   gradient_m[0], gradient_p[0],
+                                   gradient_m[1], gradient_p[1],
+                                   gradient_m[2], gradient_p[2]);
+}
+
+// ---------------------------- BaseStencil ----------------------------
+
+// BaseStencil uses curiously recurring template pattern (CRTP)
+template<typename DerivedType, int SIZE, typename GridT>
+class BaseStencil
+{
+public:
+    using ValueType = typename GridT::ValueType;
+    using GridType  = GridT;
+    using TreeType  = typename GridT::TreeType;
+    using AccessorType = typename GridT::AccessorType;// ReadAccessor<ValueType>;
+
+    /// @brief Initialize the stencil buffer with the values of voxel (i, j, k)
+    /// and its neighbors.
+    /// @param ijk Index coordinates of stencil center
+    __hostdev__ inline void moveTo(const Coord& ijk)
+    {
+        mCenter = ijk;
+        mValues[0] = mAcc.getValue(ijk);
+        static_cast<DerivedType&>(*this).init(mCenter);
+    }
+
+    /// @brief Initialize the stencil buffer with the values of voxel (i, j, k)
+    /// and its neighbors. The method also takes a value of the center
+    /// element of the stencil, assuming it is already known.
+    /// @param ijk Index coordinates of stencil center
+    /// @param centerValue Value of the center element of the stencil
+    __hostdev__ inline void moveTo(const Coord& ijk, const ValueType& centerValue)
+    {
+        mCenter = ijk;
+        mValues[0] = centerValue;
+        static_cast<DerivedType&>(*this).init(mCenter);
+    }
+
+    /// @brief Initialize the stencil buffer with the values of voxel
+    /// (x, y, z) and its neighbors.
+    ///
+    /// @note This version is slightly faster than the one above, since
+    /// the center voxel's value is read directly from the iterator.
+    template<typename IterType>
+    __hostdev__ inline void moveTo(const IterType& iter)
+    {
+        mCenter = iter.getCoord();
+        mValues[0] = *iter;
+        static_cast<DerivedType&>(*this).init(mCenter);
+    }
+
+    /// @brief Initialize the stencil buffer with the values of voxel (x, y, z)
+    /// and its neighbors.
+    /// @param xyz Floating point voxel coordinates of stencil center
+    /// @details This method will check to see if it is necessary to
+    /// update the stencil based on the cached index coordinates of
+    /// the center point.
+    template<typename RealType>
+    __hostdev__ inline void moveTo(const Vec3<RealType>& xyz)
+    {
+        Coord ijk = RoundDown(xyz);
+        if (ijk != mCenter) this->moveTo(ijk);
+    }
+
+    /// @brief Return the value from the stencil buffer with linear
+    /// offset pos.
+    ///
+    /// @note The default (@a pos = 0) corresponds to the first element
+    /// which is typically the center point of the stencil.
+    __hostdev__ inline const ValueType& getValue(unsigned int pos = 0) const
+    {
+        NANOVDB_ASSERT(pos < SIZE);
+        return mValues[pos];
+    }
+
+    /// @brief Return the value at the specified location relative to the center of the stencil
+    template<int i, int j, int k>
+    __hostdev__ inline const ValueType& getValue() const
+    {
+        return mValues[static_cast<const DerivedType&>(*this).template pos<i,j,k>()];
+    }
+
+    /// @brief Set the value at the specified location relative to the center of the stencil
+    template<int i, int j, int k>
+    __hostdev__ inline void setValue(const ValueType& value)
+    {
+        mValues[static_cast<const DerivedType&>(*this).template pos<i,j,k>()] = value;
+    }
+
+    /// @brief Return the size of the stencil buffer.
+    __hostdev__ static int size() { return SIZE; }
+
+    /// @brief Return the mean value of the current stencil.
+    __hostdev__ inline ValueType mean() const
+    {
+        ValueType sum = 0.0;
+        for (int i = 0; i < SIZE; ++i) sum += mValues[i];
+        return sum / ValueType(SIZE);
+    }
+
+    /// @brief Return the smallest value in the stencil buffer.
+    __hostdev__ inline ValueType min() const
+    {
+        ValueType v = mValues[0];
+        for (int i=1; i<SIZE; ++i) {
+            if (mValues[i] < v) v = mValues[i];
+        }
+        return v;
+    }
+
+    /// @brief Return the largest value in the stencil buffer.
+    __hostdev__ inline ValueType max() const
+    {
+        ValueType v = mValues[0];
+        for (int i=1; i<SIZE; ++i) {
+            if (mValues[i] > v) v = mValues[i];
+        }
+        return v;
+    }
+
+    /// @brief Return the coordinates of the center point of the stencil.
+    __hostdev__ inline const Coord& getCenterCoord() const { return mCenter; }
+
+    /// @brief Return the value at the center of the stencil
+    __hostdev__ inline const ValueType& getCenterValue() const { return mValues[0]; }
+
+    /// @brief Return true if the center of the stencil intersects the
+    /// iso-contour specified by the isoValue
+    __hostdev__ inline bool intersects(const ValueType &isoValue = ValueType(0) ) const
+    {
+        const bool less = this->getValue< 0, 0, 0>() < isoValue;
+        return (less  ^  (this->getValue<-1, 0, 0>() < isoValue)) ||
+               (less  ^  (this->getValue< 1, 0, 0>() < isoValue)) ||
+               (less  ^  (this->getValue< 0,-1, 0>() < isoValue)) ||
+               (less  ^  (this->getValue< 0, 1, 0>() < isoValue)) ||
+               (less  ^  (this->getValue< 0, 0,-1>() < isoValue)) ||
+               (less  ^  (this->getValue< 0, 0, 1>() < isoValue))  ;
+    }
+    struct Mask {
+        uint8_t bits;
+        __hostdev__ Mask() : bits(0u) {}
+        __hostdev__ void set(int i) { bits |= (1 << i); }
+        __hostdev__ bool test(int i) const { return bits & (1 << i); }
+        __hostdev__ bool any() const  { return bits >  0u; }
+        __hostdev__ bool all() const  { return bits == 255u; }
+        __hostdev__ bool none() const { return bits == 0u; }
+        __hostdev__ int count() const { return util::countOn(bits); }
+    };// Mask
+
+    /// @brief Return true a bit-mask where the 6 lower bits indicates if the
+    /// center of the stencil intersects the iso-contour specified by the isoValue.
+    ///
+    /// @note There are 2^6 = 64 different possible cases, including no intersections!
+    ///
+    /// @details The ordering of bit mask is ( -x, +x, -y, +y, -z, +z ), so to
+    /// check if there is an intersection in -y use (mask & (1u<<2)) where mask is
+    /// ther return value from this function. To check if there are any
+    /// intersections use mask!=0u, and for no intersections use mask==0u.
+    /// To count the number of intersections use __builtin_popcount(mask).
+    __hostdev__ inline Mask intersectionMask(ValueType isoValue = ValueType(0)) const
+    {
+        Mask mask;
+        const bool less = this->getValue< 0, 0, 0>() < isoValue;
+        if (less ^ (this->getValue<-1, 0, 0>() < isoValue)) mask.set(0);// |=  1u;
+        if (less ^ (this->getValue< 1, 0, 0>() < isoValue)) mask.set(1);// |=  2u;
+        if (less ^ (this->getValue< 0,-1, 0>() < isoValue)) mask.set(2);// |=  4u;
+        if (less ^ (this->getValue< 0, 1, 0>() < isoValue)) mask.set(3);// |=  8u;
+        if (less ^ (this->getValue< 0, 0,-1>() < isoValue)) mask.set(4);// |= 16u;
+        if (less ^ (this->getValue< 0, 0, 1>() < isoValue)) mask.set(5);// |= 32u;
+        return mask;
+    }
+
+    /// @brief Return a const reference to the grid from which this
+    /// stencil was constructed.
+    __hostdev__ inline const GridType& grid() const { return *mGrid; }
+
+    /// @brief Return a const reference to the ValueAccessor
+    /// associated with this Stencil.
+    __hostdev__ inline const AccessorType& accessor() const { return mAcc; }
+
+protected:
+    // Constructor is protected to prevent direct instantiation.
+    __hostdev__ BaseStencil(const GridType& grid)
+        : mGrid(&grid)
+        , mAcc(grid)
+        , mCenter(Coord::max())
+    {
+    }
+
+    const GridType* mGrid;
+    AccessorType    mAcc;
+    ValueType       mValues[SIZE];
+    Coord           mCenter;
+
+}; // BaseStencil class
+
+
+// ---------------------------- BoxStencil ----------------------------
+
+
+namespace { // anonymous namespace for stencil-layout map
+
+    // the eight point box stencil
+    template<int i, int j, int k> struct BoxPt {};
+    template<> struct BoxPt< 0, 0, 0> { enum { idx = 0 }; };
+    template<> struct BoxPt< 0, 0, 1> { enum { idx = 1 }; };
+    template<> struct BoxPt< 0, 1, 1> { enum { idx = 2 }; };
+    template<> struct BoxPt< 0, 1, 0> { enum { idx = 3 }; };
+    template<> struct BoxPt< 1, 0, 0> { enum { idx = 4 }; };
+    template<> struct BoxPt< 1, 0, 1> { enum { idx = 5 }; };
+    template<> struct BoxPt< 1, 1, 1> { enum { idx = 6 }; };
+    template<> struct BoxPt< 1, 1, 0> { enum { idx = 7 }; };
+
+}
+
+template<typename GridT>
+class BoxStencil: public BaseStencil<BoxStencil<GridT>, 8, GridT>
+{
+    using SelfT     = BoxStencil<GridT>;
+    using BaseType  = BaseStencil<SelfT, 8, GridT>;
+public:
+    using GridType  = GridT;
+    using TreeType  = typename GridT::TreeType;
+    using ValueType = typename GridT::ValueType;
+
+    static constexpr int SIZE = 8;
+
+    __hostdev__ BoxStencil(const GridType& grid) : BaseType(grid) {}
+
+    /// Return linear offset for the specified stencil point relative to its center
+    template<int i, int j, int k>
+    __hostdev__ unsigned int pos() const { return BoxPt<i,j,k>::idx; }
+
+     /// @brief Return true if the center of the stencil intersects the
+    /// iso-contour specified by the isoValue
+    __hostdev__ inline bool intersects(ValueType isoValue = ValueType(0)) const
+    {
+        const bool less = mValues[0] < isoValue;
+        return (less  ^  (mValues[1] < isoValue)) ||
+               (less  ^  (mValues[2] < isoValue)) ||
+               (less  ^  (mValues[3] < isoValue)) ||
+               (less  ^  (mValues[4] < isoValue)) ||
+               (less  ^  (mValues[5] < isoValue)) ||
+               (less  ^  (mValues[6] < isoValue)) ||
+               (less  ^  (mValues[7] < isoValue))  ;
+    }
+
+    /// @brief Return the trilinear interpolation at the normalized position.
+    /// @param xyz Floating point coordinate position. Index space and NOT world space.
+    /// @warning It is assumed that the stencil has already been moved
+    /// to the relevant voxel position, e.g. using moveTo(xyz).
+    /// @note Trilinear interpolation kernal reads as:
+    ///       v000 (1-u)(1-v)(1-w) + v001 (1-u)(1-v)w + v010 (1-u)v(1-w) + v011 (1-u)vw
+    ///     + v100 u(1-v)(1-w)     + v101 u(1-v)w     + v110 uv(1-w)     + v111 uvw
+    __hostdev__ inline ValueType interpolation(const Vec3<ValueType>& xyz) const
+    {
+        const ValueType u = xyz[0] - mCenter[0];
+        const ValueType v = xyz[1] - mCenter[1];
+        const ValueType w = xyz[2] - mCenter[2];
+
+        NANOVDB_ASSERT(u>=0 && u<=1);
+        NANOVDB_ASSERT(v>=0 && v<=1);
+        NANOVDB_ASSERT(w>=0 && w<=1);
+
+        ValueType V = BaseType::template getValue<0,0,0>();
+        ValueType A = V + (BaseType::template getValue<0,0,1>() - V) * w;
+        V = BaseType::template getValue< 0, 1, 0>();
+        ValueType B = V + (BaseType::template getValue<0,1,1>() - V) * w;
+        ValueType C = A + (B - A) * v;
+
+        V = BaseType::template getValue<1,0,0>();
+        A = V + (BaseType::template getValue<1,0,1>() - V) * w;
+        V = BaseType::template getValue<1,1,0>();
+        B = V + (BaseType::template getValue<1,1,1>() - V) * w;
+        ValueType D = A + (B - A) * v;
+
+        return C + (D - C) * u;
+    }
+
+    /// @brief Return the gradient in world space of the trilinear interpolation kernel.
+    /// @param xyz Floating point coordinate position.
+    /// @warning It is assumed that the stencil has already been moved
+    /// to the relevant voxel position, e.g. using moveTo(xyz).
+    /// @note Computed as partial derivatives of the trilinear interpolation kernel:
+    ///       v000 (1-u)(1-v)(1-w) + v001 (1-u)(1-v)w + v010 (1-u)v(1-w) + v011 (1-u)vw
+    ///     + v100 u(1-v)(1-w)     + v101 u(1-v)w     + v110 uv(1-w)     + v111 uvw
+    __hostdev__ inline Vec3<ValueType> gradient(const Vec3<ValueType>& xyz) const
+    {
+        const ValueType u = xyz[0] - mCenter[0];
+        const ValueType v = xyz[1] - mCenter[1];
+        const ValueType w = xyz[2] - mCenter[2];
+
+        NANOVDB_ASSERT(u>=0 && u<=1);
+        NANOVDB_ASSERT(v>=0 && v<=1);
+        NANOVDB_ASSERT(w>=0 && w<=1);
+
+        ValueType D[4]={BaseType::template getValue<0,0,1>()-BaseType::template getValue<0,0,0>(),
+                        BaseType::template getValue<0,1,1>()-BaseType::template getValue<0,1,0>(),
+                        BaseType::template getValue<1,0,1>()-BaseType::template getValue<1,0,0>(),
+                        BaseType::template getValue<1,1,1>()-BaseType::template getValue<1,1,0>()};
+
+        // Z component
+        ValueType A = D[0] + (D[1]- D[0]) * v;
+        ValueType B = D[2] + (D[3]- D[2]) * v;
+        Vec3<ValueType> grad(0, 0, A + (B - A) * u);
+
+        D[0] = BaseType::template getValue<0,0,0>() + D[0] * w;
+        D[1] = BaseType::template getValue<0,1,0>() + D[1] * w;
+        D[2] = BaseType::template getValue<1,0,0>() + D[2] * w;
+        D[3] = BaseType::template getValue<1,1,0>() + D[3] * w;
+
+        // X component
+        A = D[0] + (D[1] - D[0]) * v;
+        B = D[2] + (D[3] - D[2]) * v;
+
+        grad[0] = B - A;
+
+        // Y component
+        A = D[1] - D[0];
+        B = D[3] - D[2];
+
+        grad[1] = A + (B - A) * u;
+
+        return BaseType::mGrid->map().applyIJT(grad);
+    }
+
+private:
+    __hostdev__ inline void init(const Coord& ijk)
+    {
+        mValues[ 1] = mAcc.getValue(ijk.offsetBy( 0, 0, 1));
+        mValues[ 2] = mAcc.getValue(ijk.offsetBy( 0, 1, 1));
+        mValues[ 3] = mAcc.getValue(ijk.offsetBy( 0, 1, 0));
+        mValues[ 4] = mAcc.getValue(ijk.offsetBy( 1, 0, 0));
+        mValues[ 5] = mAcc.getValue(ijk.offsetBy( 1, 0, 1));
+        mValues[ 6] = mAcc.getValue(ijk.offsetBy( 1, 1, 1));
+        mValues[ 7] = mAcc.getValue(ijk.offsetBy( 1, 1, 0));
+    }
+
+    template<typename, int, typename> friend class BaseStencil; // allow base class to call init()
+    using BaseType::mAcc;
+    using BaseType::mValues;
+    using BaseType::mCenter;
+};// BoxStencil class
+
+
+// ---------------------------- GradStencil ----------------------------
+
+namespace { // anonymous namespace for stencil-layout map
+
+    template<int i, int j, int k> struct GradPt {};
+    template<> struct GradPt< 0, 0, 0> { enum { idx = 0 }; };
+    template<> struct GradPt< 1, 0, 0> { enum { idx = 2 }; };
+    template<> struct GradPt< 0, 1, 0> { enum { idx = 4 }; };
+    template<> struct GradPt< 0, 0, 1> { enum { idx = 6 }; };
+    template<> struct GradPt<-1, 0, 0> { enum { idx = 1 }; };
+    template<> struct GradPt< 0,-1, 0> { enum { idx = 3 }; };
+    template<> struct GradPt< 0, 0,-1> { enum { idx = 5 }; };
+}
+
+/// This is a simple 7-point nearest neighbor stencil that supports
+/// gradient by second-order central differencing, first-order upwinding,
+/// Laplacian, closest-point transform and zero-crossing test.
+///
+/// @note For optimal random access performance this class
+/// includes its own grid accessor.
+template<typename GridT>
+class GradStencil : public BaseStencil<GradStencil<GridT>, 7, GridT>
+{
+    using SelfT     = GradStencil<GridT>;
+    using BaseType  = BaseStencil<SelfT, 7, GridT>;
+public:
+    using GridType  = GridT;
+    using TreeType  = typename GridT::TreeType;
+    using ValueType = typename GridT::ValueType;
+
+    static constexpr int SIZE = 7;
+
+    __hostdev__ GradStencil(const GridType& grid)
+        : BaseType(grid)
+        , mInv2Dx(ValueType(0.5 / grid.voxelSize()[0]))
+        , mInvDx2(ValueType(4.0 * mInv2Dx * mInv2Dx))
+    {
+    }
+
+    __hostdev__ GradStencil(const GridType& grid, double dx)
+        : BaseType(grid)
+        , mInv2Dx(ValueType(0.5 / dx))
+        , mInvDx2(ValueType(4.0 * mInv2Dx * mInv2Dx))
+    {
+    }
+
+    /// @brief Return the norm square of the single-sided upwind gradient
+    /// (computed via Godunov's scheme) at the previously buffered location.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline ValueType normSqGrad() const
+    {
+        return mInvDx2 * GodunovsNormSqrd(mValues[0] > ValueType(0),
+                                          mValues[0] - mValues[1],
+                                          mValues[2] - mValues[0],
+                                          mValues[0] - mValues[3],
+                                          mValues[4] - mValues[0],
+                                          mValues[0] - mValues[5],
+                                          mValues[6] - mValues[0]);
+    }
+
+    /// @brief Return the gradient computed at the previously buffered
+    /// location by second order central differencing.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline Vec3<ValueType> gradient() const
+    {
+        return Vec3<ValueType>(mValues[2] - mValues[1],
+                               mValues[4] - mValues[3],
+                               mValues[6] - mValues[5])*mInv2Dx;
+    }
+    /// @brief Return the first-order upwind gradient corresponding to the direction V.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline Vec3<ValueType> gradient(const Vec3<ValueType>& V) const
+    {
+        return Vec3<ValueType>(
+               V[0]>0 ? mValues[0] - mValues[1] : mValues[2] - mValues[0],
+               V[1]>0 ? mValues[0] - mValues[3] : mValues[4] - mValues[0],
+               V[2]>0 ? mValues[0] - mValues[5] : mValues[6] - mValues[0])*2*mInv2Dx;
+    }
+
+    /// Return the Laplacian computed at the previously buffered
+    /// location by second-order central differencing.
+    __hostdev__ inline ValueType laplacian() const
+    {
+        return mInvDx2 * (mValues[1] + mValues[2] +
+                          mValues[3] + mValues[4] +
+                          mValues[5] + mValues[6] - 6*mValues[0]);
+    }
+
+    /// Return @c true if the sign of the value at the center point of the stencil
+    /// is different from the signs of any of its six nearest neighbors.
+    __hostdev__ inline bool zeroCrossing() const
+    {
+        return (mValues[0]>0 ? (mValues[1]<0 || mValues[2]<0 || mValues[3]<0 || mValues[4]<0 || mValues[5]<0 || mValues[6]<0)
+                             : (mValues[1]>0 || mValues[2]>0 || mValues[3]>0 || mValues[4]>0 || mValues[5]>0 || mValues[6]>0));
+    }
+
+    /// @brief Compute the closest-point transform to a level set.
+    /// @return the closest point in index space to the surface
+    /// from which the level set was derived.
+    ///
+    /// @note This method assumes that the grid represents a level set
+    /// with distances in world units and a simple affine transfrom
+    /// with uniform scaling.
+    __hostdev__ inline Vec3<ValueType> cpt()
+    {
+        const Coord& ijk = BaseType::getCenterCoord();
+        const ValueType d = ValueType(mValues[0] * 0.5 * mInvDx2); // distance in voxels / (2dx^2)
+        const auto value = Vec3<ValueType>(ijk[0] - d*(mValues[2] - mValues[1]),
+                                           ijk[1] - d*(mValues[4] - mValues[3]),
+                                           ijk[2] - d*(mValues[6] - mValues[5]));
+        return value;
+    }
+
+    /// Return linear offset for the specified stencil point relative to its center
+    template<int i, int j, int k>
+    __hostdev__ unsigned int pos() const { return GradPt<i,j,k>::idx; }
+
+private:
+
+    __hostdev__ inline void init(const Coord& ijk)
+    {
+        mValues[ 1] = mAcc.getValue(ijk.offsetBy(-1, 0, 0));
+        mValues[ 2] = mAcc.getValue(ijk.offsetBy( 1, 0, 0));
+
+        mValues[ 3] = mAcc.getValue(ijk.offsetBy( 0,-1, 0));
+        mValues[ 4] = mAcc.getValue(ijk.offsetBy( 0, 1, 0));
+
+        mValues[ 5] = mAcc.getValue(ijk.offsetBy( 0, 0,-1));
+        mValues[ 6] = mAcc.getValue(ijk.offsetBy( 0, 0, 1));
+    }
+
+    template<typename, int, typename> friend class BaseStencil; // allow base class to call init()
+    using BaseType::mAcc;
+    using BaseType::mValues;
+    const ValueType mInv2Dx, mInvDx2;
+}; // GradStencil class
+
+
+// ---------------------------- WenoStencil ----------------------------
+
+namespace { // anonymous namespace for stencil-layout map
+
+    template<int i, int j, int k> struct WenoPt {};
+    template<> struct WenoPt< 0, 0, 0> { enum { idx = 0 }; };
+
+    template<> struct WenoPt<-3, 0, 0> { enum { idx = 1 }; };
+    template<> struct WenoPt<-2, 0, 0> { enum { idx = 2 }; };
+    template<> struct WenoPt<-1, 0, 0> { enum { idx = 3 }; };
+    template<> struct WenoPt< 1, 0, 0> { enum { idx = 4 }; };
+    template<> struct WenoPt< 2, 0, 0> { enum { idx = 5 }; };
+    template<> struct WenoPt< 3, 0, 0> { enum { idx = 6 }; };
+
+    template<> struct WenoPt< 0,-3, 0> { enum { idx = 7 }; };
+    template<> struct WenoPt< 0,-2, 0> { enum { idx = 8 }; };
+    template<> struct WenoPt< 0,-1, 0> { enum { idx = 9 }; };
+    template<> struct WenoPt< 0, 1, 0> { enum { idx =10 }; };
+    template<> struct WenoPt< 0, 2, 0> { enum { idx =11 }; };
+    template<> struct WenoPt< 0, 3, 0> { enum { idx =12 }; };
+
+    template<> struct WenoPt< 0, 0,-3> { enum { idx =13 }; };
+    template<> struct WenoPt< 0, 0,-2> { enum { idx =14 }; };
+    template<> struct WenoPt< 0, 0,-1> { enum { idx =15 }; };
+    template<> struct WenoPt< 0, 0, 1> { enum { idx =16 }; };
+    template<> struct WenoPt< 0, 0, 2> { enum { idx =17 }; };
+    template<> struct WenoPt< 0, 0, 3> { enum { idx =18 }; };
+
+}
+
+/// @brief This is a special 19-point stencil that supports optimal fifth-order WENO
+/// upwinding, second-order central differencing, Laplacian, and zero-crossing test.
+///
+/// @note For optimal random access performance this class
+/// includes its own grid accessor.
+template<typename GridT, typename RealT = typename GridT::ValueType>
+class WenoStencil: public BaseStencil<WenoStencil<GridT>, 19, GridT>
+{
+    using SelfT     = WenoStencil<GridT>;
+    using BaseType  = BaseStencil<SelfT, 19, GridT>;
+public:
+    using GridType  = GridT;
+    using TreeType  = typename GridT::TreeType;
+    using ValueType = typename GridT::ValueType;
+
+    static constexpr int SIZE = 19;
+
+    __hostdev__ WenoStencil(const GridType& grid)
+        : BaseType(grid)
+        , mDx2(ValueType(Pow2(grid.voxelSize()[0])))
+        , mInv2Dx(ValueType(0.5 / grid.voxelSize()[0]))
+        , mInvDx2(ValueType(1.0 / mDx2))
+    {
+    }
+
+    __hostdev__ WenoStencil(const GridType& grid, double dx)
+        : BaseType(grid)
+        , mDx2(ValueType(dx * dx))
+        , mInv2Dx(ValueType(0.5 / dx))
+        , mInvDx2(ValueType(1.0 / mDx2))
+    {
+    }
+
+    /// @brief Return the norm-square of the WENO upwind gradient (computed via
+    /// WENO upwinding and Godunov's scheme) at the previously buffered location.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline ValueType normSqGrad(ValueType isoValue = ValueType(0)) const
+    {
+        const ValueType* v = mValues;
+        const RealT
+            dP_xm = WENO5<RealT>(v[ 2]-v[ 1],v[ 3]-v[ 2],v[ 0]-v[ 3],v[ 4]-v[ 0],v[ 5]-v[ 4],mDx2),
+            dP_xp = WENO5<RealT>(v[ 6]-v[ 5],v[ 5]-v[ 4],v[ 4]-v[ 0],v[ 0]-v[ 3],v[ 3]-v[ 2],mDx2),
+            dP_ym = WENO5<RealT>(v[ 8]-v[ 7],v[ 9]-v[ 8],v[ 0]-v[ 9],v[10]-v[ 0],v[11]-v[10],mDx2),
+            dP_yp = WENO5<RealT>(v[12]-v[11],v[11]-v[10],v[10]-v[ 0],v[ 0]-v[ 9],v[ 9]-v[ 8],mDx2),
+            dP_zm = WENO5<RealT>(v[14]-v[13],v[15]-v[14],v[ 0]-v[15],v[16]-v[ 0],v[17]-v[16],mDx2),
+            dP_zp = WENO5<RealT>(v[18]-v[17],v[17]-v[16],v[16]-v[ 0],v[ 0]-v[15],v[15]-v[14],mDx2);
+        return mInvDx2*static_cast<ValueType>(
+            GodunovsNormSqrd(v[0]>isoValue, dP_xm, dP_xp, dP_ym, dP_yp, dP_zm, dP_zp));
+    }
+
+    /// Return the optimal fifth-order upwind gradient corresponding to the
+    /// direction V.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline Vec3<ValueType> gradient(const Vec3<ValueType>& V) const
+    {
+        const ValueType* v = mValues;
+        return 2*mInv2Dx * Vec3<ValueType>(
+            V[0]>0 ? WENO5<RealT>(v[ 2]-v[ 1],v[ 3]-v[ 2],v[ 0]-v[ 3], v[ 4]-v[ 0],v[ 5]-v[ 4],mDx2)
+                   : WENO5<RealT>(v[ 6]-v[ 5],v[ 5]-v[ 4],v[ 4]-v[ 0], v[ 0]-v[ 3],v[ 3]-v[ 2],mDx2),
+            V[1]>0 ? WENO5<RealT>(v[ 8]-v[ 7],v[ 9]-v[ 8],v[ 0]-v[ 9], v[10]-v[ 0],v[11]-v[10],mDx2)
+                   : WENO5<RealT>(v[12]-v[11],v[11]-v[10],v[10]-v[ 0], v[ 0]-v[ 9],v[ 9]-v[ 8],mDx2),
+            V[2]>0 ? WENO5<RealT>(v[14]-v[13],v[15]-v[14],v[ 0]-v[15], v[16]-v[ 0],v[17]-v[16],mDx2)
+                   : WENO5<RealT>(v[18]-v[17],v[17]-v[16],v[16]-v[ 0], v[ 0]-v[15],v[15]-v[14],mDx2));
+    }
+    /// Return the gradient computed at the previously buffered
+    /// location by second-order central differencing.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline Vec3<ValueType> gradient() const
+    {
+        return mInv2Dx * Vec3<ValueType>(mValues[ 4] - mValues[ 3],
+                                         mValues[10] - mValues[ 9],
+                                         mValues[16] - mValues[15]);
+    }
+
+    /// Return the Laplacian computed at the previously buffered
+    /// location by second-order central differencing.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline ValueType laplacian() const
+    {
+        return mInvDx2 * (
+            mValues[ 3] + mValues[ 4] +
+            mValues[ 9] + mValues[10] +
+            mValues[15] + mValues[16] - 6*mValues[0]);
+    }
+
+    /// Return @c true if the sign of the value at the center point of the stencil
+    /// differs from the sign of any of its six nearest neighbors
+    __hostdev__ inline bool zeroCrossing() const
+    {
+        const ValueType* v = mValues;
+        return (v[ 0]>0 ? (v[ 3]<0 || v[ 4]<0 || v[ 9]<0 || v[10]<0 || v[15]<0 || v[16]<0)
+                        : (v[ 3]>0 || v[ 4]>0 || v[ 9]>0 || v[10]>0 || v[15]>0 || v[16]>0));
+    }
+
+    /// Return linear offset for the specified stencil point relative to its center
+    template<int i, int j, int k>
+    __hostdev__ unsigned int pos() const { return WenoPt<i,j,k>::idx; }
+
+private:
+    __hostdev__ inline void init(const Coord& ijk)
+    {
+        mValues[ 1] = mAcc.getValue(ijk.offsetBy(-3,  0,  0));
+        mValues[ 2] = mAcc.getValue(ijk.offsetBy(-2,  0,  0));
+        mValues[ 3] = mAcc.getValue(ijk.offsetBy(-1,  0,  0));
+        mValues[ 4] = mAcc.getValue(ijk.offsetBy( 1,  0,  0));
+        mValues[ 5] = mAcc.getValue(ijk.offsetBy( 2,  0,  0));
+        mValues[ 6] = mAcc.getValue(ijk.offsetBy( 3,  0,  0));
+
+        mValues[ 7] = mAcc.getValue(ijk.offsetBy( 0, -3,  0));
+        mValues[ 8] = mAcc.getValue(ijk.offsetBy( 0, -2,  0));
+        mValues[ 9] = mAcc.getValue(ijk.offsetBy( 0, -1,  0));
+        mValues[10] = mAcc.getValue(ijk.offsetBy( 0,  1,  0));
+        mValues[11] = mAcc.getValue(ijk.offsetBy( 0,  2,  0));
+        mValues[12] = mAcc.getValue(ijk.offsetBy( 0,  3,  0));
+
+        mValues[13] = mAcc.getValue(ijk.offsetBy( 0,  0, -3));
+        mValues[14] = mAcc.getValue(ijk.offsetBy( 0,  0, -2));
+        mValues[15] = mAcc.getValue(ijk.offsetBy( 0,  0, -1));
+        mValues[16] = mAcc.getValue(ijk.offsetBy( 0,  0,  1));
+        mValues[17] = mAcc.getValue(ijk.offsetBy( 0,  0,  2));
+        mValues[18] = mAcc.getValue(ijk.offsetBy( 0,  0,  3));
+    }
+
+    template<typename, int, typename> friend class BaseStencil; // allow base class to call init()
+    using BaseType::mAcc;
+    using BaseType::mValues;
+    const ValueType mDx2, mInv2Dx, mInvDx2;
+}; // WenoStencil class
+
+
+// ---------------------------- CurvatureStencil ----------------------------
+
+namespace { // anonymous namespace for stencil-layout map
+
+    template<int i, int j, int k> struct CurvPt {};
+    template<> struct CurvPt< 0, 0, 0> { enum { idx = 0 }; };
+
+    template<> struct CurvPt<-1, 0, 0> { enum { idx = 1 }; };
+    template<> struct CurvPt< 1, 0, 0> { enum { idx = 2 }; };
+
+    template<> struct CurvPt< 0,-1, 0> { enum { idx = 3 }; };
+    template<> struct CurvPt< 0, 1, 0> { enum { idx = 4 }; };
+
+    template<> struct CurvPt< 0, 0,-1> { enum { idx = 5 }; };
+    template<> struct CurvPt< 0, 0, 1> { enum { idx = 6 }; };
+
+    template<> struct CurvPt<-1,-1, 0> { enum { idx = 7 }; };
+    template<> struct CurvPt< 1,-1, 0> { enum { idx = 8 }; };
+    template<> struct CurvPt<-1, 1, 0> { enum { idx = 9 }; };
+    template<> struct CurvPt< 1, 1, 0> { enum { idx =10 }; };
+
+    template<> struct CurvPt<-1, 0,-1> { enum { idx =11 }; };
+    template<> struct CurvPt< 1, 0,-1> { enum { idx =12 }; };
+    template<> struct CurvPt<-1, 0, 1> { enum { idx =13 }; };
+    template<> struct CurvPt< 1, 0, 1> { enum { idx =14 }; };
+
+    template<> struct CurvPt< 0,-1,-1> { enum { idx =15 }; };
+    template<> struct CurvPt< 0, 1,-1> { enum { idx =16 }; };
+    template<> struct CurvPt< 0,-1, 1> { enum { idx =17 }; };
+    template<> struct CurvPt< 0, 1, 1> { enum { idx =18 }; };
+
+}
+
+template<typename GridT, typename RealT = typename GridT::ValueType>
+class CurvatureStencil: public BaseStencil<CurvatureStencil<GridT>, 19, GridT>
+{
+    using SelfT     = CurvatureStencil<GridT>;
+    using BaseType  = BaseStencil<SelfT, 19, GridT>;
+public:
+    using GridType  = GridT;
+    using TreeType  = typename GridT::TreeType;
+    using ValueType = typename GridT::ValueType;
+
+    static constexpr int SIZE = 19;
+
+    __hostdev__ CurvatureStencil(const GridType& grid)
+        : BaseType(grid)
+        , mInv2Dx(ValueType(0.5 / grid.voxelSize()[0]))
+        , mInvDx2(ValueType(4.0 * mInv2Dx * mInv2Dx))
+    {
+    }
+
+    __hostdev__ CurvatureStencil(const GridType& grid, double dx)
+        : BaseType(grid)
+        , mInv2Dx(ValueType(0.5 / dx))
+        , mInvDx2(ValueType(4.0 * mInv2Dx * mInv2Dx))
+    {
+    }
+
+    /// @brief Return the mean curvature at the previously buffered location.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline ValueType meanCurvature() const
+    {
+        RealT alpha, normGrad;
+        return this->meanCurvature(alpha, normGrad) ?
+               ValueType(alpha*mInv2Dx/Pow3(normGrad)) : 0;
+    }
+
+    /// @brief Return the Gaussian curvature at the previously buffered location.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline ValueType gaussianCurvature() const
+    {
+        RealT alpha, normGrad;
+        return this->gaussianCurvature(alpha, normGrad) ?
+               ValueType(alpha*mInvDx2/Pow4(normGrad)) : 0;
+    }
+
+    /// @brief Return both the mean and the Gaussian curvature at the
+    ///        previously buffered location.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline void curvatures(ValueType &mean, ValueType& gauss) const
+    {
+        RealT alphaM, alphaG, normGrad;
+        if (this->curvatures(alphaM, alphaG, normGrad)) {
+          mean  = ValueType(alphaM*mInv2Dx/Pow3(normGrad));
+          gauss = ValueType(alphaG*mInvDx2/Pow4(normGrad));
+        } else {
+          mean = gauss = 0;
+        }
+    }
+
+    /// Return the mean curvature multiplied by the norm of the
+    /// central-difference gradient. This method is very useful for
+    /// mean-curvature flow of level sets!
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline ValueType meanCurvatureNormGrad() const
+    {
+        RealT alpha, normGrad;
+        return this->meanCurvature(alpha, normGrad) ?
+               ValueType(alpha*mInvDx2/(2*Pow2(normGrad))) : 0;
+    }
+
+    /// Return the mean Gaussian multiplied by the norm of the
+    /// central-difference gradient.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline ValueType gaussianCurvatureNormGrad() const
+    {
+        RealT alpha, normGrad;
+        return this->gaussianCurvature(alpha, normGrad) ?
+               ValueType(2*alpha*mInv2Dx*mInvDx2/Pow3(normGrad)) : 0;
+    }
+
+    /// @brief Return both the mean and the Gaussian curvature at the
+    ///        previously buffered location.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline void curvaturesNormGrad(ValueType &mean, ValueType& gauss) const
+    {
+        RealT alphaM, alphaG, normGrad;
+        if (this->curvatures(alphaM, alphaG, normGrad)) {
+          mean  = ValueType(alphaM*mInvDx2/(2*Pow2(normGrad)));
+          gauss = ValueType(2*alphaG*mInv2Dx*mInvDx2/Pow3(normGrad));
+        } else {
+          mean = gauss = 0;
+        }
+    }
+
+    /// @brief Computes the minimum and maximum principal curvature at the
+    ///        previously buffered location.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline void principalCurvatures(ValueType &min, ValueType &max) const
+    {
+        min = max = 0;
+        RealT alphaM, alphaG, normGrad;
+        if (this->curvatures(alphaM, alphaG, normGrad)) {
+            const RealT mean = alphaM*mInv2Dx/Pow3(normGrad);
+            const RealT tmp = Sqrt(mean*mean - alphaG*mInvDx2/Pow4(normGrad));
+            min = ValueType(mean - tmp);
+            max = ValueType(mean + tmp);
+        }
+    }
+
+    /// Return the Laplacian computed at the previously buffered
+    /// location by second-order central differencing.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline ValueType laplacian() const
+    {
+        return mInvDx2 * (
+            mValues[1] + mValues[2] +
+            mValues[3] + mValues[4] +
+            mValues[5] + mValues[6] - 6*mValues[0]);
+    }
+
+    /// Return the gradient computed at the previously buffered
+    /// location by second-order central differencing.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    __hostdev__ inline Vec3<ValueType> gradient() const
+    {
+        return Vec3<ValueType>(
+            mValues[2] - mValues[1],
+            mValues[4] - mValues[3],
+            mValues[6] - mValues[5])*mInv2Dx;
+    }
+
+    /// Return linear offset for the specified stencil point relative to its center
+    template<int i, int j, int k>
+    __hostdev__ unsigned int pos() const { return CurvPt<i,j,k>::idx; }
+
+private:
+    __hostdev__ inline void init(const Coord &ijk)
+    {
+        mValues[ 1] = mAcc.getValue(ijk.offsetBy(-1,  0,  0));
+        mValues[ 2] = mAcc.getValue(ijk.offsetBy( 1,  0,  0));
+
+        mValues[ 3] = mAcc.getValue(ijk.offsetBy( 0, -1,  0));
+        mValues[ 4] = mAcc.getValue(ijk.offsetBy( 0,  1,  0));
+
+        mValues[ 5] = mAcc.getValue(ijk.offsetBy( 0,  0, -1));
+        mValues[ 6] = mAcc.getValue(ijk.offsetBy( 0,  0,  1));
+
+        mValues[ 7] = mAcc.getValue(ijk.offsetBy(-1, -1,  0));
+        mValues[ 8] = mAcc.getValue(ijk.offsetBy( 1, -1,  0));
+        mValues[ 9] = mAcc.getValue(ijk.offsetBy(-1,  1,  0));
+        mValues[10] = mAcc.getValue(ijk.offsetBy( 1,  1,  0));
+
+        mValues[11] = mAcc.getValue(ijk.offsetBy(-1,  0, -1));
+        mValues[12] = mAcc.getValue(ijk.offsetBy( 1,  0, -1));
+        mValues[13] = mAcc.getValue(ijk.offsetBy(-1,  0,  1));
+        mValues[14] = mAcc.getValue(ijk.offsetBy( 1,  0,  1));
+
+        mValues[15] = mAcc.getValue(ijk.offsetBy( 0, -1, -1));
+        mValues[16] = mAcc.getValue(ijk.offsetBy( 0,  1, -1));
+        mValues[17] = mAcc.getValue(ijk.offsetBy( 0, -1,  1));
+        mValues[18] = mAcc.getValue(ijk.offsetBy( 0,  1,  1));
+    }
+
+    __hostdev__ inline RealT Dx()  const { return 0.5*(mValues[2] - mValues[1]); }// * 1/dx
+    __hostdev__ inline RealT Dy()  const { return 0.5*(mValues[4] - mValues[3]); }// * 1/dx
+    __hostdev__ inline RealT Dz()  const { return 0.5*(mValues[6] - mValues[5]); }// * 1/dx
+    __hostdev__ inline RealT Dxx() const { return mValues[2] - 2 * mValues[0] + mValues[1]; }// * 1/dx2
+    __hostdev__ inline RealT Dyy() const { return mValues[4] - 2 * mValues[0] + mValues[3]; }// * 1/dx2}
+    __hostdev__ inline RealT Dzz() const { return mValues[6] - 2 * mValues[0] + mValues[5]; }// * 1/dx2
+    __hostdev__ inline RealT Dxy() const { return 0.25 * (mValues[10] - mValues[ 8] + mValues[ 7] - mValues[ 9]); }// * 1/dx2
+    __hostdev__ inline RealT Dxz() const { return 0.25 * (mValues[14] - mValues[12] + mValues[11] - mValues[13]); }// * 1/dx2
+    __hostdev__ inline RealT Dyz() const { return 0.25 * (mValues[18] - mValues[16] + mValues[15] - mValues[17]); }// * 1/dx2
+
+    __hostdev__ inline bool meanCurvature(RealT& alpha, RealT& normGrad) const
+    {
+        // For performance all finite differences are unscaled wrt dx
+        const RealT Dx  = this->Dx(), Dy = this->Dy(), Dz = this->Dz(),
+                    Dx2 = Dx*Dx, Dy2 = Dy*Dy, Dz2 = Dz*Dz, normGrad2 = Dx2 + Dy2 + Dz2;
+        if (normGrad2 <= Tolerance<RealT>::value()) {
+             alpha = normGrad = 0;
+             return false;
+        }
+        const RealT Dxx = this->Dxx(), Dyy = this->Dyy(), Dzz = this->Dzz();
+        alpha = Dx2*(Dyy + Dzz) + Dy2*(Dxx + Dzz) + Dz2*(Dxx + Dyy) -
+                2*(Dx*(Dy*this->Dxy() + Dz*this->Dxz()) + Dy*Dz*this->Dyz());// * 1/dx^4
+        normGrad = Sqrt(normGrad2); // * 1/dx
+        return true;
+    }
+
+    __hostdev__ inline bool gaussianCurvature(RealT& alpha, RealT& normGrad) const
+    {
+        // For performance all finite differences are unscaled wrt dx
+        const RealT Dx  = this->Dx(), Dy = this->Dy(), Dz = this->Dz(),
+                    Dx2 = Dx*Dx, Dy2 = Dy*Dy, Dz2 = Dz*Dz, normGrad2 = Dx2 + Dy2 + Dz2;
+        if (normGrad2 <= Tolerance<RealT>::value()) {
+             alpha = normGrad = 0;
+             return false;
+        }
+        const RealT Dxx = this->Dxx(), Dyy = this->Dyy(), Dzz = this->Dzz(),
+                   Dxy = this->Dxy(), Dxz = this->Dxz(), Dyz = this->Dyz();
+        alpha = Dx2*(Dyy*Dzz - Dyz*Dyz) + Dy2*(Dxx*Dzz - Dxz*Dxz) + Dz2*(Dxx*Dyy - Dxy*Dxy) +
+                2*( Dy*Dz*(Dxy*Dxz - Dyz*Dxx) + Dx*Dz*(Dxy*Dyz - Dxz*Dyy) + Dx*Dy*(Dxz*Dyz - Dxy*Dzz) );// * 1/dx^6
+        normGrad  = Sqrt(normGrad2); // * 1/dx
+        return true;
+    }
+
+    __hostdev__ inline bool curvatures(RealT& alphaM, RealT& alphaG, RealT& normGrad) const
+    {
+        // For performance all finite differences are unscaled wrt dx
+        const RealT Dx  = this->Dx(), Dy = this->Dy(), Dz = this->Dz(),
+                    Dx2 = Dx*Dx, Dy2 = Dy*Dy, Dz2 = Dz*Dz, normGrad2 = Dx2 + Dy2 + Dz2;
+        if (normGrad2 <= Tolerance<RealT>::value()) {
+             alphaM = alphaG =normGrad = 0;
+             return false;
+        }
+        const RealT Dxx = this->Dxx(), Dyy = this->Dyy(), Dzz = this->Dzz(),
+                    Dxy = this->Dxy(), Dxz = this->Dxz(), Dyz = this->Dyz();
+        alphaM = Dx2*(Dyy + Dzz) + Dy2*(Dxx + Dzz) + Dz2*(Dxx + Dyy) -
+                 2*(Dx*(Dy*Dxy + Dz*Dxz) + Dy*Dz*Dyz);// *1/dx^4
+        alphaG = Dx2*(Dyy*Dzz - Dyz*Dyz) + Dy2*(Dxx*Dzz - Dxz*Dxz) + Dz2*(Dxx*Dyy - Dxy*Dxy) +
+                 2*( Dy*Dz*(Dxy*Dxz - Dyz*Dxx) + Dx*Dz*(Dxy*Dyz - Dxz*Dyy) + Dx*Dy*(Dxz*Dyz - Dxy*Dzz) );// *1/dx^6
+        normGrad  = Sqrt(normGrad2); // * 1/dx
+        return true;
+    }
+
+    template<typename, int, typename> friend class BaseStencil; // allow base class to call init()
+    using BaseType::mAcc;
+    using BaseType::mValues;
+    const ValueType mInv2Dx, mInvDx2;
+}; // CurvatureStencil class
+
+}// namespace math
+
+} // end nanovdb namespace
+
+#endif // NANOVDB_MATH_STENCILS_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/tools/CreateNanoGrid.h b/external/nanovdb/tools/CreateNanoGrid.h
new file mode 100644
index 00000000..6f1ce040
--- /dev/null
+++ b/external/nanovdb/tools/CreateNanoGrid.h
@@ -0,0 +1,2073 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file nanovdb/tools/CreateNanoGrid.h
+
+    \author Ken Museth
+
+    \date June 26, 2020
+
+    \note In the examples below we assume that @c srcGrid is a exiting grid of type
+          SrcGridT = @c openvdb::FloatGrid, @c openvdb::FloatGrid or @c nanovdb::tools::build::FloatGrid.
+
+    \brief Convert any grid to a nanovdb grid of the same type, e.g. float->float
+    \code
+    auto handle = nanovdb::tools::createNanoGrid(srcGrid);
+    auto *dstGrid = handle.grid<float>();
+    \endcode
+
+    \brief Convert a grid to a nanovdb grid of a different type, e.g. float->half
+    \code
+    auto handle = nanovdb::tools::createNanoGrid<SrcGridT,nanovdb::Fp16>(srcGrid);
+    auto *dstGrid = handle.grid<nanovdb::Fp16>();
+    \endcode
+
+    \brief Convert a grid to a nanovdb grid of the same type but using a CUDA buffer
+    \code
+    auto handle = nanovdb::tools::createNanoGrid<SrcGridT, float, nanovdb::CudaDeviceBuffer>(srcGrid);
+    auto *dstGrid = handle.grid<float>();
+    \endcode
+
+    \brief Create a nanovdb grid that indices values in an existing source grid of any type.
+           If DstBuildT = nanovdb::ValueIndex both active and in-active values are indexed
+           and if DstBuildT = nanovdb::ValueOnIndex only active values are indexed.
+    \code
+    using DstBuildT = nanovdb::ValueIndex;// index both active an inactive values
+    auto handle = nanovdb::tools::createNanoGridSrcGridT,DstBuildT>(srcGrid,0,false,false);//no blind data, tile values or stats
+    auto *dstGrid = handle.grid<DstBuildT>();
+    \endcode
+
+    \brief Create a NanoVDB grid from scratch
+    \code
+#if defined(NANOVDB_USE_OPENVDB) && !defined(__CUDACC__)
+    using SrcGridT = openvdb::FloatGrid;
+#else
+    using SrcGridT = nanovdb::tools::build::FloatGrid;
+#endif
+    SrcGridT srcGrid(0.0f);// create an empty source grid
+    auto srcAcc = srcGrid.getAccessor();// create an accessor
+    srcAcc.setValue(nanovdb::Coord(1,2,3), 1.0f);// set a voxel value
+
+    auto handle = nanovdb::tools::createNanoGrid(srcGrid);// convert source grid to a grid handle
+    auto dstGrid = handle.grid<float>();// get a pointer to the destination grid
+    \endcode
+
+    \brief Convert a base-pointer to an openvdb grid, denoted srcGrid, to a  nanovdb
+           grid of the same type, e.g. float -> float or openvdb::Vec3f -> nanovdb::Vec3f
+    \code
+    auto handle = nanovdb::openToNanoVDB(*srcGrid);// convert source grid to a grid handle
+    auto dstGrid = handle.grid<float>();// get a pointer to the destination grid
+    \endcode
+
+    \brief Converts any existing grid to a NanoVDB grid, for example:
+           nanovdb::tools::build::Grid<SrcBuildT> -> nanovdb::Grid<DstBuildT>
+           nanovdb::Grid<SrcBuildT> -> nanovdb::Grid<DstBuildT>
+           nanovdb::Grid<SrcBuildT> -> nanovdb::Grid<ValueIndex or ValueOnIndex>
+           openvdb::Grid<SrcBuildT> -> nanovdb::Grid<DstBuildT>
+           openvdb::Grid<PointIndex> -> nanovdb::Grid<PointIndex>
+           openvdb::Grid<PointData> -> nanovdb::Grid<PointData>
+           openvdb::Grid<SrcBuildT> -> nanovdb::Grid<ValueIndex or ValueOnIndex>
+
+    \note This files replaces GridBuilder.h, IndexGridBuilder.h and OpenToNanoVDB.h
+*/
+
+#ifndef NANOVDB_TOOLS_CREATENANOGRID_H_HAS_BEEN_INCLUDED
+#define NANOVDB_TOOLS_CREATENANOGRID_H_HAS_BEEN_INCLUDED
+
+#if defined(NANOVDB_USE_OPENVDB) && !defined(__CUDACC__)
+#include <openvdb/openvdb.h>
+#include <openvdb/points/PointDataGrid.h>
+#include <openvdb/tools/PointIndexGrid.h>
+#endif
+
+#include <nanovdb/NodeManager.h>
+#include <nanovdb/GridHandle.h>
+#include <nanovdb/tools/GridBuilder.h>
+#include <nanovdb/tools/GridStats.h>
+#include <nanovdb/tools/GridChecksum.h>
+#include <nanovdb/util/Range.h>
+#include <nanovdb/util/Invoke.h>
+#include <nanovdb/util/ForEach.h>
+#include <nanovdb/util/Reduce.h>
+#include <nanovdb/util/PrefixSum.h>
+#include <nanovdb/math/DitherLUT.h>// for nanovdb::math::DitherLUT
+
+#include <limits>
+#include <vector>
+#include <set>
+#include <cstring> // for memcpy
+#include <type_traits>
+
+namespace nanovdb {// ============================================================================
+
+namespace tools {// ==============================================================================
+
+// Forward declarations (defined below)
+template <typename> class CreateNanoGrid;
+class AbsDiff;
+template <typename> struct MapToNano;
+
+//================================================================================================
+
+#if defined(NANOVDB_USE_OPENVDB) && !defined(__CUDACC__)
+/// @brief Forward declaration of free-standing function that converts an OpenVDB GridBase into a NanoVDB GridHandle
+/// @tparam BufferT Type of the buffer used to allocate the destination grid
+/// @param base Shared pointer to a base openvdb grid to be converted
+/// @param sMode Mode for computing statistics of the destination grid
+/// @param cMode Mode for computing checksums of the destination grid
+/// @param verbose Mode of verbosity
+/// @return Handle to the destination NanoGrid
+template<typename BufferT = HostBuffer>
+GridHandle<BufferT>
+openToNanoVDB(const openvdb::GridBase::Ptr& base,
+              StatsMode                     sMode = StatsMode::Default,
+              CheckMode                     cMode = CheckMode::Default,
+              int                           verbose = 0);
+#endif
+
+//================================================================================================
+
+/// @brief Freestanding function that creates a NanoGrid<T> from any source grid
+/// @tparam SrcGridT Type of in input (source) grid, e.g. openvdb::Grid or nanovdb::Grid
+/// @tparam DstBuildT Type of values in the output (destination) nanovdb Grid, e.g. float or nanovdb::Fp16
+/// @tparam BufferT Type of the buffer used ti allocate the destination grid
+/// @param srcGrid Input (source) grid to be converted
+/// @param sMode  Mode for computing statistics of the destination grid
+/// @param cMode  Mode for computing checksums of the destination grid
+/// @param verbose Mode of verbosity
+/// @param buffer Instance of a buffer used for allocation
+/// @return Handle to the destination NanoGrid
+template<typename SrcGridT,
+         typename DstBuildT = typename MapToNano<typename SrcGridT::BuildType>::type,
+         typename BufferT = HostBuffer>
+typename util::disable_if<BuildTraits<DstBuildT>::is_index || BuildTraits<DstBuildT>::is_Fp, GridHandle<BufferT>>::type
+createNanoGrid(const SrcGridT &srcGrid,
+               StatsMode sMode = StatsMode::Default,
+               CheckMode cMode = CheckMode::Default,
+               int verbose = 0,
+               const BufferT &buffer = BufferT());
+
+//================================================================================================
+
+/// @brief Freestanding function that creates a NanoGrid<ValueIndex> or NanoGrid<ValueOnIndex> from any source grid
+/// @tparam SrcGridT Type of in input (source) grid, e.g. openvdb::Grid or nanovdb::Grid
+/// @tparam DstBuildT If ValueIndex all (active and inactive) values are indexed and if
+///         it is ValueOnIndex only active values are indexed.
+/// @tparam BufferT BufferT Type of the buffer used ti allocate the destination grid
+/// @param channels If non-zero the values (active or all) in @c srcGrid are encoded as blind
+///                 data in the output index grid. @c channels indicates the number of copies
+///                 of these blind data
+/// @param includeStats If true all tree nodes will includes indices for stats, i.e. min/max/avg/std-div
+/// @param includeTiles If false on values in leaf nodes are indexed
+/// @param verbose Mode of verbosity
+/// @param buffer Instance of a buffer used for allocation
+/// @return Handle to the destination NanoGrid<T> where T = ValueIndex or ValueOnIndex
+template<typename SrcGridT,
+         typename DstBuildT = typename MapToNano<typename SrcGridT::BuildType>::type,
+         typename BufferT = HostBuffer>
+typename util::enable_if<BuildTraits<DstBuildT>::is_index, GridHandle<BufferT>>::type
+createNanoGrid(const SrcGridT &srcGrid,
+               uint32_t channels = 0u,
+               bool includeStats = true,
+               bool includeTiles = true,
+               int verbose = 0,
+               const BufferT &buffer = BufferT());
+
+//================================================================================================
+
+/// @brief Freestanding function to create a NanoGrid<FpN> from any source grid
+/// @tparam SrcGridT Type of in input (source) grid, e.g. openvdb::Grid or nanovdb::Grid
+/// @tparam DstBuildT = FpN, i.e. variable bit-width of the output grid
+/// @tparam OracleT Type of the oracle used to determine the local bit-width, i.e. N in FpN
+/// @tparam BufferT Type of the buffer used to allocate the destination grid
+/// @param srcGrid Input (source) grid to be converted
+/// @param ditherOn switch to enable or disable dithering of quantization error
+/// @param sMode Mode for computing statistics of the destination grid
+/// @param cMode Mode for computing checksums of the destination grid
+/// @param verbose Mode of verbosity
+/// @param oracle Instance of a oracle used  to determine the local bit-width, i.e. N in FpN
+/// @param buffer Instance of a buffer used for allocation
+/// @return Handle to the destination NanoGrid
+template<typename SrcGridT,
+         typename DstBuildT = typename MapToNano<typename SrcGridT::BuildType>::type,
+         typename OracleT = AbsDiff,
+         typename BufferT = HostBuffer>
+typename util::enable_if<util::is_same<FpN, DstBuildT>::value, GridHandle<BufferT>>::type
+createNanoGrid(const SrcGridT &srcGrid,
+               StatsMode sMode = StatsMode::Default,
+               CheckMode cMode = CheckMode::Default,
+               bool ditherOn = false,
+               int verbose = 0,
+               const OracleT &oracle = OracleT(),
+               const BufferT &buffer = BufferT());
+
+//================================================================================================
+
+/// @brief Freestanding function to create a NanoGrid<FpX> from any source grid, X=4,8,16
+/// @tparam SrcGridT Type of in input (source) grid, e.g. openvdb::Grid or nanovdb::Grid
+/// @tparam DstBuildT = Fp4, Fp8 or Fp16, i.e. quantization bit-width of the output grid
+/// @tparam BufferT Type of the buffer used to allocate the destination grid
+/// @param srcGrid Input (source) grid to be converted
+/// @param ditherOn switch to enable or disable dithering of quantization error
+/// @param sMode Mode for computing statistics of the destination grid
+/// @param cMode Mode for computing checksums of the destination grid
+/// @param verbose Mode of verbosity
+/// @param buffer Instance of a buffer used for allocation
+/// @return Handle to the destination NanoGrid
+template<typename SrcGridT,
+         typename DstBuildT = typename MapToNano<typename SrcGridT::BuildType>::type,
+         typename BufferT = HostBuffer>
+typename util::enable_if<BuildTraits<DstBuildT>::is_FpX, GridHandle<BufferT>>::type
+createNanoGrid(const SrcGridT &srcGrid,
+               StatsMode sMode = StatsMode::Default,
+               CheckMode cMode = CheckMode::Default,
+               bool ditherOn = false,
+               int verbose = 0,
+               const BufferT &buffer = BufferT());
+
+//================================================================================================
+
+/// @brief Compression oracle based on absolute difference
+class AbsDiff
+{
+    float mTolerance;// absolute error tolerance
+public:
+    /// @note The default value of -1 means it's un-initialized!
+    AbsDiff(float tolerance = -1.0f) : mTolerance(tolerance) {}
+    AbsDiff(const AbsDiff&) = default;
+    ~AbsDiff() = default;
+    operator bool() const {return mTolerance>=0.0f;}
+    void init(nanovdb::GridClass gClass, float background) {
+        if (gClass == GridClass::LevelSet) {
+            static const float halfWidth = 3.0f;
+            mTolerance = 0.1f * background / halfWidth;// range of ls: [-3dx; 3dx]
+        } else if (gClass == GridClass::FogVolume) {
+            mTolerance = 0.01f;// range of FOG volumes: [0;1]
+        } else {
+            mTolerance = 0.0f;
+        }
+    }
+    void  setTolerance(float tolerance) { mTolerance = tolerance; }
+    float getTolerance() const { return mTolerance; }
+    /// @brief Return true if the approximate value is within the accepted
+    ///        absolute error bounds of the exact value.
+    ///
+    /// @details Required member method
+    bool  operator()(float exact, float approx) const
+    {
+        return math::Abs(exact - approx) <= mTolerance;
+    }
+};// AbsDiff
+
+inline std::ostream& operator<<(std::ostream& os, const AbsDiff& diff)
+{
+    os << "Absolute tolerance: " << diff.getTolerance();
+    return os;
+}
+
+//================================================================================================
+
+/// @brief Compression oracle based on relative difference
+class RelDiff
+{
+    float mTolerance;// relative error tolerance
+public:
+    /// @note The default value of -1 means it's un-initialized!
+    RelDiff(float tolerance = -1.0f) : mTolerance(tolerance) {}
+    RelDiff(const RelDiff&) = default;
+    ~RelDiff() = default;
+    operator bool() const {return mTolerance>=0.0f;}
+    void  setTolerance(float tolerance) { mTolerance = tolerance; }
+    float getTolerance() const { return mTolerance; }
+    /// @brief Return true if the approximate value is within the accepted
+    ///        relative error bounds of the exact value.
+    ///
+    /// @details Required member method
+    bool  operator()(float exact, float approx) const
+    {
+        return  math::Abs(exact - approx)/math::Max(math::Abs(exact), math::Abs(approx)) <= mTolerance;
+    }
+};// RelDiff
+
+inline std::ostream& operator<<(std::ostream& os, const RelDiff& diff)
+{
+    os << "Relative tolerance: " << diff.getTolerance();
+    return os;
+}
+
+//================================================================================================
+
+/// @brief The NodeAccessor provides a uniform API for accessing nodes got NanoVDB, OpenVDB and build Grids
+///
+/// @note General implementation that works with nanovdb::tools::build::Grid
+template <typename GridT>
+class NodeAccessor
+{
+public:
+    static constexpr bool IS_OPENVDB = false;
+    static constexpr bool IS_NANOVDB = false;
+    using BuildType = typename GridT::BuildType;
+    using ValueType = typename GridT::ValueType;
+    using GridType = GridT;
+    using TreeType = typename GridT::TreeType;
+    using RootType = typename TreeType::RootNodeType;
+    template<int LEVEL>
+    using NodeType = typename NodeTrait<const TreeType, LEVEL>::type;
+    NodeAccessor(const GridT &grid) : mMgr(const_cast<GridT&>(grid)) {}
+    const GridType& grid() const {return mMgr.grid();}
+    const TreeType& tree() const {return mMgr.tree();}
+    const RootType& root() const {return mMgr.root();}
+    uint64_t nodeCount(int level) const { return mMgr.nodeCount(level); }
+    template <int LEVEL>
+    const NodeType<LEVEL>& node(uint32_t i) const {return mMgr.template node<LEVEL>(i); }
+    const std::string& getName() const {return this->grid().getName();};
+    bool hasLongGridName() const {return this->grid().getName().length() >= GridData::MaxNameSize;}
+    const nanovdb::Map& map() const {return this->grid().map();}
+    GridClass gridClass() const {return this->grid().gridClass();}
+private:
+    build::NodeManager<GridT> mMgr;
+};// NodeAccessor<GridT>
+
+//================================================================================================
+
+/// @brief Template specialization for nanovdb::Grid which is special since its NodeManage
+///         uses a handle in order to support node access on the GPU!
+template <typename BuildT>
+class NodeAccessor< NanoGrid<BuildT> >
+{
+public:
+    static constexpr bool IS_OPENVDB = false;
+    static constexpr bool IS_NANOVDB = true;
+    using BuildType = BuildT;
+    using BufferType = HostBuffer;
+    using GridType = NanoGrid<BuildT>;
+    using ValueType = typename GridType::ValueType;
+    using TreeType = typename GridType::TreeType;
+    using RootType = typename TreeType::RootType;
+    template<int LEVEL>
+    using NodeType = typename NodeTrait<TreeType, LEVEL>::type;
+    NodeAccessor(const GridType &grid)
+        : mHandle(createNodeManager<BuildT, BufferType>(grid))
+        , mMgr(*(mHandle.template mgr<BuildT>())) {}
+    const GridType& grid() const {return mMgr.grid();}
+    const TreeType& tree() const {return mMgr.tree();}
+    const RootType& root() const {return mMgr.root();}
+    uint64_t nodeCount(int level) const { return mMgr.nodeCount(level); }
+    template <int LEVEL>
+    const NodeType<LEVEL>& node(uint32_t i) const {return mMgr.template node<LEVEL>(i); }
+    std::string getName() const {return std::string(this->grid().gridName());};
+    bool hasLongGridName() const {return this->grid().hasLongGridName();}
+    const nanovdb::Map& map() const {return this->grid().map();}
+    GridClass gridClass() const {return this->grid().gridClass();}
+private:
+    NodeManagerHandle<BufferType> mHandle;
+    const NodeManager<BuildT>    &mMgr;
+};// NodeAccessor<nanovdb::Grid>
+
+//================================================================================================
+
+/// @brief Trait that maps any type to the corresponding nanovdb type
+/// @tparam T Type to be mapped
+template<typename T>
+struct MapToNano { using type = T; };
+
+#if defined(NANOVDB_USE_OPENVDB) && !defined(__CUDACC__)
+
+template<>
+struct MapToNano<openvdb::ValueMask> {using type = nanovdb::ValueMask;};
+template<typename T>
+struct MapToNano<openvdb::math::Vec3<T>>{using type = nanovdb::math::Vec3<T>;};
+template<typename T>
+struct MapToNano<openvdb::math::Vec4<T>>{using type = nanovdb::math::Vec4<T>;};
+template<>
+struct MapToNano<openvdb::PointIndex32> {using type = uint32_t;};
+template<>
+struct MapToNano<openvdb::PointDataIndex32> {using type = uint32_t;};
+
+/// Templated Grid with default 32->16->8 configuration
+template <typename BuildT>
+using OpenLeaf = openvdb::tree::LeafNode<BuildT,3>;
+template <typename BuildT>
+using OpenLower = openvdb::tree::InternalNode<OpenLeaf<BuildT>,4>;
+template <typename BuildT>
+using OpenUpper = openvdb::tree::InternalNode<OpenLower<BuildT>,5>;
+template <typename BuildT>
+using OpenRoot = openvdb::tree::RootNode<OpenUpper<BuildT>>;
+template <typename BuildT>
+using OpenTree = openvdb::tree::Tree<OpenRoot<BuildT>>;
+template <typename BuildT>
+using OpenGrid = openvdb::Grid<OpenTree<BuildT>>;
+
+//================================================================================================
+
+/// @brief Template specialization for openvdb::Grid
+template <typename BuildT>
+class NodeAccessor<OpenGrid<BuildT>>
+{
+public:
+    static constexpr bool IS_OPENVDB = true;
+    static constexpr bool IS_NANOVDB = false;
+    using BuildType = BuildT;
+    using GridType = OpenGrid<BuildT>;
+    using ValueType = typename GridType::ValueType;
+    using TreeType = OpenTree<BuildT>;
+    using RootType = OpenRoot<BuildT>;
+    template<int LEVEL>
+    using NodeType = typename NodeTrait<const TreeType, LEVEL>::type;
+    NodeAccessor(const GridType &grid) : mMgr(const_cast<GridType&>(grid)) {
+        const auto mat4 = this->grid().transform().baseMap()->getAffineMap()->getMat4();
+        mMap.set(mat4, mat4.inverse());
+    }
+    const GridType& grid() const {return mMgr.grid();}
+    const TreeType& tree() const {return mMgr.tree();}
+    const RootType& root() const {return mMgr.root();}
+    uint64_t nodeCount(int level) const { return mMgr.nodeCount(level); }
+    template <int LEVEL>
+    const NodeType<LEVEL>& node(uint32_t i) const {return mMgr.template node<LEVEL>(i); }
+    std::string getName() const { return this->grid().getName(); };
+    bool hasLongGridName() const {return this->grid().getName().length() >= GridData::MaxNameSize;}
+    const nanovdb::Map& map() const {return mMap;}
+    GridClass gridClass() const {
+        switch (this->grid().getGridClass()) {
+        case openvdb::GRID_LEVEL_SET:
+            if (!util::is_floating_point<BuildT>::value) OPENVDB_THROW(openvdb::ValueError, "processGrid: Level sets are expected to be floating point types");
+            return GridClass::LevelSet;
+        case openvdb::GRID_FOG_VOLUME:
+            return GridClass::FogVolume;
+        case openvdb::GRID_STAGGERED:
+            return GridClass::Staggered;
+        default:
+            return GridClass::Unknown;
+        }
+    }
+private:
+    build::NodeManager<GridType> mMgr;
+    nanovdb::Map                 mMap;
+};// NodeAccessor<openvdb::Grid<T>>
+
+//================================================================================================
+
+/// @brief Template specialization for openvdb::tools::PointIndexGrid
+template <>
+class NodeAccessor<openvdb::tools::PointIndexGrid>
+{
+public:
+    static constexpr bool IS_OPENVDB = true;
+    static constexpr bool IS_NANOVDB = false;
+    using BuildType = openvdb::PointIndex32;
+    using GridType = openvdb::tools::PointIndexGrid;
+    using TreeType = openvdb::tools::PointIndexTree;
+    using RootType = typename TreeType::RootNodeType;
+    using ValueType = typename GridType::ValueType;
+    template<int LEVEL>
+    using NodeType = typename NodeTrait<const TreeType, LEVEL>::type;
+    NodeAccessor(const GridType &grid) : mMgr(const_cast<GridType&>(grid)) {
+        const auto mat4 = this->grid().transform().baseMap()->getAffineMap()->getMat4();
+        mMap.set(mat4, mat4.inverse());
+    }
+    const GridType& grid() const {return mMgr.grid();}
+    const TreeType& tree() const {return mMgr.tree();}
+    const RootType& root() const {return mMgr.root();}
+    uint64_t nodeCount(int level) const { return mMgr.nodeCount(level); }
+    template <int LEVEL>
+    const NodeType<LEVEL>& node(uint32_t i) const {return mMgr.template node<LEVEL>(i); }
+    std::string getName() const { return this->grid().getName(); };
+    bool hasLongGridName() const {return this->grid().getName().length() >= GridData::MaxNameSize;}
+    const nanovdb::Map& map() const {return mMap;}
+    GridClass gridClass() const {return GridClass::PointIndex;}
+private:
+    build::NodeManager<GridType> mMgr;
+    nanovdb::Map                 mMap;
+};// NodeAccessor<openvdb::tools::PointIndexGrid>
+
+//================================================================================================
+
+// @brief Template specialization for openvdb::points::PointDataGrid
+template <>
+class NodeAccessor<openvdb::points::PointDataGrid>
+{
+public:
+    static constexpr bool IS_OPENVDB = true;
+    static constexpr bool IS_NANOVDB = false;
+    using BuildType = openvdb::PointDataIndex32;
+    using GridType = openvdb::points::PointDataGrid;
+    using TreeType = openvdb::points::PointDataTree;
+    using RootType = typename TreeType::RootNodeType;
+    using ValueType = typename GridType::ValueType;
+    template<int LEVEL>
+    using NodeType = typename NodeTrait<const TreeType, LEVEL>::type;
+    NodeAccessor(const GridType &grid) : mMgr(const_cast<GridType&>(grid)) {
+        const auto mat4 = this->grid().transform().baseMap()->getAffineMap()->getMat4();
+        mMap.set(mat4, mat4.inverse());
+    }
+    const GridType& grid() const {return mMgr.grid();}
+    const TreeType& tree() const {return mMgr.tree();}
+    const RootType& root() const {return mMgr.root();}
+    uint64_t nodeCount(int level) const { return mMgr.nodeCount(level); }
+    template <int LEVEL>
+    const NodeType<LEVEL>& node(uint32_t i) const {return mMgr.template node<LEVEL>(i); }
+    std::string getName() const { return this->grid().getName(); };
+    bool hasLongGridName() const {return this->grid().getName().length() >= GridData::MaxNameSize;}
+    const nanovdb::Map& map() const {return mMap;}
+    GridClass gridClass() const {return GridClass::PointData;}
+private:
+    build::NodeManager<GridType> mMgr;
+    nanovdb::Map                 mMap;
+};// NodeAccessor<openvdb::points::PointDataGrid>
+
+#endif
+
+//================================================================================================
+
+/// @brief Creates any nanovdb Grid from any source grid (certain combinations are obviously not allowed)
+template <typename SrcGridT>
+class CreateNanoGrid
+{
+public:
+    // SrcGridT can be either openvdb::Grid, nanovdb::Grid or nanovdb::tools::build::Grid
+    using SrcNodeAccT = NodeAccessor<SrcGridT>;
+    using SrcBuildT = typename SrcNodeAccT::BuildType;
+    using SrcValueT = typename SrcNodeAccT::ValueType;
+    using SrcTreeT  = typename SrcNodeAccT::TreeType;
+    using SrcRootT  = typename SrcNodeAccT::RootType;
+    template <int LEVEL>
+    using SrcNodeT = typename NodeTrait<SrcRootT, LEVEL>::type;
+
+    /// @brief Constructor from a source grid
+    /// @param srcGrid Source grid of type SrcGridT
+    CreateNanoGrid(const SrcGridT &srcGrid);
+
+    /// @brief Constructor from a source node accessor (defined above)
+    /// @param srcNodeAcc Source node accessor of type SrcNodeAccT
+    CreateNanoGrid(const SrcNodeAccT &srcNodeAcc);
+
+    /// @brief Set the level of verbosity
+    /// @param mode level of verbosity, mode=0 means quiet
+    void setVerbose(int mode = 1) { mVerbose = mode; }
+
+    /// @brief Enable or disable dithering, i.e. randomization of the quantization error.
+    /// @param on enable or disable dithering
+    /// @warning Dithering only has an affect when DstBuildT = {Fp4, Fp8, Fp16, FpN}
+    void enableDithering(bool on = true) { mDitherOn = on; }
+
+    /// @brief Set the mode used for computing statistics of the destination grid
+    /// @param mode specify the mode of statistics
+    void setStats(StatsMode mode = StatsMode::Default) { mStats = mode; }
+
+    /// @brief Set the mode used for computing checksums of the destination grid
+    /// @param mode specify the mode of checksum
+    void setChecksum(CheckMode mode = CheckMode::Default) { mChecksum = mode; }
+
+    /// @brief Converts the source grid into a nanovdb grid with the specified destination build type
+    /// @tparam DstBuildT build type of the destination, output, grid
+    /// @tparam BufferT Type of the buffer used for allocating the destination grid
+    /// @param buffer instance of the buffer use for allocation
+    /// @return Return an instance of a GridHandle (invoking move semantics)
+    /// @note This version is when DstBuildT != {FpN, ValueIndex, ValueOnIndex}
+    template<typename DstBuildT = typename MapToNano<SrcBuildT>::type, typename BufferT = HostBuffer>
+    typename util::disable_if<util::is_same<DstBuildT, FpN>::value ||
+                        BuildTraits<DstBuildT>::is_index, GridHandle<BufferT>>::type
+    getHandle(const BufferT &buffer = BufferT());
+
+    /// @brief Converts the source grid into a nanovdb grid with variable bit quantization
+    /// @tparam DstBuildT FpN, i.e. the destination grid uses variable bit quantization
+    /// @tparam OracleT Type of oracle used to determine the N in FpN
+    /// @tparam BufferT Type of the buffer used for allocating the destination grid
+    /// @param oracle Instance of the oracle used to determine the N in FpN
+    /// @param buffer instance of the buffer use for allocation
+    /// @return Return an instance of a GridHandle (invoking move semantics)
+    /// @note This version assumes DstBuildT == FpN
+    template<typename DstBuildT = typename MapToNano<SrcBuildT>::type, typename OracleT = AbsDiff, typename BufferT = HostBuffer>
+    typename util::enable_if<util::is_same<DstBuildT, FpN>::value, GridHandle<BufferT>>::type
+    getHandle(const OracleT &oracle = OracleT(),
+              const BufferT &buffer = BufferT());
+
+    /// @brief Converts the source grid into a nanovdb grid with indices to external arrays of values
+    /// @tparam DstBuildT ValueIndex or ValueOnIndex, i.e. index all or just active values
+    /// @tparam BufferT Type of the buffer used for allocating the destination grid
+    /// @param channels Number of copies of values encoded as blind data in the destination grid
+    /// @param includeStats Specify if statics should be indexed
+    /// @param includeTiles Specify if tile values, i.e. non-leaf-node-values, should be indexed
+    /// @param buffer instance of the buffer use for allocation
+    /// @return Return an instance of a GridHandle (invoking move semantics)
+    template<typename DstBuildT = typename MapToNano<SrcBuildT>::type, typename BufferT = HostBuffer>
+    typename util::enable_if<BuildTraits<DstBuildT>::is_index, GridHandle<BufferT>>::type
+    getHandle(uint32_t channels = 0u,
+              bool includeStats = true,
+              bool includeTiles = true,
+              const BufferT &buffer = BufferT());
+
+    /// @brief Add blind data to the destination grid
+    /// @param name String name of the blind data
+    /// @param dataSemantic Semantics of the blind data
+    /// @param dataClass Class of the blind data
+    /// @param dataType Type of the blind data
+    /// @param count Element count of the blind data
+    /// @param size Size of each element of the blind data
+    /// @return Return the index used to access the blind data
+    uint64_t addBlindData(const std::string& name,
+                          GridBlindDataSemantic dataSemantic,
+                          GridBlindDataClass dataClass,
+                          GridType dataType,
+                          size_t count, size_t size)
+    {
+        const size_t order = mBlindMetaData.size();
+        mBlindMetaData.emplace(name, dataSemantic, dataClass, dataType, order, count, size);
+        return order;
+    }
+
+    /// @brief This method only has affect when getHandle was called with DstBuildT = ValueIndex or ValueOnIndex
+    /// @return Return the number of indexed values. If called before getHandle was called with
+    ///         DstBuildT = ValueIndex or ValueOnIndex the return value is zero. Else it is a value larger than zero.
+    uint64_t valueCount() const {return mValIdx[0].empty() ? 0u : mValIdx[0].back();}
+
+    /// @brief Copy values from the source grid into a provided buffer
+    /// @tparam DstBuildT Must be ValueIndex or ValueOnIndex, i.e. a index grid
+    /// @param buffer point in which to write values
+    template <typename DstBuildT>
+    typename util::enable_if<BuildTraits<DstBuildT>::is_index>::type
+    copyValues(SrcValueT *buffer);
+
+private:
+
+    // =========================================================
+
+    template <typename T, int LEVEL>
+    typename util::enable_if<!(util::is_same<T,FpN>::value&&LEVEL==0), typename NodeTrait<NanoRoot<T>, LEVEL>::type*>::type
+    dstNode(uint64_t i) const {
+        static_assert(LEVEL==0 || LEVEL==1 || LEVEL==2, "Expected LEVEL== {0,1,2}");
+        using NodeT = typename NodeTrait<NanoRoot<T>, LEVEL>::type;
+        return util::PtrAdd<NodeT>(mBufferPtr, mOffset[5-LEVEL]) + i;
+    }
+    template <typename T, int LEVEL>
+    typename util::enable_if<util::is_same<T,FpN>::value && LEVEL==0, NanoLeaf<FpN>*>::type
+    dstNode(uint64_t i) const {return util::PtrAdd<NanoLeaf<FpN>>(mBufferPtr, mCodec[i].offset);}
+
+    template <typename T> NanoRoot<T>* dstRoot() const {return util::PtrAdd<NanoRoot<T>>(mBufferPtr, mOffset.root);}
+    template <typename T> NanoTree<T>* dstTree() const {return util::PtrAdd<NanoTree<T>>(mBufferPtr, mOffset.tree);}
+    template <typename T> NanoGrid<T>* dstGrid() const {return util::PtrAdd<NanoGrid<T>>(mBufferPtr, mOffset.grid);}
+    GridBlindMetaData* dstMeta(uint32_t i) const { return util::PtrAdd<GridBlindMetaData>(mBufferPtr, mOffset.meta) + i;};
+
+    // =========================================================
+
+    template <typename DstBuildT>
+    typename util::disable_if<util::is_same<FpN,DstBuildT>::value || BuildTraits<DstBuildT>::is_index>::type
+    preProcess();
+
+    template <typename DstBuildT>
+    typename util::enable_if<BuildTraits<DstBuildT>::is_index>::type
+    preProcess(uint32_t channels);
+
+    template <typename DstBuildT, typename OracleT>
+    typename util::enable_if<util::is_same<FpN, DstBuildT>::value>::type
+    preProcess(OracleT oracle);
+
+    // =========================================================
+
+    // Below are private methods use to serialize nodes into NanoVDB
+    template<typename DstBuildT, typename BufferT>
+    GridHandle<BufferT> initHandle(const BufferT& buffer);
+
+    // =========================================================
+
+    template <typename DstBuildT>
+    inline typename util::enable_if<BuildTraits<DstBuildT>::is_index>::type
+    postProcess(uint32_t channels);
+
+    template <typename DstBuildT>
+    inline typename util::disable_if<BuildTraits<DstBuildT>::is_index>::type
+    postProcess();
+
+    // ========================================================
+
+    template<typename DstBuildT>
+    typename util::disable_if<BuildTraits<DstBuildT>::is_special>::type
+    processLeafs();
+
+    template<typename DstBuildT>
+    typename util::enable_if<BuildTraits<DstBuildT>::is_index>::type
+    processLeafs();
+
+    template<typename DstBuildT>
+    typename util::enable_if<BuildTraits<DstBuildT>::is_FpX>::type
+    processLeafs();
+
+    template<typename DstBuildT>
+    typename util::enable_if<util::is_same<FpN, DstBuildT>::value>::type
+    processLeafs();
+
+    template<typename DstBuildT>
+    typename util::enable_if<util::is_same<bool, DstBuildT>::value>::type
+    processLeafs();
+
+    template<typename DstBuildT>
+    typename util::enable_if<util::is_same<ValueMask, DstBuildT>::value>::type
+    processLeafs();
+
+    // =========================================================
+
+    template<typename DstBuildT, int LEVEL>
+    typename util::enable_if<BuildTraits<DstBuildT>::is_index>::type
+    processInternalNodes();
+
+    template<typename DstBuildT, int LEVEL>
+    typename util::enable_if<!BuildTraits<DstBuildT>::is_index>::type
+    processInternalNodes();
+
+    // =========================================================
+
+    template <typename DstBuildT>
+    typename util::enable_if<BuildTraits<DstBuildT>::is_index>::type
+    processRoot();
+
+    template <typename DstBuildT>
+    typename util::enable_if<!BuildTraits<DstBuildT>::is_index>::type
+    processRoot();
+
+    // =========================================================
+
+    template<typename DstBuildT>
+    void processTree();
+
+    template<typename DstBuildT>
+    void processGrid();
+
+    template <typename DstBuildT, int LEVEL>
+    typename util::enable_if<BuildTraits<DstBuildT>::is_index, uint64_t>::type
+    countTileValues(uint64_t valueCount);
+
+    template <typename DstBuildT>
+    typename util::enable_if<BuildTraits<DstBuildT>::is_index, uint64_t>::type
+    countValues();
+
+#if defined(NANOVDB_USE_OPENVDB) && !defined(__CUDACC__)
+    template<typename T = SrcGridT>
+    typename util::disable_if<util::is_same<T, openvdb::tools::PointIndexGrid>::value ||
+                               util::is_same<T, openvdb::points::PointDataGrid>::value, uint64_t>::type
+    countPoints() const;
+
+    template<typename T = SrcGridT>
+    typename util::enable_if<util::is_same<T, openvdb::tools::PointIndexGrid>::value ||
+                       util::is_same<T, openvdb::points::PointDataGrid>::value, uint64_t>::type
+    countPoints() const;
+
+    template<typename DstBuildT, typename AttT, typename CodecT = openvdb::points::UnknownCodec, typename T = SrcGridT>
+    typename util::enable_if<util::is_same<openvdb::points::PointDataGrid, T>::value>::type
+    copyPointAttribute(size_t attIdx, AttT *attPtr);
+#else
+    uint64_t countPoints() const {return 0u;}
+#endif
+
+    void*                    mBufferPtr;// pointer to the beginning of the destination nanovdb grid buffer
+    struct BufferOffsets {
+        uint64_t grid, tree, root, upper, lower, leaf, meta, blind, size;
+        uint64_t operator[](int i) const { return *(reinterpret_cast<const uint64_t*>(this)+i); }
+    }                        mOffset;
+    int                      mVerbose;
+    uint64_t                 mLeafNodeSize;// non-trivial when DstBuiltT = FpN
+
+    std::unique_ptr<SrcNodeAccT> mSrcNodeAccPtr;// placeholder for potential local instance
+    const SrcNodeAccT       &mSrcNodeAcc;
+    struct BlindMetaData; // forward declaration
+    std::set<BlindMetaData>  mBlindMetaData; // sorted according to BlindMetaData.order
+    struct Codec { float min, max; uint64_t offset; uint8_t log2; };// used for adaptive bit-rate quantization
+    std::unique_ptr<Codec[]> mCodec;// defines a codec per leaf node when DstBuildT = FpN
+    StatsMode                mStats;
+    CheckMode                mChecksum;
+    bool                     mDitherOn, mIncludeStats, mIncludeTiles;
+    std::vector<uint64_t>    mValIdx[3];// store id of first value in node
+}; // CreateNanoGrid
+
+//================================================================================================
+
+template <typename SrcGridT>
+CreateNanoGrid<SrcGridT>::CreateNanoGrid(const SrcGridT &srcGrid)
+    : mVerbose(0)
+    , mSrcNodeAccPtr(new SrcNodeAccT(srcGrid))
+    , mSrcNodeAcc(*mSrcNodeAccPtr)
+    , mStats(StatsMode::Default)
+    , mChecksum(CheckMode::Default)
+    , mDitherOn(false)
+    , mIncludeStats(true)
+    , mIncludeTiles(true)
+{
+}
+
+//================================================================================================
+
+template <typename SrcGridT>
+CreateNanoGrid<SrcGridT>::CreateNanoGrid(const SrcNodeAccT &srcNodeAcc)
+    : mVerbose(0)
+    , mSrcNodeAccPtr(nullptr)
+    , mSrcNodeAcc(srcNodeAcc)
+    , mStats(StatsMode::Default)
+    , mChecksum(CheckMode::Default)
+    , mDitherOn(false)
+    , mIncludeStats(true)
+    , mIncludeTiles(true)
+{
+}
+
+//================================================================================================
+
+template <typename SrcGridT>
+struct CreateNanoGrid<SrcGridT>::BlindMetaData
+{
+    BlindMetaData(const std::string& name,// name + used to derive GridBlindDataSemantic
+                  const std::string& type,// used to derive GridType of blind data
+                  GridBlindDataClass dataClass,
+                  size_t i, size_t valueCount, size_t valueSize)
+        : metaData(reinterpret_cast<GridBlindMetaData*>(new char[sizeof(GridBlindMetaData)]))
+        , order(i)// sorted id of meta data
+        , size(math::AlignUp<NANOVDB_DATA_ALIGNMENT>(valueCount * valueSize))
+    {
+        util::memzero(metaData, sizeof(GridBlindMetaData));// zero out all meta data
+        if (name.length()>=GridData::MaxNameSize) throw std::runtime_error("blind data name exceeds limit");
+        std::memcpy(metaData->mName, name.c_str(), name.length() + 1);
+        metaData->mValueCount = valueCount;
+        metaData->mSemantic = BlindMetaData::mapToSemantics(name);
+        metaData->mDataClass = dataClass;
+        metaData->mDataType = BlindMetaData::mapToType(type);
+        metaData->mValueSize = valueSize;
+        NANOVDB_ASSERT(metaData->isValid());
+    }
+    BlindMetaData(const std::string& name,// only name
+                  GridBlindDataSemantic dataSemantic,
+                  GridBlindDataClass dataClass,
+                  GridType dataType,
+                  size_t i, size_t valueCount, size_t valueSize)
+        : metaData(reinterpret_cast<GridBlindMetaData*>(new char[sizeof(GridBlindMetaData)]))
+        , order(i)// sorted id of meta data
+        , size(math::AlignUp<NANOVDB_DATA_ALIGNMENT>(valueCount * valueSize))
+    {
+        std::memset(metaData, 0, sizeof(GridBlindMetaData));// zero out all meta data
+        if (name.length()>=GridData::MaxNameSize) throw std::runtime_error("blind data name exceeds character limit");
+        std::memcpy(metaData->mName, name.c_str(), name.length() + 1);
+        metaData->mValueCount = valueCount;
+        metaData->mSemantic = dataSemantic;
+        metaData->mDataClass = dataClass;
+        metaData->mDataType = dataType;
+        metaData->mValueSize = valueSize;
+        NANOVDB_ASSERT(metaData->isValid());
+    }
+    ~BlindMetaData(){ delete [] reinterpret_cast<char*>(metaData); }
+    bool operator<(const BlindMetaData& other) const { return order < other.order; } // required by std::set
+    static GridType mapToType(const std::string& name)
+    {
+        GridType type = GridType::Unknown;
+        if ("uint32_t" == name) {
+            type = GridType::UInt32;
+        } else if ("float" == name) {
+            type = GridType::Float;
+        } else if ("vec3s"== name) {
+            type = GridType::Vec3f;
+        } else if ("int32" == name) {
+            type = GridType::Int32;
+        } else if ("int64" == name) {
+            type = GridType::Int64;
+        }
+        return type;
+    }
+    static GridBlindDataSemantic mapToSemantics(const std::string& name)
+    {
+        GridBlindDataSemantic semantic = GridBlindDataSemantic::Unknown;
+        if ("P" == name) {
+            semantic = GridBlindDataSemantic::PointPosition;
+        } else if ("V" == name) {
+            semantic = GridBlindDataSemantic::PointVelocity;
+        } else if ("Cd" == name) {
+            semantic = GridBlindDataSemantic::PointColor;
+        } else if ("N" == name) {
+            semantic = GridBlindDataSemantic::PointNormal;
+        } else if ("id" == name) {
+            semantic = GridBlindDataSemantic::PointId;
+        }
+        return semantic;
+    }
+    GridBlindMetaData *metaData;
+    const size_t       order, size;
+}; // CreateNanoGrid::BlindMetaData
+
+//================================================================================================
+
+template <typename SrcGridT>
+template<typename DstBuildT, typename BufferT>
+typename util::disable_if<util::is_same<DstBuildT, FpN>::value ||
+                    BuildTraits<DstBuildT>::is_index, GridHandle<BufferT>>::type
+CreateNanoGrid<SrcGridT>::getHandle(const BufferT& pool)
+{
+    this->template preProcess<DstBuildT>();
+    auto handle = this->template initHandle<DstBuildT>(pool);
+    this->template postProcess<DstBuildT>();
+    return handle;
+} // CreateNanoGrid::getHandle<T>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template<typename DstBuildT, typename OracleT, typename BufferT>
+typename util::enable_if<util::is_same<DstBuildT, FpN>::value, GridHandle<BufferT>>::type
+CreateNanoGrid<SrcGridT>::getHandle(const OracleT& oracle, const BufferT& pool)
+{
+    this->template preProcess<DstBuildT, OracleT>(oracle);
+    auto handle = this->template initHandle<DstBuildT>(pool);
+    this->template postProcess<DstBuildT>();
+    return handle;
+} // CreateNanoGrid::getHandle<FpN>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template<typename DstBuildT, typename BufferT>
+typename util::enable_if<BuildTraits<DstBuildT>::is_index, GridHandle<BufferT>>::type
+CreateNanoGrid<SrcGridT>::getHandle(uint32_t channels,
+                                    bool includeStats,
+                                    bool includeTiles,
+                                    const BufferT &pool)
+{
+    mIncludeStats = includeStats;
+    mIncludeTiles = includeTiles;
+    this->template preProcess<DstBuildT>(channels);
+    auto handle = this->template initHandle<DstBuildT>(pool);
+    this->template postProcess<DstBuildT>(channels);
+    return handle;
+}// CreateNanoGrid::getHandle<ValueIndex or ValueOnIndex>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT, typename BufferT>
+GridHandle<BufferT> CreateNanoGrid<SrcGridT>::initHandle(const BufferT& pool)
+{
+    mOffset.grid  = 0;// grid is always stored at the start of the buffer!
+    mOffset.tree  = NanoGrid<DstBuildT>::memUsage(); // grid ends and tree begins
+    mOffset.root  = mOffset.tree  + NanoTree<DstBuildT>::memUsage(); // tree ends and root node begins
+    mOffset.upper = mOffset.root  + NanoRoot<DstBuildT>::memUsage(mSrcNodeAcc.root().getTableSize()); // root node ends and upper internal nodes begin
+    mOffset.lower = mOffset.upper + NanoUpper<DstBuildT>::memUsage()*mSrcNodeAcc.nodeCount(2); // upper internal nodes ends and lower internal nodes begin
+    mOffset.leaf  = mOffset.lower + NanoLower<DstBuildT>::memUsage()*mSrcNodeAcc.nodeCount(1); // lower internal nodes ends and leaf nodes begin
+    mOffset.meta  = mOffset.leaf  + mLeafNodeSize;// leaf nodes end and blind meta data begins
+    mOffset.blind = mOffset.meta  + sizeof(GridBlindMetaData)*mBlindMetaData.size(); // meta data ends and blind data begins
+    mOffset.size  = mOffset.blind;// end of buffer
+    for (const auto& b : mBlindMetaData) mOffset.size += b.size; // accumulate all the blind data
+
+    auto buffer = BufferT::create(mOffset.size, &pool);
+    mBufferPtr = buffer.data();
+
+    // Concurrent processing of all tree levels!
+    util::invoke( [&](){this->template processLeafs<DstBuildT>();},
+                  [&](){this->template processInternalNodes<DstBuildT, 1>();},
+                  [&](){this->template processInternalNodes<DstBuildT, 2>();},
+                  [&](){this->template processRoot<DstBuildT>();},
+                  [&](){this->template processTree<DstBuildT>();},
+                  [&](){this->template processGrid<DstBuildT>();} );
+
+    return GridHandle<BufferT>(std::move(buffer));
+} // CreateNanoGrid::initHandle
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+inline typename util::disable_if<util::is_same<FpN, DstBuildT>::value || BuildTraits<DstBuildT>::is_index>::type
+CreateNanoGrid<SrcGridT>::preProcess()
+{
+    if (const uint64_t pointCount = this->countPoints()) {
+#if defined(NANOVDB_USE_OPENVDB) && !defined(__CUDACC__)
+        if constexpr(util::is_same<openvdb::tools::PointIndexGrid, SrcGridT>::value) {
+            if (!mBlindMetaData.empty()) throw std::runtime_error("expected no blind meta data");
+            this->addBlindData("index",
+                               GridBlindDataSemantic::PointId,
+                               GridBlindDataClass::IndexArray,
+                               GridType::UInt32,
+                               pointCount,
+                               sizeof(uint32_t));
+        } else if constexpr(util::is_same<openvdb::points::PointDataGrid, SrcGridT>::value) {
+            if (!mBlindMetaData.empty()) throw std::runtime_error("expected no blind meta data");
+            auto &srcLeaf = mSrcNodeAcc.template node<0>(0);
+            const auto& attributeSet = srcLeaf.attributeSet();
+            const auto& descriptor = attributeSet.descriptor();
+            const auto& nameMap = descriptor.map();
+            for (auto it = nameMap.begin(); it != nameMap.end(); ++it) {
+                const size_t index = it->second;
+                auto& attArray = srcLeaf.constAttributeArray(index);
+                mBlindMetaData.emplace(it->first, // name used to derive semantics
+                                       descriptor.valueType(index), // type
+                                       it->first == "id" ? GridBlindDataClass::IndexArray : GridBlindDataClass::AttributeArray, // class
+                                       index, // order
+                                       pointCount, // element count
+                                       attArray.valueTypeSize()); // element size
+            }
+        }
+#endif
+    }
+    if (mSrcNodeAcc.hasLongGridName()) {
+        this->addBlindData("grid name",
+                           GridBlindDataSemantic::Unknown,
+                           GridBlindDataClass::GridName,
+                           GridType::Unknown,
+                           mSrcNodeAcc.getName().length() + 1, 1);
+    }
+    mLeafNodeSize = mSrcNodeAcc.nodeCount(0)*NanoLeaf<DstBuildT>::DataType::memUsage();
+}// CreateNanoGrid::preProcess<T>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT, typename OracleT>
+inline typename util::enable_if<util::is_same<FpN, DstBuildT>::value>::type
+CreateNanoGrid<SrcGridT>::preProcess(OracleT oracle)
+{
+    static_assert(util::is_same<float, SrcValueT>::value, "preProcess<FpN>: expected SrcValueT == float");
+
+    const size_t leafCount = mSrcNodeAcc.nodeCount(0);
+    if (leafCount==0) {
+        mLeafNodeSize = 0u;
+        return;
+    }
+    mCodec.reset(new Codec[leafCount]);
+
+    if constexpr(util::is_same<AbsDiff, OracleT>::value) {
+        if (!oracle) oracle.init(mSrcNodeAcc.gridClass(), mSrcNodeAcc.root().background());
+    }
+
+    math::DitherLUT lut(mDitherOn);
+    util::forEach(0, leafCount, 4, [&](const util::Range1D &r) {
+        for (auto i=r.begin(); i!=r.end(); ++i) {
+            const auto &srcLeaf = mSrcNodeAcc.template node<0>(i);
+            float &min = mCodec[i].min = std::numeric_limits<float>::max();
+            float &max = mCodec[i].max = -min;
+            for (int j=0; j<512; ++j) {
+                float v = srcLeaf.getValue(j);
+                if (v<min) min = v;
+                if (v>max) max = v;
+            }
+            const float range = max - min;
+            uint8_t &logBitWidth = mCodec[i].log2 = 0;// 0,1,2,3,4 => 1,2,4,8,16 bits
+            while (range > 0.0f && logBitWidth < 4u) {
+                const uint32_t mask = (uint32_t(1) << (uint32_t(1) << logBitWidth)) - 1u;
+                const float encode  = mask/range;
+                const float decode  = range/mask;
+                int j = 0;
+                do {
+                    const float exact = srcLeaf.getValue(j);//data[j];// exact value
+                    const uint32_t code = uint32_t(encode*(exact - min) + lut(j));
+                    const float approx = code * decode + min;// approximate value
+                    j += oracle(exact, approx) ? 1 : 513;
+                } while(j < 512);
+                if (j == 512) break;
+                ++logBitWidth;
+            }
+        }
+    });
+
+    auto getOffset = [&](size_t i){
+        --i;
+        return mCodec[i].offset +  NanoLeaf<DstBuildT>::DataType::memUsage(1u << mCodec[i].log2);
+    };
+    mCodec[0].offset = NanoGrid<FpN>::memUsage() +
+                       NanoTree<FpN>::memUsage() +
+                       NanoRoot<FpN>::memUsage(mSrcNodeAcc.root().getTableSize()) +
+                       NanoUpper<FpN>::memUsage()*mSrcNodeAcc.nodeCount(2) +
+                       NanoLower<FpN>::memUsage()*mSrcNodeAcc.nodeCount(1);
+    for (size_t i=1; i<leafCount; ++i) mCodec[i].offset = getOffset(i);
+    mLeafNodeSize = getOffset(leafCount);
+
+    if (mVerbose) {
+        uint32_t counters[5+1] = {0};
+        ++counters[mCodec[0].log2];
+        for (size_t i=1; i<leafCount; ++i) ++counters[mCodec[i].log2];
+        std::cout << "\n" << oracle << std::endl;
+        std::cout << "Dithering: " << (mDitherOn ? "enabled" : "disabled") << std::endl;
+        float avg = 0.0f;
+        for (uint32_t i=0; i<=5; ++i) {
+            if (uint32_t n = counters[i]) {
+                avg += n * float(1 << i);
+                printf("%2i bits: %6u leaf nodes, i.e. %4.1f%%\n",1<<i, n, 100.0f*n/float(leafCount));
+            }
+        }
+        printf("%4.1f bits per value on average\n", avg/float(leafCount));
+    }
+
+    if (mSrcNodeAcc.hasLongGridName()) {
+        this->addBlindData("grid name",
+                           GridBlindDataSemantic::Unknown,
+                           GridBlindDataClass::GridName,
+                           GridType::Unknown,
+                           mSrcNodeAcc.getName().length() + 1, 1);
+    }
+}// CreateNanoGrid::preProcess<FpN>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT, int LEVEL>
+inline typename util::enable_if<BuildTraits<DstBuildT>::is_index, uint64_t>::type
+CreateNanoGrid<SrcGridT>::countTileValues(uint64_t valueCount)
+{
+    const uint64_t stats = mIncludeStats ? 4u : 0u;// minimum, maximum, average, and deviation
+    mValIdx[LEVEL].clear();
+    mValIdx[LEVEL].resize(mSrcNodeAcc.nodeCount(LEVEL) + 1, stats);// minimum 1 entry
+    util::forEach(1, mValIdx[LEVEL].size(), 8, [&](const util::Range1D& r){
+        for (auto i = r.begin(); i!=r.end(); ++i) {
+            auto &srcNode = mSrcNodeAcc.template node<LEVEL>(i-1);
+            if constexpr(BuildTraits<DstBuildT>::is_onindex) {// resolved at compile time
+                mValIdx[LEVEL][i] += srcNode.getValueMask().countOn();
+            } else {
+                static const uint64_t maxTileCount = uint64_t(1u) << 3*srcNode.LOG2DIM;
+                mValIdx[LEVEL][i] += maxTileCount - srcNode.getChildMask().countOn();
+            }
+        }
+    });
+    mValIdx[LEVEL][0] = valueCount;
+    for (size_t i=1; i<mValIdx[LEVEL].size(); ++i) mValIdx[LEVEL][i] += mValIdx[LEVEL][i-1];// pre-fixed sum
+    return mValIdx[LEVEL].back();
+}// CreateNanoGrid::countTileValues<ValueIndex or ValueOnIndex>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+inline typename util::enable_if<BuildTraits<DstBuildT>::is_index, uint64_t>::type
+CreateNanoGrid<SrcGridT>::countValues()
+{
+    const uint64_t stats = mIncludeStats ? 4u : 0u;// minimum, maximum, average, and deviation
+    uint64_t valueCount = 1u;// offset 0 corresponds to the background value
+    if (mIncludeTiles) {
+        if constexpr(BuildTraits<DstBuildT>::is_onindex) {
+            for (auto it = mSrcNodeAcc.root().cbeginValueOn(); it; ++it) ++valueCount;
+        } else {
+            for (auto it = mSrcNodeAcc.root().cbeginValueAll(); it; ++it) ++valueCount;
+        }
+        valueCount += stats;// optionally append stats for the root node
+        valueCount = countTileValues<DstBuildT, 2>(valueCount);
+        valueCount = countTileValues<DstBuildT, 1>(valueCount);
+    }
+    mValIdx[0].clear();
+    mValIdx[0].resize(mSrcNodeAcc.nodeCount(0) + 1, 512u + stats);// minimum 1 entry
+    if constexpr(BuildTraits<DstBuildT>::is_onindex) {
+        util::forEach(1, mValIdx[0].size(), 8, [&](const util::Range1D& r) {
+            for (auto i = r.begin(); i != r.end(); ++i) {
+                mValIdx[0][i] = stats;
+                mValIdx[0][i] += mSrcNodeAcc.template node<0>(i-1).getValueMask().countOn();
+            }
+        });
+    }
+    mValIdx[0][0] = valueCount;
+    util::prefixSum(mValIdx[0], true);// inclusive prefix sum
+    return mValIdx[0].back();
+}// CreateNanoGrid::countValues<ValueIndex or ValueOnIndex>()
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+inline typename util::enable_if<BuildTraits<DstBuildT>::is_index>::type
+CreateNanoGrid<SrcGridT>::preProcess(uint32_t channels)
+{
+    const uint64_t valueCount = this->template countValues<DstBuildT>();
+    mLeafNodeSize = mSrcNodeAcc.nodeCount(0)*NanoLeaf<DstBuildT>::DataType::memUsage();
+
+    uint32_t order = mBlindMetaData.size();
+    char str[16];
+    for (uint32_t i=0; i<channels; ++i) {
+        mBlindMetaData.emplace("channel_"+std::to_string(i),
+                               toStr(str, toGridType<SrcValueT>()),
+                               GridBlindDataClass::AttributeArray,
+                               order++,
+                               valueCount,
+                               sizeof(SrcValueT));
+    }
+    if (mSrcNodeAcc.hasLongGridName()) {
+        this->addBlindData("grid name",
+                           GridBlindDataSemantic::Unknown,
+                           GridBlindDataClass::GridName,
+                           GridType::Unknown,
+                           mSrcNodeAcc.getName().length() + 1, 1);
+    }
+}// preProcess<ValueIndex or ValueOnIndex>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+inline typename util::disable_if<BuildTraits<DstBuildT>::is_special>::type
+CreateNanoGrid<SrcGridT>::processLeafs()
+{
+    using DstDataT  = typename NanoLeaf<DstBuildT>::DataType;
+    using DstValueT = typename DstDataT::ValueType;
+    static_assert(DstDataT::FIXED_SIZE, "Expected destination LeafNode<T> to have fixed size");
+    util::forEach(0, mSrcNodeAcc.nodeCount(0), 8, [&](const util::Range1D& r) {
+        auto *dstLeaf = this->template dstNode<DstBuildT,0>(r.begin());
+        for (auto i = r.begin(); i != r.end(); ++i, ++dstLeaf) {
+            auto &srcLeaf = mSrcNodeAcc.template node<0>(i);
+            if (DstDataT::padding()>0u) {
+                util::memzero(dstLeaf, DstDataT::memUsage());
+            } else {
+                dstLeaf->mBBoxDif[0] = dstLeaf->mBBoxDif[1] = dstLeaf->mBBoxDif[2] = 0u;
+                dstLeaf->mFlags = 0u;// enable rendering, no bbox, no stats
+                dstLeaf->mMinimum = dstLeaf->mMaximum = typename DstDataT::ValueType();
+                dstLeaf->mAverage = dstLeaf->mStdDevi = 0;
+            }
+            dstLeaf->mBBoxMin = srcLeaf.origin(); // copy origin of node
+            dstLeaf->mValueMask = srcLeaf.getValueMask(); // copy value mask
+            DstValueT *dst = dstLeaf->mValues;
+            if constexpr(util::is_same<DstValueT, SrcValueT>::value && SrcNodeAccT::IS_OPENVDB) {
+                const SrcValueT *src = srcLeaf.buffer().data();
+                for (auto *end = dst + 512u; dst != end; dst += 4, src += 4) {
+                    dst[0] = src[0]; // copy *all* voxel values in sets of four, i.e. loop-unrolling
+                    dst[1] = src[1];
+                    dst[2] = src[2];
+                    dst[3] = src[3];
+                }
+            } else {
+                for (uint32_t j=0; j<512u; ++j) *dst++ = static_cast<DstValueT>(srcLeaf.getValue(j));
+            }
+        }
+    });
+} // CreateNanoGrid::processLeafs<T>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+inline typename util::enable_if<BuildTraits<DstBuildT>::is_index>::type
+CreateNanoGrid<SrcGridT>::processLeafs()
+{
+    using DstDataT  = typename NanoLeaf<DstBuildT>::DataType;
+    static_assert(DstDataT::FIXED_SIZE, "Expected destination LeafNode<ValueIndex> to have fixed size");
+    static_assert(DstDataT::padding()==0u, "Expected leaf nodes to have no padding");
+
+    util::forEach(0, mSrcNodeAcc.nodeCount(0), 8, [&](const util::Range1D& r) {
+        const uint8_t flags  = mIncludeStats ? 16u : 0u;// 4th bit indicates stats
+        DstDataT *dstLeaf = this->template dstNode<DstBuildT,0>(r.begin());// fixed size
+        for (auto i = r.begin(); i != r.end(); ++i, ++dstLeaf) {
+            auto &srcLeaf = mSrcNodeAcc.template node<0>(i);
+            dstLeaf->mBBoxMin = srcLeaf.origin(); // copy origin of node
+            dstLeaf->mBBoxDif[0] = dstLeaf->mBBoxDif[1] = dstLeaf->mBBoxDif[2] = 0u;
+            dstLeaf->mFlags = flags;
+            dstLeaf->mValueMask = srcLeaf.getValueMask(); // copy value mask
+            dstLeaf->mOffset = mValIdx[0][i];
+            if constexpr(BuildTraits<DstBuildT>::is_onindex) {
+                const uint64_t *w = dstLeaf->mValueMask.words();
+#ifdef USE_OLD_VALUE_ON_INDEX
+                int32_t sum = CountOn(*w++);
+                uint8_t *p = reinterpret_cast<uint8_t*>(&dstLeaf->mPrefixSum), *q = p + 7;
+                for (int j=0; j<7; ++j) {
+                    *p++ = sum & 255u;
+                    *q |= (sum >> 8) << j;
+                    sum += CountOn(*w++);
+                }
+#else
+                uint64_t &prefixSum = dstLeaf->mPrefixSum, sum = util::countOn(*w++);
+                prefixSum = sum;
+                for (int n = 9; n < 55; n += 9) {// n=i*9 where i=1,2,..6
+                    sum += util::countOn(*w++);
+                    prefixSum |= sum << n;// each pre-fixed sum is encoded in 9 bits
+                }
+#endif
+            } else {
+                dstLeaf->mPrefixSum = 0u;
+            }
+            if constexpr(BuildTraits<DstBuildT>::is_indexmask) dstLeaf->mMask = dstLeaf->mValueMask;
+        }
+    });
+} // CreateNanoGrid::processLeafs<ValueIndex or ValueOnIndex>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+inline typename util::enable_if<util::is_same<ValueMask, DstBuildT>::value>::type
+CreateNanoGrid<SrcGridT>::processLeafs()
+{
+    using DstDataT = typename NanoLeaf<ValueMask>::DataType;
+    static_assert(DstDataT::FIXED_SIZE, "Expected destination LeafNode<ValueMask> to have fixed size");
+    util::forEach(0, mSrcNodeAcc.nodeCount(0), 8, [&](const util::Range1D& r) {
+        auto *dstLeaf = this->template dstNode<DstBuildT,0>(r.begin());
+        for (auto i = r.begin(); i != r.end(); ++i, ++dstLeaf) {
+            auto &srcLeaf = mSrcNodeAcc.template node<0>(i);
+            if (DstDataT::padding()>0u) {
+                util::memzero(dstLeaf, DstDataT::memUsage());
+            } else {
+                dstLeaf->mBBoxDif[0] = dstLeaf->mBBoxDif[1] = dstLeaf->mBBoxDif[2] = 0u;
+                dstLeaf->mFlags = 0u;// enable rendering, no bbox, no stats
+                dstLeaf->mPadding[0] = dstLeaf->mPadding[1] = 0u;
+            }
+            dstLeaf->mBBoxMin = srcLeaf.origin(); // copy origin of node
+            dstLeaf->mValueMask = srcLeaf.getValueMask(); // copy value mask
+        }
+    });
+} // CreateNanoGrid::processLeafs<ValueMask>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+inline typename util::enable_if<util::is_same<bool, DstBuildT>::value>::type
+CreateNanoGrid<SrcGridT>::processLeafs()
+{
+    using DstDataT = typename NanoLeaf<bool>::DataType;
+    static_assert(DstDataT::FIXED_SIZE, "Expected destination LeafNode<bool> to have fixed size");
+    util::forEach(0, mSrcNodeAcc.nodeCount(0), 8, [&](const util::Range1D& r) {
+        auto *dstLeaf = this->template dstNode<DstBuildT,0>(r.begin());
+        for (auto i = r.begin(); i != r.end(); ++i, ++dstLeaf) {
+            auto &srcLeaf = mSrcNodeAcc.template node<0>(i);
+            if (DstDataT::padding()>0u) {
+                util::memzero(dstLeaf, DstDataT::memUsage());
+            } else {
+                dstLeaf->mBBoxDif[0] = dstLeaf->mBBoxDif[1] = dstLeaf->mBBoxDif[2] = 0u;
+                dstLeaf->mFlags = 0u;// enable rendering, no bbox, no stats
+            }
+            dstLeaf->mBBoxMin = srcLeaf.origin(); // copy origin of node
+            dstLeaf->mValueMask = srcLeaf.getValueMask(); // copy value mask
+            if constexpr(!util::is_same<bool, SrcBuildT>::value) {
+                for (int j=0; j<512; ++j) dstLeaf->mValues.set(j, static_cast<bool>(srcLeaf.getValue(j)));
+            } else if constexpr(SrcNodeAccT::IS_OPENVDB) {
+                dstLeaf->mValues = *reinterpret_cast<const Mask<3>*>(srcLeaf.buffer().data());
+            } else if constexpr(SrcNodeAccT::IS_NANOVDB) {
+                dstLeaf->mValues = srcLeaf.data()->mValues;
+            } else {// tools::Leaf
+                dstLeaf->mValues = srcLeaf.mValues; // copy value mask
+            }
+        }
+    });
+} // CreateNanoGrid::processLeafs<bool>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+inline typename util::enable_if<BuildTraits<DstBuildT>::is_FpX>::type
+CreateNanoGrid<SrcGridT>::processLeafs()
+{
+    using DstDataT = typename NanoLeaf<DstBuildT>::DataType;
+    static_assert(DstDataT::FIXED_SIZE, "Expected destination LeafNode<Fp4|Fp8|Fp16> to have fixed size");
+    using ArrayT = typename DstDataT::ArrayType;
+    static_assert(util::is_same<float, SrcValueT>::value, "Expected ValueT == float");
+    using FloatT = typename std::conditional<DstDataT::bitWidth()>=16, double, float>::type;// 16 compression and higher requires double
+    static constexpr FloatT UNITS = FloatT((1 << DstDataT::bitWidth()) - 1);// # of unique non-zero values
+    math::DitherLUT lut(mDitherOn);
+
+    util::forEach(0, mSrcNodeAcc.nodeCount(0), 8, [&](const util::Range1D& r) {
+        auto *dstLeaf = this->template dstNode<DstBuildT,0>(r.begin());
+        for (auto i = r.begin(); i != r.end(); ++i, ++dstLeaf) {
+            auto &srcLeaf = mSrcNodeAcc.template node<0>(i);
+            if (DstDataT::padding()>0u) {
+                util::memzero(dstLeaf, DstDataT::memUsage());
+            } else {
+                dstLeaf->mFlags = dstLeaf->mBBoxDif[2] = dstLeaf->mBBoxDif[1] = dstLeaf->mBBoxDif[0] = 0u;
+                dstLeaf->mDev = dstLeaf->mAvg = dstLeaf->mMax = dstLeaf->mMin = 0u;
+            }
+            dstLeaf->mBBoxMin = srcLeaf.origin(); // copy origin of node
+            dstLeaf->mValueMask = srcLeaf.getValueMask(); // copy value mask
+            // compute extrema values
+            float min = std::numeric_limits<float>::max(), max = -min;
+            for (uint32_t j=0; j<512u; ++j) {
+                const float v = srcLeaf.getValue(j);
+                if (v < min) min = v;
+                if (v > max) max = v;
+            }
+            dstLeaf->init(min, max, DstDataT::bitWidth());
+            // perform quantization relative to the values in the current leaf node
+            const FloatT encode = UNITS/(max-min);
+            uint32_t offset = 0;
+            auto quantize = [&]()->ArrayT{
+                const ArrayT tmp = static_cast<ArrayT>(encode * (srcLeaf.getValue(offset) - min) + lut(offset));
+                ++offset;
+                return tmp;
+            };
+            auto *code = reinterpret_cast<ArrayT*>(dstLeaf->mCode);
+            if (util::is_same<Fp4, DstBuildT>::value) {// resolved at compile-time
+                for (uint32_t j=0; j<128u; ++j) {
+                    auto tmp = quantize();
+                    *code++  = quantize() << 4 | tmp;
+                    tmp      = quantize();
+                    *code++  = quantize() << 4 | tmp;
+                }
+            } else {
+                for (uint32_t j=0; j<128u; ++j) {
+                    *code++ = quantize();
+                    *code++ = quantize();
+                    *code++ = quantize();
+                    *code++ = quantize();
+                }
+            }
+        }
+    });
+} // CreateNanoGrid::processLeafs<Fp4, Fp8, Fp16>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+inline typename util::enable_if<util::is_same<FpN, DstBuildT>::value>::type
+CreateNanoGrid<SrcGridT>::processLeafs()
+{
+    static_assert(util::is_same<float, SrcValueT>::value, "Expected SrcValueT == float");
+    math::DitherLUT lut(mDitherOn);
+    util::forEach(0, mSrcNodeAcc.nodeCount(0), 8, [&](const util::Range1D& r) {
+        for (auto i = r.begin(); i != r.end(); ++i) {
+            auto &srcLeaf = mSrcNodeAcc.template node<0>(i);
+            auto *dstLeaf = this->template dstNode<DstBuildT,0>(i);
+            dstLeaf->mBBoxMin = srcLeaf.origin(); // copy origin of node
+            dstLeaf->mBBoxDif[0] = dstLeaf->mBBoxDif[1] = dstLeaf->mBBoxDif[2] = 0u;
+            const uint8_t logBitWidth = mCodec[i].log2;
+            dstLeaf->mFlags = logBitWidth << 5;// pack logBitWidth into 3 MSB of mFlag
+            dstLeaf->mValueMask = srcLeaf.getValueMask(); // copy value mask
+            const float min = mCodec[i].min, max = mCodec[i].max;
+            dstLeaf->init(min, max, uint8_t(1) << logBitWidth);
+            // perform quantization relative to the values in the current leaf node
+            uint32_t offset = 0;
+            float encode = 0.0f;
+            auto quantize = [&]()->uint8_t{
+                const uint8_t tmp = static_cast<uint8_t>(encode * (srcLeaf.getValue(offset) - min) + lut(offset));
+                ++offset;
+                return tmp;
+            };
+            auto *dst = reinterpret_cast<uint8_t*>(dstLeaf+1);
+            switch (logBitWidth) {
+                case 0u: {// 1 bit
+                    encode = 1.0f/(max - min);
+                    for (int j=0; j<64; ++j) {
+                        uint8_t a = 0;
+                        for (int k=0; k<8; ++k) a |= quantize() << k;
+                        *dst++ = a;
+                    }
+                }
+                break;
+                case 1u: {// 2 bits
+                    encode = 3.0f/(max - min);
+                    for (int j=0; j<128; ++j) {
+                        auto a = quantize();
+                        a     |= quantize() << 2;
+                        a     |= quantize() << 4;
+                        *dst++ = quantize() << 6 | a;
+                    }
+                }
+                break;
+                case 2u: {// 4 bits
+                    encode = 15.0f/(max - min);
+                    for (int j=0; j<128; ++j) {
+                        auto a = quantize();
+                        *dst++ = quantize() << 4 | a;
+                        a      = quantize();
+                        *dst++ = quantize() << 4 | a;
+                    }
+                }
+                break;
+                case 3u: {// 8 bits
+                    encode = 255.0f/(max - min);
+                    for (int j=0; j<128; ++j) {
+                        *dst++ = quantize();
+                        *dst++ = quantize();
+                        *dst++ = quantize();
+                        *dst++ = quantize();
+                    }
+                }
+                break;
+                default: {// 16 bits - special implementation using higher bit-precision
+                    auto *dst = reinterpret_cast<uint16_t*>(dstLeaf+1);
+                    const double encode = 65535.0/(max - min);// note that double is required!
+                    for (int j=0; j<128; ++j) {
+                        *dst++ = uint16_t(encode * (srcLeaf.getValue(offset) - min) + lut(offset)); ++offset;
+                        *dst++ = uint16_t(encode * (srcLeaf.getValue(offset) - min) + lut(offset)); ++offset;
+                        *dst++ = uint16_t(encode * (srcLeaf.getValue(offset) - min) + lut(offset)); ++offset;
+                        *dst++ = uint16_t(encode * (srcLeaf.getValue(offset) - min) + lut(offset)); ++offset;
+                    }
+                }
+            }// end switch
+        }
+    });// kernel
+} // CreateNanoGrid::processLeafs<FpN>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT, int LEVEL>
+inline typename util::enable_if<!BuildTraits<DstBuildT>::is_index>::type
+CreateNanoGrid<SrcGridT>::processInternalNodes()
+{
+    using DstNodeT  = typename NanoNode<DstBuildT, LEVEL>::type;
+    using DstValueT = typename DstNodeT::ValueType;
+    using DstChildT = typename NanoNode<DstBuildT, LEVEL-1>::type;
+    static_assert(LEVEL == 1 || LEVEL == 2, "Expected internal node");
+
+    const uint64_t nodeCount = mSrcNodeAcc.nodeCount(LEVEL);
+    if (nodeCount > 0) {// compute and temporarily encode IDs of child nodes
+        uint64_t childCount = 0;
+        auto *dstNode = this->template dstNode<DstBuildT,LEVEL>(0);
+        for (uint64_t i=0; i<nodeCount; ++i) {
+            dstNode[i].mFlags = childCount;
+            childCount += mSrcNodeAcc.template node<LEVEL>(static_cast<uint32_t>(i)).getChildMask().countOn();
+        }
+    }
+
+    util::forEach(0, nodeCount, 4, [&](const util::Range1D& r) {
+        auto *dstNode = this->template dstNode<DstBuildT,LEVEL>(r.begin());
+        for (auto i = r.begin(); i != r.end(); ++i, ++dstNode) {
+            auto &srcNode  = mSrcNodeAcc.template node<LEVEL>(i);
+            uint64_t childID = dstNode->mFlags;
+            if (DstNodeT::DataType::padding()>0u) {
+                util::memzero(dstNode, DstNodeT::memUsage());
+            } else {
+                dstNode->mFlags = 0;// enable rendering, no bbox, no stats
+                dstNode->mMinimum = dstNode->mMaximum = typename DstNodeT::ValueType();
+                dstNode->mAverage = dstNode->mStdDevi = 0;
+            }
+            dstNode->mBBox[0]   = srcNode.origin(); // copy origin of node
+            dstNode->mValueMask = srcNode.getValueMask(); // copy value mask
+            dstNode->mChildMask = srcNode.getChildMask(); // copy child mask
+            for (auto it = srcNode.cbeginChildAll(); it; ++it) {
+                SrcValueT value{}; // default initialization
+                if (it.probeChild(value)) {
+                    DstChildT *dstChild = this->template dstNode<DstBuildT,LEVEL-1>(childID++);// might be Leaf<FpN>
+                    dstNode->setChild(it.pos(), dstChild);
+                } else {
+                    dstNode->setValue(it.pos(), static_cast<DstValueT>(value));
+                }
+            }
+        }
+    });
+} // CreateNanoGrid::processInternalNodes<T>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT, int LEVEL>
+inline typename util::enable_if<BuildTraits<DstBuildT>::is_index>::type
+CreateNanoGrid<SrcGridT>::processInternalNodes()
+{
+    using DstNodeT  = typename NanoNode<DstBuildT, LEVEL>::type;
+    using DstChildT = typename NanoNode<DstBuildT, LEVEL-1>::type;
+    static_assert(LEVEL == 1 || LEVEL == 2, "Expected internal node");
+    static_assert(DstNodeT::DataType::padding()==0u, "Expected internal nodes to have no padding");
+
+    const uint64_t nodeCount = mSrcNodeAcc.nodeCount(LEVEL);
+    if (nodeCount > 0) {// compute and temporarily encode IDs of child nodes
+        uint64_t childCount = 0;
+        auto *dstNode = this->template dstNode<DstBuildT,LEVEL>(0);
+        for (uint64_t i=0; i<nodeCount; ++i) {
+            dstNode[i].mFlags = childCount;
+            childCount += mSrcNodeAcc.template node<LEVEL>(i).getChildMask().countOn();
+        }
+    }
+
+    util::forEach(0, nodeCount, 4, [&](const util::Range1D& r) {
+        auto *dstNode = this->template dstNode<DstBuildT,LEVEL>(r.begin());
+        for (auto i = r.begin(); i != r.end(); ++i, ++dstNode) {
+            auto &srcNode  = mSrcNodeAcc.template node<LEVEL>(i);
+            uint64_t childID = dstNode->mFlags;
+            dstNode->mFlags = 0u;
+            dstNode->mBBox[0]   = srcNode.origin(); // copy origin of node
+            dstNode->mValueMask = srcNode.getValueMask(); // copy value mask
+            dstNode->mChildMask = srcNode.getChildMask(); // copy child mask
+            uint64_t n = mIncludeTiles ? mValIdx[LEVEL][i] : 0u;
+            for (auto it = srcNode.cbeginChildAll(); it; ++it) {
+                SrcValueT value;
+                if (it.probeChild(value)) {
+                    DstChildT *dstChild = this->template dstNode<DstBuildT,LEVEL-1>(childID++);// might be Leaf<FpN>
+                    dstNode->setChild(it.pos(), dstChild);
+                } else {
+                    uint64_t m = 0u;
+                    if (mIncludeTiles && !((BuildTraits<DstBuildT>::is_onindex) && dstNode->mValueMask.isOff(it.pos()))) m = n++;
+                    dstNode->setValue(it.pos(), m);
+                }
+            }
+            if (mIncludeTiles && mIncludeStats) {// stats are always placed after the tile values
+                dstNode->mMinimum = n++;
+                dstNode->mMaximum = n++;
+                dstNode->mAverage = n++;
+                dstNode->mStdDevi = n++;
+            } else {// if not tiles or stats set stats to the background offset
+                dstNode->mMinimum = 0u;
+                dstNode->mMaximum = 0u;
+                dstNode->mAverage = 0u;
+                dstNode->mStdDevi = 0u;
+            }
+        }
+    });
+} // CreateNanoGrid::processInternalNodes<ValueIndex or ValueOnIndex>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+inline typename util::enable_if<!BuildTraits<DstBuildT>::is_index>::type
+CreateNanoGrid<SrcGridT>::processRoot()
+{
+    using DstRootT  = NanoRoot<DstBuildT>;
+    using DstValueT = typename DstRootT::ValueType;
+    auto &srcRoot = mSrcNodeAcc.root();
+    auto *dstRoot = this->template dstRoot<DstBuildT>();
+    const uint32_t tableSize = srcRoot.getTableSize();
+    if (DstRootT::DataType::padding()>0) util::memzero(dstRoot, DstRootT::memUsage(tableSize));
+    dstRoot->mTableSize = tableSize;
+    dstRoot->mMinimum = dstRoot->mMaximum = dstRoot->mBackground = srcRoot.background();
+    dstRoot->mBBox = CoordBBox(); // // set to an empty bounding box
+    if (tableSize==0) return;
+    auto *dstChild = this->template dstNode<DstBuildT, 2>(0);// fixed size and linear in memory
+    auto *dstTile  = dstRoot->tile(0);// fixed size and linear in memory
+    for (auto it = srcRoot.cbeginChildAll(); it; ++it, ++dstTile) {
+        SrcValueT value;
+        if (it.probeChild(value)) {
+            dstTile->setChild(it.getCoord(), dstChild++, dstRoot);
+        } else {
+            dstTile->setValue(it.getCoord(), it.isValueOn(), static_cast<DstValueT>(value));
+        }
+    }
+} // CreateNanoGrid::processRoot<T>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+inline typename util::enable_if<BuildTraits<DstBuildT>::is_index>::type
+CreateNanoGrid<SrcGridT>::processRoot()
+{
+    using DstRootT  = NanoRoot<DstBuildT>;
+    auto &srcRoot = mSrcNodeAcc.root();
+    auto *dstRoot = this->template dstRoot<DstBuildT>();
+    const uint32_t tableSize = srcRoot.getTableSize();
+    if (DstRootT::DataType::padding()>0) util::memzero(dstRoot, DstRootT::memUsage(tableSize));
+    dstRoot->mTableSize = tableSize;
+    dstRoot->mBackground = 0u;
+    uint64_t valueCount = 0u;// the first entry is always the background value
+    dstRoot->mBBox = CoordBBox(); // set to an empty/invalid bounding box
+
+    if (tableSize>0) {
+        auto *dstChild = this->template dstNode<DstBuildT, 2>(0);// fixed size and linear in memory
+        auto *dstTile  = dstRoot->tile(0);// fixed size and linear in memory
+        for (auto it = srcRoot.cbeginChildAll(); it; ++it, ++dstTile) {
+            SrcValueT tmp;
+            if (it.probeChild(tmp)) {
+                dstTile->setChild(it.getCoord(), dstChild++, dstRoot);
+            } else {
+                dstTile->setValue(it.getCoord(), it.isValueOn(), 0u);
+                if (mIncludeTiles && !((BuildTraits<DstBuildT>::is_onindex) && !dstTile->state)) dstTile->value = ++valueCount;
+            }
+        }
+    }
+    if (mIncludeTiles && mIncludeStats) {// stats are always placed after the tile values
+        dstRoot->mMinimum = ++valueCount;
+        dstRoot->mMaximum = ++valueCount;
+        dstRoot->mAverage = ++valueCount;
+        dstRoot->mStdDevi = ++valueCount;
+    } else if (dstRoot->padding()==0) {
+        dstRoot->mMinimum = 0u;
+        dstRoot->mMaximum = 0u;
+        dstRoot->mAverage = 0u;
+        dstRoot->mStdDevi = 0u;
+    }
+} // CreateNanoGrid::processRoot<ValueIndex or ValueOnIndex>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+void CreateNanoGrid<SrcGridT>::processTree()
+{
+    const uint64_t nodeCount[3] = {mSrcNodeAcc.nodeCount(0), mSrcNodeAcc.nodeCount(1), mSrcNodeAcc.nodeCount(2)};
+    auto *dstTree = this->template dstTree<DstBuildT>();
+    dstTree->setRoot( this->template dstRoot<DstBuildT>() );
+    dstTree->setFirstNode(nodeCount[2] ? this->template dstNode<DstBuildT, 2>(0) : nullptr);
+    dstTree->setFirstNode(nodeCount[1] ? this->template dstNode<DstBuildT, 1>(0) : nullptr);
+    dstTree->setFirstNode(nodeCount[0] ? this->template dstNode<DstBuildT, 0>(0) : nullptr);
+
+    dstTree->mNodeCount[0] = static_cast<uint32_t>(nodeCount[0]);
+    dstTree->mNodeCount[1] = static_cast<uint32_t>(nodeCount[1]);
+    dstTree->mNodeCount[2] = static_cast<uint32_t>(nodeCount[2]);
+
+    // Count number of active leaf level tiles
+    dstTree->mTileCount[0] = util::reduce(util::Range1D(0,nodeCount[1]), uint32_t(0), [&](util::Range1D &r, uint32_t sum){
+        for (auto i=r.begin(); i!=r.end(); ++i) sum += mSrcNodeAcc.template node<1>(i).getValueMask().countOn();
+        return sum;}, std::plus<uint32_t>());
+
+    // Count number of active lower internal node tiles
+    dstTree->mTileCount[1] = util::reduce(util::Range1D(0,nodeCount[2]), uint32_t(0), [&](util::Range1D &r, uint32_t sum){
+        for (auto i=r.begin(); i!=r.end(); ++i) sum += mSrcNodeAcc.template node<2>(i).getValueMask().countOn();
+        return sum;}, std::plus<uint32_t>());
+
+    // Count number of active upper internal node tiles
+    dstTree->mTileCount[2] = 0;
+    for (auto it = mSrcNodeAcc.root().cbeginValueOn(); it; ++it) dstTree->mTileCount[2] += 1;
+
+    // Count number of active voxels
+    dstTree->mVoxelCount = util::reduce(util::Range1D(0, nodeCount[0]), uint64_t(0), [&](util::Range1D &r, uint64_t sum){
+        for (auto i=r.begin(); i!=r.end(); ++i) sum += mSrcNodeAcc.template node<0>(i).getValueMask().countOn();
+        return sum;}, std::plus<uint64_t>());
+
+    dstTree->mVoxelCount += uint64_t(dstTree->mTileCount[0]) <<  9;// = 3 * 3
+    dstTree->mVoxelCount += uint64_t(dstTree->mTileCount[1]) << 21;// = 3 * (3+4)
+    dstTree->mVoxelCount += uint64_t(dstTree->mTileCount[2]) << 36;// = 3 * (3+4+5)
+
+} // CreateNanoGrid::processTree
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+void CreateNanoGrid<SrcGridT>::processGrid()
+{
+    auto* dstGrid = this->template dstGrid<DstBuildT>();
+    dstGrid->init({GridFlags::IsBreadthFirst}, mOffset.size, mSrcNodeAcc.map(),
+                  toGridType<DstBuildT>(), toGridClass<DstBuildT>(mSrcNodeAcc.gridClass()));
+    dstGrid->mBlindMetadataCount = static_cast<uint32_t>(mBlindMetaData.size());
+    dstGrid->mData1 = this->valueCount();
+
+//    if (!isValid(dstGrid->mGridType, dstGrid->mGridClass)) {
+//#if 1
+//        char str[30];
+//        fprintf(stderr,"Warning: Strange combination of GridType(\"%s\") and GridClass(\"%s\"). Consider changing GridClass to \"Unknown\"\n",
+//                toStr(str, dstGrid->mGridType), toStr(str + 15, dstGrid->mGridClass));
+//#else
+//        throw std::runtime_error("Invalid combination of GridType("+std::to_string(int(dstGrid->mGridType))+
+//                                 ") and GridClass("+std::to_string(int(dstGrid->mGridClass))+"). See NanoVDB.h for details!");
+//#endif
+//    }
+    util::memzero(dstGrid->mGridName, GridData::MaxNameSize);// initialize mGridName to zero
+    strncpy(dstGrid->mGridName, mSrcNodeAcc.getName().c_str(), GridData::MaxNameSize-1);
+    if (mSrcNodeAcc.hasLongGridName()) dstGrid->setLongGridNameOn();// grid name is long so store it as blind data
+
+    // Partially process blind meta data - they will be complete in postProcess
+    if (mBlindMetaData.size()>0) {
+        auto *metaData = this->dstMeta(0);
+        dstGrid->mBlindMetadataOffset = util::PtrDiff(metaData, dstGrid);
+        dstGrid->mBlindMetadataCount = static_cast<uint32_t>(mBlindMetaData.size());
+        char *blindData = util::PtrAdd<char>(mBufferPtr, mOffset.blind);
+        for (const auto &b : mBlindMetaData) {
+            std::memcpy(metaData, b.metaData, sizeof(GridBlindMetaData));
+            metaData->setBlindData(blindData);// sets metaData.mOffset
+            if (metaData->mDataClass == GridBlindDataClass::GridName) strcpy(blindData, mSrcNodeAcc.getName().c_str());
+            ++metaData;
+            blindData += b.size;
+        }
+        mBlindMetaData.clear();
+    }
+} // CreateNanoGrid::processGrid
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+inline typename util::disable_if<BuildTraits<DstBuildT>::is_index>::type
+CreateNanoGrid<SrcGridT>::postProcess()
+{
+    if constexpr(util::is_same<FpN, DstBuildT>::value) mCodec.reset();
+    auto *dstGrid = this->template dstGrid<DstBuildT>();
+    updateGridStats(dstGrid, mStats);
+#if defined(NANOVDB_USE_OPENVDB) && !defined(__CUDACC__)
+    auto *metaData = this->dstMeta(0);
+    if constexpr(util::is_same<openvdb::tools::PointIndexGrid, SrcGridT>::value ||
+                 util::is_same<openvdb::points::PointDataGrid, SrcGridT>::value) {
+        static_assert(util::is_same<DstBuildT, uint32_t>::value, "expected DstBuildT==uint32_t");
+        auto *dstData0 = this->template dstNode<DstBuildT,0>(0)->data();
+        dstData0->mMinimum = 0; // start of prefix sum
+        dstData0->mMaximum = dstData0->mValues[511u];
+        for (uint64_t i=1, n=mSrcNodeAcc.nodeCount(0); i<n; ++i) {
+            auto *dstData1 = dstData0 + 1;
+            dstData1->mMinimum = dstData0->mMinimum + dstData0->mMaximum;
+            dstData1->mMaximum = dstData1->mValues[511u];
+            dstData0 = dstData1;
+        }
+        for (size_t i = 0, n = dstGrid->blindDataCount(); i < n; ++i, ++metaData) {
+            if constexpr(util::is_same<openvdb::tools::PointIndexGrid, SrcGridT>::value) {
+                if (metaData->mDataClass != GridBlindDataClass::IndexArray) continue;
+                if (metaData->mDataType == GridType::UInt32) {
+                    uint32_t *blindData = const_cast<uint32_t*>(metaData->template getBlindData<uint32_t>());
+                    util::forEach(0, mSrcNodeAcc.nodeCount(0), 16, [&](const auto& r) {
+                        auto *dstLeaf = this->template dstNode<DstBuildT,0>(r.begin());
+                        for (auto j = r.begin(); j != r.end(); ++j, ++dstLeaf) {
+                            uint32_t* p = blindData + dstLeaf->mMinimum;
+                            for (uint32_t idx : mSrcNodeAcc.template node<0>(j).indices()) *p++ = idx;
+                        }
+                    });
+                }
+            } else {// if constexpr(util::is_same<openvdb::points::PointDataGrid, SrcGridT>::value)
+                if (metaData->mDataClass != GridBlindDataClass::AttributeArray) continue;
+                if (auto *blindData = dstGrid->template getBlindData<float>(i)) {
+                    this->template copyPointAttribute<DstBuildT>(i, blindData);
+                } else if (auto *blindData = dstGrid->template getBlindData<nanovdb::Vec3f>(i)) {
+                    this->template copyPointAttribute<DstBuildT>(i, reinterpret_cast<openvdb::Vec3f*>(blindData));
+                } else if (auto *blindData = dstGrid->template getBlindData<int32_t>(i)) {
+                    this->template copyPointAttribute<DstBuildT>(i, blindData);
+                } else if (auto *blindData = dstGrid->template getBlindData<int64_t>(i)) {
+                    this->template copyPointAttribute<DstBuildT>(i, blindData);
+                } else {
+                    char str[16];
+                    std::cerr << "unsupported point attribute \"" << toStr(str, metaData->mDataType) << "\"\n";
+                }
+            }// if
+        }// loop
+    } else { // if
+        (void)metaData;
+    }
+#endif
+    updateChecksum(dstGrid, mChecksum);
+}// CreateNanoGrid::postProcess<T>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+inline typename util::enable_if<BuildTraits<DstBuildT>::is_index>::type
+CreateNanoGrid<SrcGridT>::postProcess(uint32_t channels)
+{
+    char str[16];
+    const std::string typeName = toStr(str, toGridType<SrcValueT>());
+    const uint64_t valueCount = this->valueCount();
+    auto *dstGrid = this->template dstGrid<DstBuildT>();
+    for (uint32_t i=0; i<channels; ++i) {
+        const std::string name = "channel_"+std::to_string(i);
+        int j = dstGrid->findBlindData(name.c_str());
+        if (j<0) throw std::runtime_error("missing " + name);
+        auto *metaData = this->dstMeta(j);// partially set in processGrid
+        metaData->mDataClass = GridBlindDataClass::ChannelArray;
+        metaData->mDataType  = toGridType<SrcValueT>();
+        SrcValueT *blindData = const_cast<SrcValueT*>(metaData->template getBlindData<SrcValueT>());
+        if (i>0) {// concurrent copy from previous channel
+            util::forEach(0,valueCount,1024,[&](const util::Range1D &r){
+                SrcValueT *dst=blindData+r.begin(), *end=dst+r.size(), *src=dst-valueCount;
+                while(dst!=end) *dst++ = *src++;
+            });
+        } else {
+            this->template copyValues<DstBuildT>(blindData);
+        }
+    }// loop over channels
+    updateGridStats(this->template dstGrid<DstBuildT>(), std::min(StatsMode::BBox, mStats));
+    updateChecksum(dstGrid, mChecksum);
+}// CreateNanoGrid::postProcess<ValueIndex or ValueOnIndex>
+
+//================================================================================================
+
+template <typename SrcGridT>
+template <typename DstBuildT>
+typename util::enable_if<BuildTraits<DstBuildT>::is_index>::type
+CreateNanoGrid<SrcGridT>::copyValues(SrcValueT *buffer)
+{// copy values from the source grid into the provided buffer
+    assert(mBufferPtr && buffer);
+    using StatsT = typename FloatTraits<SrcValueT>::FloatType;
+
+    if (this->valueCount()==0) this->template countValues<DstBuildT>();
+
+    auto copyNodeValues = [&](const auto &node, SrcValueT *v) {
+        if constexpr(BuildTraits<DstBuildT>::is_onindex) {
+            for (auto it = node.cbeginValueOn(); it; ++it) *v++ = *it;
+        } else {
+            for (auto it = node.cbeginValueAll(); it; ++it) *v++ = *it;
+        }
+        if (mIncludeStats) {
+            if constexpr(SrcNodeAccT::IS_NANOVDB) {// resolved at compile time
+                *v++ = node.minimum();
+                *v++ = node.maximum();
+                if constexpr(util::is_same<SrcValueT, StatsT>::value) {
+                    *v++ = node.average();
+                    *v++ = node.stdDeviation();
+                } else {// eg when SrcValueT=Vec3f and StatsT=float
+                    *v++ = SrcValueT(node.average());
+                    *v++ = SrcValueT(node.stdDeviation());
+                }
+            } else {// openvdb and nanovdb::tools::build::Grid have no stats
+                *v++ = buffer[0];// background
+                *v++ = buffer[0];// background
+                *v++ = buffer[0];// background
+                *v++ = buffer[0];// background
+            }
+        }
+    };// copyNodeValues
+
+    const SrcRootT &root = mSrcNodeAcc.root();
+    buffer[0] = root.background();// Value array always starts with the background value
+    if (mIncludeTiles) {
+        copyNodeValues(root, buffer + 1u);
+        util::forEach(0, mSrcNodeAcc.nodeCount(2), 1, [&](const util::Range1D& r) {
+            for (auto i = r.begin(); i!=r.end(); ++i) {
+                copyNodeValues(mSrcNodeAcc.template node<2>(i), buffer + mValIdx[2][i]);
+            }
+        });
+        util::forEach(0, mSrcNodeAcc.nodeCount(1), 1, [&](const util::Range1D& r) {
+            for (auto i = r.begin(); i!=r.end(); ++i) {
+                copyNodeValues(mSrcNodeAcc.template node<1>(i), buffer + mValIdx[1][i]);
+            }
+        });
+    }
+    util::forEach(0, mSrcNodeAcc.nodeCount(0), 4, [&](const util::Range1D& r) {
+        for (auto i = r.begin(); i!=r.end(); ++i) {
+            copyNodeValues(mSrcNodeAcc.template node<0>(i), buffer + mValIdx[0][i]);
+        }
+    });
+}// CreateNanoGrid::copyValues<ValueIndex or ValueOnIndex>
+
+
+//================================================================================================
+
+#if defined(NANOVDB_USE_OPENVDB) && !defined(__CUDACC__)
+
+template <typename SrcGridT>
+template<typename T>
+typename util::disable_if<util::is_same<T, openvdb::tools::PointIndexGrid>::value ||
+                    util::is_same<T, openvdb::points::PointDataGrid>::value, uint64_t>::type
+CreateNanoGrid<SrcGridT>::countPoints() const
+{
+    static_assert(util::is_same<T, SrcGridT>::value, "expected default template parameter");
+    return 0u;
+}// CreateNanoGrid::countPoints<T>
+
+template <typename SrcGridT>
+template<typename T>
+typename util::enable_if<util::is_same<T, openvdb::tools::PointIndexGrid>::value ||
+                   util::is_same<T, openvdb::points::PointDataGrid>::value, uint64_t>::type
+CreateNanoGrid<SrcGridT>::countPoints() const
+{
+    static_assert(util::is_same<T, SrcGridT>::value, "expected default template parameter");
+    return util::reduce(0, mSrcNodeAcc.nodeCount(0), 8, uint64_t(0), [&](auto &r, uint64_t sum) {
+        for (auto i=r.begin(); i!=r.end(); ++i) sum += mSrcNodeAcc.template node<0>(i).getLastValue();
+        return sum;}, std::plus<uint64_t>());
+}// CreateNanoGrid::countPoints<PointIndexGrid or PointDataGrid>
+
+template <typename SrcGridT>
+template<typename DstBuildT, typename AttT, typename CodecT, typename T>
+typename util::enable_if<util::is_same<openvdb::points::PointDataGrid, T>::value>::type
+CreateNanoGrid<SrcGridT>::copyPointAttribute(size_t attIdx, AttT *attPtr)
+{
+    static_assert(util::is_same<SrcGridT, T>::value, "Expected default parameter");
+    using HandleT = openvdb::points::AttributeHandle<AttT, CodecT>;
+    util::forEach(0, mSrcNodeAcc.nodeCount(0), 16, [&](const auto& r) {
+        auto *dstLeaf = this->template dstNode<DstBuildT,0>(r.begin());
+        for (auto i = r.begin(); i != r.end(); ++i, ++dstLeaf) {
+            auto& srcLeaf = mSrcNodeAcc.template node<0>(i);
+            HandleT handle(srcLeaf.constAttributeArray(attIdx));
+            AttT *p = attPtr + dstLeaf->mMinimum;
+            for (auto iter = srcLeaf.beginIndexOn(); iter; ++iter) *p++ = handle.get(*iter);
+        }
+    });
+}// CreateNanoGrid::copyPointAttribute
+
+#endif
+
+//================================================================================================
+
+template<typename SrcGridT, typename DstBuildT, typename BufferT>
+typename util::disable_if<BuildTraits<DstBuildT>::is_index || BuildTraits<DstBuildT>::is_Fp, GridHandle<BufferT>>::type
+createNanoGrid(const SrcGridT &srcGrid,
+               StatsMode sMode,
+               CheckMode cMode,
+               int verbose,
+               const BufferT &buffer)
+{
+    CreateNanoGrid<SrcGridT> converter(srcGrid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    converter.setVerbose(verbose);
+    return converter.template getHandle<DstBuildT, BufferT>(buffer);
+}// createNanoGrid<T>
+
+//================================================================================================
+
+template<typename SrcGridT, typename DstBuildT, typename BufferT>
+typename util::enable_if<BuildTraits<DstBuildT>::is_index, GridHandle<BufferT>>::type
+createNanoGrid(const SrcGridT &srcGrid,
+               uint32_t channels,
+               bool includeStats,
+               bool includeTiles,
+               int verbose,
+               const BufferT &buffer)
+{
+    CreateNanoGrid<SrcGridT> converter(srcGrid);
+    converter.setVerbose(verbose);
+    return converter.template getHandle<DstBuildT, BufferT>(channels, includeStats, includeTiles, buffer);
+}
+
+//================================================================================================
+
+template<typename SrcGridT, typename DstBuildT, typename OracleT, typename BufferT>
+typename util::enable_if<util::is_same<FpN, DstBuildT>::value, GridHandle<BufferT>>::type
+createNanoGrid(const SrcGridT &srcGrid,
+               StatsMode sMode,
+               CheckMode cMode,
+               bool ditherOn,
+               int verbose,
+               const OracleT &oracle,
+               const BufferT &buffer)
+{
+    CreateNanoGrid<SrcGridT> converter(srcGrid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    converter.enableDithering(ditherOn);
+    converter.setVerbose(verbose);
+    return converter.template getHandle<DstBuildT, OracleT, BufferT>(oracle, buffer);
+}// createNanoGrid<FpN>
+
+//================================================================================================
+
+template<typename SrcGridT, typename DstBuildT, typename BufferT>
+typename util::enable_if<BuildTraits<DstBuildT>::is_FpX, GridHandle<BufferT>>::type
+createNanoGrid(const SrcGridT &srcGrid,
+               StatsMode sMode,
+               CheckMode cMode,
+               bool ditherOn,
+               int verbose,
+               const BufferT &buffer)
+{
+    CreateNanoGrid<SrcGridT> converter(srcGrid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    converter.enableDithering(ditherOn);
+    converter.setVerbose(verbose);
+    return converter.template getHandle<DstBuildT, BufferT>(buffer);
+}// createNanoGrid<Fp4,8,16>
+
+//================================================================================================
+
+#if defined(NANOVDB_USE_OPENVDB) && !defined(__CUDACC__)
+template<typename BufferT>
+GridHandle<BufferT>
+openToNanoVDB(const openvdb::GridBase::Ptr& base,
+              StatsMode                     sMode,
+              CheckMode                  cMode,
+              int                           verbose)
+{
+    // We need to define these types because they are not defined in OpenVDB
+    using openvdb_Vec4fTree = typename openvdb::tree::Tree4<openvdb::Vec4f, 5, 4, 3>::Type;
+    using openvdb_Vec4dTree = typename openvdb::tree::Tree4<openvdb::Vec4d, 5, 4, 3>::Type;
+    using openvdb_Vec4fGrid = openvdb::Grid<openvdb_Vec4fTree>;
+    using openvdb_Vec4dGrid = openvdb::Grid<openvdb_Vec4dTree>;
+    using openvdb_UInt32Grid = openvdb::Grid<openvdb::UInt32Tree>;
+
+    if (auto grid = openvdb::GridBase::grid<openvdb::FloatGrid>(base)) {
+        return createNanoGrid<openvdb::FloatGrid, float, BufferT>(*grid, sMode, cMode, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb::DoubleGrid>(base)) {
+        return createNanoGrid<openvdb::DoubleGrid, double, BufferT>(*grid, sMode, cMode, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb::Int32Grid>(base)) {
+        return createNanoGrid<openvdb::Int32Grid, int32_t,BufferT>(*grid, sMode, cMode, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb::Int64Grid>(base)) {
+        return createNanoGrid<openvdb::Int64Grid, int64_t, BufferT>(*grid, sMode, cMode, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb_UInt32Grid>(base)) {
+        return createNanoGrid<openvdb_UInt32Grid, uint32_t, BufferT>(*grid, sMode, cMode, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb::Vec3fGrid>(base)) {
+        return createNanoGrid<openvdb::Vec3fGrid, nanovdb::Vec3f, BufferT>(*grid, sMode, cMode, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb::Vec3dGrid>(base)) {
+        return createNanoGrid<openvdb::Vec3dGrid, nanovdb::Vec3d, BufferT>(*grid, sMode, cMode, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb::tools::PointIndexGrid>(base)) {
+        return createNanoGrid<openvdb::tools::PointIndexGrid, uint32_t, BufferT>(*grid, sMode, cMode, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb::points::PointDataGrid>(base)) {
+        return createNanoGrid<openvdb::points::PointDataGrid, uint32_t, BufferT>(*grid, sMode, cMode, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb::MaskGrid>(base)) {
+        return createNanoGrid<openvdb::MaskGrid, nanovdb::ValueMask, BufferT>(*grid, sMode, cMode, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb::BoolGrid>(base)) {
+        return createNanoGrid<openvdb::BoolGrid, bool, BufferT>(*grid, sMode, cMode, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb_Vec4fGrid>(base)) {
+        return createNanoGrid<openvdb_Vec4fGrid, nanovdb::Vec4f, BufferT>(*grid, sMode, cMode, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb_Vec4dGrid>(base)) {
+        return createNanoGrid<openvdb_Vec4dGrid, nanovdb::Vec4d, BufferT>(*grid, sMode, cMode, verbose);
+    } else {
+        OPENVDB_THROW(openvdb::RuntimeError, "Unrecognized OpenVDB grid type");
+    }
+}// openToNanoVDB
+#endif
+
+}// namespace tools ===============================================================================
+
+} // namespace nanovdb
+
+#endif // NANOVDB_TOOLS_CREATENANOGRID_H_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/tools/CreatePrimitives.h b/external/nanovdb/tools/CreatePrimitives.h
new file mode 100644
index 00000000..95b84918
--- /dev/null
+++ b/external/nanovdb/tools/CreatePrimitives.h
@@ -0,0 +1,1752 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file nanovdb/tools/CreatePrimitives.h
+
+    \author Ken Museth
+
+    \date June 26, 2020
+
+    \brief Generates volumetric primitives, e.g. sphere, torus etc, as NanoVDB grid.
+
+    \note This has no dependency on openvdb.
+*/
+
+#ifndef NANOVDB_TOOLS_PRIMITIVES_H_HAS_BEEN_INCLUDED
+#define NANOVDB_TOOLS_PRIMITIVES_H_HAS_BEEN_INCLUDED
+
+#define NANOVDB_PARALLEL_PRIMITIVES
+
+#include <nanovdb/NanoVDB.h>
+#include <nanovdb/tools/CreateNanoGrid.h>
+#include <nanovdb/util/ForEach.h>// for util::forEach and util::Range
+
+namespace nanovdb {
+
+namespace tools {// ===================================================
+
+/// @brief Returns a handle to a narrow-band level set of a sphere
+///
+/// @param radius    Radius of sphere in world units
+/// @param center    Center of sphere in world units
+/// @param voxelSize Size of a voxel in world units
+/// @param halfWidth Half-width of narrow band in voxel units
+/// @param origin    Origin of grid in world units
+/// @param name      Name of the grid
+/// @param sMode     Mode of computation for the statistics.
+/// @param cMode     Mode of computation for the checksum.
+/// @param tolerance Global error tolerance use when VoxelT = FpN
+/// @param ditherOn  If true dithering will be applied when VoxelT = {Fp4,Fp8,Fp16,FpN}
+/// @param buffer    Buffer used for memory allocation by the handle
+///
+/// @details The @c BuildT template parameter must be one of the following:
+///          float (default), double, Fp4, Fp8, Fp16 or FpN. The @c tolerance
+///          argument is only used when BuildT is set to FpN.
+template<typename BuildT = float, typename BufferT = HostBuffer>
+typename util::enable_if<util::is_same<BuildT, float,  double>::value, GridHandle<BufferT>>::type
+createLevelSetSphere(double              radius = 100.0,
+                     const Vec3d&        center = Vec3d(0),
+                     double              voxelSize = 1.0,
+                     double              halfWidth = 3.0,
+                     const Vec3d&        origin = Vec3d(0),
+                     const std::string&  name = "sphere_ls",
+                     StatsMode           sMode = StatsMode::Default,
+                     CheckMode           cMode = CheckMode::Default,
+                     const BufferT&      buffer = BufferT());
+
+template<typename BuildT, typename BufferT = HostBuffer>
+typename util::enable_if<util::is_same<BuildT, Fp4, Fp8, Fp16>::value, GridHandle<BufferT>>::type
+createLevelSetSphere(double              radius = 100.0,
+                     const Vec3d&        center = Vec3d(0),
+                     double              voxelSize = 1.0,
+                     double              halfWidth = 3.0,
+                     const Vec3d&        origin = Vec3d(0),
+                     const std::string&  name = "sphere_ls",
+                     StatsMode           sMode = StatsMode::Default,
+                     CheckMode           cMode = CheckMode::Default,
+                     bool                ditherOn = false,
+                     const BufferT&      buffer = BufferT());
+
+template<typename BuildT, typename BufferT = HostBuffer>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetSphere(double              radius = 100.0,
+                     const Vec3d&        center = Vec3d(0),
+                     double              voxelSize = 1.0,
+                     double              halfWidth = 3.0,
+                     const Vec3d&        origin = Vec3d(0),
+                     const std::string&  name = "sphere_ls_FpN",
+                     StatsMode           sMode = StatsMode::Default,
+                     CheckMode           cMode = CheckMode::Default,
+                     float               tolerance = -1.0f,
+                     bool                ditherOn = false,
+                     const BufferT&      buffer = BufferT());
+
+//================================================================================================
+
+/// @brief Returns a handle to a sparse fog volume of a sphere such
+///        that the exterior is 0 and inactive, the interior is active
+///        with values varying smoothly from 0 at the surface of the
+///        sphere to 1 at the halfWidth and interior of the sphere.
+///
+/// @param radius    Radius of sphere in world units
+/// @param center    Center of sphere in world units
+/// @param voxelSize Size of a voxel in world units
+/// @param halfWidth Half-width of narrow band in voxel units
+/// @param origin    Origin of grid in world units
+/// @param name      Name of the grid
+/// @param sMode     Mode of computation for the statistics.
+/// @param cMode     Mode of computation for the checksum.
+/// @param tolerance Global error tolerance use when VoxelT = FpN
+/// @param ditherOn  If true dithering will be applied when BuildT = {Fp4,Fp8,Fp16,FpN}
+/// @param buffer    Buffer used for memory allocation by the handle
+///
+/// @details The @c BuildT template parameter must be one of the following:
+///          float (default), double, Fp4, Fp8, Fp16 or FpN. The @c tolerance
+///          argument is only used when BuildT is set to FpN.
+template<typename BuildT = float, typename BufferT = HostBuffer>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeSphere(double              radius = 100.0,
+                      const Vec3d&        center = Vec3d(0.0),
+                      double              voxelSize = 1.0,
+                      double              halfWidth = 3.0,
+                      const Vec3d&        origin = Vec3d(0.0),
+                      const std::string&  name = "sphere_fog",
+                      StatsMode           sMode = StatsMode::Default,
+                      CheckMode           cMode = CheckMode::Default,
+                      const BufferT&      buffer = BufferT());
+
+template<typename BuildT, typename BufferT = HostBuffer>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeSphere(double              radius = 100.0,
+                      const Vec3d&        center = Vec3d(0.0),
+                      double              voxelSize = 1.0,
+                      double              halfWidth = 3.0,
+                      const Vec3d&        origin = Vec3d(0.0),
+                      const std::string&  name = "sphere_fog",
+                      StatsMode           sMode = StatsMode::Default,
+                      CheckMode           cMode = CheckMode::Default,
+                      float               tolerance = -1.0f,
+                      bool                ditherOn = false,
+                      const BufferT&      buffer = BufferT());
+
+//================================================================================================
+
+/// @brief Returns a handle to a PointDataGrid containing points scattered
+///        on the surface of a sphere.
+///
+/// @param pointsPerVoxel Number of point per voxel on on the surface
+/// @param radius         Radius of sphere in world units
+/// @param center         Center of sphere in world units
+/// @param voxelSize      Size of a voxel in world units
+/// @param origin         Origin of grid in world units
+/// @param name           Name of the grid
+/// @param mode           Mode of computation for the checksum.
+/// @param buffer         Buffer used for memory allocation by the handle
+///
+/// @details The @c BuildT template parameter must be float (default) or double.
+template<typename BuildT = float, typename BufferT = HostBuffer>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createPointSphere(int                 pointsPerVoxel = 1,
+                  double              radius = 100.0,
+                  const Vec3d&        center = Vec3d(0.0),
+                  double              voxelSize = 1.0,
+                  const Vec3d&        origin = Vec3d(0.0),
+                  const std::string&  name = "sphere_points",
+                  CheckMode           mode = CheckMode::Default,
+                  const BufferT&      buffer = BufferT());
+
+//================================================================================================
+
+/// @brief Returns a handle to a narrow-band level set of a torus in the xz-plane
+///
+/// @param majorRadius Major radius of torus in world units
+/// @param minorRadius Minor radius of torus in world units
+/// @param center      Center of torus in world units
+/// @param voxelSize   Size of a voxel in world units
+/// @param halfWidth   Half-width of narrow band in voxel units
+/// @param origin      Origin of grid in world units
+/// @param name        Name of the grid
+/// @param sMode       Mode of computation for the statistics.
+/// @param cMode       Mode of computation for the checksum.
+/// @param tolerance   Global error tolerance use when VoxelT = FpN
+/// @param ditherOn    If true dithering will be applied when VoxelT = {Fp4,Fp8,Fp16,FpN}
+/// @param buffer      Buffer used for memory allocation by the handle
+///
+/// @details The @c BuildT template parameter must be one of the following:
+///          float (default), double, Fp4, Fp8, Fp16 or FpN. The @c tolerance
+///          argument is only used when BuildT is set to FpN.
+template<typename BuildT = float, typename BufferT = HostBuffer>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetTorus(double              majorRadius = 100.0,
+                    double              minorRadius = 50.0,
+                    const Vec3d&        center = Vec3d(0.0),
+                    double              voxelSize = 1.0,
+                    double              halfWidth = 3.0,
+                    const Vec3d&        origin = Vec3d(0.0),
+                    const std::string&  name = "torus_ls",
+                    StatsMode           sMode = StatsMode::Default,
+                    CheckMode           cMode = CheckMode::Default,
+                    const BufferT&      buffer = BufferT());
+
+template<typename BuildT, typename BufferT = HostBuffer>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetTorus(double              majorRadius = 100.0,
+                    double              minorRadius = 50.0,
+                    const Vec3d&        center = Vec3d(0.0),
+                    double              voxelSize = 1.0,
+                    double              halfWidth = 3.0,
+                    const Vec3d&        origin = Vec3d(0.0),
+                    const std::string&  name = "torus_ls",
+                    StatsMode           sMode = StatsMode::Default,
+                    CheckMode           cMode = CheckMode::Default,
+                    float               tolerance = -1.0f,
+                    bool                ditherOn = false,
+                    const BufferT&      buffer = BufferT());
+
+//================================================================================================
+
+/// @brief Returns a handle to a sparse fog volume of a torus in the xz-plane such
+///        that the exterior is 0 and inactive, the interior is active
+///        with values varying smoothly from 0 at the surface of the
+///        torus to 1 at the halfWidth and interior of the torus.
+///
+/// @param majorRadius Major radius of torus in world units
+/// @param minorRadius Minor radius of torus in world units
+/// @param center      Center of torus in world units
+/// @param voxelSize   Size of a voxel in world units
+/// @param halfWidth   Half-width of narrow band in voxel units
+/// @param origin      Origin of grid in world units
+/// @param name        Name of the grid
+/// @param sMode       Mode of computation for the statistics.
+/// @param cMode       Mode of computation for the checksum.
+/// @param tolerance   Global error tolerance use when VoxelT = FpN
+/// @param ditherOn    If true dithering will be applied when VoxelT = {Fp4,Fp8,Fp16,FpN}
+/// @param buffer      Buffer used for memory allocation by the handle
+///
+/// @details The @c BuildT template parameter must be one of the following:
+///          float (default), double, Fp4, Fp8, Fp16 or FpN. The @c tolerance
+///          argument is only used when BuildT is set to FpN.
+template<typename BuildT = float, typename BufferT = HostBuffer>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeTorus(double              majorRadius = 100.0,
+                     double              minorRadius = 50.0,
+                     const Vec3d&        center = Vec3d(0.0),
+                     double              voxelSize = 1.0,
+                     double              halfWidth = 3.0,
+                     const Vec3d&        origin = Vec3d(0.0),
+                     const std::string&  name = "torus_fog",
+                     StatsMode           sMode = StatsMode::Default,
+                     CheckMode           cMode = CheckMode::Default,
+                     const BufferT&      buffer = BufferT());
+
+template<typename BuildT, typename BufferT = HostBuffer>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeTorus(double              majorRadius = 100.0,
+                     double              minorRadius = 50.0,
+                     const Vec3d&        center = Vec3d(0.0),
+                     double              voxelSize = 1.0,
+                     double              halfWidth = 3.0,
+                     const Vec3d&        origin = Vec3d(0.0),
+                     const std::string&  name = "torus_fog_FpN",
+                     StatsMode           sMode = StatsMode::Default,
+                     CheckMode           cMode = CheckMode::Default,
+                     float               tolerance = -1.0f,
+                     bool                ditherOn = false,
+                     const BufferT&      buffer = BufferT());
+
+//================================================================================================
+
+/// @brief Returns a handle to a PointDataGrid containing points scattered
+///        on the surface of a torus.
+///
+/// @param pointsPerVoxel Number of point per voxel on on the surface
+/// @param majorRadius    Major radius of torus in world units
+/// @param minorRadius    Minor radius of torus in world units
+/// @param center         Center of torus in world units
+/// @param voxelSize      Size of a voxel in world units
+/// @param origin         Origin of grid in world units
+/// @param name           Name of the grid
+/// @param cMode          Mode of computation for the checksum.
+/// @param buffer         Buffer used for memory allocation by the handle
+//
+/// @details The @c BuildT template parameter must be float (default) or double.
+template<typename BuildT = float, typename BufferT = HostBuffer>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createPointTorus(int                 pointsPerVoxel = 1, // half-width of narrow band in voxel units
+                 double              majorRadius = 100.0, // major radius of torus in world units
+                 double              minorRadius = 50.0, // minor radius of torus in world units
+                 const Vec3d&        center = Vec3d(0.0), // center of torus in world units
+                 double              voxelSize = 1.0, // size of a voxel in world units
+                 const Vec3d&        origin = Vec3d(0.0f), // origin of grid in world units
+                 const std::string&  name = "torus_points", // name of grid
+                 CheckMode           cMode = CheckMode::Default,
+                 const BufferT&      buffer = BufferT());
+
+//================================================================================================
+
+/// @brief Returns a handle to a narrow-band level set of a box
+///
+/// @param width     Width of box in world units
+/// @param height    Height of box in world units
+/// @param depth     Depth of box in world units
+/// @param center    Center of box in world units
+/// @param voxelSize Size of a voxel in world units
+/// @param halfWidth Half-width of narrow band in voxel units
+/// @param origin    Origin of grid in world units
+/// @param name      Name of the grid
+/// @param sMode     Mode of computation for the statistics.
+/// @param cMode     Mode of computation for the checksum.
+/// @param tolerance Global error tolerance use when VoxelT = FpN
+/// @param ditherOn  If true dithering will be applied when VoxelT = {Fp4,Fp8,Fp16,FpN}
+/// @param buffer    Buffer used for memory allocation by the handle
+///
+/// @details The @c BuildT template parameter must be one of the following:
+///          float (default), double, Fp4, Fp8, Fp16 or FpN. The @c tolerance
+///          argument is only used when BuildT is set to FpN.
+template<typename BuildT = float, typename BufferT = HostBuffer>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetBox(double              width = 40.0,
+                  double              height = 60.0,
+                  double              depth = 100.0,
+                  const Vec3d& center = Vec3d(0.0),
+                  double              voxelSize = 1.0,
+                  double              halfWidth = 3.0,
+                  const Vec3d&        origin = Vec3d(0.0),
+                  const std::string&  name = "box_ls",
+                  StatsMode           sMode = StatsMode::Default,
+                  CheckMode           cMode = CheckMode::Default,
+                  const BufferT&      buffer = BufferT());
+
+template<typename BuildT, typename BufferT = HostBuffer>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetBox(double              width = 40.0,
+                  double              height = 60.0,
+                  double              depth = 100.0,
+                  const Vec3d& center = Vec3d(0.0),
+                  double              voxelSize = 1.0,
+                  double              halfWidth = 3.0,
+                  const Vec3d&        origin = Vec3d(0.0),
+                  const std::string&  name = "box_ls_FpN",
+                  StatsMode           sMode = StatsMode::Default,
+                  CheckMode           cMode = CheckMode::Default,
+                  float               tolerance = -1.0f,
+                  bool                ditherOn = false,
+                  const BufferT&      buffer = BufferT());
+
+//================================================================================================
+
+/// @brief Returns a handle to a sparse fog volume of a box such
+///        that the exterior is 0 and inactive, the interior is active
+///        with values varying smoothly from 0 at the surface of the
+///        box to 1 at the halfWidth and interior of the box.
+///
+/// @param width     Width of box in world units
+/// @param height    Height of box in world units
+/// @param depth     Depth of box in world units
+/// @param center    Center of box in world units
+/// @param voxelSize Size of a voxel in world units
+/// @param halfWidth Half-width of narrow band in voxel units
+/// @param origin    Origin of grid in world units
+/// @param name      Name of the grid
+/// @param sMode     Mode of computation for the statistics.
+/// @param cMode     Mode of computation for the checksum.
+/// @param tolerance Global error tolerance use when VoxelT = FpN
+/// @param ditherOn  If true dithering will be applied when VoxelT = {Fp4,Fp8,Fp16,FpN}
+/// @param buffer    Buffer used for memory allocation by the handle
+///
+/// @details The @c BuildT template parameter must be one of the following:
+///          float (default), double, Fp4, Fp8, Fp16 or FpN. The @c tolerance
+///          argument is only used when BuildT is set to FpN.
+template<typename BuildT = float, typename BufferT = HostBuffer>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeBox(double              width = 40.0,
+                   double              height = 60.0,
+                   double              depth = 100.0,
+                   const Vec3d& center = Vec3d(0.0),
+                   double              voxelSize = 1.0,
+                   double              halfWidth = 3.0,
+                   const Vec3d&        origin = Vec3d(0.0),
+                   const std::string&  name = "box_fog",
+                   StatsMode           sMode = StatsMode::Default,
+                   CheckMode           cMode = CheckMode::Default,
+                   const BufferT&      buffer = BufferT());
+
+template<typename BuildT, typename BufferT = HostBuffer>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeBox(double              width = 40.0,
+                   double              height = 60.0,
+                   double              depth = 100.0,
+                   const Vec3d& center = Vec3d(0.0),
+                   double              voxelSize = 1.0,
+                   double              halfWidth = 3.0,
+                   const Vec3d&        origin = Vec3d(0.0),
+                   const std::string&  name = "box_fog_FpN",
+                   StatsMode           sMode = StatsMode::Default,
+                   CheckMode           cMode = CheckMode::Default,
+                   float               tolerance = -1.0f,
+                   bool                ditherOn = false,
+                   const BufferT&      buffer = BufferT());
+
+//================================================================================================
+
+/// @brief Returns a handle to a narrow-band level set of a octahedron
+///
+/// @param scale     Scale of octahedron in world units
+/// @param center    Center of octahedron in world units
+/// @param voxelSize Size of a voxel in world units
+/// @param halfWidth Half-width of narrow band in voxel units
+/// @param origin    Origin of grid in world units
+/// @param name      Name of the grid
+/// @param sMode     Mode of computation for the statistics.
+/// @param cMode     Mode of computation for the checksum.
+/// @param tolerance Global error tolerance use when VoxelT = FpN
+/// @param ditherOn  If true dithering will be applied when VoxelT = {Fp4,Fp8,Fp16,FpN}
+/// @param buffer    Buffer used for memory allocation by the handle
+///
+/// @details The @c BuildT template parameter must be one of the following:
+///          float (default), double, Fp4, Fp8, Fp16 or FpN. The @c tolerance
+///          argument is only used when BuildT is set to FpN.
+template<typename BuildT = float, typename BufferT = HostBuffer>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetOctahedron(double              scale = 100.0,
+                         const Vec3d&        center = Vec3d(0.0),
+                         double              voxelSize = 1.0,
+                         double              halfWidth = 3.0,
+                         const Vec3d&        origin = Vec3d(0.0),
+                         const std::string&  name = "octadedron_ls",
+                         StatsMode           sMode = StatsMode::Default,
+                         CheckMode           cMode = CheckMode::Default,
+                         const BufferT&      buffer = BufferT());
+
+template<typename BuildT, typename BufferT = HostBuffer>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetOctahedron(double              scale = 100.0,
+                         const Vec3d&        center = Vec3d(0.0),
+                         double              voxelSize = 1.0,
+                         double              halfWidth = 3.0,
+                         const Vec3d&        origin = Vec3d(0.0),
+                         const std::string&  name = "octadedron_ls_FpN",
+                         StatsMode           sMode = StatsMode::Default,
+                         CheckMode           cMode = CheckMode::Default,
+                         float               tolerance = -1.0f,
+                         bool                ditherOn = false,
+                         const BufferT&      buffer = BufferT());
+
+//================================================================================================
+
+/// @brief Returns a handle to a sparse fog volume of an octahedron such
+///        that the exterior is 0 and inactive, the interior is active
+///        with values varying smoothly from 0 at the surface of the
+///        octahedron to 1 at the halfWidth and interior of the octahedron.
+///
+/// @param scale     Scale of octahedron in world units
+/// @param center    Center of box in world units
+/// @param voxelSize Size of a voxel in world units
+/// @param halfWidth Half-width of narrow band in voxel units
+/// @param origin    Origin of grid in world units
+/// @param name      Name of the grid
+/// @param sMode     Mode of computation for the statistics.
+/// @param cMode     Mode of computation for the checksum.
+/// @param tolerance Global error tolerance use when VoxelT = FpN
+/// @param ditherOn  If true dithering will be applied when VoxelT = {Fp4,Fp8,Fp16,FpN}
+/// @param buffer    Buffer used for memory allocation by the handle
+///
+/// @details The @c BuildT template parameter must be one of the following:
+///          float (default), double, Fp4, Fp8, Fp16 or FpN. The @c tolerance
+///          argument is only used when BuildT is set to FpN.
+template<typename BuildT = float, typename BufferT = HostBuffer>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeOctahedron(double              scale = 100.0,
+                          const Vec3d& center = Vec3d(0.0),
+                          double              voxelSize = 1.0,
+                          double              halfWidth = 3.0,
+                          const Vec3d&        origin = Vec3d(0.0),
+                          const std::string&  name = "octadedron_fog",
+                          StatsMode           sMode = StatsMode::Default,
+                          CheckMode           cMode = CheckMode::Default,
+                          const BufferT&      buffer = BufferT());
+
+template<typename BuildT, typename BufferT = HostBuffer>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeOctahedron(double              scale = 100.0,
+                          const Vec3d& center = Vec3d(0.0),
+                          double              voxelSize = 1.0,
+                          double              halfWidth = 3.0,
+                          const Vec3d&        origin = Vec3d(0.0),
+                          const std::string&  name = "octadedron_fog_FpN",
+                          StatsMode           sMode = StatsMode::Default,
+                          CheckMode           cMode = CheckMode::Default,
+                          float               tolerance = -1.0f,
+                          bool                ditherOn = false,
+                          const BufferT&      buffer = BufferT());
+
+//================================================================================================
+
+/// @brief Returns a handle to a narrow-band level set of a bounding-box (= wireframe of a box)
+///
+/// @param width     Width of box in world units
+/// @param height    Height of box in world units
+/// @param depth     Depth of box in world units
+/// @param thickness Thickness of the wire in world units
+/// @param center    Center of bbox in world units
+/// @param voxelSize Size of a voxel in world units
+/// @param halfWidth Half-width of narrow band in voxel units
+/// @param origin    Origin of grid in world units
+/// @param name      Name of the grid
+/// @param sMode     Mode of computation for the statistics.
+/// @param cMode     Mode of computation for the checksum.
+/// @param tolerance Global error tolerance use when VoxelT = FpN
+/// @param ditherOn  If true dithering will be applied when VoxelT = {Fp4,Fp8,Fp16,FpN}
+/// @param buffer    Buffer used for memory allocation by the handle
+///
+/// @details The @c BuildT template parameter must be one of the following:
+///          float (default), double, Fp4, Fp8, Fp16 or FpN. The @c tolerance
+///          argument is only used when BuildT is set to FpN.
+template<typename BuildT = float, typename BufferT = HostBuffer>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetBBox(double              width = 40.0,
+                   double              height = 60.0,
+                   double              depth = 100.0,
+                   double              thickness = 10.0,
+                   const Vec3d&        center = Vec3d(0.0),
+                   double              voxelSize = 1.0,
+                   double              halfWidth = 3.0,
+                   const Vec3d&        origin = Vec3d(0.0),
+                   const std::string&  name = "bbox_ls",
+                   StatsMode           sMode = StatsMode::Default,
+                   CheckMode           cMode = CheckMode::Default,
+                   const BufferT&      buffer = BufferT());
+
+template<typename BuildT, typename BufferT = HostBuffer>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetBBox(double              width = 40.0,
+                   double              height = 60.0,
+                   double              depth = 100.0,
+                   double              thickness = 10.0,
+                   const Vec3d&        center = Vec3d(0.0),
+                   double              voxelSize = 1.0,
+                   double              halfWidth = 3.0,
+                   const Vec3d&        origin = Vec3d(0.0),
+                   const std::string&  name = "bbox_ls_FpN",
+                   StatsMode           sMode = StatsMode::Default,
+                   CheckMode           cMode = CheckMode::Default,
+                   float               tolerance = -1.0f,
+                   bool                ditherOn = false,
+                   const BufferT&      buffer = BufferT());
+
+
+//================================================================================================
+
+/// @brief Returns a handle to a PointDataGrid containing points scattered
+///        on the surface of a box.
+///
+/// @param pointsPerVoxel Number of point per voxel on on the surface
+/// @param width     Width of box in world units
+/// @param height    Height of box in world units
+/// @param depth     Depth of box in world units
+/// @param center    Center of box in world units
+/// @param voxelSize Size of a voxel in world units
+/// @param origin    Origin of grid in world units
+/// @param name      Name of the grid
+/// @param mode      Mode of computation for the checksum.
+/// @param buffer    Buffer used for memory allocation by the handle
+template<typename BuildT = float, typename BufferT = HostBuffer>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createPointBox(int                 pointsPerVoxel = 1, // half-width of narrow band in voxel units
+               double              width = 40.0, // width of box in world units
+               double              height = 60.0, // height of box in world units
+               double              depth = 100.0, // depth of box in world units
+               const Vec3d& center = Vec3d(0.0), // center of box in world units
+               double              voxelSize = 1.0, // size of a voxel in world units
+               const Vec3d&        origin = Vec3d(0.0), // origin of grid in world units
+               const std::string&  name = "box_points", // name of grid
+               CheckMode           mode = CheckMode::Default,
+               const BufferT&      buffer = BufferT());
+
+//================================================================================================
+
+/// @brief Given an input NanoVDB voxel grid this methods returns a GridHandle to another NanoVDB
+///        PointDataGrid with points scattered in the active leaf voxels of in input grid. Note, the
+///        coordinates of the points are encoded as blind data in world-space.
+///
+/// @param srcGrid        Const input grid used to determine the active voxels to scatter points into
+/// @param pointsPerVoxel Number of point per voxel on on the surface
+/// @param name           Name of the grid
+/// @param mode           Mode of computation for the checksum.
+/// @param buffer         Buffer used for memory allocation by the handle
+template<typename SrcBuildT = float, typename BufferT = HostBuffer>
+inline GridHandle<BufferT>
+createPointScatter(const NanoGrid<SrcBuildT>& srcGrid, // source grid used to scatter points into
+                   int                        pointsPerVoxel = 1, // half-width of narrow band in voxel units
+                   const std::string&         name = "point_scatter", // name of grid
+                   CheckMode                  mode = CheckMode::Default,
+                   const BufferT&             buffer = BufferT());
+
+//================================================================================================
+
+namespace {
+
+/// @brief Returns a shared pointer to a build::Grid containing a narrow-band SDF values for a sphere
+///
+/// @brief Note, this is not (yet) a valid level set SDF field since values inside sphere (and outside
+///        the narrow band) are still undefined. Call builder::sdfToLevelSet() to set those
+///        values or alternatively call builder::levelSetToFog to generate a FOG volume.
+///
+/// @details The @c BuildT template parameter must be one of the following:
+///          float (default), double, Fp4, Fp8, Fp16 or FpN.
+template<typename BuildT>
+std::shared_ptr<build::Grid<BuildT>>
+initSphere(double              radius, // radius of sphere in world units
+           const Vec3d&        center, // center of sphere in world units
+           double              voxelSize, // size of a voxel in world units
+           double              halfWidth, // half-width of narrow band in voxel units
+           const Vec3d&        origin) // origin of grid in world units
+{
+    using GridT = build::Grid<BuildT>;
+    using ValueT = typename BuildToValueMap<BuildT>::type;
+    static_assert(util::is_floating_point<ValueT>::value, "initSphere: expect floating point");
+    if (!(radius > 0))
+        throw std::runtime_error("Sphere: radius must be positive!");
+    if (!(voxelSize > 0))
+        throw std::runtime_error("Sphere: voxelSize must be positive!");
+    if (!(halfWidth > 0))
+        throw std::runtime_error("Sphere: halfWidth must be positive!");
+
+    auto grid = std::make_shared<GridT>(ValueT(halfWidth * voxelSize));
+    grid->setTransform(voxelSize, origin);
+
+    // Define radius of sphere with narrow-band in voxel units
+    const ValueT r0 = radius / ValueT(voxelSize), rmax = r0 + ValueT(halfWidth);
+
+    // Radius below the Nyquist frequency
+    if (r0 < ValueT(1.5f)) return grid;
+
+    // Define center of sphere in voxel units
+    const math::Vec3<ValueT> c(ValueT(center[0] - origin[0]) / ValueT(voxelSize),
+                               ValueT(center[1] - origin[1]) / ValueT(voxelSize),
+                               ValueT(center[2] - origin[2]) / ValueT(voxelSize));
+
+    // Define bounds of the voxel coordinates
+    const int imin = math::Floor(c[0] - rmax), imax = math::Ceil(c[0] + rmax);
+    const int jmin = math::Floor(c[1] - rmax), jmax = math::Ceil(c[1] + rmax);
+    const int kmin = math::Floor(c[2] - rmax), kmax = math::Ceil(c[2] + rmax);
+
+    const util::Range<1,int> range(imin, imax+1, 32);
+
+    auto kernel = [&](const util::Range<1,int> &r) {
+        auto acc = grid->getWriteAccessor();
+        Coord ijk;
+        int &i = ijk[0], &j = ijk[1], &k = ijk[2], m = 1;
+        // Compute signed distances to sphere using leapfrogging in k
+        for (i = r.begin(); i < r.end(); ++i) {
+            const auto x2 = math::Pow2(ValueT(i) - c[0]);
+            for (j = jmin; j <= jmax; ++j) {
+                const auto x2y2 = math::Pow2(ValueT(j) - c[1]) + x2;
+                for (k = kmin; k <= kmax; k += m) {
+                    m = 1;
+                    const auto v = math::Sqrt(x2y2 + math::Pow2(ValueT(k) - c[2])) - r0; // Distance in voxel units
+                    const auto d = v < 0 ? -v : v;
+                    if (d < halfWidth) { // inside narrow band
+                        acc.setValue(ijk, ValueT(voxelSize) * v); // distance in world units
+                    } else { // outside narrow band
+                        m += math::Floor(d - halfWidth); // leapfrog
+                    }
+                } //end leapfrog over k
+            } //end loop over j
+        } //end loop over i
+    };// kernel
+#ifdef NANOVDB_PARALLEL_PRIMITIVES
+    util::forEach(range, kernel);
+#else
+    kernel(range);
+#endif
+    return grid;
+} // initSphere
+
+template<typename BuildT>
+std::shared_ptr<build::Grid<BuildT>>
+initTorus(double              radius1, // major radius of torus in world units
+          double              radius2, // minor radius of torus in world units
+          const Vec3d&        center, // center of torus in world units
+          double              voxelSize, // size of a voxel in world units
+          double              halfWidth, // half-width of narrow band in voxel units
+          const Vec3d&        origin) // origin of grid in world units
+{
+    using GridT = build::Grid<BuildT>;
+    using ValueT = typename BuildToValueMap<BuildT>::type;
+    static_assert(util::is_floating_point<ValueT>::value, "initTorus: expect floating point");
+    if (!(radius2 > 0))
+        throw std::runtime_error("Torus: radius2 must be positive!");
+    if (!(radius1 > radius2))
+        throw std::runtime_error("Torus: radius1 must be larger than radius2!");
+    if (!(voxelSize > 0))
+        throw std::runtime_error("Torus: voxelSize must be positive!");
+    if (!(halfWidth > 0))
+        throw std::runtime_error("Torus: halfWidth must be positive!");
+
+    auto grid = std::make_shared<GridT>(ValueT(halfWidth * voxelSize));
+    grid->setTransform(voxelSize, origin);
+
+    // Define size of torus with narrow-band in voxel units
+    const ValueT r1 = radius1 / ValueT(voxelSize), r2 = radius2 / ValueT(voxelSize), rmax1 = r1 + r2 + ValueT(halfWidth), rmax2 = r2 + ValueT(halfWidth);
+
+    // Radius below the Nyquist frequency
+    if (r2 < ValueT(1.5)) return grid;
+
+    // Define center of torus in voxel units
+    const math::Vec3<ValueT> c(ValueT(center[0] - origin[0]) / ValueT(voxelSize),
+                               ValueT(center[1] - origin[1]) / ValueT(voxelSize),
+                               ValueT(center[2] - origin[2]) / ValueT(voxelSize));
+
+    // Define bounds of the voxel coordinates
+    const int imin = math::Floor(c[0] - rmax1), imax = math::Ceil(c[0] + rmax1);
+    const int jmin = math::Floor(c[1] - rmax2), jmax = math::Ceil(c[1] + rmax2);
+    const int kmin = math::Floor(c[2] - rmax1), kmax = math::Ceil(c[2] + rmax1);
+
+    const util::Range<1,int> range(imin, imax+1, 32);
+    auto kernel = [&](const util::Range<1,int> &r) {
+        auto acc = grid->getWriteAccessor();
+        Coord ijk;
+        int &i = ijk[0], &j = ijk[1], &k = ijk[2], m = 1;
+        // Compute signed distances to torus using leapfrogging in k
+        for (i = r.begin(); i < r.end(); ++i) {
+            const auto x2 = math::Pow2(ValueT(i) - c[0]);
+            for (k = kmin; k <= kmax; ++k) {
+                const auto x2z2 = math::Pow2(math::Sqrt(math::Pow2(ValueT(k) - c[2]) + x2) - r1);
+                for (j = jmin; j <= jmax; j += m) {
+                    m = 1;
+                    const auto v = math::Sqrt(x2z2 + math::Pow2(ValueT(j) - c[1])) - r2; // Distance in voxel units
+                    const auto d = v < 0 ? -v : v;
+                    if (d < halfWidth) { // inside narrow band
+                        acc.setValue(ijk, ValueT(voxelSize) * v); // distance in world units
+                    } else { // outside narrow band
+                        m += math::Floor(d - halfWidth); // leapfrog
+                    }
+                } //end leapfrog over k
+            } //end loop over j
+        } //end loop over i
+     }; // kernel
+
+#ifdef NANOVDB_PARALLEL_PRIMITIVES
+    util::forEach(range, kernel);
+#else
+    kernel(range);
+#endif
+
+    return grid;
+} // initTorus
+
+template<typename BuildT>
+std::shared_ptr<build::Grid<BuildT>>
+initBox(double       width, // major radius of torus in world units
+        double       height, // minor radius of torus in world units
+        double       depth,
+        const Vec3d& center, // center of box in world units
+        double       voxelSize, // size of a voxel in world units
+        double       halfWidth, // half-width of narrow band in voxel units
+        const Vec3d& origin) // origin of grid in world units
+{
+    using GridT = build::Grid<BuildT>;
+    using ValueT = typename BuildToValueMap<BuildT>::type;
+    static_assert(util::is_floating_point<ValueT>::value, "initBox: expect floating point");
+    using Vec3T = math::Vec3<ValueT>;
+    if (!(width > 0))
+        throw std::runtime_error("Box: width must be positive!");
+    if (!(height > 0))
+        throw std::runtime_error("Box: height must be positive!");
+    if (!(depth > 0))
+        throw std::runtime_error("Box: depth must be positive!");
+
+    if (!(voxelSize > 0))
+        throw std::runtime_error("Box: voxelSize must be positive!");
+    if (!(halfWidth > 0))
+        throw std::runtime_error("Box: halfWidth must be positive!");
+
+    auto grid = std::make_shared<GridT>(ValueT(halfWidth * voxelSize));
+    grid->setTransform(voxelSize, origin);
+
+    // Define size of box with narrow-band in voxel units
+    const Vec3T r(width  / (2 * ValueT(voxelSize)),
+                  height / (2 * ValueT(voxelSize)),
+                  depth  / (2 * ValueT(voxelSize)));
+
+    // Below the Nyquist frequency
+    if (r.min() < ValueT(1.5)) return grid;
+
+    // Define center of box in voxel units
+    const Vec3T c(ValueT(center[0] - origin[0]) / ValueT(voxelSize),
+                  ValueT(center[1] - origin[1]) / ValueT(voxelSize),
+                  ValueT(center[2] - origin[2]) / ValueT(voxelSize));
+
+    // Define utility functions
+    auto Pos = [](ValueT x) { return x > 0 ? x : 0; };
+    auto Neg = [](ValueT x) { return x < 0 ? x : 0; };
+
+    // Define bounds of the voxel coordinates
+    const math::BBox<Vec3T> b(c - r - Vec3T(ValueT(halfWidth)), c + r + Vec3T(ValueT(halfWidth)));
+    const CoordBBox   bbox(Coord(math::Floor(b[0][0]), math::Floor(b[0][1]), math::Floor(b[0][2])),
+                           Coord(math::Ceil(b[1][0]),  math::Ceil(b[1][1]),  math::Ceil(b[1][2])));
+    const util::Range<1,int> range(bbox[0][0], bbox[1][0]+1, 32);
+
+    // Compute signed distances to box using leapfrogging in k
+    auto kernel = [&](const util::Range<1,int> &ra) {
+        auto acc = grid->getWriteAccessor();
+        int m = 1;
+        for (Coord p(ra.begin(),bbox[0][1],bbox[0][2]); p[0] < ra.end(); ++p[0]) {
+            const auto q1 = math::Abs(ValueT(p[0]) - c[0]) - r[0];
+            const auto x2 = math::Pow2(Pos(q1));
+            for (p[1] = bbox[0][1]; p[1] <= bbox[1][1]; ++p[1]) {
+                const auto q2 = math::Abs(ValueT(p[1]) - c[1]) - r[1];
+                const auto q0 = math::Max(q1, q2);
+                const auto x2y2 = x2 + math::Pow2(Pos(q2));
+                for (p[2] = bbox[0][2]; p[2] <= bbox[1][2]; p[2] += m) {
+                    m = 1;
+                    const auto q3 = math::Abs(ValueT(p[2]) - c[2]) - r[2];
+                    const auto v = math::Sqrt(x2y2 + math::Pow2(Pos(q3))) + Neg(math::Max(q0, q3)); // Distance in voxel units
+                    const auto d = math::Abs(v);
+                    if (d < halfWidth) { // inside narrow band
+                        acc.setValue(p, ValueT(voxelSize) * v); // distance in world units
+                    } else { // outside narrow band
+                        m += math::Floor(d - halfWidth); // leapfrog
+                    }
+                } //end leapfrog over k
+            } //end loop over j
+        } //end loop over i
+    }; // kernel
+#ifdef NANOVDB_PARALLEL_PRIMITIVES
+    util::forEach(range, kernel);
+#else
+    kernel(range);
+#endif
+    return grid;
+} // initBox
+
+template<typename BuildT>
+std::shared_ptr<build::Grid<BuildT>>
+initBBox(double       width, // width of the bbox in world units
+         double       height, // height of the bbox in world units
+         double       depth, // depth of the bbox in world units
+         double       thickness, // thickness of the wire in world units
+         const Vec3d& center, // center of bbox in world units
+         double       voxelSize, // size of a voxel in world units
+         double       halfWidth, // half-width of narrow band in voxel units
+         const Vec3d& origin) // origin of grid in world units
+{
+    using GridT = build::Grid<BuildT>;
+    using ValueT = typename BuildToValueMap<BuildT>::type;
+    static_assert(util::is_floating_point<ValueT>::value, "initBBox: expect floating point");
+    using Vec3T = math::Vec3<ValueT>;
+    if (!(width > 0))
+        throw std::runtime_error("BBox: width must be positive!");
+    if (!(height > 0))
+        throw std::runtime_error("BBox: height must be positive!");
+    if (!(depth > 0))
+        throw std::runtime_error("BBox: depth must be positive!");
+    if (!(thickness > 0))
+        throw std::runtime_error("BBox: thickness must be positive!");
+    if (!(voxelSize > 0.0))
+        throw std::runtime_error("BBox: voxelSize must be positive!");
+
+
+    auto grid = std::make_shared<GridT>(ValueT(halfWidth * voxelSize));
+    grid->setTransform(voxelSize, origin);
+
+    // Define size of bbox with narrow-band in voxel units
+    const Vec3T  r(width / (2 * ValueT(voxelSize)),
+                  height / (2 * ValueT(voxelSize)),
+                  depth  / (2 * ValueT(voxelSize)));
+    const ValueT e = thickness / ValueT(voxelSize);
+
+    // Below the Nyquist frequency
+    if (r.min() < ValueT(1.5) || e < ValueT(1.5)) return grid;
+
+    // Define center of bbox in voxel units
+    const Vec3T c(ValueT(center[0] - origin[0]) / ValueT(voxelSize),
+                  ValueT(center[1] - origin[1]) / ValueT(voxelSize),
+                  ValueT(center[2] - origin[2]) / ValueT(voxelSize));
+
+    // Define utility functions
+    auto Pos = [](ValueT x) { return x > 0 ? x : 0; };
+    auto Neg = [](ValueT x) { return x < 0 ? x : 0; };
+
+    // Define bounds of the voxel coordinates
+    const math::BBox<Vec3T> b(c - r - Vec3T(e + ValueT(halfWidth)), c + r + Vec3T(e + ValueT(halfWidth)));
+    const CoordBBox   bbox(Coord(math::Floor(b[0][0]), math::Floor(b[0][1]), math::Floor(b[0][2])),
+                           Coord(math::Ceil(b[1][0]),  math::Ceil(b[1][1]),  math::Ceil(b[1][2])));
+    const util::Range<1,int> range(bbox[0][0], bbox[1][0]+1, 32);
+
+    // Compute signed distances to bbox using leapfrogging in k
+    auto kernel = [&](const util::Range<1,int> &ra) {
+        auto acc = grid->getWriteAccessor();
+        int m = 1;
+        for (Coord p(ra.begin(),bbox[0][1],bbox[0][2]); p[0] < ra.end(); ++p[0]) {
+            const ValueT px = math::Abs(ValueT(p[0]) - c[0]) - r[0];
+            const ValueT qx = math::Abs(ValueT(px) + e) - e;
+            const ValueT px2 = math::Pow2(Pos(px));
+            const ValueT qx2 = math::Pow2(Pos(qx));
+            for (p[1] = bbox[0][1]; p[1] <= bbox[1][1]; ++p[1]) {
+                const ValueT py = math::Abs(ValueT(p[1]) - c[1]) - r[1];
+                const ValueT qy = math::Abs(ValueT(py) + e) - e;
+                const ValueT qy2 = math::Pow2(Pos(qy));
+                const ValueT px2qy2 = px2 + qy2;
+                const ValueT qx2py2 = qx2 + math::Pow2(Pos(py));
+                const ValueT qx2qy2 = qx2 + qy2;
+                const ValueT a[3] = {math::Max(px, qy), math::Max(qx, py), math::Max(qx, qy)};
+                for (p[2] = bbox[0][2]; p[2] <= bbox[1][2]; p[2] += m) {
+                    m = 1;
+                    const ValueT pz = math::Abs(ValueT(p[2]) - c[2]) - r[2];
+                    const ValueT qz = math::Abs(ValueT(pz) + e) - e;
+                    const ValueT qz2 = math::Pow2(Pos(qz));
+                    const ValueT s1 = math::Sqrt(px2qy2 + qz2) + Neg(math::Max(a[0], qz));
+                    const ValueT s2 = math::Sqrt(qx2py2 + qz2) + Neg(math::Max(a[1], qz));
+                    const ValueT s3 = math::Sqrt(qx2qy2 + math::Pow2(Pos(pz))) + Neg(math::Max(a[2], pz));
+                    const ValueT v = math::Min(s1, math::Min(s2, s3)); // Distance in voxel units
+                    const ValueT d = math::Abs(v);
+                    if (d < halfWidth) { // inside narrow band
+                        acc.setValue(p, ValueT(voxelSize) * v); // distance in world units
+                    } else { // outside narrow band
+                        m += math::Floor(d - halfWidth); // leapfrog
+                    }
+                } //end leapfrog over k
+            } //end loop over j
+        } //end loop over i
+    }; //kernel
+#ifdef NANOVDB_PARALLEL_PRIMITIVES
+    util::forEach(range, kernel);
+#else
+    kernel(range);
+#endif
+
+    return grid;
+} // initBBox
+
+template<typename BuildT>
+std::shared_ptr<build::Grid<BuildT>>
+initOctahedron(double       scale, // scale of the octahedron in world units
+               const Vec3d& center, // center of octahedron in world units
+               double       voxelSize, // size of a voxel in world units
+               double       halfWidth, // half-width of narrow band in voxel units
+               const Vec3d& origin) // origin of grid in world units
+{
+    using GridT = build::Grid<BuildT>;
+    using ValueT = typename BuildToValueMap<BuildT>::type;
+    using Vec3T = math::Vec3<ValueT>;
+    static_assert(util::is_floating_point<ValueT>::value, "initOctahedron: expect floating point");
+
+    if (!(scale > 0)) throw std::runtime_error("Octahedron: width must be positive!");
+    if (!(voxelSize > 0)) throw std::runtime_error("Octahedron: voxelSize must be positive!");
+
+    auto grid = std::make_shared<GridT>(ValueT(halfWidth * voxelSize));
+    grid->setTransform(voxelSize, origin);
+
+    // Define size of octahedron with narrow-band in voxel units
+    const ValueT s = scale / (2 * ValueT(voxelSize));
+
+    // Below the Nyquist frequency
+    if ( s < ValueT(1.5) ) return grid;
+
+    // Define center of octahedron in voxel units
+    const Vec3T c(ValueT(center[0] - origin[0]) / ValueT(voxelSize),
+                  ValueT(center[1] - origin[1]) / ValueT(voxelSize),
+                  ValueT(center[2] - origin[2]) / ValueT(voxelSize));
+
+    // Define utility functions
+    auto sdf = [&s](ValueT x, ValueT y, ValueT z) {
+        const ValueT d = ValueT(0.5)*(z - y + s);
+        if (d < ValueT(0)) {
+            return Vec3T(x, y - s, z).length();
+        } else if (d > s) {
+            return Vec3T(x, y, z - s).length();
+        }
+        return Vec3T(x, y - s + d, z - d).length();
+    };
+
+    // Define bounds of the voxel coordinates
+    const math::BBox<Vec3T> b(c - Vec3T(s + ValueT(halfWidth)), c + Vec3T(s + ValueT(halfWidth)));
+    const CoordBBox   bbox(Coord(math::Floor(b[0][0]), math::Floor(b[0][1]), math::Floor(b[0][2])),
+                           Coord(math::Ceil(b[1][0]),  math::Ceil(b[1][1]),  math::Ceil(b[1][2])));
+    const util::Range<1,int> range(bbox[0][0], bbox[1][0]+1, 32);
+
+    // Compute signed distances to octahedron using leapfrogging in k
+    auto kernel = [&](const util::Range<1,int> &ra) {
+        auto acc = grid->getWriteAccessor();
+        int m = 1;
+        static const ValueT a = math::Sqrt(ValueT(1)/ValueT(3));
+        for (Coord p(ra.begin(),bbox[0][1],bbox[0][2]); p[0] < ra.end(); ++p[0]) {
+            const ValueT px = math::Abs(ValueT(p[0]) - c[0]);
+            for (p[1] = bbox[0][1]; p[1] <= bbox[1][1]; ++p[1]) {
+                const ValueT py = math::Abs(ValueT(p[1]) - c[1]);
+                for (p[2] = bbox[0][2]; p[2] <= bbox[1][2]; p[2] += m) {
+                    m = 1;
+                    const ValueT pz = math::Abs(ValueT(p[2]) - c[2]);
+                    ValueT d =  px + py + pz - s;
+                    ValueT v;
+                    if (ValueT(3)*px < d) {
+                        v = sdf(px, py, pz);
+                    } else if (ValueT(3)*py < d) {
+                        v = sdf(py, pz, px);
+                    } else if (ValueT(3)*pz < d) {
+                        v = sdf(pz, px, py);
+                    } else {
+                        v = a * d;
+                    }
+                    d = math::Abs(v);
+                    if (d < halfWidth) { // inside narrow band
+                        acc.setValue(p, ValueT(voxelSize) * v); // distance in world units
+                    } else { // outside narrow band
+                        m += math::Floor(d - halfWidth); // leapfrog
+                    }
+                } //end leapfrog over k
+            } //end loop over j
+        } //end loop over i
+     };// kernel
+#ifdef NANOVDB_PARALLEL_PRIMITIVES
+    util::forEach(range, kernel);
+#else
+    kernel(range);
+#endif
+    return grid;
+} // initOctahedron
+
+} // unnamed namespace
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::enable_if<util::is_same<BuildT, float, double>::value, GridHandle<BufferT>>::type
+createLevelSetSphere(double              radius, // radius of sphere in world units
+                     const Vec3d&        center, // center of sphere in world units
+                     double              voxelSize, // size of a voxel in world units
+                     double              halfWidth, // half-width of narrow band in voxel units
+                     const Vec3d&        origin, // origin of grid in world units
+                     const std::string&  name, // name of grid
+                     StatsMode           sMode, // mode of computation for the statistics
+                     CheckMode           cMode, // mode of computation for the checksum
+                     const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initSphere<BuildT>(radius, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    auto handle = converter.template getHandle<BuildT, BufferT>(buffer);
+    assert(handle);
+    return handle;
+} // createLevelSetSphere<T>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::enable_if<util::is_same<BuildT, Fp4, Fp8, Fp16>::value, GridHandle<BufferT>>::type
+createLevelSetSphere(double              radius, // radius of sphere in world units
+                     const Vec3d&        center, // center of sphere in world units
+                     double              voxelSize, // size of a voxel in world units
+                     double              halfWidth, // half-width of narrow band in voxel units
+                     const Vec3d&        origin, // origin of grid in world units
+                     const std::string&  name, // name of grid
+                     StatsMode           sMode, // mode of computation for the statistics
+                     CheckMode           cMode, // mode of computation for the checksum
+                     bool                ditherOn,
+                     const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initSphere<BuildT>(radius, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    converter.enableDithering(ditherOn);
+    auto handle = converter.template getHandle<BuildT, BufferT>(buffer);
+    assert(handle);
+    return handle;
+} // createLevelSetSphere<Fp4 or Fp8 or Fp16>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetSphere(double              radius, // radius of sphere in world units
+                     const Vec3d&        center, // center of sphere in world units
+                     double              voxelSize, // size of a voxel in world units
+                     double              halfWidth, // half-width of narrow band in voxel units
+                     const Vec3d&        origin, // origin of grid in world units
+                     const std::string&  name, // name of grid
+                     StatsMode           sMode, // mode of computation for the statistics
+                     CheckMode           cMode, // mode of computation for the checksum
+                     float               tolerance,// only used if VoxelT = FpN
+                     bool                ditherOn,
+                     const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initSphere<BuildT>(radius, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    converter.enableDithering(ditherOn);
+    AbsDiff oracle(tolerance);
+    auto handle = converter.template getHandle<BuildT, AbsDiff, BufferT>(oracle, buffer);
+    assert(handle);
+    return handle;
+} // createLevelSetSphere<FpN>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeSphere(double              radius, // radius of sphere in world units
+                      const Vec3d&        center, // center of sphere in world units
+                      double              voxelSize, // size of a voxel in world units
+                      double              halfWidth, // half-width of narrow band in voxel units
+                      const Vec3d&        origin, // origin of grid in world units
+                      const std::string&  name, // name of grid
+                      StatsMode           sMode, // mode of computation for the statistics
+                      CheckMode           cMode, // mode of computation for the checksum
+                      const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initSphere<BuildT>(radius, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    build::levelSetToFog(mgr, false);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    auto handle = converter.template getHandle<BuildT, BufferT>(buffer);
+    assert(handle);
+    return handle;
+} // createFogVolumeSphere<T>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeSphere(double              radius, // radius of sphere in world units
+                      const Vec3d&        center, // center of sphere in world units
+                      double              voxelSize, // size of a voxel in world units
+                      double              halfWidth, // half-width of narrow band in voxel units
+                      const Vec3d&        origin, // origin of grid in world units
+                      const std::string&  name, // name of grid
+                      StatsMode           sMode, // mode of computation for the statistics
+                      CheckMode           cMode, // mode of computation for the checksum
+                      float               tolerance,// only used if VoxelT = FpN
+                      bool                ditherOn,
+                      const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initSphere<BuildT>(radius, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    build::levelSetToFog(mgr, false);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    converter.enableDithering(ditherOn);
+    AbsDiff oracle(tolerance);
+    auto handle = converter.template getHandle<BuildT, AbsDiff, BufferT>(oracle, buffer);
+    assert(handle);
+    return handle;
+} // createFogVolumeSphere<FpN>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createPointSphere(int                 pointsPerVoxel, // number of points to be scattered in each active voxel
+                  double              radius, // radius of sphere in world units
+                  const Vec3d&        center, // center of sphere in world units
+                  double              voxelSize, // size of a voxel in world units
+                  const Vec3d&        origin, // origin of grid in world units
+                  const std::string&  name, // name of grid
+                  CheckMode           cMode, // mode of computation for the checksum
+                  const BufferT&      buffer)
+{
+    auto sphereHandle = createLevelSetSphere(radius, center, voxelSize, 0.5, origin, "dummy",
+                                             StatsMode::BBox, CheckMode::Disable, buffer);
+    assert(sphereHandle);
+    auto* sphereGrid = sphereHandle.template grid<BuildT>();
+    assert(sphereGrid);
+    auto pointHandle = createPointScatter(*sphereGrid, pointsPerVoxel, name, cMode, buffer);
+    assert(pointHandle);
+    return pointHandle;
+} // createPointSphere
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetTorus(double              majorRadius, // major radius of torus in world units
+                    double              minorRadius, // minor radius of torus in world units
+                    const Vec3d&        center, // center of torus in world units
+                    double              voxelSize, // size of a voxel in world units
+                    double              halfWidth, // half-width of narrow band in voxel units
+                    const Vec3d&        origin, // origin of grid in world units
+                    const std::string&  name, // name of grid
+                    StatsMode           sMode, // mode of computation for the statistics
+                    CheckMode           cMode, // mode of computation for the checksum
+                    const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initTorus<BuildT>(majorRadius, minorRadius, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    auto handle = converter.template getHandle<BuildT, BufferT>(buffer);
+    assert(handle);
+    return handle;
+} // createLevelSetTorus<T>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetTorus(double              majorRadius, // major radius of torus in world units
+                    double              minorRadius, // minor radius of torus in world units
+                    const Vec3d&        center, // center of torus in world units
+                    double              voxelSize, // size of a voxel in world units
+                    double              halfWidth, // half-width of narrow band in voxel units
+                    const Vec3d&        origin, // origin of grid in world units
+                    const std::string&  name, // name of grid
+                    StatsMode           sMode, // mode of computation for the statistics
+                    CheckMode           cMode, // mode of computation for the checksum
+                    float               tolerance,
+                    bool                ditherOn,
+                    const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initTorus<BuildT>(majorRadius, minorRadius, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    converter.enableDithering(ditherOn);
+    AbsDiff oracle(tolerance);
+    auto handle = converter.template getHandle<BuildT, AbsDiff, BufferT>(oracle, buffer);
+    assert(handle);
+    return handle;
+} // createLevelSetTorus<FpN>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeTorus(double              majorRadius, // major radius of torus in world units
+                     double              minorRadius, // minor radius of torus in world units
+                     const Vec3d&        center, // center of torus in world units
+                     double              voxelSize, // size of a voxel in world units
+                     double              halfWidth, // half-width of narrow band in voxel units
+                     const Vec3d&        origin, // origin of grid in world units
+                     const std::string&  name, // name of grid
+                     StatsMode           sMode, // mode of computation for the statistics
+                     CheckMode           cMode, // mode of computation for the checksum
+                     const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initTorus<BuildT>(majorRadius, minorRadius, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    build::levelSetToFog(mgr, false);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    auto handle = converter.template getHandle<BuildT, BufferT>(buffer);
+    assert(handle);
+    return handle;
+} // createFogVolumeTorus<T>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeTorus(double              majorRadius, // major radius of torus in world units
+                     double              minorRadius, // minor radius of torus in world units
+                     const Vec3d&        center, // center of torus in world units
+                     double              voxelSize, // size of a voxel in world units
+                     double              halfWidth, // half-width of narrow band in voxel units
+                     const Vec3d&        origin, // origin of grid in world units
+                     const std::string&  name, // name of grid
+                     StatsMode           sMode, // mode of computation for the statistics
+                     CheckMode           cMode, // mode of computation for the checksum
+                     float               tolerance,
+                     bool                ditherOn,
+                     const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initTorus<BuildT>(majorRadius, minorRadius, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    build::levelSetToFog(mgr, false);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    converter.enableDithering(ditherOn);
+    AbsDiff oracle(tolerance);
+    auto handle = converter.template getHandle<BuildT, AbsDiff, BufferT>(oracle, buffer);
+    assert(handle);
+    return handle;
+} // createFogVolumeTorus<FpN>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createPointTorus(int                 pointsPerVoxel, // number of points to be scattered in each active voxel
+                 double              majorRadius, // major radius of torus in world units
+                 double              minorRadius, // minor radius of torus in world units
+                 const Vec3d&        center, // center of torus in world units
+                 double              voxelSize, // size of a voxel in world units
+                 const Vec3d&        origin, // origin of grid in world units
+                 const std::string&  name, // name of grid
+                 CheckMode           cMode, // mode of computation for the checksum
+                 const BufferT&      buffer)
+{
+    auto torusHandle = createLevelSetTorus(majorRadius, minorRadius, center, voxelSize, 0.5f, origin,
+                                           "dummy", StatsMode::BBox, CheckMode::Disable, buffer);
+    assert(torusHandle);
+    auto* torusGrid = torusHandle.template grid<BuildT>();
+    assert(torusGrid);
+    auto pointHandle = createPointScatter(*torusGrid, pointsPerVoxel, name, cMode, buffer);
+    assert(pointHandle);
+    return pointHandle;
+} // createPointTorus<T>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetBox(double              width, // width of box in world units
+                  double              height, // height of box in world units
+                  double              depth, // depth of box in world units
+                  const Vec3d&        center, // center of box in world units
+                  double              voxelSize, // size of a voxel in world units
+                  double              halfWidth, // half-width of narrow band in voxel units
+                  const Vec3d&        origin, // origin of grid in world units
+                  const std::string&  name, // name of grid
+                  StatsMode           sMode, // mode of computation for the statistics
+                  CheckMode           cMode, // mode of computation for the checksum
+                  const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initBox<BuildT>(width, height, depth, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    auto handle = converter.template getHandle<BuildT, BufferT>(buffer);
+    assert(handle);
+    return handle;
+} // createLevelSetBox<T>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetBox(double              width, // width of box in world units
+                  double              height, // height of box in world units
+                  double              depth, // depth of box in world units
+                  const Vec3d&        center, // center of box in world units
+                  double              voxelSize, // size of a voxel in world units
+                  double              halfWidth, // half-width of narrow band in voxel units
+                  const Vec3d&        origin, // origin of grid in world units
+                  const std::string&  name, // name of grid
+                  StatsMode           sMode, // mode of computation for the statistics
+                  CheckMode           cMode, // mode of computation for the checksum
+                  float               tolerance,
+                  bool                ditherOn,
+                  const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initBox<BuildT>(width, height, depth, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    converter.enableDithering(ditherOn);
+    AbsDiff oracle(tolerance);
+    auto handle = converter.template getHandle<BuildT, AbsDiff, BufferT>(oracle, buffer);
+    assert(handle);
+    return handle;
+} // createLevelSetBox<FpN>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetOctahedron(double              scale, // scale of the octahedron in world units
+                         const Vec3d&        center, // center of box in world units
+                         double              voxelSize, // size of a voxel in world units
+                         double              halfWidth, // half-width of narrow band in voxel units
+                         const Vec3d&        origin, // origin of grid in world units
+                         const std::string&  name, // name of grid
+                         StatsMode           sMode, // mode of computation for the statistics
+                         CheckMode           cMode, // mode of computation for the checksum
+                         const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initOctahedron<BuildT>(scale, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    auto handle = converter.template getHandle<BuildT, BufferT>(buffer);
+    assert(handle);
+    return handle;
+} // createLevelSetOctahedron<T>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetOctahedron(double              scale, // scale of the octahedron in world units
+                         const Vec3d&        center, // center of box in world units
+                         double              voxelSize, // size of a voxel in world units
+                         double              halfWidth, // half-width of narrow band in voxel units
+                         const Vec3d&        origin, // origin of grid in world units
+                         const std::string&  name, // name of grid
+                         StatsMode           sMode, // mode of computation for the statistics
+                         CheckMode           cMode, // mode of computation for the checksum
+                         float               tolerance,
+                         bool                ditherOn,
+                         const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initOctahedron<BuildT>(scale, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    converter.enableDithering(ditherOn);
+    AbsDiff oracle(tolerance);
+    auto handle = converter.template getHandle<BuildT, AbsDiff, BufferT>(oracle, buffer);
+    assert(handle);
+    return handle;
+} // createLevelSetOctahedron<FpN>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetBBox(double              width, // width of bbox in world units
+                   double              height, // height of bbox in world units
+                   double              depth, // depth of bbox in world units
+                   double              thickness, // thickness of the wire in world units
+                   const Vec3d&        center, // center of bbox in world units
+                   double              voxelSize, // size of a voxel in world units
+                   double              halfWidth, // half-width of narrow band in voxel units
+                   const Vec3d&        origin, // origin of grid in world units
+                   const std::string&  name, // name of grid
+                   StatsMode           sMode, // mode of computation for the statistics
+                   CheckMode           cMode, // mode of computation for the checksum
+                   const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initBBox<BuildT>(width, height, depth, thickness, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    auto handle = converter.template getHandle<BuildT, BufferT>(buffer);
+    assert(handle);
+    return handle;
+} // createLevelSetBBox<T>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createLevelSetBBox(double              width, // width of bbox in world units
+                   double              height, // height of bbox in world units
+                   double              depth, // depth of bbox in world units
+                   double              thickness, // thickness of the wire in world units
+                   const Vec3d&        center, // center of bbox in world units
+                   double              voxelSize, // size of a voxel in world units
+                   double              halfWidth, // half-width of narrow band in voxel units
+                   const Vec3d&        origin, // origin of grid in world units
+                   const std::string&  name, // name of grid
+                   StatsMode           sMode, // mode of computation for the statistics
+                   CheckMode           cMode, // mode of computation for the checksum
+                   float               tolerance,
+                   bool                ditherOn,
+                   const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initBBox<BuildT>(width, height, depth, thickness, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    converter.enableDithering(ditherOn);
+    AbsDiff oracle(tolerance);
+    auto handle = converter.template getHandle<BuildT, AbsDiff, BufferT>(oracle, buffer);
+    assert(handle);
+    return handle;
+} // createLevelSetBBox<FpN>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeBox(double              width, // width of box in world units
+                   double              height, // height of box in world units
+                   double              depth, // depth of box in world units
+                   const Vec3d&        center, // center of box in world units
+                   double              voxelSize, // size of a voxel in world units
+                   double              halfWidth, // half-width of narrow band in voxel units
+                   const Vec3d&        origin, // origin of grid in world units
+                   const std::string&  name, // name of grid
+                   StatsMode           sMode, // mode of computation for the statistics
+                   CheckMode           cMode, // mode of computation for the checksum
+                   const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initBox<BuildT>(width, height, depth, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    build::levelSetToFog(mgr, false);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    auto handle = converter.template getHandle<BuildT, BufferT>(buffer);
+    assert(handle);
+    return handle;
+} // createFogVolumeBox<T>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeBox(double              width, // width of box in world units
+                   double              height, // height of box in world units
+                   double              depth, // depth of box in world units
+                   const Vec3d&        center, // center of box in world units
+                   double              voxelSize, // size of a voxel in world units
+                   double              halfWidth, // half-width of narrow band in voxel units
+                   const Vec3d&        origin, // origin of grid in world units
+                   const std::string&  name, // name of grid
+                   StatsMode           sMode, // mode of computation for the statistics
+                   CheckMode           cMode, // mode of computation for the checksum
+                   float               tolerance,
+                   bool                ditherOn,
+                   const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initBox<BuildT>(width, height, depth, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    build::levelSetToFog(mgr, false);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    converter.enableDithering(ditherOn);
+    AbsDiff oracle(tolerance);
+    auto handle = converter.template getHandle<BuildT, AbsDiff, BufferT>(oracle, buffer);
+    assert(handle);
+    return handle;
+} // createFogVolumeBox<FpN>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeOctahedron(double              scale, // scale of octahedron in world units
+                          const Vec3d&        center, // center of box in world units
+                          double              voxelSize, // size of a voxel in world units
+                          double              halfWidth, // half-width of narrow band in voxel units
+                          const Vec3d&        origin, // origin of grid in world units
+                          const std::string&  name, // name of grid
+                          StatsMode           sMode, // mode of computation for the statistics
+                          CheckMode           cMode, // mode of computation for the checksum
+                          const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initOctahedron<BuildT>(scale, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    build::levelSetToFog(mgr, false);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    auto handle = converter.template getHandle<BuildT, BufferT>(buffer);
+    assert(handle);
+    return handle;
+} // createFogVolumeOctahedron<T>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::enable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createFogVolumeOctahedron(double              scale, // scale of octahedron in world units
+                          const Vec3d&        center, // center of box in world units
+                          double              voxelSize, // size of a voxel in world units
+                          double              halfWidth, // half-width of narrow band in voxel units
+                          const Vec3d&        origin, // origin of grid in world units
+                          const std::string&  name, // name of grid
+                          StatsMode           sMode, // mode of computation for the statistics
+                          CheckMode           cMode, // mode of computation for the checksum
+                          float               tolerance,
+                          bool                ditherOn,
+                          const BufferT&      buffer)
+{
+    using GridT = build::Grid<BuildT>;
+    auto grid = initOctahedron<BuildT>(scale, center, voxelSize, halfWidth, origin);
+    grid->mName = name;
+    build::NodeManager<GridT> mgr(*grid);
+    build::sdfToLevelSet(mgr);
+    build::levelSetToFog(mgr, false);
+    CreateNanoGrid<GridT> converter(*grid);
+    converter.setStats(sMode);
+    converter.setChecksum(cMode);
+    converter.enableDithering(ditherOn);
+    AbsDiff oracle(tolerance);
+    auto handle = converter.template getHandle<BuildT, AbsDiff, BufferT>(oracle, buffer);
+    assert(handle);
+    return handle;
+} // createFogVolumeOctahedron<FpN>
+
+//================================================================================================
+
+template<typename BuildT, typename BufferT>
+typename util::disable_if<util::is_same<FpN, BuildT>::value, GridHandle<BufferT>>::type
+createPointBox(int                 pointsPerVoxel, // number of points to be scattered in each active voxel
+               double              width, // width of box in world units
+               double              height, // height of box in world units
+               double              depth, // depth of box in world units
+               const Vec3d&        center, // center of box in world units
+               double              voxelSize, // size of a voxel in world units
+               const Vec3d&        origin, // origin of grid in world units
+               const std::string&  name, // name of grid
+               CheckMode           cMode, // mode of computation for the checksum
+               const BufferT&      buffer)
+{
+    auto boxHandle = createLevelSetBox(width, height, depth, center, voxelSize, 0.5, origin, "dummy",
+                                       StatsMode::BBox, CheckMode::Disable, buffer);
+    assert(boxHandle);
+    auto* boxGrid = boxHandle.template grid<BuildT>();
+    assert(boxGrid);
+    auto pointHandle = createPointScatter(*boxGrid, pointsPerVoxel, name, cMode, buffer);
+    assert(pointHandle);
+    return pointHandle;
+} // createPointBox<T>
+
+//================================================================================================
+
+template<typename SrcBuildT, typename BufferT>
+inline GridHandle<BufferT>
+createPointScatter(const NanoGrid<SrcBuildT>& srcGrid, // origin of grid in world units
+                   int                        pointsPerVoxel, // number of points to be scattered in each active voxel
+                   const std::string&         name, // name of grid
+                   CheckMode                  cMode, // mode of computation for the checksum
+                   const BufferT&             buffer)
+{
+    using ValueT = typename BuildToValueMap<SrcBuildT>::type;
+    static_assert(util::is_floating_point<ValueT>::value, "createPointScatter: expect floating point");
+    using Vec3T = math::Vec3<ValueT>;
+    if (pointsPerVoxel < 1) {
+        throw std::runtime_error("createPointScatter: Expected at least one point per voxel");
+    }
+    if (!srcGrid.isLevelSet()) {
+        throw std::runtime_error("createPointScatter: Expected a level set grid");
+    }
+    if (!srcGrid.hasBBox()) {
+        throw std::runtime_error("createPointScatter: ActiveVoxelCount is required");
+    }
+    const uint64_t pointCount = pointsPerVoxel * srcGrid.activeVoxelCount();
+    if (pointCount == 0) {
+        throw std::runtime_error("createPointScatter: No particles to scatter");
+    }
+    std::vector<Vec3T> xyz;
+    xyz.reserve(pointCount);
+    using DstGridT = build::Grid<uint32_t>;
+    DstGridT dstGrid(std::numeric_limits<uint32_t>::max(), name, GridClass::PointData);
+    dstGrid.mMap = srcGrid.map();
+    auto dstAcc = dstGrid.getAccessor();
+    std::srand(1234);
+    const ValueT s = 1 / (1 + ValueT(RAND_MAX)); // scale so s*rand() is in ] 0, 1 [
+    // return a point with random local voxel coordinates (-0.5 to +0.5)
+    auto randomPoint = [&s](){return s * Vec3T(rand(), rand(), rand()) - Vec3T(0.5);};
+    const auto& srcTree = srcGrid.tree();
+    auto srcMgrHandle = createNodeManager(srcGrid);
+    auto *srcMgr = srcMgrHandle.template mgr<SrcBuildT>();
+    assert(srcMgr);
+    for (uint32_t i = 0, end = srcTree.nodeCount(0); i < end; ++i) {
+        auto& srcLeaf = srcMgr->leaf(i);
+        auto* dstLeaf = dstAcc.setValue(srcLeaf.origin(), pointsPerVoxel); // allocates leaf node
+        dstLeaf->mValueMask = srcLeaf.valueMask();
+        for (uint32_t j = 0, m = 0; j < 512; ++j) {
+            if (dstLeaf->mValueMask.isOn(j)) {
+                const Vec3f ijk = dstLeaf->offsetToGlobalCoord(j).asVec3s();// floating-point representatrion of index coorindates
+                for (int n = 0; n < pointsPerVoxel; ++n) xyz.push_back(srcGrid.indexToWorld(randomPoint() + ijk));
+                m += pointsPerVoxel;
+            }// active voxels
+            dstLeaf->mValues[j] = m;
+        }// loop over all voxels
+    }// loop over leaf nodes
+    assert(pointCount == xyz.size());
+    CreateNanoGrid<DstGridT> converter(dstGrid);
+    converter.setStats(StatsMode::MinMax);
+    converter.setChecksum(CheckMode::Disable);
+
+    converter.addBlindData(name,
+                           GridBlindDataSemantic::WorldCoords,
+                           GridBlindDataClass::AttributeArray,
+                           toGridType<Vec3T>(),
+                           pointCount,
+                           sizeof(Vec3T));
+    auto handle = converter.template getHandle<uint32_t>(buffer);
+    assert(handle);
+
+    auto* grid = handle.template grid<uint32_t>();
+    assert(grid && grid->template isSequential<0>());
+    auto &tree = grid->tree();
+    if (tree.nodeCount(0) == 0) throw std::runtime_error("Expect leaf nodes!");
+    auto *leafData = tree.getFirstLeaf()->data();
+    leafData[0].mMinimum = 0; // start of prefix sum
+    for (uint32_t i = 1, n = tree.nodeCount(0); i < n; ++i) {
+        leafData[i].mMinimum = leafData[i - 1].mMinimum + leafData[i - 1].mMaximum;
+    }
+    if (Vec3T *blindData = grid->template getBlindData<Vec3T>(0)) {
+        memcpy(blindData, xyz.data(), xyz.size() * sizeof(Vec3T));
+    } else {
+        throw std::runtime_error("Blind data pointer was NULL");
+    }
+    updateChecksum(grid, cMode);
+    return handle;
+} // createPointScatter
+
+}// namespace tools
+
+} // namespace nanovdb
+
+#endif // NANOVDB_TOOLS_PRIMITIVES_H_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/tools/GridBuilder.h b/external/nanovdb/tools/GridBuilder.h
new file mode 100644
index 00000000..428215ba
--- /dev/null
+++ b/external/nanovdb/tools/GridBuilder.h
@@ -0,0 +1,2315 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file nanovdb/tools/GridBuilder.h
+
+    \author Ken Museth
+
+    \date June 26, 2020
+
+    \brief This file defines a minimum set of tree nodes and tools that
+           can be used (instead of OpenVDB) to build nanovdb grids on the CPU.
+*/
+
+#ifndef NANOVDB_TOOLS_BUILD_GRIDBUILDER_H_HAS_BEEN_INCLUDED
+#define NANOVDB_TOOLS_BUILD_GRIDBUILDER_H_HAS_BEEN_INCLUDED
+
+#include <iostream>
+
+#include <map>
+#include <limits>
+#include <sstream> // for stringstream
+#include <vector>
+#include <cstring> // for memcpy
+#include <mutex>
+#include <array>
+#include <atomic>
+
+#include <nanovdb/NanoVDB.h>
+#include <nanovdb/util/Range.h>
+#include <nanovdb/util/ForEach.h>
+
+namespace nanovdb {
+
+namespace tools::build {
+
+// ----------------------------> Froward decelerations of random access methods <--------------------------------------
+
+template <typename T> struct GetValue;
+template <typename T> struct SetValue;
+template <typename T> struct TouchLeaf;
+template <typename T> struct GetState;
+template <typename T> struct ProbeValue;
+
+// ----------------------------> RootNode <--------------------------------------
+
+template<typename ChildT>
+struct RootNode
+{
+    using ValueType = typename ChildT::ValueType;
+    using BuildType = typename ChildT::BuildType;
+    using ChildNodeType = ChildT;
+    using LeafNodeType = typename ChildT::LeafNodeType;
+    static constexpr uint32_t LEVEL = 1 + ChildT::LEVEL; // level 0 = leaf
+    struct Tile {
+        Tile(ChildT* c = nullptr) : child(c) {}
+        Tile(const ValueType& v, bool s) : child(nullptr), value(v), state(s) {}
+        bool isChild() const { return child!=nullptr; }
+        bool isValue() const { return child==nullptr; }
+        bool isActive() const { return child==nullptr && state; }
+        ChildT*   child;
+        ValueType value;
+        bool      state;
+    };
+    using MapT = std::map<Coord, Tile>;
+    MapT      mTable;
+    ValueType mBackground;
+
+    Tile* probeTile(const Coord &ijk) {
+        auto iter = mTable.find(CoordToKey(ijk));
+        return iter == mTable.end() ? nullptr : &(iter->second);
+    }
+
+    const Tile* probeTile(const Coord &ijk) const {
+        auto iter = mTable.find(CoordToKey(ijk));
+        return iter == mTable.end() ? nullptr : &(iter->second);
+    }
+
+    class ChildIterator
+    {
+        const RootNode *mParent;
+        typename MapT::const_iterator mIter;
+    public:
+        ChildIterator() : mParent(nullptr), mIter() {}
+        ChildIterator(const RootNode *parent) : mParent(parent), mIter(parent->mTable.begin()) {
+            while (mIter!=parent->mTable.end() && mIter->second.child==nullptr) ++mIter;
+        }
+        ChildIterator& operator=(const ChildIterator&) = default;
+        ChildT& operator*() const {NANOVDB_ASSERT(*this); return *mIter->second.child;}
+        ChildT* operator->() const {NANOVDB_ASSERT(*this); return mIter->second.child;}
+        Coord getOrigin() const { NANOVDB_ASSERT(*this); return mIter->first;}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return mIter->first;}
+        operator bool() const {return mParent && mIter!=mParent->mTable.end();}
+        ChildIterator& operator++() {
+            NANOVDB_ASSERT(mParent);
+            ++mIter;
+            while (mIter!=mParent->mTable.end() && mIter->second.child==nullptr) ++mIter;
+            return *this;
+        }
+        ChildIterator operator++(int) {
+            auto tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+        uint32_t pos() const {
+            NANOVDB_ASSERT(mParent);
+            return uint32_t(std::distance(mParent->mTable.begin(), mIter));
+        }
+    }; // Member class ChildIterator
+
+    ChildIterator  cbeginChild()  const {return ChildIterator(this);}
+    ChildIterator cbeginChildOn() const {return ChildIterator(this);}// match openvdb
+
+    class ValueIterator
+    {
+        const RootNode *mParent;
+        typename MapT::const_iterator mIter;
+    public:
+        ValueIterator() : mParent(nullptr), mIter() {}
+        ValueIterator(const RootNode *parent) : mParent(parent), mIter(parent->mTable.begin()) {
+            while (mIter!=parent->mTable.end() && mIter->second.child!=nullptr) ++mIter;
+        }
+        ValueIterator& operator=(const ValueIterator&) = default;
+        ValueType operator*() const {NANOVDB_ASSERT(*this); return mIter->second.value;}
+        bool isActive() const {NANOVDB_ASSERT(*this); return mIter->second.state;}
+        Coord getOrigin() const { NANOVDB_ASSERT(*this); return mIter->first;}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return mIter->first;}
+        operator bool() const {return mParent && mIter!=mParent->mTable.end();}
+        ValueIterator& operator++() {
+            NANOVDB_ASSERT(mParent);
+            ++mIter;
+            while (mIter!=mParent->mTable.end() && mIter->second.child!=nullptr) ++mIter;
+            return *this;;
+        }
+        ValueIterator operator++(int) {
+            auto tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+        uint32_t pos() const {
+            NANOVDB_ASSERT(mParent);
+            return uint32_t(std::distance(mParent->mTable.begin(), mIter));
+        }
+    }; // Member class ValueIterator
+
+    ValueIterator  beginValue()          {return ValueIterator(this);}
+    ValueIterator cbeginValueAll() const {return ValueIterator(this);}
+
+    class ValueOnIterator
+    {
+        const RootNode *mParent;
+        typename MapT::const_iterator mIter;
+    public:
+        ValueOnIterator() : mParent(nullptr), mIter() {}
+        ValueOnIterator(const RootNode *parent) : mParent(parent), mIter(parent->mTable.begin()) {
+            while (mIter!=parent->mTable.end() && (mIter->second.child!=nullptr || !mIter->second.state)) ++mIter;
+        }
+        ValueOnIterator& operator=(const ValueOnIterator&) = default;
+        ValueType operator*() const {NANOVDB_ASSERT(*this); return mIter->second.value;}
+        Coord getOrigin() const { NANOVDB_ASSERT(*this); return mIter->first;}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return mIter->first;}
+        operator bool() const {return mParent && mIter!=mParent->mTable.end();}
+        ValueOnIterator& operator++() {
+            NANOVDB_ASSERT(mParent);
+            ++mIter;
+            while (mIter!=mParent->mTable.end() && (mIter->second.child!=nullptr || !mIter->second.state)) ++mIter;
+            return *this;;
+        }
+        ValueOnIterator operator++(int) {
+            auto tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+        uint32_t pos() const {
+            NANOVDB_ASSERT(mParent);
+            return uint32_t(std::distance(mParent->mTable.begin(), mIter));
+        }
+    }; // Member class ValueOnIterator
+
+    ValueOnIterator  beginValueOn()       {return ValueOnIterator(this);}
+    ValueOnIterator cbeginValueOn() const {return ValueOnIterator(this);}
+
+    class TileIterator
+    {
+        const RootNode *mParent;
+        typename MapT::const_iterator mIter;
+    public:
+        TileIterator() : mParent(nullptr), mIter() {}
+        TileIterator(const RootNode *parent) : mParent(parent), mIter(parent->mTable.begin()) {
+            NANOVDB_ASSERT(mParent);
+        }
+        TileIterator& operator=(const TileIterator&) = default;
+        const Tile& operator*() const {NANOVDB_ASSERT(*this); return mIter->second;}
+        const Tile* operator->() const {NANOVDB_ASSERT(*this); return &(mIter->second);}
+        Coord getOrigin() const { NANOVDB_ASSERT(*this); return mIter->first;}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return mIter->first;}
+        operator bool() const {return mParent && mIter!=mParent->mTable.end();}
+        const ChildT* probeChild(ValueType &value) {
+            NANOVDB_ASSERT(*this);
+            const ChildT *child = mIter->second.child;
+            if (child==nullptr) value = mIter->second.value;
+            return child;
+        }
+        bool isValueOn() const {return mIter->second.child==nullptr && mIter->second.state;}
+        TileIterator& operator++() {
+            NANOVDB_ASSERT(mParent);
+            ++mIter;
+            return *this;
+        }
+        TileIterator operator++(int) {
+            auto tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+        uint32_t pos() const {
+            NANOVDB_ASSERT(mParent);
+            return uint32_t(std::distance(mParent->mTable.begin(), mIter));
+        }
+    }; // Member class TileIterator
+
+    TileIterator  beginTile()           {return TileIterator(this);}
+    TileIterator cbeginChildAll() const {return TileIterator(this);}
+
+    //class DenseIterator : public TileIterator
+
+    RootNode(const ValueType& background) : mBackground(background) {}
+    RootNode(const RootNode&) = delete; // disallow copy-construction
+    RootNode(RootNode&&) = default; // allow move construction
+    RootNode& operator=(const RootNode&) = delete; // disallow copy assignment
+    RootNode& operator=(RootNode&&) = default; // allow move assignment
+
+    ~RootNode() { this->clear(); }
+
+    uint32_t tileCount()    const { return uint32_t(mTable.size()); }
+    uint32_t getTableSize() const { return uint32_t(mTable.size()); }// match openvdb
+    const ValueType& background() const {return mBackground;}
+
+    void nodeCount(std::array<size_t,3> &count) const
+    {
+        for (auto it = this->cbeginChild(); it; ++it) {
+            count[ChildT::LEVEL] += 1;
+            it->nodeCount(count);
+        }
+    }
+
+    bool empty() const { return mTable.empty(); }
+
+    void clear()
+    {
+        for (auto iter = mTable.begin(); iter != mTable.end(); ++iter) delete iter->second.child;
+        mTable.clear();
+    }
+
+    static Coord CoordToKey(const Coord& ijk) { return ijk & ~ChildT::MASK; }
+
+#ifdef NANOVDB_NEW_ACCESSOR_METHODS
+    template<typename OpT, typename... ArgsT>
+    auto get(const Coord& ijk, ArgsT&&... args) const
+    {
+        if (const Tile *tile = this->probeTile(ijk)) {
+            if (auto *child = tile->child) return child->template get<OpT>(ijk, args...);
+            return OpT::get(*tile, args...);
+        }
+        return OpT::get(*this, args...);
+    }
+    template<typename OpT, typename... ArgsT>
+    auto set(const Coord& ijk, ArgsT&&... args)
+    {
+        ChildT* child = nullptr;
+        const Coord key = CoordToKey(ijk);
+        auto iter = mTable.find(key);
+        if (iter == mTable.end()) {
+            child = new ChildT(ijk, mBackground, false);
+            mTable[key] = Tile(child);
+        } else if (iter->second.child != nullptr) {
+            child = iter->second.child;
+        } else {
+            child = new ChildT(ijk, iter->second.value, iter->second.state);
+            iter->second.child = child;
+        }
+        NANOVDB_ASSERT(child);
+        return child->template set<OpT>(ijk, args...);
+    }
+    template<typename OpT, typename AccT, typename... ArgsT>
+    auto getAndCache(const Coord& ijk, const AccT& acc, ArgsT&&... args) const
+    {
+        if (const Tile *tile = this->probeTile(ijk)) {
+            if (auto *child = tile->child) {
+                acc.insert(ijk, child);
+                return child->template get<OpT>(ijk, args...);
+            }
+            return OpT::get(*tile, args...);
+        }
+        return OpT::get(*this, args...);
+    }
+
+    template<typename OpT, typename AccT, typename... ArgsT>
+    auto setAndCache(const Coord& ijk, const AccT& acc, ArgsT&&... args)
+    {
+        ChildT* child = nullptr;
+        const Coord key = CoordToKey(ijk);
+        auto iter = mTable.find(key);
+        if (iter == mTable.end()) {
+            child = new ChildT(ijk, mBackground, false);
+            mTable[key] = Tile(child);
+        } else if (iter->second.child != nullptr) {
+            child = iter->second.child;
+        } else {
+            child = new ChildT(ijk, iter->second.value, iter->second.state);
+            iter->second.child = child;
+        }
+        NANOVDB_ASSERT(child);
+        acc.insert(ijk, child);
+        return child->template setAndCache<OpT>(ijk, acc, args...);
+    }
+    ValueType getValue(const Coord& ijk) const {return this->template get<GetValue<BuildType>>(ijk);}
+    ValueType getValue(int i, int j, int k) const {return this->template get<GetValue<BuildType>>(Coord(i,j,k));}
+    ValueType operator()(const Coord& ijk) const {return this->template get<GetValue<BuildType>>(ijk);}
+    ValueType operator()(int i, int j, int k) const {return this->template get<GetValue<BuildType>>(Coord(i,j,k));}
+    void setValue(const Coord& ijk, const ValueType& value) {this->template set<SetValue<BuildType>>(ijk, value);}
+    bool probeValue(const Coord& ijk, ValueType& value) const {return this->template get<ProbeValue<BuildType>>(ijk, value);}
+    bool isActive(const Coord& ijk) const {return this->template get<GetState<BuildType>>(ijk);}
+#else
+    ValueType getValue(const Coord& ijk) const
+    {
+#if 1
+        if (auto *tile = this->probeTile(ijk)) return tile->child ? tile->child->getValue(ijk) : tile->value;
+        return mBackground;
+#else
+        auto iter = mTable.find(CoordToKey(ijk));
+        if (iter == mTable.end()) {
+            return mBackground;
+        } else if (iter->second.child) {
+            return iter->second.child->getValue(ijk);
+        } else {
+            return iter->second.value;
+        }
+#endif
+    }
+    ValueType getValue(int i, int j, int k) const {return this->getValue(Coord(i,j,k));}
+
+    void setValue(const Coord& ijk, const ValueType& value)
+    {
+        ChildT* child = nullptr;
+        const Coord key = CoordToKey(ijk);
+        auto iter = mTable.find(key);
+        if (iter == mTable.end()) {
+            child = new ChildT(ijk, mBackground, false);
+            mTable[key] = Tile(child);
+        } else if (iter->second.child != nullptr) {
+            child = iter->second.child;
+        } else {
+            child = new ChildT(ijk, iter->second.value, iter->second.state);
+            iter->second.child = child;
+        }
+        NANOVDB_ASSERT(child);
+        child->setValue(ijk, value);
+    }
+
+    template<typename AccT>
+    bool isActiveAndCache(const Coord& ijk, AccT& acc) const
+    {
+        auto iter = mTable.find(CoordToKey(ijk));
+        if (iter == mTable.end())
+            return false;
+        if (iter->second.child) {
+            acc.insert(ijk, iter->second.child);
+            return iter->second.child->isActiveAndCache(ijk, acc);
+        }
+        return iter->second.state;
+    }
+
+    template<typename AccT>
+    ValueType getValueAndCache(const Coord& ijk, AccT& acc) const
+    {
+        auto iter = mTable.find(CoordToKey(ijk));
+        if (iter == mTable.end())
+            return mBackground;
+        if (iter->second.child) {
+            acc.insert(ijk, iter->second.child);
+            return iter->second.child->getValueAndCache(ijk, acc);
+        }
+        return iter->second.value;
+    }
+
+    template<typename AccT>
+    void setValueAndCache(const Coord& ijk, const ValueType& value, AccT& acc)
+    {
+        ChildT* child = nullptr;
+        const Coord key = CoordToKey(ijk);
+        auto iter = mTable.find(key);
+        if (iter == mTable.end()) {
+            child = new ChildT(ijk, mBackground, false);
+            mTable[key] = Tile(child);
+        } else if (iter->second.child != nullptr) {
+            child = iter->second.child;
+        } else {
+            child = new ChildT(ijk, iter->second.value, iter->second.state);
+            iter->second.child = child;
+        }
+        NANOVDB_ASSERT(child);
+        acc.insert(ijk, child);
+        child->setValueAndCache(ijk, value, acc);
+    }
+    template<typename AccT>
+    void setValueOnAndCache(const Coord& ijk, AccT& acc)
+    {
+        ChildT* child = nullptr;
+        const Coord key = CoordToKey(ijk);
+        auto iter = mTable.find(key);
+        if (iter == mTable.end()) {
+            child = new ChildT(ijk, mBackground, false);
+            mTable[key] = Tile(child);
+        } else if (iter->second.child != nullptr) {
+            child = iter->second.child;
+        } else {
+            child = new ChildT(ijk, iter->second.value, iter->second.state);
+            iter->second.child = child;
+        }
+        NANOVDB_ASSERT(child);
+        acc.insert(ijk, child);
+        child->setValueOnAndCache(ijk, acc);
+    }
+    template<typename AccT>
+    void touchLeafAndCache(const Coord &ijk, AccT& acc)
+    {
+        ChildT* child = nullptr;
+        const Coord key = CoordToKey(ijk);
+        auto iter = mTable.find(key);
+        if (iter == mTable.end()) {
+            child = new ChildT(ijk, mBackground, false);
+            mTable[key] = Tile(child);
+        } else if (iter->second.child != nullptr) {
+            child = iter->second.child;
+        } else {
+            child = new ChildT(ijk, iter->second.value, iter->second.state);
+            iter->second.child = child;
+        }
+        acc.insert(ijk, child);
+        child->touchLeafAndCache(ijk, acc);
+    }
+#endif// NANOVDB_NEW_ACCESSOR_METHODS
+
+    template<typename NodeT>
+    uint32_t nodeCount() const
+    {
+        static_assert(util::is_same<ValueType, typename NodeT::ValueType>::value, "Root::getNodes: Invalid type");
+        static_assert(NodeT::LEVEL < LEVEL, "Root::getNodes: LEVEL error");
+        uint32_t sum = 0;
+        for (auto iter = mTable.begin(); iter != mTable.end(); ++iter) {
+            if (iter->second.child == nullptr) continue; // skip tiles
+            if constexpr(util::is_same<NodeT, ChildT>::value) { //resolved at compile-time
+                ++sum;
+            } else {
+                sum += iter->second.child->template nodeCount<NodeT>();
+            }
+        }
+        return sum;
+    }
+
+    template<typename NodeT>
+    void getNodes(std::vector<NodeT*>& array)
+    {
+        static_assert(util::is_same<ValueType, typename NodeT::ValueType>::value, "Root::getNodes: Invalid type");
+        static_assert(NodeT::LEVEL < LEVEL, "Root::getNodes: LEVEL error");
+        for (auto iter = mTable.begin(); iter != mTable.end(); ++iter) {
+            if (iter->second.child == nullptr)
+                continue;
+            if constexpr(util::is_same<NodeT, ChildT>::value) { //resolved at compile-time
+                array.push_back(reinterpret_cast<NodeT*>(iter->second.child));
+            } else {
+                iter->second.child->getNodes(array);
+            }
+        }
+    }
+
+    void addChild(ChildT*& child)
+    {
+        NANOVDB_ASSERT(child);
+        const Coord key = CoordToKey(child->mOrigin);
+        auto iter = mTable.find(key);
+        if (iter != mTable.end() && iter->second.child != nullptr) { // existing child node
+            delete iter->second.child;
+            iter->second.child = child;
+        } else {
+            mTable[key] = Tile(child);
+        }
+        child = nullptr;
+    }
+
+    /// @brief Add a tile containing voxel (i, j, k) at the specified tree level,
+    /// creating a new branch if necessary.  Delete any existing lower-level nodes
+    /// that contain (x, y, z).
+    /// @tparam level tree level at which the tile is inserted. Must be 1, 2 or 3.
+    /// @param ijk Index coordinate that map to the tile being inserted
+    /// @param value Value of the tile
+    /// @param state Binary state of the tile
+    template <uint32_t level>
+    void addTile(const Coord& ijk, const ValueType& value, bool state)
+    {
+        static_assert(level > 0 && level <= LEVEL, "invalid template value of level");
+        const Coord key = CoordToKey(ijk);
+        auto        iter = mTable.find(key);
+        if constexpr(level == LEVEL) {
+            if (iter == mTable.end()) {
+                mTable[key] = Tile(value, state);
+            } else if (iter->second.child == nullptr) {
+                iter->second.value = value;
+                iter->second.state = state;
+            } else {
+                delete iter->second.child;
+                iter->second.child = nullptr;
+                iter->second.value = value;
+                iter->second.state = state;
+            }
+        } else if constexpr(level < LEVEL) {
+            ChildT* child = nullptr;
+            if (iter == mTable.end()) {
+                child = new ChildT(ijk, mBackground, false);
+                mTable[key] = Tile(child);
+            } else if (iter->second.child != nullptr) {
+                child = iter->second.child;
+            } else {
+                child = new ChildT(ijk, iter->second.value, iter->second.state);
+                iter->second.child = child;
+            }
+            child->template addTile<level>(ijk, value, state);
+        }
+    }
+
+    template<typename NodeT>
+    void addNode(NodeT*& node)
+    {
+        if constexpr(util::is_same<NodeT, ChildT>::value) { //resolved at compile-time
+            this->addChild(reinterpret_cast<ChildT*&>(node));
+        } else {
+            ChildT*     child = nullptr;
+            const Coord key = CoordToKey(node->mOrigin);
+            auto        iter = mTable.find(key);
+            if (iter == mTable.end()) {
+                child = new ChildT(node->mOrigin, mBackground, false);
+                mTable[key] = Tile(child);
+            } else if (iter->second.child != nullptr) {
+                child = iter->second.child;
+            } else {
+                child = new ChildT(node->mOrigin, iter->second.value, iter->second.state);
+                iter->second.child = child;
+            }
+            child->addNode(node);
+        }
+    }
+
+    void merge(RootNode &other)
+    {
+        for (auto iter1 = other.mTable.begin(); iter1 != other.mTable.end(); ++iter1) {
+            if (iter1->second.child == nullptr) continue;// ignore input tiles
+            auto iter2 = mTable.find(iter1->first);
+            if (iter2 == mTable.end() || iter2->second.child == nullptr) {
+                mTable[iter1->first] = Tile(iter1->second.child);
+                iter1->second.child = nullptr;
+            } else {
+                iter2->second.child->merge(*iter1->second.child);
+            }
+        }
+        other.clear();
+    }
+
+    template<typename T>
+    typename util::enable_if<std::is_floating_point<T>::value>::type
+    signedFloodFill(T outside);
+
+}; // tools::build::RootNode
+
+//================================================================================================
+
+template<typename ChildT>
+template<typename T>
+inline typename util::enable_if<std::is_floating_point<T>::value>::type
+RootNode<ChildT>::signedFloodFill(T outside)
+{
+    std::map<Coord, ChildT*> nodeKeys;
+    for (auto iter = mTable.begin(); iter != mTable.end(); ++iter) {
+        if (iter->second.child == nullptr)
+            continue;
+        nodeKeys.insert(std::pair<Coord, ChildT*>(iter->first, iter->second.child));
+    }
+
+    // We employ a simple z-scanline algorithm that inserts inactive tiles with
+    // the inside value if they are sandwiched between inside child nodes only!
+    auto b = nodeKeys.begin(), e = nodeKeys.end();
+    if (b == e)
+        return;
+    for (auto a = b++; b != e; ++a, ++b) {
+        Coord d = b->first - a->first; // delta of neighboring coordinates
+        if (d[0] != 0 || d[1] != 0 || d[2] == int(ChildT::DIM))
+            continue; // not same z-scanline or neighbors
+        const ValueType fill[] = {a->second->getLastValue(), b->second->getFirstValue()};
+        if (!(fill[0] < 0) || !(fill[1] < 0))
+            continue; // scanline isn't inside
+        Coord c = a->first + Coord(0u, 0u, ChildT::DIM);
+        for (; c[2] != b->first[2]; c[2] += ChildT::DIM) {
+            const Coord key = RootNode<ChildT>::CoordToKey(c);
+            mTable[key] = typename RootNode<ChildT>::Tile(-outside, false); // inactive tile
+        }
+    }
+} // tools::build::RootNode::signedFloodFill
+
+// ----------------------------> InternalNode <--------------------------------------
+
+template<typename ChildT>
+struct InternalNode
+{
+    using ValueType = typename ChildT::ValueType;
+    using BuildType = typename ChildT::BuildType;
+    using ChildNodeType = ChildT;
+    using LeafNodeType = typename ChildT::LeafNodeType;
+    static constexpr uint32_t LOG2DIM = ChildT::LOG2DIM + 1;
+    static constexpr uint32_t TOTAL = LOG2DIM + ChildT::TOTAL; //dimension in index space
+    static constexpr uint32_t DIM = 1u << TOTAL;
+    static constexpr uint32_t SIZE = 1u << (3 * LOG2DIM); //number of tile values (or child pointers)
+    static constexpr uint32_t MASK = DIM - 1;
+    static constexpr uint32_t LEVEL = 1 + ChildT::LEVEL; // level 0 = leaf
+    static constexpr uint64_t NUM_VALUES = uint64_t(1) << (3 * TOTAL); // total voxel count represented by this node
+    using MaskT = Mask<LOG2DIM>;
+    template<bool On>
+    using MaskIterT = typename MaskT::template Iterator<On>;
+    using NanoNodeT = typename NanoNode<BuildType, LEVEL>::Type;
+
+    struct Tile {
+        Tile(ChildT* c = nullptr) : child(c) {}
+        Tile(const ValueType& v) : value(v) {}
+        union{
+            ChildT*   child;
+            ValueType value;
+        };
+    };
+    Coord      mOrigin;
+    MaskT      mValueMask;
+    MaskT      mChildMask;
+    Tile       mTable[SIZE];
+
+    union {
+        NanoNodeT *mDstNode;
+        uint64_t   mDstOffset;
+    };
+
+    /// @brief Visits child nodes of this node only
+    class ChildIterator : public MaskIterT<true>
+    {
+        using BaseT = MaskIterT<true>;
+        const InternalNode *mParent;
+    public:
+        ChildIterator() : BaseT(), mParent(nullptr) {}
+        ChildIterator(const InternalNode* parent) : BaseT(parent->mChildMask.beginOn()), mParent(parent) {}
+        ChildIterator& operator=(const ChildIterator&) = default;
+        const ChildT& operator*() const {NANOVDB_ASSERT(*this); return *mParent->mTable[BaseT::pos()].child;}
+        const ChildT* operator->() const {NANOVDB_ASSERT(*this); return mParent->mTable[BaseT::pos()].child;}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return (*this)->origin();}
+    }; // Member class ChildIterator
+
+    ChildIterator  beginChild()         {return ChildIterator(this);}
+    ChildIterator cbeginChildOn() const {return ChildIterator(this);}// match openvdb
+
+     /// @brief Visits all tile values in this node, i.e. both inactive and active tiles
+    class ValueIterator : public MaskIterT<false>
+    {
+        using BaseT = MaskIterT<false>;
+        const InternalNode *mParent;
+    public:
+        ValueIterator() : BaseT(), mParent(nullptr) {}
+        ValueIterator(const InternalNode* parent) :  BaseT(parent->mChildMask.beginOff()), mParent(parent) {}
+        ValueIterator& operator=(const ValueIterator&) = default;
+        ValueType operator*() const {NANOVDB_ASSERT(*this); return mParent->mTable[BaseT::pos()].value;}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(BaseT::pos());}
+        bool isActive() const { NANOVDB_ASSERT(*this); return mParent->mValueMask.isOn(BaseT::pos());}
+    }; // Member class ValueIterator
+
+    ValueIterator  beginValue()          {return ValueIterator(this);}
+    ValueIterator cbeginValueAll() const {return ValueIterator(this);}
+
+    /// @brief Visits active tile values of this node only
+    class ValueOnIterator : public MaskIterT<true>
+    {
+        using BaseT = MaskIterT<true>;
+        const InternalNode *mParent;
+    public:
+        ValueOnIterator() : BaseT(), mParent(nullptr) {}
+        ValueOnIterator(const InternalNode* parent) :  BaseT(parent->mValueMask.beginOn()), mParent(parent) {}
+        ValueOnIterator& operator=(const ValueOnIterator&) = default;
+        ValueType operator*() const {NANOVDB_ASSERT(*this); return mParent->mTable[BaseT::pos()].value;}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(BaseT::pos());}
+    }; // Member class ValueOnIterator
+
+    ValueOnIterator  beginValueOn()       {return ValueOnIterator(this);}
+    ValueOnIterator cbeginValueOn() const {return ValueOnIterator(this);}
+
+    /// @brief Visits all tile values and child nodes of this node
+    class DenseIterator : public MaskT::DenseIterator
+    {
+        using BaseT = typename MaskT::DenseIterator;
+        const InternalNode *mParent;
+    public:
+        DenseIterator() : BaseT(), mParent(nullptr) {}
+        DenseIterator(const InternalNode* parent) :  BaseT(0), mParent(parent) {}
+        DenseIterator& operator=(const DenseIterator&) = default;
+        ChildT* probeChild(ValueType& value) const
+        {
+            NANOVDB_ASSERT(mParent && bool(*this));
+            ChildT *child = nullptr;
+            if (mParent->mChildMask.isOn(BaseT::pos())) {
+                child = mParent->mTable[BaseT::pos()].child;
+            } else {
+                value = mParent->mTable[BaseT::pos()].value;
+            }
+            return child;
+        }
+        Coord getCoord() const { NANOVDB_ASSERT(mParent && bool(*this)); return mParent->offsetToGlobalCoord(BaseT::pos());}
+    }; // Member class DenseIterator
+
+    DenseIterator     beginDense()       {return DenseIterator(this);}
+    DenseIterator cbeginChildAll() const {return DenseIterator(this);}// matches openvdb
+
+    InternalNode(const Coord& origin, const ValueType& value, bool state)
+        : mOrigin(origin & ~MASK)
+        , mValueMask(state)
+        , mChildMask()
+        , mDstOffset(0)
+    {
+        for (uint32_t i = 0; i < SIZE; ++i) mTable[i].value = value;
+    }
+    InternalNode(const InternalNode&) = delete; // disallow copy-construction
+    InternalNode(InternalNode&&) = delete; // disallow move construction
+    InternalNode& operator=(const InternalNode&) = delete; // disallow copy assignment
+    InternalNode& operator=(InternalNode&&) = delete; // disallow move assignment
+    ~InternalNode()
+    {
+        for (auto iter = mChildMask.beginOn(); iter; ++iter) {
+            delete mTable[*iter].child;
+        }
+    }
+    const MaskT& getValueMask() const {return mValueMask;}
+    const MaskT& valueMask() const {return mValueMask;}
+    const MaskT& getChildMask() const {return mChildMask;}
+    const MaskT& childMask() const {return mChildMask;}
+    const Coord& origin() const {return mOrigin;}
+
+    void nodeCount(std::array<size_t,3> &count) const
+    {
+        count[ChildT::LEVEL] += mChildMask.countOn();
+        if constexpr(ChildT::LEVEL>0) {
+            for (auto it = const_cast<InternalNode*>(this)->beginChild(); it; ++it) it->nodeCount(count);
+        }
+    }
+
+    static uint32_t CoordToOffset(const Coord& ijk)
+    {
+        return (((ijk[0] & int32_t(MASK)) >> ChildT::TOTAL) << (2 * LOG2DIM)) +
+               (((ijk[1] & int32_t(MASK)) >> ChildT::TOTAL) << (LOG2DIM)) +
+                ((ijk[2] & int32_t(MASK)) >> ChildT::TOTAL);
+    }
+
+    static Coord OffsetToLocalCoord(uint32_t n)
+    {
+        NANOVDB_ASSERT(n < SIZE);
+        const uint32_t m = n & ((1 << 2 * LOG2DIM) - 1);
+        return Coord(n >> 2 * LOG2DIM, m >> LOG2DIM, m & ((1 << LOG2DIM) - 1));
+    }
+
+    void localToGlobalCoord(Coord& ijk) const
+    {
+        ijk <<= ChildT::TOTAL;
+        ijk += mOrigin;
+    }
+
+    Coord offsetToGlobalCoord(uint32_t n) const
+    {
+        Coord ijk = InternalNode::OffsetToLocalCoord(n);
+        this->localToGlobalCoord(ijk);
+        return ijk;
+    }
+
+    ValueType getFirstValue() const { return mChildMask.isOn(0) ? mTable[0].child->getFirstValue() : mTable[0].value; }
+    ValueType getLastValue() const { return mChildMask.isOn(SIZE - 1) ? mTable[SIZE - 1].child->getLastValue() : mTable[SIZE - 1].value; }
+
+    template<typename OpT, typename... ArgsT>
+    auto get(const Coord& ijk, ArgsT&&... args) const
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        if (mChildMask.isOn(n)) return mTable[n].child->template get<OpT>(ijk, args...);
+        return OpT::get(*this, n, args...);
+    }
+
+    template<typename OpT, typename... ArgsT>
+    auto set(const Coord& ijk, ArgsT&&... args)
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        ChildT* child = nullptr;
+        if (mChildMask.isOn(n)) {
+            child = mTable[n].child;
+        } else {
+            child = new ChildT(ijk, mTable[n].value, mValueMask.isOn(n));
+            mTable[n].child = child;
+            mChildMask.setOn(n);
+        }
+        NANOVDB_ASSERT(child);
+        return child->template set<OpT>(ijk, args...);
+    }
+
+    template<typename OpT, typename AccT, typename... ArgsT>
+    auto getAndCache(const Coord& ijk, const AccT& acc, ArgsT&&... args) const
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        if (mChildMask.isOff(n)) return OpT::get(*this, n, args...);
+        ChildT* child = mTable[n].child;
+        acc.insert(ijk, child);
+        if constexpr(ChildT::LEVEL == 0) {
+            return child->template get<OpT>(ijk, args...);
+        } else {
+            return child->template getAndCache<OpT>(ijk, acc, args...);
+        }
+    }
+
+    template<typename OpT, typename AccT, typename... ArgsT>
+    auto setAndCache(const Coord& ijk, const AccT& acc, ArgsT&&... args)
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        ChildT* child = nullptr;
+        if (mChildMask.isOn(n)) {
+            child = mTable[n].child;
+        } else {
+            child = new ChildT(ijk, mTable[n].value, mValueMask.isOn(n));
+            mTable[n].child = child;
+            mChildMask.setOn(n);
+        }
+        NANOVDB_ASSERT(child);
+        acc.insert(ijk, child);
+        if constexpr(ChildT::LEVEL == 0) {
+            return child->template set<OpT>(ijk, args...);
+        } else {
+            return child->template setAndCache<OpT>(ijk, acc, args...);
+        }
+    }
+
+#ifdef NANOVDB_NEW_ACCESSOR_METHODS
+    ValueType getValue(const Coord& ijk) const {return this->template get<GetValue<BuildType>>(ijk);}
+    LeafNodeType& setValue(const Coord& ijk, const ValueType& value){return this->template set<SetValue<BuildType>>(ijk, value);}
+#else
+    ValueType getValue(const Coord& ijk) const
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        if (mChildMask.isOn(n)) {
+            return mTable[n].child->getValue(ijk);
+        }
+        return mTable[n].value;
+    }
+    void setValue(const Coord& ijk, const ValueType& value)
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        ChildT*        child = nullptr;
+        if (mChildMask.isOn(n)) {
+            child = mTable[n].child;
+        } else {
+            child = new ChildT(ijk, mTable[n].value, mValueMask.isOn(n));
+            mTable[n].child = child;
+            mChildMask.setOn(n);
+        }
+        child->setValue(ijk, value);
+    }
+
+    template<typename AccT>
+    ValueType getValueAndCache(const Coord& ijk, AccT& acc) const
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        if (mChildMask.isOn(n)) {
+            acc.insert(ijk, const_cast<ChildT*>(mTable[n].child));
+            return mTable[n].child->getValueAndCache(ijk, acc);
+        }
+        return mTable[n].value;
+    }
+
+    template<typename AccT>
+    void setValueAndCache(const Coord& ijk, const ValueType& value, AccT& acc)
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        ChildT*        child = nullptr;
+        if (mChildMask.isOn(n)) {
+            child = mTable[n].child;
+        } else {
+            child = new ChildT(ijk, mTable[n].value, mValueMask.isOn(n));
+            mTable[n].child = child;
+            mChildMask.setOn(n);
+        }
+        acc.insert(ijk, child);
+        child->setValueAndCache(ijk, value, acc);
+    }
+
+    template<typename AccT>
+    void setValueOnAndCache(const Coord& ijk, AccT& acc)
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        ChildT*        child = nullptr;
+        if (mChildMask.isOn(n)) {
+            child = mTable[n].child;
+        } else {
+            child = new ChildT(ijk, mTable[n].value, mValueMask.isOn(n));
+            mTable[n].child = child;
+            mChildMask.setOn(n);
+        }
+        acc.insert(ijk, child);
+        child->setValueOnAndCache(ijk, acc);
+    }
+
+    template<typename AccT>
+    void touchLeafAndCache(const Coord &ijk, AccT& acc)
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        ChildT* child = nullptr;
+        if (mChildMask.isOn(n)) {
+            child = mTable[n].child;
+        } else {
+            child = new ChildT(ijk, mTable[n].value, mValueMask.isOn(n));
+            mTable[n].child = child;
+            mChildMask.setOn(n);
+        }
+        acc.insert(ijk, child);
+        if constexpr(LEVEL>1) child->touchLeafAndCache(ijk, acc);
+    }
+    template<typename AccT>
+    bool isActiveAndCache(const Coord& ijk, AccT& acc) const
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        if (mChildMask.isOn(n)) {
+            acc.insert(ijk, const_cast<ChildT*>(mTable[n].child));
+            return mTable[n].child->isActiveAndCache(ijk, acc);
+        }
+        return mValueMask.isOn(n);
+    }
+#endif
+
+    template<typename NodeT>
+    uint32_t nodeCount() const
+    {
+        static_assert(util::is_same<ValueType, typename NodeT::ValueType>::value, "Node::getNodes: Invalid type");
+        NANOVDB_ASSERT(NodeT::LEVEL < LEVEL);
+        uint32_t sum = 0;
+        if constexpr(util::is_same<NodeT, ChildT>::value) { // resolved at compile-time
+            sum += mChildMask.countOn();
+        } else if constexpr(LEVEL>1) {
+            for (auto iter = mChildMask.beginOn(); iter; ++iter) {
+                sum += mTable[*iter].child->template nodeCount<NodeT>();
+            }
+        }
+        return sum;
+    }
+
+    template<typename NodeT>
+    void getNodes(std::vector<NodeT*>& array)
+    {
+        static_assert(util::is_same<ValueType, typename NodeT::ValueType>::value, "Node::getNodes: Invalid type");
+        NANOVDB_ASSERT(NodeT::LEVEL < LEVEL);
+        for (auto iter = mChildMask.beginOn(); iter; ++iter) {
+            if constexpr(util::is_same<NodeT, ChildT>::value) { // resolved at compile-time
+                array.push_back(reinterpret_cast<NodeT*>(mTable[*iter].child));
+            } else if constexpr(LEVEL>1) {
+                mTable[*iter].child->getNodes(array);
+            }
+        }
+    }
+
+    void addChild(ChildT*& child)
+    {
+        NANOVDB_ASSERT(child && (child->mOrigin & ~MASK) == this->mOrigin);
+        const uint32_t n = CoordToOffset(child->mOrigin);
+        if (mChildMask.isOn(n)) {
+            delete mTable[n].child;
+        } else {
+            mChildMask.setOn(n);
+        }
+        mTable[n].child = child;
+        child = nullptr;
+    }
+
+    /// @brief Add a tile containing voxel (i, j, k) at the specified tree level,
+    /// creating a new branch if necessary.  Delete any existing lower-level nodes
+    /// that contain (x, y, z).
+    /// @tparam level tree level at which the tile is inserted. Must be 1 or 2.
+    /// @param ijk Index coordinate that map to the tile being inserted
+    /// @param value Value of the tile
+    /// @param state Binary state of the tile
+    template <uint32_t level>
+    void addTile(const Coord& ijk, const ValueType& value, bool state)
+    {
+        static_assert(level > 0 && level <= LEVEL, "invalid template value of level");
+        const uint32_t n = CoordToOffset(ijk);
+        if constexpr(level == LEVEL) {
+            if (mChildMask.isOn(n)) {
+                delete mTable[n].child;
+                mTable[n] = Tile(value);
+            } else {
+                mValueMask.set(n, state);
+                mTable[n].value = value;
+            }
+        } else if constexpr(level < LEVEL) {
+            ChildT* child = nullptr;
+            if (mChildMask.isOn(n)) {
+                child = mTable[n].child;
+            } else {
+                child = new ChildT(ijk, value, state);
+                mTable[n].child = child;
+                mChildMask.setOn(n);
+            }
+            child->template addTile<level>(ijk, value, state);
+        }
+    }
+
+    template<typename NodeT>
+    void addNode(NodeT*& node)
+    {
+        if constexpr(util::is_same<NodeT, ChildT>::value) { //resolved at compile-time
+            this->addChild(reinterpret_cast<ChildT*&>(node));
+        } else if constexpr(LEVEL>1) {
+            const uint32_t n = CoordToOffset(node->mOrigin);
+            ChildT*        child = nullptr;
+            if (mChildMask.isOn(n)) {
+                child = mTable[n].child;
+            } else {
+                child = new ChildT(node->mOrigin, mTable[n].value, mValueMask.isOn(n));
+                mTable[n].child = child;
+                mChildMask.setOn(n);
+            }
+            child->addNode(node);
+        }
+    }
+
+    void merge(InternalNode &other)
+    {
+        for (auto iter = other.mChildMask.beginOn(); iter; ++iter) {
+            const uint32_t n = *iter;
+            if (mChildMask.isOn(n)) {
+                mTable[n].child->merge(*other.mTable[n].child);
+            } else {
+                mTable[n].child = other.mTable[n].child;
+                other.mChildMask.setOff(n);
+                mChildMask.setOn(n);
+            }
+        }
+    }
+
+    template<typename T>
+    typename util::enable_if<std::is_floating_point<T>::value>::type
+    signedFloodFill(T outside);
+
+}; // tools::build::InternalNode
+
+//================================================================================================
+
+template<typename ChildT>
+template<typename T>
+inline typename util::enable_if<std::is_floating_point<T>::value>::type
+InternalNode<ChildT>::signedFloodFill(T outside)
+{
+    const uint32_t first = *mChildMask.beginOn();
+    if (first < NUM_VALUES) {
+        bool xInside = mTable[first].child->getFirstValue() < 0;
+        bool yInside = xInside, zInside = xInside;
+        for (uint32_t x = 0; x != (1 << LOG2DIM); ++x) {
+            const uint32_t x00 = x << (2 * LOG2DIM); // offset for block(x, 0, 0)
+            if (mChildMask.isOn(x00)) {
+                xInside = mTable[x00].child->getLastValue() < 0;
+            }
+            yInside = xInside;
+            for (uint32_t y = 0; y != (1u << LOG2DIM); ++y) {
+                const uint32_t xy0 = x00 + (y << LOG2DIM); // offset for block(x, y, 0)
+                if (mChildMask.isOn(xy0))
+                    yInside = mTable[xy0].child->getLastValue() < 0;
+                zInside = yInside;
+                for (uint32_t z = 0; z != (1 << LOG2DIM); ++z) {
+                    const uint32_t xyz = xy0 + z; // offset for block(x, y, z)
+                    if (mChildMask.isOn(xyz)) {
+                        zInside = mTable[xyz].child->getLastValue() < 0;
+                    } else {
+                        mTable[xyz].value = zInside ? -outside : outside;
+                    }
+                }
+            }
+        }
+    }
+} // tools::build::InternalNode::signedFloodFill
+
+// ----------------------------> LeafNode <--------------------------------------
+
+template<typename BuildT>
+struct LeafNode
+{
+    using BuildType = BuildT;
+    using ValueType = typename BuildToValueMap<BuildT>::type;
+    using LeafNodeType = LeafNode<BuildT>;
+    static constexpr uint32_t LOG2DIM = 3;
+    static constexpr uint32_t TOTAL = LOG2DIM; // needed by parent nodes
+    static constexpr uint32_t DIM = 1u << TOTAL;
+    static constexpr uint32_t SIZE = 1u << 3 * LOG2DIM; // total number of voxels represented by this node
+    static constexpr uint32_t MASK = DIM - 1; // mask for bit operations
+    static constexpr uint32_t LEVEL = 0; // level 0 = leaf
+    static constexpr uint64_t NUM_VALUES = uint64_t(1) << (3 * TOTAL); // total voxel count represented by this node
+    using NodeMaskType = Mask<LOG2DIM>;
+    template<bool ON>
+    using MaskIterT = typename Mask<LOG2DIM>::template Iterator<ON>;
+    using NanoLeafT = typename NanoNode<BuildT, 0>::Type;
+
+    Coord         mOrigin;
+    Mask<LOG2DIM> mValueMask;
+    ValueType     mValues[SIZE];
+    union {
+        NanoLeafT *mDstNode;
+        uint64_t   mDstOffset;
+    };
+
+    /// @brief Visits all active values in a leaf node
+    class ValueOnIterator : public MaskIterT<true>
+    {
+        using BaseT = MaskIterT<true>;
+        const LeafNode *mParent;
+    public:
+        ValueOnIterator() : BaseT(), mParent(nullptr) {}
+        ValueOnIterator(const LeafNode* parent) :  BaseT(parent->mValueMask.beginOn()), mParent(parent) {}
+        ValueOnIterator& operator=(const ValueOnIterator&) = default;
+        ValueType operator*() const {NANOVDB_ASSERT(*this); return mParent->mValues[BaseT::pos()];}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(BaseT::pos());}
+    }; // Member class ValueOnIterator
+
+    ValueOnIterator  beginValueOn()       {return ValueOnIterator(this);}
+    ValueOnIterator cbeginValueOn() const {return ValueOnIterator(this);}
+
+    /// @brief Visits all inactive values in a leaf node
+    class ValueOffIterator : public MaskIterT<false>
+    {
+        using BaseT = MaskIterT<false>;
+        const LeafNode *mParent;
+    public:
+        ValueOffIterator() : BaseT(), mParent(nullptr) {}
+        ValueOffIterator(const LeafNode* parent) :  BaseT(parent->mValueMask.beginOff()), mParent(parent) {}
+        ValueOffIterator& operator=(const ValueOffIterator&) = default;
+        ValueType operator*() const {NANOVDB_ASSERT(*this); return mParent->mValues[BaseT::pos()];}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(BaseT::pos());}
+    }; // Member class ValueOffIterator
+
+    ValueOffIterator  beginValueOff()       {return ValueOffIterator(this);}
+    ValueOffIterator cbeginValueOff() const {return ValueOffIterator(this);}
+
+    /// @brief Visits all values in a leaf node, i.e. both active and inactive values
+    class ValueIterator
+    {
+        const LeafNode *mParent;
+        uint32_t mPos;
+    public:
+        ValueIterator() : mParent(nullptr), mPos(1u << 3 * LOG2DIM) {}
+        ValueIterator(const LeafNode* parent) :  mParent(parent), mPos(0) {NANOVDB_ASSERT(parent);}
+        ValueIterator& operator=(const ValueIterator&) = default;
+        ValueType operator*() const { NANOVDB_ASSERT(*this); return mParent->mValues[mPos];}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(mPos);}
+        bool isActive() const { NANOVDB_ASSERT(*this); return mParent->mValueMask.isOn(mPos);}
+        operator bool() const {return mPos < SIZE;}
+        ValueIterator& operator++() {++mPos; return *this;}
+        ValueIterator operator++(int) {
+            auto tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+    }; // Member class ValueIterator
+
+    ValueIterator  beginValue()          {return ValueIterator(this);}
+    ValueIterator cbeginValueAll() const {return ValueIterator(this);}
+
+    LeafNode(const Coord& ijk, const ValueType& value, bool state)
+        : mOrigin(ijk & ~MASK)
+        , mValueMask(state) //invalid
+        , mDstOffset(0)
+    {
+        ValueType*  target = mValues;
+        uint32_t n = SIZE;
+        while (n--) {
+            *target++ = value;
+        }
+    }
+    LeafNode(const LeafNode&) = delete; // disallow copy-construction
+    LeafNode(LeafNode&&) = delete; // disallow move construction
+    LeafNode& operator=(const LeafNode&) = delete; // disallow copy assignment
+    LeafNode& operator=(LeafNode&&) = delete; // disallow move assignment
+    ~LeafNode() = default;
+
+    const Mask<LOG2DIM>& getValueMask() const {return mValueMask;}
+    const Mask<LOG2DIM>& valueMask() const {return mValueMask;}
+    const Coord& origin() const {return mOrigin;}
+
+    /// @brief Return the linear offset corresponding to the given coordinate
+    static uint32_t CoordToOffset(const Coord& ijk)
+    {
+        return ((ijk[0] & int32_t(MASK)) << (2 * LOG2DIM)) +
+               ((ijk[1] & int32_t(MASK)) << LOG2DIM) +
+                (ijk[2] & int32_t(MASK));
+    }
+
+    static Coord OffsetToLocalCoord(uint32_t n)
+    {
+        NANOVDB_ASSERT(n < SIZE);
+        const int32_t m = n & ((1 << 2 * LOG2DIM) - 1);
+        return Coord(n >> 2 * LOG2DIM, m >> LOG2DIM, m & int32_t(MASK));
+    }
+
+    void localToGlobalCoord(Coord& ijk) const
+    {
+        ijk += mOrigin;
+    }
+
+    Coord offsetToGlobalCoord(uint32_t n) const
+    {
+        Coord ijk = LeafNode::OffsetToLocalCoord(n);
+        this->localToGlobalCoord(ijk);
+        return ijk;
+    }
+
+    ValueType getFirstValue() const { return mValues[0]; }
+    ValueType getLastValue() const { return mValues[SIZE - 1]; }
+    const ValueType& getValue(uint32_t i) const {return mValues[i];}
+    const ValueType& getValue(const Coord& ijk) const {return mValues[CoordToOffset(ijk)];}
+
+    template<typename OpT, typename... ArgsT>
+    auto get(const Coord& ijk, ArgsT&&... args) const {return OpT::get(*this, CoordToOffset(ijk), args...);}
+
+    template<typename OpT, typename... ArgsT>
+    auto set(const Coord& ijk, ArgsT&&... args) {return OpT::set(*this, CoordToOffset(ijk), args...);}
+
+#ifndef NANOVDB_NEW_ACCESSOR_METHODS
+    template<typename AccT>
+    const ValueType& getValueAndCache(const Coord& ijk, const AccT&) const
+    {
+        return mValues[CoordToOffset(ijk)];
+    }
+
+    template<typename AccT>
+    void setValueAndCache(const Coord& ijk, const ValueType& value, const AccT&)
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        mValueMask.setOn(n);
+        mValues[n] = value;
+    }
+
+    template<typename AccT>
+    void setValueOnAndCache(const Coord& ijk, const AccT&)
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        mValueMask.setOn(n);
+    }
+
+    template<typename AccT>
+    bool isActiveAndCache(const Coord& ijk, const AccT&) const
+    {
+        return mValueMask.isOn(CoordToOffset(ijk));
+    }
+#endif
+
+    void setValue(uint32_t n, const ValueType& value)
+    {
+        mValueMask.setOn(n);
+        mValues[n] = value;
+    }
+    void setValue(const Coord& ijk, const ValueType& value){this->setValue(CoordToOffset(ijk), value);}
+
+    void merge(LeafNode &other)
+    {
+        other.mValueMask -= mValueMask;
+        for (auto iter = other.mValueMask.beginOn(); iter; ++iter) {
+            const uint32_t n = *iter;
+            mValues[n] = other.mValues[n];
+        }
+        mValueMask |= other.mValueMask;
+    }
+
+    template<typename T>
+    typename util::enable_if<std::is_floating_point<T>::value>::type
+    signedFloodFill(T outside);
+
+}; // tools::build::LeafNode<T>
+
+//================================================================================================
+
+template <>
+struct LeafNode<ValueMask>
+{
+    using ValueType = bool;
+    using BuildType = ValueMask;
+    using LeafNodeType = LeafNode<BuildType>;
+    static constexpr uint32_t LOG2DIM = 3;
+    static constexpr uint32_t TOTAL = LOG2DIM; // needed by parent nodes
+    static constexpr uint32_t DIM = 1u << TOTAL;
+    static constexpr uint32_t SIZE = 1u << 3 * LOG2DIM; // total number of voxels represented by this node
+    static constexpr uint32_t MASK = DIM - 1; // mask for bit operations
+    static constexpr uint32_t LEVEL = 0; // level 0 = leaf
+    static constexpr uint64_t NUM_VALUES = uint64_t(1) << (3 * TOTAL); // total voxel count represented by this node
+    using NodeMaskType = Mask<LOG2DIM>;
+    template<bool ON>
+    using MaskIterT = typename Mask<LOG2DIM>::template Iterator<ON>;
+    using NanoLeafT = typename NanoNode<BuildType, 0>::Type;
+
+    Coord         mOrigin;
+    Mask<LOG2DIM> mValueMask;
+    union {
+        NanoLeafT *mDstNode;
+        uint64_t   mDstOffset;
+    };
+
+    /// @brief Visits all active values in a leaf node
+    class ValueOnIterator : public MaskIterT<true>
+    {
+        using BaseT = MaskIterT<true>;
+        const LeafNode *mParent;
+    public:
+        ValueOnIterator() : BaseT(), mParent(nullptr) {}
+        ValueOnIterator(const LeafNode* parent) :  BaseT(parent->mValueMask.beginOn()), mParent(parent) {}
+        ValueOnIterator& operator=(const ValueOnIterator&) = default;
+        bool operator*() const {NANOVDB_ASSERT(*this); return true;}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(BaseT::pos());}
+    }; // Member class ValueOnIterator
+
+    ValueOnIterator  beginValueOn()       {return ValueOnIterator(this);}
+    ValueOnIterator cbeginValueOn() const {return ValueOnIterator(this);}
+
+    /// @brief Visits all inactive values in a leaf node
+    class ValueOffIterator : public MaskIterT<false>
+    {
+        using BaseT = MaskIterT<false>;
+        const LeafNode *mParent;
+    public:
+        ValueOffIterator() : BaseT(), mParent(nullptr) {}
+        ValueOffIterator(const LeafNode* parent) :  BaseT(parent->mValueMask.beginOff()), mParent(parent) {}
+        ValueOffIterator& operator=(const ValueOffIterator&) = default;
+        bool operator*() const {NANOVDB_ASSERT(*this); return false;}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(BaseT::pos());}
+    }; // Member class ValueOffIterator
+
+    ValueOffIterator  beginValueOff()       {return ValueOffIterator(this);}
+    ValueOffIterator cbeginValueOff() const {return ValueOffIterator(this);}
+
+    /// @brief Visits all values in a leaf node, i.e. both active and inactive values
+    class ValueIterator
+    {
+        const LeafNode *mParent;
+        uint32_t mPos;
+    public:
+        ValueIterator() : mParent(nullptr), mPos(1u << 3 * LOG2DIM) {}
+        ValueIterator(const LeafNode* parent) :  mParent(parent), mPos(0) {NANOVDB_ASSERT(parent);}
+        ValueIterator& operator=(const ValueIterator&) = default;
+        bool operator*() const { NANOVDB_ASSERT(*this); return mParent->mValueMask.isOn(mPos);}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(mPos);}
+        bool isActive() const { NANOVDB_ASSERT(*this); return mParent->mValueMask.isOn(mPos);}
+        operator bool() const {return mPos < SIZE;}
+        ValueIterator& operator++() {++mPos; return *this;}
+        ValueIterator operator++(int) {
+            auto tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+    }; // Member class ValueIterator
+
+    ValueIterator  beginValue()          {return ValueIterator(this);}
+    ValueIterator cbeginValueAll() const {return ValueIterator(this);}
+
+    LeafNode(const Coord& ijk, const ValueType&, bool state)
+        : mOrigin(ijk & ~MASK)
+        , mValueMask(state) //invalid
+        , mDstOffset(0)
+    {
+    }
+    LeafNode(const LeafNode&) = delete; // disallow copy-construction
+    LeafNode(LeafNode&&) = delete; // disallow move construction
+    LeafNode& operator=(const LeafNode&) = delete; // disallow copy assignment
+    LeafNode& operator=(LeafNode&&) = delete; // disallow move assignment
+    ~LeafNode() = default;
+
+    const Mask<LOG2DIM>& valueMask() const {return mValueMask;}
+    const Mask<LOG2DIM>& getValueMask() const {return mValueMask;}
+    const Coord& origin() const {return mOrigin;}
+
+    /// @brief Return the linear offset corresponding to the given coordinate
+    static uint32_t CoordToOffset(const Coord& ijk)
+    {
+        return ((ijk[0] & int32_t(MASK)) << (2 * LOG2DIM)) +
+               ((ijk[1] & int32_t(MASK)) <<       LOG2DIM) +
+                (ijk[2] & int32_t(MASK));
+    }
+
+    static Coord OffsetToLocalCoord(uint32_t n)
+    {
+        NANOVDB_ASSERT(n < SIZE);
+        const int32_t m = n & ((1 << 2 * LOG2DIM) - 1);
+        return Coord(n >> 2 * LOG2DIM, m >> LOG2DIM, m & int32_t(MASK));
+    }
+
+    void localToGlobalCoord(Coord& ijk) const {ijk += mOrigin;}
+
+    Coord offsetToGlobalCoord(uint32_t n) const
+    {
+        Coord ijk = LeafNode::OffsetToLocalCoord(n);
+        this->localToGlobalCoord(ijk);
+        return ijk;
+    }
+
+    bool getFirstValue() const { return mValueMask.isOn(0); }
+    bool getLastValue() const { return mValueMask.isOn(SIZE - 1); }
+    bool getValue(uint32_t i) const {return mValueMask.isOn(i);}
+    bool getValue(const Coord& ijk) const {return mValueMask.isOn(CoordToOffset(ijk));}
+
+    template<typename OpT, typename... ArgsT>
+    auto get(const Coord& ijk, ArgsT&&... args) const {return OpT::get(*this, CoordToOffset(ijk), args...);}
+
+    template<typename OpT, typename... ArgsT>
+    auto set(const Coord& ijk, ArgsT&&... args) {return OpT::set(*this, CoordToOffset(ijk), args...);}
+
+#ifndef NANOVDB_NEW_ACCESSOR_METHODS
+    template<typename AccT>
+    bool getValueAndCache(const Coord& ijk, const AccT&) const
+    {
+        return mValueMask.isOn(CoordToOffset(ijk));
+    }
+
+    template<typename AccT>
+    void setValueAndCache(const Coord& ijk, bool, const AccT&)
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        mValueMask.setOn(n);
+    }
+
+    template<typename AccT>
+    void setValueOnAndCache(const Coord& ijk, const AccT&)
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        mValueMask.setOn(n);
+    }
+
+    template<typename AccT>
+    bool isActiveAndCache(const Coord& ijk, const AccT&) const
+    {
+        return mValueMask.isOn(CoordToOffset(ijk));
+    }
+#endif
+
+    void setValue(uint32_t n, bool) {mValueMask.setOn(n);}
+    void setValue(const Coord& ijk) {mValueMask.setOn(CoordToOffset(ijk));}
+
+    void merge(LeafNode &other)
+    {
+        mValueMask |= other.mValueMask;
+    }
+
+}; // tools::build::LeafNode<ValueMask>
+
+//================================================================================================
+
+template <>
+struct LeafNode<bool>
+{
+    using ValueType = bool;
+    using BuildType = ValueMask;
+    using LeafNodeType = LeafNode<BuildType>;
+    static constexpr uint32_t LOG2DIM = 3;
+    static constexpr uint32_t TOTAL = LOG2DIM; // needed by parent nodes
+    static constexpr uint32_t DIM = 1u << TOTAL;
+    static constexpr uint32_t SIZE = 1u << 3 * LOG2DIM; // total number of voxels represented by this node
+    static constexpr uint32_t MASK = DIM - 1; // mask for bit operations
+    static constexpr uint32_t LEVEL = 0; // level 0 = leaf
+    static constexpr uint64_t NUM_VALUES = uint64_t(1) << (3 * TOTAL); // total voxel count represented by this node
+    using NodeMaskType = Mask<LOG2DIM>;
+    template<bool ON>
+    using MaskIterT = typename Mask<LOG2DIM>::template Iterator<ON>;
+    using NanoLeafT = typename NanoNode<BuildType, 0>::Type;
+
+    Coord         mOrigin;
+    Mask<LOG2DIM> mValueMask, mValues;
+    union {
+        NanoLeafT *mDstNode;
+        uint64_t   mDstOffset;
+    };
+
+    /// @brief Visits all active values in a leaf node
+    class ValueOnIterator : public MaskIterT<true>
+    {
+        using BaseT = MaskIterT<true>;
+        const LeafNode *mParent;
+    public:
+        ValueOnIterator() : BaseT(), mParent(nullptr) {}
+        ValueOnIterator(const LeafNode* parent) :  BaseT(parent->mValueMask.beginOn()), mParent(parent) {}
+        ValueOnIterator& operator=(const ValueOnIterator&) = default;
+        bool operator*() const {NANOVDB_ASSERT(*this); return mParent->mValues.isOn(BaseT::pos());}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(BaseT::pos());}
+    }; // Member class ValueOnIterator
+
+    ValueOnIterator  beginValueOn()       {return ValueOnIterator(this);}
+    ValueOnIterator cbeginValueOn() const {return ValueOnIterator(this);}
+
+    /// @brief Visits all inactive values in a leaf node
+    class ValueOffIterator : public MaskIterT<false>
+    {
+        using BaseT = MaskIterT<false>;
+        const LeafNode *mParent;
+    public:
+        ValueOffIterator() : BaseT(), mParent(nullptr) {}
+        ValueOffIterator(const LeafNode* parent) :  BaseT(parent->mValueMask.beginOff()), mParent(parent) {}
+        ValueOffIterator& operator=(const ValueOffIterator&) = default;
+        bool operator*() const {NANOVDB_ASSERT(*this); return mParent->mValues.isOn(BaseT::pos());}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(BaseT::pos());}
+    }; // Member class ValueOffIterator
+
+    ValueOffIterator  beginValueOff()       {return ValueOffIterator(this);}
+    ValueOffIterator cbeginValueOff() const {return ValueOffIterator(this);}
+
+    /// @brief Visits all values in a leaf node, i.e. both active and inactive values
+    class ValueIterator
+    {
+        const LeafNode *mParent;
+        uint32_t mPos;
+    public:
+        ValueIterator() : mParent(nullptr), mPos(1u << 3 * LOG2DIM) {}
+        ValueIterator(const LeafNode* parent) :  mParent(parent), mPos(0) {NANOVDB_ASSERT(parent);}
+        ValueIterator& operator=(const ValueIterator&) = default;
+        bool operator*() const { NANOVDB_ASSERT(*this); return mParent->mValues.isOn(mPos);}
+        Coord getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(mPos);}
+        bool isActive() const { NANOVDB_ASSERT(*this); return mParent->mValueMask.isOn(mPos);}
+        operator bool() const {return mPos < SIZE;}
+        ValueIterator& operator++() {++mPos; return *this;}
+        ValueIterator operator++(int) {
+            auto tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+    }; // Member class ValueIterator
+
+    ValueIterator beginValue()           {return ValueIterator(this);}
+    ValueIterator cbeginValueAll() const {return ValueIterator(this);}
+
+    LeafNode(const Coord& ijk, bool value, bool state)
+        : mOrigin(ijk & ~MASK)
+        , mValueMask(state)
+        , mValues(value)
+        , mDstOffset(0)
+    {
+    }
+    LeafNode(const LeafNode&) = delete; // disallow copy-construction
+    LeafNode(LeafNode&&) = delete; // disallow move construction
+    LeafNode& operator=(const LeafNode&) = delete; // disallow copy assignment
+    LeafNode& operator=(LeafNode&&) = delete; // disallow move assignment
+    ~LeafNode() = default;
+
+    const Mask<LOG2DIM>& valueMask() const {return mValueMask;}
+    const Mask<LOG2DIM>& getValueMask() const {return mValueMask;}
+    const Coord& origin() const {return mOrigin;}
+
+    /// @brief Return the linear offset corresponding to the given coordinate
+    static uint32_t CoordToOffset(const Coord& ijk)
+    {
+        return ((ijk[0] & int32_t(MASK)) << (2 * LOG2DIM)) +
+               ((ijk[1] & int32_t(MASK)) << LOG2DIM) +
+                (ijk[2] & int32_t(MASK));
+    }
+
+    static Coord OffsetToLocalCoord(uint32_t n)
+    {
+        NANOVDB_ASSERT(n < SIZE);
+        const int32_t m = n & ((1 << 2 * LOG2DIM) - 1);
+        return Coord(n >> 2 * LOG2DIM, m >> LOG2DIM, m & int32_t(MASK));
+    }
+
+    void localToGlobalCoord(Coord& ijk) const
+    {
+        ijk += mOrigin;
+    }
+
+    Coord offsetToGlobalCoord(uint32_t n) const
+    {
+        Coord ijk = LeafNode::OffsetToLocalCoord(n);
+        this->localToGlobalCoord(ijk);
+        return ijk;
+    }
+    bool getFirstValue() const { return mValues.isOn(0); }
+    bool getLastValue() const { return mValues.isOn(SIZE - 1); }
+
+    bool getValue(uint32_t i) const {return mValues.isOn(i);}
+    bool getValue(const Coord& ijk) const
+    {
+        return mValues.isOn(CoordToOffset(ijk));
+    }
+#ifndef NANOVDB_NEW_ACCESSOR_METHODS
+    template<typename AccT>
+    bool isActiveAndCache(const Coord& ijk, const AccT&) const
+    {
+        return mValueMask.isOn(CoordToOffset(ijk));
+    }
+
+    template<typename AccT>
+    bool getValueAndCache(const Coord& ijk, const AccT&) const
+    {
+        return mValues.isOn(CoordToOffset(ijk));
+    }
+
+    template<typename AccT>
+    void setValueAndCache(const Coord& ijk, bool value, const AccT&)
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        mValueMask.setOn(n);
+        mValues.setOn(n);
+    }
+
+    template<typename AccT>
+    void setValueOnAndCache(const Coord& ijk, const AccT&)
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        mValueMask.setOn(n);
+    }
+#endif
+
+    void setValue(uint32_t n, bool value)
+    {
+        mValueMask.setOn(n);
+        mValues.set(n, value);
+    }
+    void setValue(const Coord& ijk, bool value) {return this->setValue(CoordToOffset(ijk), value);}
+
+    void merge(LeafNode &other)
+    {
+        mValues |= other.mValues;
+        mValueMask |= other.mValueMask;
+    }
+
+}; // tools::build::LeafNode<bool>
+
+//================================================================================================
+
+template<typename BuildT>
+template<typename T>
+inline typename util::enable_if<std::is_floating_point<T>::value>::type
+LeafNode<BuildT>::signedFloodFill(T outside)
+{
+    const uint32_t first = *mValueMask.beginOn();
+    if (first < SIZE) {
+        bool xInside = mValues[first] < 0, yInside = xInside, zInside = xInside;
+        for (uint32_t x = 0; x != DIM; ++x) {
+            const uint32_t x00 = x << (2 * LOG2DIM);
+            if (mValueMask.isOn(x00))
+                xInside = mValues[x00] < 0; // element(x, 0, 0)
+            yInside = xInside;
+            for (uint32_t y = 0; y != DIM; ++y) {
+                const uint32_t xy0 = x00 + (y << LOG2DIM);
+                if (mValueMask.isOn(xy0))
+                    yInside = mValues[xy0] < 0; // element(x, y, 0)
+                zInside = yInside;
+                for (uint32_t z = 0; z != (1 << LOG2DIM); ++z) {
+                    const uint32_t xyz = xy0 + z; // element(x, y, z)
+                    if (mValueMask.isOn(xyz)) {
+                        zInside = mValues[xyz] < 0;
+                    } else {
+                        mValues[xyz] = zInside ? -outside : outside;
+                    }
+                }
+            }
+        }
+    }
+} // tools::build::LeafNode<T>::signedFloodFill
+
+// ----------------------------> ValueAccessor <--------------------------------------
+
+template<typename BuildT>
+struct ValueAccessor
+{
+    using ValueType = typename BuildToValueMap<BuildT>::type;
+    using LeafT = LeafNode<BuildT>;
+    using Node1 = InternalNode<LeafT>;
+    using Node2 = InternalNode<Node1>;
+    using RootNodeType = RootNode<Node2>;
+    using LeafNodeType = typename RootNodeType::LeafNodeType;
+
+    ValueAccessor(RootNodeType& root)
+        : mRoot(root)
+        , mKeys{Coord(math::Maximum<int>::value()), Coord(math::Maximum<int>::value()), Coord(math::Maximum<int>::value())}
+        , mNode{nullptr, nullptr, nullptr}
+    {
+    }
+    ValueAccessor(ValueAccessor&&) = default; // allow move construction
+    ValueAccessor(const ValueAccessor&) = delete; // disallow copy construction
+    ValueType getValue(int i, int j, int k) const {return this->getValue(Coord(i,j,k));}
+    template<typename NodeT>
+    bool isCached(const Coord& ijk) const
+    {
+        return (ijk[0] & int32_t(~NodeT::MASK)) == mKeys[NodeT::LEVEL][0] &&
+               (ijk[1] & int32_t(~NodeT::MASK)) == mKeys[NodeT::LEVEL][1] &&
+               (ijk[2] & int32_t(~NodeT::MASK)) == mKeys[NodeT::LEVEL][2];
+    }
+
+    template <typename OpT, typename... ArgsT>
+    auto get(const Coord& ijk, ArgsT&&... args) const
+    {
+        if (this->template isCached<LeafT>(ijk)) {
+            return ((const LeafT*)mNode[0])->template get<OpT>(ijk, args...);
+        } else if (this->template isCached<Node1>(ijk)) {
+            return ((const Node1*)mNode[1])->template getAndCache<OpT>(ijk, *this, args...);
+        } else if (this->template isCached<Node2>(ijk)) {
+            return ((const Node2*)mNode[2])->template getAndCache<OpT>(ijk, *this, args...);
+        }
+        return mRoot.template getAndCache<OpT>(ijk, *this, args...);
+    }
+
+    template <typename OpT, typename... ArgsT>
+    auto set(const Coord& ijk, ArgsT&&... args) const
+    {
+        if (this->template isCached<LeafT>(ijk)) {
+            return ((LeafT*)mNode[0])->template set<OpT>(ijk, args...);
+        } else if (this->template isCached<Node1>(ijk)) {
+            return ((Node1*)mNode[1])->template setAndCache<OpT>(ijk, *this, args...);
+        } else if (this->template isCached<Node2>(ijk)) {
+            return ((Node2*)mNode[2])->template setAndCache<OpT>(ijk, *this, args...);
+        }
+        return mRoot.template setAndCache<OpT>(ijk, *this, args...);
+    }
+
+#ifdef NANOVDB_NEW_ACCESSOR_METHODS
+    ValueType getValue(const Coord& ijk) const {return this->template get<GetValue<BuildT>>(ijk);}
+    LeafT* setValue(const Coord& ijk, const ValueType& value) {return this->template set<SetValue<BuildT>>(ijk, value);}
+    LeafT* setValueOn(const Coord& ijk) {return this->template set<SetValue<BuildT>>(ijk);}
+    LeafT& touchLeaf(const Coord& ijk) {return this->template set<TouchLeaf<BuildT>>(ijk);}
+    bool isActive(const Coord& ijk) const {return this->template get<GetState<BuildT>>(ijk);}
+#else
+    ValueType getValue(const Coord& ijk) const
+    {
+        if (this->template isCached<LeafT>(ijk)) {
+            return ((LeafT*)mNode[0])->getValueAndCache(ijk, *this);
+        } else if (this->template isCached<Node1>(ijk)) {
+            return ((Node1*)mNode[1])->getValueAndCache(ijk, *this);
+        } else if (this->template isCached<Node2>(ijk)) {
+            return ((Node2*)mNode[2])->getValueAndCache(ijk, *this);
+        }
+        return mRoot.getValueAndCache(ijk, *this);
+    }
+
+    /// @brief Sets value in a leaf node and returns it.
+    LeafT* setValue(const Coord& ijk, const ValueType& value)
+    {
+        if (this->template isCached<LeafT>(ijk)) {
+            ((LeafT*)mNode[0])->setValueAndCache(ijk, value, *this);
+        } else if (this->template isCached<Node1>(ijk)) {
+            ((Node1*)mNode[1])->setValueAndCache(ijk, value, *this);
+        } else if (this->template isCached<Node2>(ijk)) {
+            ((Node2*)mNode[2])->setValueAndCache(ijk, value, *this);
+        } else {
+            mRoot.setValueAndCache(ijk, value, *this);
+        }
+        NANOVDB_ASSERT(this->isCached<LeafT>(ijk));
+        return (LeafT*)mNode[0];
+    }
+    void setValueOn(const Coord& ijk)
+    {
+        if (this->template isCached<LeafT>(ijk)) {
+            ((LeafT*)mNode[0])->setValueOnAndCache(ijk, *this);
+        } else if (this->template isCached<Node1>(ijk)) {
+            ((Node1*)mNode[1])->setValueOnAndCache(ijk, *this);
+        } else if (this->template isCached<Node2>(ijk)) {
+            ((Node2*)mNode[2])->setValueOnAndCache(ijk, *this);
+        } else {
+            mRoot.setValueOnAndCache(ijk, *this);
+        }
+    }
+    void touchLeaf(const Coord& ijk) const
+    {
+        if (this->template isCached<LeafT>(ijk)) {
+            return;
+        } else if (this->template isCached<Node1>(ijk)) {
+            ((Node1*)mNode[1])->touchLeafAndCache(ijk, *this);
+        } else if (this->template isCached<Node2>(ijk)) {
+            ((Node2*)mNode[2])->touchLeafAndCache(ijk, *this);
+        } else {
+            mRoot.touchLeafAndCache(ijk, *this);
+        }
+    }
+    bool isActive(const Coord& ijk) const
+    {
+        if (this->template isCached<LeafT>(ijk)) {
+            return ((LeafT*)mNode[0])->isActiveAndCache(ijk, *this);
+        } else if (this->template isCached<Node1>(ijk)) {
+            return ((Node1*)mNode[1])->isActiveAndCache(ijk, *this);
+        } else if (this->template isCached<Node2>(ijk)) {
+            return ((Node2*)mNode[2])->isActiveAndCache(ijk, *this);
+        }
+        return mRoot.isActiveAndCache(ijk, *this);
+    }
+#endif
+
+    bool isValueOn(const Coord& ijk) const { return this->isActive(ijk); }
+    template<typename NodeT>
+    void insert(const Coord& ijk, NodeT* node) const
+    {
+        mKeys[NodeT::LEVEL] = ijk & ~NodeT::MASK;
+        mNode[NodeT::LEVEL] = node;
+    }
+    RootNodeType& mRoot;
+    mutable Coord mKeys[3];
+    mutable void* mNode[3];
+}; // tools::build::ValueAccessor<BuildT>
+
+// ----------------------------> Tree <--------------------------------------
+
+template<typename BuildT>
+struct Tree
+{
+    using ValueType = typename BuildToValueMap<BuildT>::type;
+    using Node0 = LeafNode<BuildT>;
+    using Node1 = InternalNode<Node0>;
+    using Node2 = InternalNode<Node1>;
+    using RootNodeType = RootNode<Node2>;
+    using LeafNodeType = typename RootNodeType::LeafNodeType;
+    struct WriteAccessor;
+
+    RootNodeType  mRoot;
+    std::mutex    mMutex;
+
+    Tree(const ValueType &background) : mRoot(background) {}
+    Tree(const Tree&) = delete; // disallow copy construction
+    Tree(Tree&&) = delete; // disallow move construction
+    Tree& tree() {return *this;}
+    RootNodeType& root() {return mRoot;}
+    ValueType getValue(const Coord& ijk) const {return mRoot.getValue(ijk);}
+    ValueType getValue(int i, int j, int k) const {return this->getValue(Coord(i,j,k));}
+    void setValue(const Coord& ijk, const ValueType &value) {mRoot.setValue(ijk, value);}
+    std::array<size_t,3> nodeCount() const
+    {
+        std::array<size_t, 3> count{0,0,0};
+        mRoot.nodeCount(count);
+        return count;
+    }
+    /// @brief regular accessor for thread-safe reading and non-thread-safe writing
+    ValueAccessor<BuildT> getAccessor() { return ValueAccessor<BuildT>(mRoot); }
+    /// @brief special accessor for thread-safe writing only
+    WriteAccessor getWriteAccessor() { return WriteAccessor(mRoot, mMutex); }
+};// tools::build::Tree<BuildT>
+
+// ----------------------------> Tree::WriteAccessor <--------------------------------------
+
+template<typename BuildT>
+struct Tree<BuildT>::WriteAccessor
+{
+    using AccT   = ValueAccessor<BuildT>;
+    using ValueType = typename AccT::ValueType;
+    using LeafT  = typename AccT::LeafT;
+    using Node1  = typename AccT::Node1;
+    using Node2  = typename AccT::Node2;
+    using RootNodeType  = typename AccT::RootNodeType;
+
+    WriteAccessor(RootNodeType& parent, std::mutex &mx)
+        : mParent(parent)
+        , mRoot(parent.mBackground)
+        , mAcc(mRoot)
+        , mMutex(mx)
+    {
+    }
+    WriteAccessor(const WriteAccessor&) = delete; // disallow copy construction
+    WriteAccessor(WriteAccessor&&) = default; // allow move construction
+    ~WriteAccessor() { this->merge(); }
+    void merge()
+    {
+        mMutex.lock();
+        mParent.merge(mRoot);
+        mMutex.unlock();
+    }
+    inline void setValueOn(const Coord& ijk) {mAcc.setValueOn(ijk);}
+    inline void setValue(const Coord& ijk, const ValueType &value) {mAcc.setValue(ijk, value);}
+
+    RootNodeType &mParent, mRoot;
+    AccT          mAcc;
+    std::mutex   &mMutex;
+}; // tools::build::Tree<BuildT>::WriteAccessor
+
+// ----------------------------> Grid <--------------------------------------
+
+template<typename BuildT>
+struct Grid : public Tree<BuildT>
+{
+    using BuildType = BuildT;
+    using ValueType = typename BuildToValueMap<BuildT>::type;
+    using TreeType = Tree<BuildT>;
+    using Node0 = LeafNode<BuildT>;
+    using Node1 = InternalNode<Node0>;
+    using Node2 = InternalNode<Node1>;
+    using RootNodeType = RootNode<Node2>;
+
+    GridClass   mGridClass;
+    GridType    mGridType;
+    Map         mMap;
+    std::string mName;
+
+    Grid(const ValueType &background, const std::string &name = "", GridClass gClass = GridClass::Unknown)
+      : TreeType(background)
+      , mGridClass(gClass)
+      , mGridType(toGridType<BuildT>())
+      , mName(name)
+    {
+        mMap.set(1.0, Vec3d(0.0), 1.0);
+    }
+    TreeType& tree() {return *this;}
+    const GridType&  gridType() const { return mGridType; }
+    const GridClass& gridClass() const { return mGridClass; }
+    const Map& map() const { return mMap; }
+    void setTransform(double scale=1.0, const Vec3d &translation = Vec3d(0.0)) {mMap.set(scale, translation, 1.0);}
+    const std::string& gridName() const { return mName; }
+    const std::string& getName() const { return mName; }
+    void setName(const std::string &name) { mName = name; }
+    /// @brief Sets grids values in domain of the @a bbox to those returned by the specified @a func with the
+    ///        expected signature [](const Coord&)->ValueType.
+    ///
+    /// @note If @a func returns a value equal to the background value of the input grid at a
+    ///       specific voxel coordinate, then the active state of that coordinate is off! Else the value
+    ///       value is set and the active state is on. This is done to allow for sparse grids to be generated.
+    ///
+    /// @param func  Functor used to evaluate the grid values in the @a bbox
+    /// @param bbox  Coordinate bounding-box over which the grid values will be set.
+    /// @param delta Specifies a lower threshold value for rendering (optional). Typically equals the voxel size
+    ///              for level sets and otherwise it's zero.
+    template <typename Func>
+    void operator()(const Func& func, const CoordBBox& bbox, ValueType delta = ValueType(0));
+};// tools::build::Grid
+
+template <typename BuildT>
+template <typename Func>
+void Grid<BuildT>::operator()(const Func& func, const CoordBBox& bbox, ValueType delta)
+{
+    auto &root = this->tree().root();
+#if __cplusplus >= 201703L
+    static_assert(util::is_same<ValueType, typename std::invoke_result<Func,const Coord&>::type>::value, "GridBuilder: mismatched ValueType");
+#else// invoke_result was introduced in C++17 and result_of was removed in C++20
+    static_assert(util::is_same<ValueType, typename std::result_of<Func(const Coord&)>::type>::value, "GridBuilder: mismatched ValueType");
+#endif
+    const CoordBBox leafBBox(bbox[0] >> Node0::TOTAL, bbox[1] >> Node0::TOTAL);
+    std::mutex mutex;
+    util::forEach(leafBBox, [&](const CoordBBox& b) {
+        Node0* leaf = nullptr;
+        for (auto it = b.begin(); it; ++it) {
+            Coord min(*it << Node0::TOTAL), max(min + Coord(Node0::DIM - 1));
+            const CoordBBox b(min.maxComponent(bbox.min()),
+                              max.minComponent(bbox.max()));// crop
+            if (leaf == nullptr) {
+                leaf = new Node0(b[0], root.mBackground, false);
+            } else {
+                leaf->mOrigin = b[0] & ~Node0::MASK;
+                NANOVDB_ASSERT(leaf->mValueMask.isOff());
+            }
+            leaf->mDstOffset = 0;// no prune
+            for (auto ijk = b.begin(); ijk; ++ijk) {
+                const auto v = func(*ijk);// call functor
+                if (v != root.mBackground) leaf->setValue(*ijk, v);// don't insert background values
+            }
+            if (!leaf->mValueMask.isOff()) {// has active values
+                if (leaf->mValueMask.isOn()) {// only active values
+                    const auto first = leaf->getFirstValue();
+                    int n=1;
+                    while (n<512) {// 8^3 = 512
+                        if (leaf->mValues[n++] != first) break;
+                    }
+                    if (n == 512) leaf->mDstOffset = 1;// prune below
+                }
+                std::lock_guard<std::mutex> guard(mutex);
+                NANOVDB_ASSERT(leaf != nullptr);
+                root.addNode(leaf);
+                NANOVDB_ASSERT(leaf == nullptr);
+            }
+        }// loop over sub-part of leafBBox
+        if (leaf) delete leaf;
+    });
+
+    // Prune leaf and tile nodes
+    for (auto it2 = root.mTable.begin(); it2 != root.mTable.end(); ++it2) {
+        if (auto *upper = it2->second.child) {//upper level internal node
+            for (auto it1 = upper->mChildMask.beginOn(); it1; ++it1) {
+                auto *lower = upper->mTable[*it1].child;// lower level internal node
+                for (auto it0 = lower->mChildMask.beginOn(); it0; ++it0) {
+                    auto *leaf = lower->mTable[*it0].child;// leaf nodes
+                    if (leaf->mDstOffset) {
+                        lower->mTable[*it0].value = leaf->getFirstValue();
+                        lower->mChildMask.setOff(*it0);
+                        lower->mValueMask.setOn(*it0);
+                        delete leaf;
+                    }
+                }// loop over leaf nodes
+                if (lower->mChildMask.isOff()) {//only tiles
+                    const auto first = lower->getFirstValue();
+                    int n=1;
+                    while (n < 4096) {// 16^3 = 4096
+                        if (lower->mTable[n++].value != first) break;
+                    }
+                    if (n == 4096) {// identical tile values so prune
+                        upper->mTable[*it1].value = first;
+                        upper->mChildMask.setOff(*it1);
+                        upper->mValueMask.setOn(*it1);
+                        delete lower;
+                    }
+                }
+            }// loop over lower internal nodes
+            if (upper->mChildMask.isOff()) {//only tiles
+                const auto first = upper->getFirstValue();
+                int n=1;
+                while (n < 32768) {// 32^3 = 32768
+                    if (upper->mTable[n++].value != first) break;
+                }
+                if (n == 32768) {// identical tile values so prune
+                    it2->second.value = first;
+                    it2->second.state = upper->mValueMask.isOn();
+                    it2->second.child = nullptr;
+                    delete upper;
+                }
+            }
+        }// is child node of the root
+    }// loop over root table
+}// tools::build::Grid::operator()
+
+//================================================================================================
+
+template <typename T>
+using BuildLeaf = LeafNode<T>;
+template <typename T>
+using BuildLower = InternalNode<BuildLeaf<T>>;
+template <typename T>
+using BuildUpper = InternalNode<BuildLower<T>>;
+template <typename T>
+using BuildRoot  = RootNode<BuildUpper<T>>;
+template <typename T>
+using BuildTile  = typename BuildRoot<T>::Tile;
+
+using FloatGrid  = Grid<float>;
+using Fp4Grid    = Grid<Fp4>;
+using Fp8Grid    = Grid<Fp8>;
+using Fp16Grid   = Grid<Fp16>;
+using FpNGrid    = Grid<FpN>;
+using DoubleGrid = Grid<double>;
+using Int32Grid  = Grid<int32_t>;
+using UInt32Grid = Grid<uint32_t>;
+using Int64Grid  = Grid<int64_t>;
+using Vec3fGrid  = Grid<Vec3f>;
+using Vec3dGrid  = Grid<Vec3d>;
+using Vec4fGrid  = Grid<Vec4f>;
+using Vec4dGrid  = Grid<Vec4d>;
+using MaskGrid   = Grid<ValueMask>;
+using IndexGrid  = Grid<ValueIndex>;
+using OnIndexGrid = Grid<ValueOnIndex>;
+using BoolGrid   = Grid<bool>;
+
+// ----------------------------> NodeManager <--------------------------------------
+
+// GridT can be openvdb::Grid and nanovdb::tools::build::Grid
+template <typename GridT>
+class NodeManager
+{
+public:
+
+    using ValueType = typename GridT::ValueType;
+    using BuildType = typename GridT::BuildType;
+    using GridType = GridT;
+    using TreeType = typename GridT::TreeType;
+    using RootNodeType = typename TreeType::RootNodeType;
+    static_assert(RootNodeType::LEVEL == 3, "NodeManager expected LEVEL=3");
+    using Node2 = typename RootNodeType::ChildNodeType;
+    using Node1 = typename Node2::ChildNodeType;
+    using Node0 = typename Node1::ChildNodeType;
+
+    NodeManager(GridT &grid) : mGrid(grid) {this->init();}
+    void init()
+    {
+        mArray0.clear();
+        mArray1.clear();
+        mArray2.clear();
+        auto counts = mGrid.tree().nodeCount();
+        mArray0.reserve(counts[0]);
+        mArray1.reserve(counts[1]);
+        mArray2.reserve(counts[2]);
+
+        for (auto it2 = mGrid.tree().root().cbeginChildOn(); it2; ++it2) {
+            Node2 &upper = const_cast<Node2&>(*it2);
+            mArray2.emplace_back(&upper);
+            for (auto it1 = upper.cbeginChildOn(); it1; ++it1) {
+                Node1 &lower = const_cast<Node1&>(*it1);
+                mArray1.emplace_back(&lower);
+                for (auto it0 = lower.cbeginChildOn(); it0; ++it0) {
+                    Node0 &leaf = const_cast<Node0&>(*it0);
+                    mArray0.emplace_back(&leaf);
+                }// loop over leaf nodes
+            }// loop over lower internal nodes
+        }// loop over root node
+    }
+
+    /// @brief Return the number of tree nodes at the specified level
+    /// @details 0 is leaf, 1 is lower internal, and 2 is upper internal level
+    uint64_t nodeCount(int level) const
+    {
+        NANOVDB_ASSERT(level==0 || level==1 || level==2);
+        return level==0 ? mArray0.size() : level==1 ? mArray1.size() : mArray2.size();
+    }
+
+    template <int LEVEL>
+    typename util::enable_if<LEVEL==0, Node0&>::type node(int i) {return *mArray0[i];}
+    template <int LEVEL>
+    typename util::enable_if<LEVEL==0, const Node0&>::type node(int i) const {return *mArray0[i];}
+    template <int LEVEL>
+    typename util::enable_if<LEVEL==1, Node1&>::type node(int i) {return *mArray1[i];}
+    template <int LEVEL>
+    typename util::enable_if<LEVEL==1, const Node1&>::type node(int i) const {return *mArray1[i];}
+    template <int LEVEL>
+    typename util::enable_if<LEVEL==2, Node2&>::type node(int i) {return *mArray2[i];}
+    template <int LEVEL>
+    typename util::enable_if<LEVEL==2, const Node2&>::type node(int i) const {return *mArray2[i];}
+
+    /// @brief Return the i'th leaf node with respect to breadth-first ordering
+    const Node0& leaf(uint32_t i) const { return *mArray0[i]; }
+    Node0& leaf(uint32_t i) { return *mArray0[i]; }
+    uint64_t leafCount() const {return mArray0.size();}
+
+    /// @brief Return the i'th lower internal node with respect to breadth-first ordering
+    const Node1& lower(uint32_t i) const { return *mArray1[i]; }
+    Node1& lower(uint32_t i) { return *mArray1[i]; }
+    uint64_t lowerCount() const {return mArray1.size();}
+
+    /// @brief Return the i'th upper internal node with respect to breadth-first ordering
+    const Node2& upper(uint32_t i) const { return *mArray2[i]; }
+    Node2& upper(uint32_t i) { return *mArray2[i]; }
+    uint64_t upperCount() const {return mArray2.size();}
+
+    RootNodeType& root() {return mGrid.tree().root();}
+    const RootNodeType& root() const {return mGrid.tree().root();}
+
+    TreeType& tree() {return mGrid.tree();}
+    const TreeType& tree() const {return mGrid.tree();}
+
+    GridType& grid() {return mGrid;}
+    const GridType& grid() const {return mGrid;}
+
+protected:
+
+    GridT                &mGrid;
+    std::vector<Node0*>   mArray0; // leaf nodes
+    std::vector<Node1*>   mArray1; // lower internal nodes
+    std::vector<Node2*>   mArray2; // upper internal nodes
+
+};// NodeManager
+
+template <typename NodeManagerT>
+typename util::enable_if<util::is_floating_point<typename NodeManagerT::ValueType>::value>::type
+sdfToLevelSet(NodeManagerT &mgr)
+{
+    mgr.grid().mGridClass = GridClass::LevelSet;
+    // Note that the bottom-up flood filling is essential
+    const auto outside = mgr.root().mBackground;
+    util::forEach(0, mgr.leafCount(), 8, [&](const util::Range1D& r) {
+        for (auto i = r.begin(); i != r.end(); ++i) mgr.leaf(i).signedFloodFill(outside);
+    });
+    util::forEach(0, mgr.lowerCount(), 1, [&](const util::Range1D& r) {
+        for (auto i = r.begin(); i != r.end(); ++i) mgr.lower(i).signedFloodFill(outside);
+    });
+    util::forEach(0, mgr.upperCount(), 1, [&](const util::Range1D& r) {
+        for (auto i = r.begin(); i != r.end(); ++i) mgr.upper(i).signedFloodFill(outside);
+    });
+    mgr.root().signedFloodFill(outside);
+}// sdfToLevelSet
+
+template <typename NodeManagerT>
+void levelSetToFog(NodeManagerT &mgr, bool rebuild = true)
+{
+    using ValueType = typename NodeManagerT::ValueType;
+    mgr.grid().mGridClass = GridClass::FogVolume;
+    const ValueType d = -mgr.root().mBackground, w = 1.0f / d;
+    //std::atomic_bool prune{false};
+    std::atomic<bool> prune{false};
+    auto op = [&](ValueType& v) -> bool {
+        if (v > ValueType(0)) {
+            v = ValueType(0);
+            return false;
+        }
+        v = v > d ? v * w : ValueType(1);
+        return true;
+    };
+    util::forEach(0, mgr.leafCount(), 8, [&](const util::Range1D& r) {
+        for (auto i = r.begin(); i != r.end(); ++i) {
+            auto& leaf = mgr.leaf(i);
+            for (uint32_t i = 0; i < 512u; ++i) leaf.mValueMask.set(i, op(leaf.mValues[i]));
+        }
+    });
+    util::forEach(0, mgr.lowerCount(), 1, [&](const util::Range1D& r) {
+        for (auto i = r.begin(); i != r.end(); ++i) {
+            auto& node = mgr.lower(i);
+            for (uint32_t i = 0; i < 4096u; ++i) {
+                if (node.mChildMask.isOn(i)) {
+                    auto* leaf = node.mTable[i].child;
+                    if (leaf->mValueMask.isOff()) {// prune leaf node
+                        node.mTable[i].value = leaf->getFirstValue();
+                        node.mChildMask.setOff(i);
+                        delete leaf;
+                        prune = true;
+                    }
+                } else {
+                    node.mValueMask.set(i, op(node.mTable[i].value));
+                }
+            }
+        }
+    });
+    util::forEach(0, mgr.upperCount(), 1, [&](const util::Range1D& r) {
+        for (auto i = r.begin(); i != r.end(); ++i) {
+            auto& node = mgr.upper(i);
+            for (uint32_t i = 0; i < 32768u; ++i) {
+                if (node.mChildMask.isOn(i)) {// prune lower internal node
+                    auto* child = node.mTable[i].child;
+                    if (child->mChildMask.isOff() && child->mValueMask.isOff()) {
+                        node.mTable[i].value = child->getFirstValue();
+                        node.mChildMask.setOff(i);
+                        delete child;
+                        prune = true;
+                    }
+                } else {
+                    node.mValueMask.set(i, op(node.mTable[i].value));
+                }
+            }
+        }
+    });
+
+    for (auto it = mgr.root().mTable.begin(); it != mgr.root().mTable.end(); ++it) {
+        auto* child = it->second.child;
+        if (child == nullptr) {
+            it->second.state = op(it->second.value);
+        } else if (child->mChildMask.isOff() && child->mValueMask.isOff()) {
+            it->second.value = child->getFirstValue();
+            it->second.state = false;
+            it->second.child = nullptr;
+            delete child;
+            prune = true;
+        }
+    }
+    if (rebuild && prune) mgr.init();
+}// levelSetToFog
+
+// ----------------------------> Implementations of random access methods <--------------------------------------
+
+template <typename T>
+struct TouchLeaf {
+    static BuildLeaf<T>& set(BuildLeaf<T> &leaf, uint32_t)  {return leaf;}
+};// TouchLeaf<BuildT>
+
+/// @brief Implements Tree::getValue(Coord), i.e. return the value associated with a specific coordinate @c ijk.
+/// @tparam BuildT Build type of the grid being called
+/// @details The value at a coordinate maps to the background, a tile value or a leaf value.
+template <typename T>
+struct GetValue {
+    static auto get(const BuildRoot<T>  &root) {return root.mBackground;}
+    static auto get(const BuildTile<T>  &tile) {return tile.value;}
+    static auto get(const BuildUpper<T> &node, uint32_t n) {return node.mTable[n].value;}
+    static auto get(const BuildLower<T> &node, uint32_t n) {return node.mTable[n].value;}
+    static auto get(const BuildLeaf<T>  &leaf, uint32_t n) {return leaf.getValue(n);}
+};// GetValue<T>
+
+/// @brief Implements Tree::isActive(Coord)
+/// @tparam T Build type of the grid being called
+template <typename T>
+struct GetState {
+    static bool get(const BuildRoot<T>&) {return false;}
+    static bool get(const BuildTile<T>  &tile) {return tile.state;}
+    static bool get(const BuildUpper<T> &node, uint32_t n) {return node.mValueMask.isOn(n);}
+    static bool get(const BuildLower<T> &node, uint32_t n) {return node.mValueMask.isOn(n);}
+    static bool get(const BuildLeaf<T>  &leaf, uint32_t n) {return leaf.mValueMask.isOn(n);}
+};// GetState<T>
+
+/// @brief Set the value and its state at the leaf level mapped to by ijk, and create the leaf node and branch if needed.
+/// @tparam T BuildType of the corresponding tree
+template <typename T>
+struct SetValue {
+    static BuildLeaf<T>* set(BuildLeaf<T> &leaf, uint32_t n) {
+        leaf.mValueMask.setOn(n);// always set the active bit
+        return &leaf;
+    }
+    static BuildLeaf<T>* set(BuildLeaf<T> &leaf, uint32_t n, const typename BuildLeaf<T>::ValueType &v) {
+        leaf.setValue(n, v);
+        return &leaf;
+    }
+};// SetValue<T>
+
+/// @brief Implements Tree::probeLeaf(Coord)
+/// @tparam T Build type of the grid being called
+template <typename T>
+struct ProbeValue {
+    using ValueT = typename BuildLeaf<T>::ValueType;
+    static bool get(const BuildRoot<T>  &root, ValueT &v) {
+        v = root.mBackground;
+        return false;
+    }
+    static bool get(const BuildTile<T> &tile, ValueT &v) {
+        v = tile.value;
+        return tile.state;
+    }
+    static bool get(const BuildUpper<T> &node, uint32_t n, ValueT &v) {
+        v = node.mTable[n].value;
+        return node.mValueMask.isOn(n);
+    }
+    static bool get(const BuildLower<T> &node, uint32_t n, ValueT &v) {
+        v = node.mTable[n].value;
+        return node.mValueMask.isOn(n);
+    }
+    static bool get(const BuildLeaf<T>  &leaf, uint32_t n, ValueT &v) {
+        v = leaf.getValue(n);
+        return leaf.isActive(n);
+    }
+};// ProbeValue<T>
+
+} // namespace tools::build
+
+} // namespace nanovdb
+
+#endif // NANOVDB_TOOLS_BUILD_GRIDBUILDER_H_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/tools/GridChecksum.h b/external/nanovdb/tools/GridChecksum.h
new file mode 100644
index 00000000..62323c30
--- /dev/null
+++ b/external/nanovdb/tools/GridChecksum.h
@@ -0,0 +1,427 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file nanovdb/tools/GridChecksum.h
+
+    \author Ken Museth
+
+    \brief Computes a pair of uint32_t checksums, of a Grid, by means of 32 bit Cyclic Redundancy Check (CRC32)
+
+    \details A CRC32 is the 32 bit remainder, or residue, of binary division of a message, by a polynomial.
+
+
+    \note before v32.6.0: checksum[0] = Grid+Tree+Root, checksum[1] = nodes
+          after  v32.6.0: checksum[0] = Grid+Tree,      checksum[1] = nodes + blind data in 4K blocks
+
+    When serialized:
+                                [Grid,Tree][Root][ROOT TILES...][Node<5>...][Node<4>...][Leaf<3>...][BlindMeta...][BlindData...]
+    checksum[2] before v32.6.0: <------------- [0] ------------><-------------- [1] --------------->
+    checksum[2] after  v32.6.0: <---[0]---><----------------------------------------[1]---------------------------------------->
+*/
+
+#ifndef NANOVDB_TOOLS_GRIDCHECKSUM_H_HAS_BEEN_INCLUDED
+#define NANOVDB_TOOLS_GRIDCHECKSUM_H_HAS_BEEN_INCLUDED
+
+#include <algorithm>// for std::generate
+#include <array>
+#include <vector>
+#include <cstdint>
+#include <cstddef>// offsetof macro
+#include <numeric>
+#include <type_traits>
+#include <memory>// for std::unique_ptr
+
+#include <nanovdb/NanoVDB.h>
+#include <nanovdb/util/ForEach.h>
+#include <nanovdb/NodeManager.h>
+
+// Define log of block size for FULL CRC32 computation.
+// A value of 12 corresponds to a block size of 4KB (2^12 = 4096).
+#define NANOVDB_CRC32_LOG2_BLOCK_SIZE 12
+
+namespace nanovdb {// ==================================================================
+
+namespace tools {// ====================================================================
+
+/// @brief Compute the (2 x CRC32) checksum of the specified @c gridData
+/// @param gridData  Base pointer to the grid from which the checksum is computed.
+/// @param mode Defines the mode of computation for the checksum.
+/// @return Return the (2 x CRC32) checksum of the specified @c gridData
+Checksum evalChecksum(const GridData *gridData, CheckMode mode = CheckMode::Default);
+
+/// @brief Extract the checksum of a grid
+/// @param gridData Base pointer to grid with a checksum
+/// @return Checksum encoded in the specified grid
+inline Checksum getChecksum(const GridData *gridData)
+{
+    NANOVDB_ASSERT(gridData);
+    return gridData->mChecksum;
+}
+
+/// @brief Return true if the checksum of @c gridData matches the expected
+///        value already encoded into the grid's meta data.
+/// @tparam BuildT Template parameter used to build NanoVDB grid.
+/// @param grid Grid whose checksum is validated.
+/// @param mode Defines the mode of computation for the checksum.
+bool validateChecksum(const GridData *gridData, CheckMode mode = CheckMode::Default);
+
+/// @brief Updates the checksum of a grid
+/// @param grid Grid whose checksum will be updated.
+/// @param mode Defines the mode of computation for the checksum.
+inline void updateChecksum(GridData *gridData, CheckMode mode)
+{
+    NANOVDB_ASSERT(gridData);
+    gridData->mChecksum = evalChecksum(gridData, mode);
+}
+
+/// @brief Updates the checksum of a grid by preserving its mode
+/// @param gridData Base pointer to grid
+inline void updateChecksum(GridData *gridData)
+{
+    updateChecksum(gridData, gridData->mChecksum.mode());
+}
+
+}// namespace tools
+
+namespace util {
+
+/// @brief Initiate single entry in look-up-table for CRC32 computations
+/// @param lut pointer of size 256 for look-up-table
+/// @param n entry in table (assumed n < 256)
+inline __hostdev__ void initCrc32Lut(uint32_t lut[256], uint32_t n)
+{
+    lut[n] = n;
+    uint32_t &cs = lut[n];
+    for (int i = 0; i < 8; ++i) cs = (cs >> 1) ^ ((cs & 1) ? 0xEDB88320 : 0);
+}
+
+/// @brief Initiate entire look-up-table for CRC32 computations
+/// @param lut pointer of size 256 for look-up-table
+inline __hostdev__ void initCrc32Lut(uint32_t lut[256]){for (uint32_t n = 0u; n < 256u; ++n) initCrc32Lut(lut, n);}
+
+/// @brief Create and initiate entire look-up-table for CRC32 computations
+/// @return returns a unique pointer to the lookup table of size 256.
+inline std::unique_ptr<uint32_t[]> createCrc32Lut()
+{
+    std::unique_ptr<uint32_t[]> lut(new uint32_t[256]);
+    initCrc32Lut(lut.get());
+    return lut;
+}
+
+/// @brief Compute crc32 checksum of @c data of @c size bytes (without a lookup table))
+/// @param data pointer to beginning of data
+/// @param size byte size of data
+/// @param crc initial value of crc32 checksum
+/// @return return crc32 checksum of @c data
+inline __hostdev__ uint32_t crc32(const void* data, size_t size, uint32_t crc = 0)
+{
+    NANOVDB_ASSERT(data);
+    crc = ~crc;
+    for (auto *p = (const uint8_t*)data, *q = p + size; p != q; ++p) {
+        crc ^= *p;
+        for (int j = 0; j < 8; ++j) crc = (crc >> 1) ^ (0xEDB88320 & (-(crc & 1)));
+    }
+    return ~crc;
+}
+
+/// @brief Compute crc32 checksum of data between @c begin and @c end
+/// @param begin points to beginning of data
+/// @param end points to end of @data, (exclusive)
+/// @param crc initial value of crc32 checksum
+/// @return return crc32 checksum
+inline __hostdev__ uint32_t crc32(const void *begin, const void *end, uint32_t crc = 0)
+{
+    NANOVDB_ASSERT(begin && end);
+    NANOVDB_ASSERT(end >= begin);
+    return crc32(begin, (const char*)end - (const char*)begin, crc);
+}
+
+/// @brief Compute crc32 checksum of @c data with @c size bytes using a lookup table
+/// @param data pointer to begenning of data
+/// @param size byte size
+/// @param lut pointer to loopup table for accelerated crc32 computation
+/// @param crc initial value of the checksum
+/// @return crc32 checksum of @c data with @c size bytes
+inline __hostdev__ uint32_t crc32(const void *data, size_t size, const uint32_t lut[256], uint32_t crc = 0)
+{
+    NANOVDB_ASSERT(data);
+    crc = ~crc;
+    for (auto *p = (const uint8_t*)data, *q = p + size; p != q; ++p) crc = lut[(crc ^ *p) & 0xFF] ^ (crc >> 8);
+    return ~crc;
+}
+
+/// @brief Compute crc32 checksum of data between @c begin and @c end using a lookup table
+/// @param begin points to beginning of data
+/// @param end points to end of @data, (exclusive)
+/// @param lut pointer to loopup table for accelerated crc32 computation
+/// @param crc initial value of crc32 checksum
+/// @return return crc32 checksum
+inline __hostdev__ uint32_t crc32(const void *begin, const void *end, const uint32_t lut[256], uint32_t crc = 0)
+{
+    NANOVDB_ASSERT(begin && end);
+    NANOVDB_ASSERT(end >= begin);
+    return crc32(begin, (const char*)end - (const char*)begin, lut, crc);
+}// uint32_t util::crc32(const void *begin, const void *end, const uint32_t lut[256], uint32_t crc = 0)
+
+/// @brief
+/// @param data
+/// @param size
+/// @param lut
+/// @return
+inline uint32_t blockedCrc32(const void *data, size_t size, const uint32_t *lut)
+{
+    if (size == 0 ) return ~uint32_t(0);
+    const uint64_t blockCount = size >> NANOVDB_CRC32_LOG2_BLOCK_SIZE;// number of 4 KB (4096 byte) blocks
+    std::unique_ptr<uint32_t[]> checksums(new uint32_t[blockCount]);
+    forEach(0, blockCount, 64, [&](const Range1D &r) {
+        uint32_t blockSize = 1 << NANOVDB_CRC32_LOG2_BLOCK_SIZE, *p = checksums.get() + r.begin();
+        for (auto i = r.begin(); i != r.end(); ++i) {
+            if (i+1 == blockCount) blockSize += static_cast<uint32_t>(size - (blockCount<<NANOVDB_CRC32_LOG2_BLOCK_SIZE));
+            *p++ = crc32((const uint8_t*)data + (i<<NANOVDB_CRC32_LOG2_BLOCK_SIZE), blockSize, lut);
+        }
+    });
+    return crc32(checksums.get(), sizeof(uint32_t)*blockCount, lut);
+}// uint32_t util::blockedCrc32(const void *data, size_t size, const uint32_t *lut)
+
+/// @brief
+/// @param begin
+/// @param end
+/// @param lut
+/// @return
+inline uint32_t blockedCrc32(const void *begin, const void *end, const uint32_t *lut)
+{
+    return blockedCrc32(begin, PtrDiff(end, begin), lut);
+}
+
+}// namespace util =======================================================================================
+
+namespace tools {// ======================================================================================
+
+//    When serialized:
+//                                [Grid,Tree][Root][ROOT TILES...][Node<5>...][Node<4>...][Leaf<3>...][BlindMeta...][BlindData...]
+//    checksum[2] before v32.6.0: <------------- [0] ------------><-------------- [1] --------------->
+//    checksum[]2 after  v32.6.0: <---[0]---><----------------------------------------[1]---------------------------------------->
+
+// ----------------------------> crc32Head <--------------------------------------
+
+/// @brief
+/// @tparam ValueT
+/// @param grid
+/// @param mode
+/// @return
+inline __hostdev__ uint32_t crc32Head(const GridData *gridData, const uint32_t *lut)
+{
+    NANOVDB_ASSERT(gridData);
+    const uint8_t *begin = (const uint8_t*)(gridData), *mid = begin + sizeof(GridData) + sizeof(TreeData);
+    if (gridData->mVersion <= Version(32,6,0)) mid = (const uint8_t*)(gridData->template nodePtr<2>());
+    return util::crc32(begin + 16u, mid, lut);// exclude GridData::mMagic and GridData::mChecksum
+}// uint32_t crc32Head(const GridData *gridData, const uint32_t *lut)
+
+/// @brief
+/// @param gridData
+/// @return
+inline __hostdev__ uint32_t crc32Head(const GridData *gridData)
+{
+    NANOVDB_ASSERT(gridData);
+    const uint8_t *begin = (const uint8_t*)(gridData), *mid = begin + sizeof(GridData) + sizeof(TreeData);
+    if (gridData->mVersion <= Version(32,6,0)) mid = (const uint8_t*)(gridData->template nodePtr<2>());
+    return util::crc32(begin + 16, mid);// exclude GridData::mMagic and GridData::mChecksum
+}// uint32_t crc32Head(const GridData *gridData)
+
+// ----------------------------> crc32TailOld <--------------------------------------
+
+// Old checksum
+template <typename ValueT>
+uint32_t crc32TailOld(const NanoGrid<ValueT> *grid, const uint32_t *lut)
+{
+    NANOVDB_ASSERT(grid->mVersion <= Version(32,6,0));
+    const auto &tree = grid->tree();
+    auto nodeMgrHandle = createNodeManager(*grid);
+    auto *nodeMgr = nodeMgrHandle.template mgr<ValueT>();
+    assert(nodeMgr && isAligned(nodeMgr));
+    const auto nodeCount = tree.nodeCount(0) + tree.nodeCount(1) + tree.nodeCount(2);
+    std::vector<uint32_t> checksums(nodeCount, 0);
+    util::forEach(0, tree.nodeCount(2), 1,[&](const util::Range1D &r) {// process upper internal nodes
+        uint32_t *p = checksums.data() + r.begin();
+        for (auto i = r.begin(); i != r.end(); ++i) {
+            const auto &node = nodeMgr->upper(static_cast<uint32_t>(i));
+            *p++ = util::crc32(&node, node.memUsage(), lut);
+        }
+    });
+    util::forEach(0, tree.nodeCount(1), 1, [&](const util::Range1D &r) { // process lower internal nodes
+        uint32_t *p = checksums.data() + r.begin() + tree.nodeCount(2);
+        for (auto i = r.begin(); i != r.end(); ++i) {
+            const auto &node = nodeMgr->lower(static_cast<uint32_t>(i));
+            *p++ = util::crc32(&node, node.memUsage(), lut);
+        }
+    });
+    util::forEach(0, tree.nodeCount(0), 8, [&](const util::Range1D &r) { // process leaf nodes
+        uint32_t *p = checksums.data() + r.begin() + tree.nodeCount(1) + tree.nodeCount(2);
+        for (auto i = r.begin(); i != r.end(); ++i) {
+            const auto &leaf = nodeMgr->leaf(static_cast<uint32_t>(i));
+            *p++ = util::crc32(&leaf, leaf.memUsage(), lut);
+        }
+    });
+    return util::crc32(checksums.data(), sizeof(uint32_t)*checksums.size(), lut);
+}// uint32_t crc32TailOld(const NanoGrid<ValueT> *grid, const uint32_t *lut)
+
+struct Crc32TailOld {
+    template <typename BuildT>
+    static uint32_t   known(const GridData *gridData, const uint32_t *lut)
+    {
+        return crc32TailOld((const NanoGrid<BuildT>*)gridData, lut);
+    }
+    static uint32_t unknown(const GridData*, const uint32_t*)
+    {
+        throw std::runtime_error("Cannot call Crc32TailOld with grid of unknown type");
+        return 0u;//dummy
+    }
+};// struct Crc32TailOld
+
+inline uint32_t crc32Tail(const GridData *gridData, const uint32_t *lut)
+{
+    NANOVDB_ASSERT(gridData);
+    if (gridData->mVersion > Version(32,6,0)) {
+        const uint8_t *begin = (const uint8_t*)(gridData);
+        return util::blockedCrc32(begin + sizeof(GridData) + sizeof(TreeData), begin + gridData->mGridSize, lut);
+    } else {
+        return callNanoGrid<Crc32TailOld>(gridData, lut);
+    }
+}// uint32_t crc32Tail(const GridData *gridData, const uint32_t *lut)
+
+template <typename ValueT>
+uint32_t crc32Tail(const NanoGrid<ValueT> *grid, const uint32_t *lut)
+{
+    NANOVDB_ASSERT(grid);
+    if (grid->mVersion > Version(32,6,0)) {
+        const uint8_t *begin = (const uint8_t*)(grid);
+        return util::blockedCrc32(begin + sizeof(GridData) + sizeof(TreeData), begin + grid->mGridSize, lut);
+    } else {
+        return crc32TailOld(grid, lut);
+    }
+}// uint32_t crc32Tail(const NanoGrid<ValueT> *gridData, const uint32_t *lut)
+
+// ----------------------------> evalChecksum <--------------------------------------
+
+/// @brief
+/// @tparam ValueT
+/// @param grid
+/// @param mode
+/// @return
+template <typename ValueT>
+Checksum evalChecksum(const NanoGrid<ValueT> *grid, CheckMode mode)
+{
+    NANOVDB_ASSERT(grid);
+    Checksum cs;
+    if (mode != CheckMode::Empty) {
+        auto lut  = util::createCrc32Lut();
+        cs.head() = crc32Head(grid, lut.get());
+        if (mode == CheckMode::Full) cs.tail() = crc32Tail(grid, lut.get());
+    }
+    return cs;
+}// checksum(const NanoGrid*, CheckMode)
+
+template <typename ValueT>
+[[deprecated("Use evalChecksum(const NanoGrid<ValueT> *grid, CheckMode mode) instead")]]
+Checksum checksum(const NanoGrid<ValueT> *grid, CheckMode mode){return evalChecksum(grid, mode);}
+
+inline Checksum evalChecksum(const GridData *gridData, CheckMode mode)
+{
+    NANOVDB_ASSERT(gridData);
+    Checksum cs;
+    if (mode != CheckMode::Disable) {
+        auto lut  = util::createCrc32Lut();
+        cs.head() = crc32Head(gridData, lut.get());
+        if (mode == CheckMode::Full) cs.tail() = crc32Tail(gridData, lut.get());
+    }
+    return cs;
+}// evalChecksum(GridData *data, CheckMode mode)
+
+[[deprecated("Use evalChecksum(const NanoGrid*, CheckMode) instead")]]
+inline Checksum checksum(const GridData *gridData, CheckMode mode){return evalChecksum(gridData, mode);}
+
+template <typename ValueT>
+[[deprecated("Use checksum(const NanoGrid*, CheckMode) instead")]]
+Checksum checksum(const NanoGrid<ValueT> &grid, CheckMode mode){return checksum(&grid, mode);}
+
+// ----------------------------> validateChecksum <--------------------------------------
+
+/// @brief
+/// @tparam ValueT
+/// @param grid
+/// @param mode
+/// @return
+template <typename ValueT>
+bool validateChecksum(const NanoGrid<ValueT> *grid, CheckMode mode)
+{
+    if (grid->mChecksum.isEmpty() || mode == CheckMode::Empty) return true;
+    auto lut = util::createCrc32Lut();
+    bool checkHead = grid->mChecksum.head() == crc32Head(grid->data(), lut.get());
+    if (grid->mChecksum.isHalf() || mode == CheckMode::Half || !checkHead) {
+        return checkHead;
+    } else {
+        return grid->mChecksum.tail() == crc32Tail(grid, lut.get());
+    }
+}
+
+/// @brief
+/// @tparam ValueT
+/// @param grid
+/// @param mode
+/// @return
+inline bool validateChecksum(const GridData *gridData, CheckMode mode)
+{
+    if (gridData->mChecksum.isEmpty()|| mode == CheckMode::Empty) return true;
+    auto lut = util::createCrc32Lut();
+    bool checkHead = gridData->mChecksum.head() == crc32Head(gridData, lut.get());
+    if (gridData->mChecksum.isHalf() || mode == CheckMode::Half || !checkHead) {
+        return checkHead;
+    } else {
+        return gridData->mChecksum.tail() == crc32Tail(gridData, lut.get());
+    }
+}//  bool validateChecksum(const GridData *gridData, CheckMode mode)
+
+template <typename ValueT>
+[[deprecated("Use validateChecksum(const NanoGrid*, CheckMode) instead")]]
+bool validateChecksum(const NanoGrid<ValueT> &grid, CheckMode mode){return validateChecksum(&grid, mode);}
+
+// ----------------------------> updateChecksum <--------------------------------------
+
+/// @brief
+/// @tparam ValueT
+/// @param grid
+/// @param mode
+template <typename ValueT>
+void updateChecksum(NanoGrid<ValueT> *grid, CheckMode mode){grid->mChecksum = evalChecksum(grid, mode);}
+
+template <typename ValueT>
+void updateChecksum(NanoGrid<ValueT> *grid){grid->mChecksum = evalChecksum(grid, grid->mChecksum.mode());}
+
+// deprecated method that takes a reference vs a pointer
+template <typename ValueT>
+[[deprecated("Use updateChecksum(const NanoGrid*, CheckMode) instead")]]
+void updateChecksum(NanoGrid<ValueT> &grid, CheckMode mode){updateChecksum(&grid, mode);}
+
+// ----------------------------> updateGridCount <--------------------------------------
+
+/// @brief Updates the ground index and count, as well as the head checksum if needed
+/// @param data Pointer to grid data
+/// @param gridIndex New value of the index
+/// @param gridCount New value of the grid count
+inline void updateGridCount(GridData *data, uint32_t gridIndex, uint32_t gridCount)
+{
+    NANOVDB_ASSERT(data && gridIndex < gridCount);
+    if (data->mGridIndex != gridIndex || data->mGridCount != gridCount) {
+        data->mGridIndex  = gridIndex;
+        data->mGridCount  = gridCount;
+        if (!data->mChecksum.isEmpty()) data->mChecksum.head() = crc32Head(data);
+    }
+}
+
+} // namespace tools ======================================================================
+
+
+} // namespace nanovdb ====================================================================
+
+#endif // NANOVDB_TOOLS_GRIDCHECKSUM_H_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/tools/GridStats.h b/external/nanovdb/tools/GridStats.h
new file mode 100644
index 00000000..fac54b20
--- /dev/null
+++ b/external/nanovdb/tools/GridStats.h
@@ -0,0 +1,877 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file nanovdb/tools/GridStats.h
+
+    \author Ken Museth
+
+    \date August 29, 2020
+
+    \brief Re-computes min/max/avg/var/bbox information for each node in a
+           pre-existing NanoVDB grid.
+*/
+
+#ifndef NANOVDB_TOOLS_GRIDSTATS_H_HAS_BEEN_INCLUDED
+#define NANOVDB_TOOLS_GRIDSTATS_H_HAS_BEEN_INCLUDED
+
+#include <nanovdb/NanoVDB.h>
+
+#ifdef NANOVDB_USE_TBB
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_reduce.h>
+#endif
+
+#if defined(__CUDACC__)
+#include <cuda/std/limits>// for cuda::std::numeric_limits
+#else
+#include <limits.h>// for std::numeric_limits
+#endif
+
+#include <atomic>
+#include <iostream>
+
+namespace nanovdb {
+
+namespace tools {//=======================================================================
+
+/// @brief Grid flags which indicate what extra information is present in the grid buffer
+enum class StatsMode : uint32_t {
+    Disable = 0,// disable the computation of any type of statistics (obviously the FASTEST!)
+    BBox    = 1,// only compute the bbox of active values per node and total activeVoxelCount
+    MinMax  = 2,// additionally compute extrema values
+    All     = 3,// compute all of the statics, i.e. bbox, min/max, average and standard deviation
+    Default = 3,// default computational mode for statistics
+    End     = 4,
+};
+
+/// @brief Re-computes the min/max, stats and bbox information for an existing NanoVDB Grid
+/// @param grid  Grid whose stats to update
+/// @param mode  Mode of computation for the statistics.
+template<typename BuildT>
+void updateGridStats(NanoGrid<BuildT>* grid, StatsMode mode = StatsMode::Default);
+
+template<typename ValueT, int Rank = TensorTraits<ValueT>::Rank>
+class Extrema;
+
+/// @brief Determine the extrema of all the values in a grid that
+///        intersects the specified bounding box.
+/// @tparam BuildT Build type of the input grid
+/// @param grid typed grid
+/// @param bbox index bounding box in which min/max are computed
+/// @return Extream of values insixe @c bbox
+template<typename BuildT>
+Extrema<typename NanoGrid<BuildT>::ValueType>
+getExtrema(const NanoGrid<BuildT>& grid, const CoordBBox &bbox);
+
+//================================================================================================
+
+/// @brief Template specialization of Extrema on scalar value types, i.e. rank = 0
+template<typename ValueT>
+class Extrema<ValueT, 0>
+{
+protected:
+    ValueT mMin, mMax;
+
+public:
+    using ValueType = ValueT;
+    __hostdev__ Extrema()
+#if defined(__CUDACC__)
+        // note "::cuda" is needed since we also define a cuda namespace
+        : mMin(::cuda::std::numeric_limits<ValueT>::max())
+        , mMax(::cuda::std::numeric_limits<ValueT>::lowest())
+#else
+        : mMin(std::numeric_limits<ValueT>::max())
+        , mMax(std::numeric_limits<ValueT>::lowest())
+#endif
+    {
+    }
+    __hostdev__ Extrema(const ValueT& v)
+        : mMin(v)
+        , mMax(v)
+    {
+    }
+    __hostdev__ Extrema(const ValueT& a, const ValueT& b)
+        : mMin(a)
+        , mMax(b)
+    {
+    }
+    __hostdev__ Extrema& min(const ValueT& v)
+    {
+        if (v < mMin) mMin = v;
+        return *this;
+    }
+    __hostdev__ Extrema& max(const ValueT& v)
+    {
+        if (v > mMax) mMax = v;
+        return *this;
+    }
+    __hostdev__ Extrema& add(const ValueT& v)
+    {
+        this->min(v);
+        this->max(v);
+        return *this;
+    }
+    __hostdev__ Extrema& add(const ValueT& v, uint64_t) { return this->add(v); }
+    __hostdev__ Extrema& add(const Extrema& other)
+    {
+        this->min(other.mMin);
+        this->max(other.mMax);
+        return *this;
+    }
+    __hostdev__ const ValueT& min() const { return mMin; }
+    __hostdev__ const ValueT& max() const { return mMax; }
+    __hostdev__ operator bool() const { return mMin <= mMax; }
+    __hostdev__ static constexpr bool hasMinMax() { return !util::is_same<bool, ValueT>::value; }
+    __hostdev__ static constexpr bool hasAverage() { return false; }
+    __hostdev__ static constexpr bool hasStdDeviation() { return false; }
+    __hostdev__ static constexpr bool hasStats() { return !util::is_same<bool, ValueT>::value; }
+    __hostdev__ static constexpr size_t size() { return 0; }
+
+    template <typename NodeT>
+    __hostdev__ void setStats(NodeT &node) const
+    {
+        node.setMin(this->min());
+        node.setMax(this->max());
+    }
+}; // Extrema<T, 0>
+
+/// @brief Template specialization of Extrema on vector value types, i.e. rank = 1
+template<typename VecT>
+class Extrema<VecT, 1>
+{
+protected:
+    using Real = typename VecT::ValueType; // this works with both nanovdb and openvdb vectors
+    struct Pair
+    {
+        Real scalar;
+        VecT vector;
+
+        __hostdev__ Pair(Real s)// is only used by Extrema() default c-tor
+            : scalar(s)
+            , vector(s)
+        {
+        }
+        __hostdev__ Pair(const VecT& v)
+            : scalar(v.lengthSqr())
+            , vector(v)
+        {
+        }
+        __hostdev__ bool  operator<(const Pair& rhs) const { return scalar < rhs.scalar; }
+    } mMin, mMax;
+    __hostdev__ Extrema& add(const Pair& p)
+    {
+        if (p < mMin) mMin = p;
+        if (mMax < p) mMax = p;
+        return *this;
+    }
+
+public:
+    using ValueType = VecT;
+    __hostdev__ Extrema()
+#if defined(__CUDACC__)
+        // note "::cuda" is needed since we also define a cuda namespace
+        : mMin(::cuda::std::numeric_limits<Real>::max())
+        , mMax(::cuda::std::numeric_limits<Real>::lowest())
+#else
+        : mMin(std::numeric_limits<Real>::max())
+        , mMax(std::numeric_limits<Real>::lowest())
+#endif
+    {
+    }
+    __hostdev__ Extrema(const VecT& v)
+        : mMin(v)
+        , mMax(v)
+    {
+    }
+    __hostdev__ Extrema(const VecT& a, const VecT& b)
+        : mMin(a)
+        , mMax(b)
+    {
+    }
+    __hostdev__ Extrema& min(const VecT& v)
+    {
+        Pair tmp(v);
+        if (tmp < mMin) mMin = tmp;
+        return *this;
+    }
+    __hostdev__ Extrema& max(const VecT& v)
+    {
+        Pair tmp(v);
+        if (mMax < tmp) mMax = tmp;
+        return *this;
+    }
+    __hostdev__ Extrema& add(const VecT& v) { return this->add(Pair(v)); }
+    __hostdev__ Extrema& add(const VecT& v, uint64_t) { return this->add(Pair(v)); }
+    __hostdev__ Extrema& add(const Extrema& other)
+    {
+        if (other.mMin < mMin) mMin = other.mMin;
+        if (mMax < other.mMax) mMax = other.mMax;
+        return *this;
+    }
+    __hostdev__ const VecT& min() const { return mMin.vector; }
+    __hostdev__ const VecT& max() const { return mMax.vector; }
+    __hostdev__ operator bool() const { return !(mMax < mMin); }
+    __hostdev__ static constexpr bool hasMinMax() { return !util::is_same<bool, Real>::value; }
+    __hostdev__ static constexpr bool hasAverage() { return false; }
+    __hostdev__ static constexpr bool hasStdDeviation() { return false; }
+    __hostdev__ static constexpr bool hasStats() { return !util::is_same<bool, Real>::value; }
+    __hostdev__ static constexpr size_t size() { return 0; }
+
+    template <typename NodeT>
+    __hostdev__ void setStats(NodeT &node) const
+    {
+        node.setMin(this->min());
+        node.setMax(this->max());
+    }
+}; // Extrema<T, 1>
+
+//================================================================================================
+
+template<typename ValueT, int Rank = TensorTraits<ValueT>::Rank>
+class Stats;
+
+/// @brief This class computes statistics (minimum value, maximum
+/// value, mean, variance and standard deviation) of a population
+/// of floating-point values.
+///
+/// @details variance = Mean[ (X-Mean[X])^2 ] = Mean[X^2] - Mean[X]^2,
+///          standard deviation = sqrt(variance)
+///
+/// @note This class employs incremental computation and double precision.
+template<typename ValueT>
+class Stats<ValueT, 0> : public Extrema<ValueT, 0>
+{
+protected:
+    using BaseT = Extrema<ValueT, 0>;
+    using RealT = double; // for accuracy the internal precission must be 64 bit floats
+    size_t mSize;
+    double mAvg, mAux;
+
+public:
+    using ValueType = ValueT;
+    __hostdev__ Stats()
+        : BaseT()
+        , mSize(0)
+        , mAvg(0.0)
+        , mAux(0.0)
+    {
+    }
+    __hostdev__ Stats(const ValueT& val)
+        : BaseT(val)
+        , mSize(1)
+        , mAvg(RealT(val))
+        , mAux(0.0)
+    {
+    }
+    /// @brief Add a single sample
+    __hostdev__ Stats& add(const ValueT& val)
+    {
+        BaseT::add(val);
+        mSize += 1;
+        const double delta = double(val) - mAvg;
+        mAvg += delta / double(mSize);
+        mAux += delta * (double(val) - mAvg);
+        return *this;
+    }
+    /// @brief Add @a n samples with constant value @a val.
+    __hostdev__ Stats& add(const ValueT& val, uint64_t n)
+    {
+        const double denom = 1.0 / double(mSize + n);
+        const double delta = double(val) - mAvg;
+        mAvg += denom * delta * double(n);
+        mAux += denom * delta * delta * double(mSize) * double(n);
+        BaseT::add(val);
+        mSize += n;
+        return *this;
+    }
+
+    /// Add the samples from the other Stats instance.
+    __hostdev__ Stats& add(const Stats& other)
+    {
+        if (other.mSize > 0) {
+            const double denom = 1.0 / double(mSize + other.mSize);
+            const double delta = other.mAvg - mAvg;
+            mAvg += denom * delta * double(other.mSize);
+            mAux += other.mAux + denom * delta * delta * double(mSize) * double(other.mSize);
+            BaseT::add(other);
+            mSize += other.mSize;
+        }
+        return *this;
+    }
+
+    __hostdev__ static constexpr bool hasMinMax() { return !util::is_same<bool, ValueT>::value; }
+    __hostdev__ static constexpr bool hasAverage() { return !util::is_same<bool, ValueT>::value; }
+    __hostdev__ static constexpr bool hasStdDeviation() { return !util::is_same<bool, ValueT>::value; }
+    __hostdev__ static constexpr bool hasStats() { return !util::is_same<bool, ValueT>::value; }
+
+    __hostdev__ size_t size() const { return mSize; }
+
+    //@{
+    /// Return the  arithmetic mean, i.e. average, value.
+    __hostdev__ double avg() const { return mAvg; }
+    __hostdev__ double mean() const { return mAvg; }
+    //@}
+
+    //@{
+    /// @brief Return the population variance.
+    ///
+    /// @note The unbiased sample variance = population variance * num/(num-1)
+    __hostdev__ double var() const { return mSize < 2 ? 0.0 : mAux / double(mSize); }
+    __hostdev__ double variance() const { return this->var(); }
+    //@}
+
+    //@{
+    /// @brief Return the standard deviation (=Sqrt(variance)) as
+    ///        defined from the (biased) population variance.
+    __hostdev__ double std() const { return sqrt(this->var()); }
+    __hostdev__ double stdDev() const { return this->std(); }
+    //@}
+
+    template <typename NodeT>
+    __hostdev__ void setStats(NodeT &node) const
+    {
+        node.setMin(this->min());
+        node.setMax(this->max());
+        node.setAvg(this->avg());
+        node.setDev(this->std());
+    }
+}; // end Stats<T, 0>
+
+/// @brief This class computes statistics (minimum value, maximum
+/// value, mean, variance and standard deviation) of a population
+/// of floating-point values.
+///
+/// @details variance = Mean[ (X-Mean[X])^2 ] = Mean[X^2] - Mean[X]^2,
+///          standard deviation = sqrt(variance)
+///
+/// @note This class employs incremental computation and double precision.
+template<typename ValueT>
+class Stats<ValueT, 1> : public Extrema<ValueT, 1>
+{
+protected:
+    using BaseT = Extrema<ValueT, 1>;
+    using RealT = double; // for accuracy the internal precision must be 64 bit floats
+    size_t mSize;
+    double mAvg, mAux;
+
+public:
+    using ValueType = ValueT;
+    __hostdev__ Stats()
+        : BaseT()
+        , mSize(0)
+        , mAvg(0.0)
+        , mAux(0.0)
+    {
+    }
+    /// @brief Add a single sample
+    __hostdev__ Stats& add(const ValueT& val)
+    {
+        typename BaseT::Pair tmp(val);
+        BaseT::add(tmp);
+        mSize += 1;
+        const double delta = tmp.scalar - mAvg;
+        mAvg += delta / double(mSize);
+        mAux += delta * (tmp.scalar - mAvg);
+        return *this;
+    }
+    /// @brief Add @a n samples with constant value @a val.
+    __hostdev__ Stats& add(const ValueT& val, uint64_t n)
+    {
+        typename BaseT::Pair tmp(val);
+        const double         denom = 1.0 / double(mSize + n);
+        const double         delta = tmp.scalar - mAvg;
+        mAvg += denom * delta * double(n);
+        mAux += denom * delta * delta * double(mSize) * double(n);
+        BaseT::add(tmp);
+        mSize += n;
+        return *this;
+    }
+
+    /// Add the samples from the other Stats instance.
+    __hostdev__ Stats& add(const Stats& other)
+    {
+        if (other.mSize > 0) {
+            const double denom = 1.0 / double(mSize + other.mSize);
+            const double delta = other.mAvg - mAvg;
+            mAvg += denom * delta * double(other.mSize);
+            mAux += other.mAux + denom * delta * delta * double(mSize) * double(other.mSize);
+            BaseT::add(other);
+            mSize += other.mSize;
+        }
+        return *this;
+    }
+
+    __hostdev__ static constexpr bool hasMinMax() { return !util::is_same<bool, ValueT>::value; }
+    __hostdev__ static constexpr bool hasAverage() { return !util::is_same<bool, ValueT>::value; }
+    __hostdev__ static constexpr bool hasStdDeviation() { return !util::is_same<bool, ValueT>::value; }
+    __hostdev__ static constexpr bool hasStats() { return !util::is_same<bool, ValueT>::value; }
+
+    __hostdev__ size_t size() const { return mSize; }
+
+    //@{
+    /// Return the  arithmetic mean, i.e. average, value.
+    __hostdev__ double avg() const { return mAvg; }
+    __hostdev__ double mean() const { return mAvg; }
+    //@}
+
+    //@{
+    /// @brief Return the population variance.
+    ///
+    /// @note The unbiased sample variance = population variance * num/(num-1)
+    __hostdev__ double var() const { return mSize < 2 ? 0.0 : mAux / double(mSize); }
+    __hostdev__ double variance() const { return this->var(); }
+    //@}
+
+    //@{
+    /// @brief Return the standard deviation (=Sqrt(variance)) as
+    ///        defined from the (biased) population variance.
+    __hostdev__ double std() const { return sqrt(this->var()); }
+    __hostdev__ double stdDev() const { return this->std(); }
+    //@}
+
+    template <typename NodeT>
+    __hostdev__ void setStats(NodeT &node) const
+    {
+        node.setMin(this->min());
+        node.setMax(this->max());
+        node.setAvg(this->avg());
+        node.setDev(this->std());
+    }
+}; // end Stats<T, 1>
+
+/// @brief No-op Stats class
+template<typename ValueT>
+struct NoopStats
+{
+    using ValueType = ValueT;
+    __hostdev__ NoopStats() {}
+    __hostdev__ NoopStats(const ValueT&) {}
+    __hostdev__ NoopStats& add(const ValueT&) { return *this; }
+    __hostdev__ NoopStats& add(const ValueT&, uint64_t) { return *this; }
+    __hostdev__ NoopStats& add(const NoopStats&) { return *this; }
+    __hostdev__ static constexpr size_t size() { return 0; }
+    __hostdev__ static constexpr bool hasMinMax() { return false; }
+    __hostdev__ static constexpr bool hasAverage() { return false; }
+    __hostdev__ static constexpr bool hasStdDeviation() { return false; }
+    __hostdev__ static constexpr bool hasStats() { return false; }
+    template <typename NodeT>
+    __hostdev__ void setStats(NodeT&) const{}
+}; // end NoopStats<T>
+
+//================================================================================================
+
+/// @brief Allows for the construction of NanoVDB grids without any dependency
+template<typename GridT, typename StatsT = Stats<typename GridT::ValueType>>
+class GridStats
+{
+    struct NodeStats;
+    using TreeT  = typename GridT::TreeType;
+    using ValueT = typename TreeT::ValueType;
+    using BuildT = typename TreeT::BuildType;
+    using Node0  = typename TreeT::Node0; // leaf
+    using Node1  = typename TreeT::Node1; // lower
+    using Node2  = typename TreeT::Node2; // upper
+    using RootT  = typename TreeT::Node3; // root
+    static_assert(util::is_same<ValueT, typename StatsT::ValueType>::value, "Mismatching type");
+
+    ValueT mDelta; // skip rendering of node if: node.max < -mDelta || node.min > mDelta
+
+    void process( GridT& );// process grid and all tree nodes
+    void process( TreeT& );// process Tree, root node and child nodes
+    void process( RootT& );// process root node and child nodes
+    NodeStats process( Node0& );// process leaf node
+
+    template<typename NodeT>
+    NodeStats process( NodeT& );// process internal node and child nodes
+
+    template<typename DataT, int Rank>
+    void setStats(DataT*, const Extrema<ValueT, Rank>&);
+    template<typename DataT, int Rank>
+    void setStats(DataT*, const Stats<ValueT, Rank>&);
+    template<typename DataT>
+    void setStats(DataT*, const NoopStats<ValueT>&) {}
+
+    template<typename T, typename FlagT>
+    typename std::enable_if<!std::is_floating_point<T>::value>::type
+    setFlag(const T&, const T&, FlagT& flag) const { flag &= ~FlagT(1); } // unset 1st bit to enable rendering
+
+    template<typename T, typename FlagT>
+    typename std::enable_if<std::is_floating_point<T>::value>::type
+    setFlag(const T& min, const T& max, FlagT& flag) const;
+
+public:
+    GridStats() = default;
+
+    void update(GridT& grid, ValueT delta = ValueT(0));
+
+}; // GridStats
+
+template<typename GridT, typename StatsT>
+struct GridStats<GridT, StatsT>::NodeStats
+{
+    StatsT    stats;
+    CoordBBox bbox;
+
+    NodeStats(): stats(), bbox() {}//activeCount(0), bbox() {};
+
+    NodeStats& add(const NodeStats &other)
+    {
+        stats.add( other.stats );// no-op for NoopStats?!
+        bbox[0].minComponent(other.bbox[0]);
+        bbox[1].maxComponent(other.bbox[1]);
+        return *this;
+    }
+};// GridStats::NodeStats
+
+//================================================================================================
+
+template<typename GridT, typename StatsT>
+void GridStats<GridT, StatsT>::update(GridT& grid, ValueT delta)
+{
+    mDelta = delta; // delta = voxel size for level sets, else 0
+    this->process( grid );
+}
+
+//================================================================================================
+
+template<typename GridT, typename StatsT>
+template<typename DataT, int Rank>
+inline void GridStats<GridT, StatsT>::
+    setStats(DataT* data, const Extrema<ValueT, Rank>& e)
+{
+    data->setMin(e.min());
+    data->setMax(e.max());
+}
+
+template<typename GridT, typename StatsT>
+template<typename DataT, int Rank>
+inline void GridStats<GridT, StatsT>::
+    setStats(DataT* data, const Stats<ValueT, Rank>& s)
+{
+    data->setMin(s.min());
+    data->setMax(s.max());
+    data->setAvg(s.avg());
+    data->setDev(s.std());
+}
+
+//================================================================================================
+
+template<typename GridT, typename StatsT>
+template<typename T, typename FlagT>
+inline typename std::enable_if<std::is_floating_point<T>::value>::type
+GridStats<GridT, StatsT>::
+    setFlag(const T& min, const T& max, FlagT& flag) const
+{
+    if (mDelta > 0 && (min > mDelta || max < -mDelta)) {// LS: min > dx || max < -dx
+        flag |=  FlagT(1u);// set 1st bit to disable rendering
+    } else {
+        flag &= ~FlagT(1u);// unset 1st bit to enable rendering
+    }
+}
+
+//================================================================================================
+
+template<typename GridT, typename StatsT>
+void GridStats<GridT, StatsT>::process( GridT &grid )
+{
+    this->process( grid.tree() );// this processes tree, root and all nodes
+
+    // set world space AABB
+    auto& data = *grid.data();
+    const auto& indexBBox = grid.tree().root().bbox();
+    if (indexBBox.empty()) {
+        data.mWorldBBox = Vec3dBBox();
+        data.setBBoxOn(false);
+    } else {
+        // Note that below max is offset by one since CoordBBox.max is inclusive
+        // while bbox<Vec3d>.max is exclusive. However, min is inclusive in both
+        // CoordBBox and Vec3dBBox. This also guarantees that a grid with a single
+        // active voxel, does not have an empty world bbox! E.g. if a grid with a
+        // unit index-to-world transformation only contains the active voxel (0,0,0)
+        // then indeBBox = (0,0,0) -> (0,0,0) and then worldBBox = (0.0, 0.0, 0.0)
+        // -> (1.0, 1.0, 1.0). This is a consequence of the different definitions
+        // of index and world bounding boxes inherited from OpenVDB!
+        grid.mWorldBBox = CoordBBox(indexBBox[0], indexBBox[1].offsetBy(1)).transform(grid.map());
+        grid.setBBoxOn(true);
+    }
+
+    // set bit flags
+    data.setMinMaxOn(StatsT::hasMinMax());
+    data.setAverageOn(StatsT::hasAverage());
+    data.setStdDeviationOn(StatsT::hasStdDeviation());
+} // GridStats::process( Grid )
+
+//================================================================================================
+
+template<typename GridT, typename StatsT>
+inline void GridStats<GridT, StatsT>::process( typename GridT::TreeType &tree )
+{
+    this->process( tree.root() );
+}
+
+//================================================================================================
+
+template<typename GridT, typename StatsT>
+void GridStats<GridT, StatsT>::process(RootT &root)
+{
+    using ChildT = Node2;
+    auto     &data = *root.data();
+    if (data.mTableSize == 0) { // empty root node
+        data.mMinimum = data.mMaximum = data.mBackground;
+        data.mAverage = data.mStdDevi = 0;
+        data.mBBox = CoordBBox();
+    } else {
+        NodeStats total;
+        for (uint32_t i = 0; i < data.mTableSize; ++i) {
+            auto* tile = data.tile(i);
+            if (tile->isChild()) { // process child node
+                total.add( this->process( *data.getChild(tile) ) );
+            } else if (tile->state) { // active tile
+                const Coord ijk = tile->origin();
+                total.bbox[0].minComponent(ijk);
+                total.bbox[1].maxComponent(ijk + Coord(ChildT::DIM - 1));
+                if (StatsT::hasStats()) { // resolved at compile time
+                    total.stats.add(tile->value, ChildT::NUM_VALUES);
+                }
+            }
+        }
+        this->setStats(&data, total.stats);
+        if (total.bbox.empty()) {
+            std::cerr << "\nWarning in GridStats: input tree only contained inactive root tiles!"
+                      << "\nWhile not strictly an error it's rather suspicious!\n";
+        }
+        data.mBBox = total.bbox;
+    }
+} // GridStats::process( RootNode )
+
+//================================================================================================
+
+template<typename GridT, typename StatsT>
+template<typename NodeT>
+typename GridStats<GridT, StatsT>::NodeStats
+GridStats<GridT, StatsT>::process(NodeT &node)
+{
+    static_assert(util::is_same<NodeT,Node1>::value || util::is_same<NodeT,Node2>::value, "Incorrect node type");
+    using ChildT = typename NodeT::ChildNodeType;
+
+    NodeStats total;
+    auto* data = node.data();
+
+    // Serial processing of active tiles
+    if (const auto tileCount = data->mValueMask.countOn()) {
+        //total.activeCount = tileCount * ChildT::NUM_VALUES; // active tiles
+        for (auto it = data->mValueMask.beginOn(); it; ++it) {
+            if (StatsT::hasStats()) { // resolved at compile time
+                total.stats.add( data->mTable[*it].value, ChildT::NUM_VALUES );
+            }
+            const Coord ijk = node.offsetToGlobalCoord(*it);
+            total.bbox[0].minComponent(ijk);
+            total.bbox[1].maxComponent(ijk + Coord(int32_t(ChildT::DIM) - 1));
+        }
+    }
+
+    // Serial or parallel processing of child nodes
+    if (const size_t childCount = data->mChildMask.countOn()) {
+#ifndef NANOVDB_USE_TBB
+        for (auto it = data->mChildMask.beginOn(); it; ++it) {
+            total.add( this->process( *data->getChild(*it) ) );
+        }
+#else
+        std::unique_ptr<ChildT*[]> childNodes(new ChildT*[childCount]);
+        ChildT **ptr = childNodes.get();
+        for (auto it = data->mChildMask.beginOn(); it; ++it) {
+            *ptr++ = data->getChild( *it );
+        }
+        using RangeT = tbb::blocked_range<size_t>;
+        total.add( tbb::parallel_reduce(RangeT(0, childCount), NodeStats(),
+            [&](const RangeT &r, NodeStats local)->NodeStats {
+                for(size_t i=r.begin(); i!=r.end(); ++i){
+                    local.add( this->process( *childNodes[i] ) );
+                }
+                return local;},
+            [](NodeStats a, const NodeStats &b)->NodeStats { return a.add( b ); }
+        ));
+#endif
+    }
+
+    data->mBBox = total.bbox;
+    if (total.bbox.empty()) {
+        data->mFlags |=  uint32_t(1); // set 1st bit on to disable rendering of node
+        data->mFlags &= ~uint32_t(2); // set 2nd bit off since node does not contain active values
+    } else {
+        data->mFlags |=  uint32_t(2); // set 2nd bit on since node contains active values
+        if (StatsT::hasStats()) { // resolved at compile time
+            this->setStats(data, total.stats);
+            this->setFlag(data->mMinimum, data->mMaximum, data->mFlags);
+        }
+    }
+    return total;
+} // GridStats::process( InternalNode )
+
+//================================================================================================
+
+template<typename GridT, typename StatsT>
+typename GridStats<GridT, StatsT>::NodeStats
+GridStats<GridT, StatsT>::process(Node0 &leaf)
+{
+    NodeStats local;
+    if (leaf.updateBBox()) {// optionally update active bounding box (updates data->mFlags)
+        local.bbox[0] = local.bbox[1] = leaf.mBBoxMin;
+        local.bbox[1] += Coord(leaf.mBBoxDif[0], leaf.mBBoxDif[1], leaf.mBBoxDif[2]);
+        if (StatsT::hasStats()) {// resolved at compile time
+            for (auto it = leaf.cbeginValueOn(); it; ++it) local.stats.add(*it);
+            this->setStats(&leaf, local.stats);
+            this->setFlag(leaf.getMin(), leaf.getMax(), leaf.mFlags);
+        }
+    }
+    return local;
+} // GridStats::process( LeafNode )
+
+//================================================================================================
+
+template<typename BuildT>
+void updateGridStats(NanoGrid<BuildT>* grid, StatsMode mode)
+{
+    NANOVDB_ASSERT(grid);
+    using GridT  = NanoGrid<BuildT>;
+    using ValueT = typename GridT::ValueType;
+    if (mode == StatsMode::Disable) {
+        return;
+    } else if (mode == StatsMode::BBox || util::is_same<bool, ValueT>::value) {
+        GridStats<GridT, NoopStats<ValueT> > stats;
+        stats.update(*grid);
+    } else if (mode == StatsMode::MinMax) {
+        GridStats<GridT, Extrema<ValueT> > stats;
+        stats.update(*grid);
+    } else if (mode == StatsMode::All) {
+        GridStats<GridT, Stats<ValueT> > stats;
+        stats.update(*grid);
+    } else {
+        throw std::runtime_error("gridStats: Unsupported statistics mode.");
+    }
+}// updateGridStats
+
+template<typename BuildT>
+[[deprecated("Use nanovdb::tools::updateGridStats(NanoGrid*, StatsMode) instead")]]
+void gridStats(NanoGrid<BuildT>& grid, StatsMode mode = StatsMode::Default)
+{
+    updateGridStats<BuildT>(&grid, mode);
+}
+
+//================================================================================================
+
+namespace {
+
+// returns a bitmask (of size 32^3 or 16^3) that marks all the entries
+// in a node table that intersects with the specified bounding box.
+template<typename NodeT>
+Mask<NodeT::LOG2DIM> getBBoxMask(const CoordBBox &bbox, const NodeT* node)
+{
+    Mask<NodeT::LOG2DIM> mask;// typically 32^3 or 16^3 bit mask
+    auto b = CoordBBox::createCube(node->origin(), node->dim());
+    assert( bbox.hasOverlap(b) );
+    if ( bbox.isInside(b) ) {
+        mask.setOn();//node is completely inside the bbox so early out
+    } else {
+        b.intersect(bbox);// trim bounding box
+        // transform bounding box from global to local coordinates
+        b.min() &=  NodeT::DIM-1u;
+        b.min() >>= NodeT::ChildNodeType::TOTAL;
+        b.max() &=  NodeT::DIM-1u;
+        b.max() >>= NodeT::ChildNodeType::TOTAL;
+        assert( !b.empty() );
+        auto it = b.begin();// iterates over all the child nodes or tiles that intersects bbox
+        for (const Coord& ijk = *it; it; ++it) {
+            mask.setOn(ijk[2] + (ijk[1] << NodeT::LOG2DIM) + (ijk[0] << 2*NodeT::LOG2DIM));
+        }
+    }
+    return mask;
+}// getBBoxMask
+
+}// end of unnamed namespace
+
+/// @brief return the extrema of all the values in a grid that
+///        intersects the specified bounding box.
+template<typename BuildT>
+Extrema<typename NanoGrid<BuildT>::ValueType>
+getExtrema(const NanoGrid<BuildT>& grid, const CoordBBox &bbox)
+{
+    using GridT  = NanoGrid<BuildT>;
+    using ValueT = typename GridT::ValueType;
+    using TreeT = typename GridTree<GridT>::type;
+    using RootT = typename NodeTrait<TreeT, 3>::type;// root node
+    using Node2 = typename NodeTrait<TreeT, 2>::type;// upper internal node
+    using Node1 = typename NodeTrait<TreeT, 1>::type;// lower internal node
+    using Node0 = typename NodeTrait<TreeT, 0>::type;// leaf node
+
+    Extrema<ValueT> extrema;
+    const RootT &root = grid.tree().root();
+    const auto &bbox3 = root.bbox();
+    if (bbox.isInside(bbox3)) {// bbox3 is contained inside bbox
+        extrema.min(root.minimum());
+        extrema.max(root.maximum());
+        extrema.add(root.background());
+    } else if (bbox.hasOverlap(bbox3)) {
+        const auto *data3 = root.data();
+        for (uint32_t i=0; i<data3->mTableSize; ++i) {
+            const auto *tile = data3->tile(i);
+            CoordBBox bbox2 = CoordBBox::createCube(tile->origin(), Node2::dim());
+            if (!bbox.hasOverlap(bbox2)) continue;
+            if (tile->isChild()) {
+                const Node2 *node2 = data3->getChild(tile);
+                if (bbox.isInside(bbox2)) {
+                    extrema.min(node2->minimum());
+                    extrema.max(node2->maximum());
+                } else {// partial intersections at level 2
+                    auto *data2 = node2->data();
+                    const auto bboxMask2 = getBBoxMask(bbox, node2);
+                    for (auto it2 = bboxMask2.beginOn(); it2; ++it2) {
+                        if (data2->mChildMask.isOn(*it2)) {
+                            const Node1* node1 = data2->getChild(*it2);
+                            CoordBBox bbox1 = CoordBBox::createCube(node1->origin(), Node1::dim());
+                            if (bbox.isInside(bbox1)) {
+                                extrema.min(node1->minimum());
+                                extrema.max(node1->maximum());
+                            } else {// partial intersection at level 1
+                                auto *data1 = node1->data();
+                                const auto bboxMask1 = getBBoxMask(bbox, node1);
+                                for (auto it1 = bboxMask1.beginOn(); it1; ++it1) {
+                                    if (data1->mChildMask.isOn(*it1)) {
+                                        const Node0* node0 = data1->getChild(*it1);
+                                        CoordBBox bbox0 = CoordBBox::createCube(node0->origin(), Node0::dim());
+                                        if (bbox.isInside(bbox0)) {
+                                            extrema.min(node0->minimum());
+                                            extrema.max(node0->maximum());
+                                        } else {// partial intersection at level 0
+                                            auto *data0 = node0->data();
+                                            const auto bboxMask0 = getBBoxMask(bbox, node0);
+                                            for (auto it0 = bboxMask0.beginOn(); it0; ++it0) {
+                                                extrema.add(data0->getValue(*it0));
+                                            }
+                                        }// end partial intersection at level 0
+                                    } else {// tile at level 1
+                                        extrema.add(data1->mTable[*it1].value);
+                                    }
+                                }
+                            }// end of partial intersection at level 1
+                        } else {// tile at level 2
+                           extrema.add(data2->mTable[*it2].value);
+                        }
+                    }// loop over tiles and nodes at level 2
+                }// end of partial intersection at level 1
+            } else {// tile at root level
+                extrema.add(tile->value);
+            }
+        }// loop over root table
+    } else {// bbox does not overlap the grid
+        extrema.add(root.background());
+    }
+    return extrema;
+}// getExtrema
+
+}// namespace tools
+
+} // namespace nanovdb
+
+#endif // NANOVDB_TOOLS_GRIDSTATS_H_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/tools/GridValidator.h b/external/nanovdb/tools/GridValidator.h
new file mode 100644
index 00000000..6a8565cb
--- /dev/null
+++ b/external/nanovdb/tools/GridValidator.h
@@ -0,0 +1,244 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file nanovdb/tools/GridValidator.h
+
+    \author Ken Museth
+
+    \date August 30, 2020
+
+    \brief Checks the validity of an existing NanoVDB grid.
+
+    \note before v32.6.0: checksum[0] = Grid+Tree+Root, checksum[1] = nodes
+          after  v32.6.0: checksum[0] = Grid+Tree,      checksum[1] = nodes + blind data in 4K blocks
+
+    When serialized:
+    [Grid,Tree][Root][ROOT TILES...][Node<5>...][Node<4>...][Leaf<3>...][BlindMeta...][BlindData...]
+*/
+
+#ifndef NANOVDB_TOOLS_GRID_VALIDATOR_H_HAS_BEEN_INCLUDED
+#define NANOVDB_TOOLS_GRID_VALIDATOR_H_HAS_BEEN_INCLUDED
+
+#include <iostream> // for std::cerr
+
+#include <nanovdb/NanoVDB.h>
+#include <nanovdb/tools/GridChecksum.h>
+
+namespace nanovdb {
+
+namespace tools {
+
+/// @brief Performs several validation tests on a grid pointer.
+/// @tparam ValueT Build type of the input grid
+/// @param grid const point to the grid that needs validation
+/// @param mode Mode of the validation check (defined in GridChecksum.h)
+/// @param verbose If true information about the first failed test is printed to std::cerr
+/// @return Return true if the specified grid passes several validation tests.
+template <typename ValueT>
+bool isValid(const NanoGrid<ValueT> *grid, CheckMode mode, bool verbose = false);
+
+/// @brief Return true if the specified grid passes several validation tests.
+/// @tparam ValueT Build type of the input grid
+/// @param grid Grid to validate
+/// @param detailed If true the validation test is detailed and relatively slow.
+/// @param verbose If true information about the first failed test is printed to std::cerr
+/// @note This method has been deprecated by the one defined above
+template <typename ValueT>
+[[deprecated("Use isValue(const NanoGrid<ValueT>*, CheckMode, bool) instead.")]]
+bool isValid(const NanoGrid<ValueT> &grid, bool detailed = true, bool verbose = false)
+{
+    return isValid(&grid, detailed ? CheckMode::Full : CheckMode::Half, verbose);
+}
+
+//================================================================================================
+
+/// @brief validate grid
+template<typename ValueT>
+__hostdev__ char* checkGrid(const NanoGrid<ValueT> *grid, char *error, CheckMode mode = CheckMode::Full)
+{
+    *error = '\0';// reset error string
+    char str[32];// temporary buffer for toStr
+
+    // check Grid
+    if (grid == nullptr) {
+        return util::sprint(error, "Invalid pointer: Grid is NULL");
+    } else if (!isAligned(grid)) {
+        return util::sprint(error, "Invalid pointer: Grid is misaligned");
+    } else if (grid->mMagic != NANOVDB_MAGIC_NUMB && grid->mMagic != NANOVDB_MAGIC_GRID) {
+        return util::sprint(error, "Invalid magic number: ", toStr(str, toMagic(grid->mMagic)));
+    } else if (!grid->mVersion.isCompatible()) {
+        return util::sprint(error, "Incompatible version number: ", toStr(str, grid->mVersion));
+    } else if (grid->mGridCount == 0) {
+        return util::sprint(error, "Zero grid count");
+    } else if (grid->mGridIndex >= grid->mGridCount) {
+        return util::sprint(error, "grid index(", int(grid->mGridIndex), ") >= grid count(", int(grid->mGridCount), ")");
+    } else if (grid->mGridClass >= GridClass::End) {
+        return util::sprint(error, "Invalid GridClass(", toStr(str, grid->mGridClass), ")");
+     } else if (grid->mGridType >= GridType::End) {
+        return util::sprint(error, "Invalid GridType(", toStr(str, grid->mGridType), ")");
+     } else if (grid->mGridType != toGridType<ValueT>()) {
+        return util::sprint(error, "Invalid combination of BuildType(", toStr(str, toGridType<ValueT>()), ") and GridType(", toStr(str+16, grid->mGridType), ")");
+    } else if (!isValid(grid->mGridType, grid->mGridClass)) {
+        return util::sprint(error, "Invalid combination of GridType(", toStr(str, grid->mGridType), ") and GridClass(", toStr(str+16,grid->mGridClass), ")");
+    }
+
+    // check Tree
+    auto &tree = grid->tree();
+    if (auto *p = tree.getRoot()) {
+        if (!isAligned(p)) return util::strcpy(error, "Invalid pointer: Root is misaligned");
+    } else {
+        return util::strcpy(error, "Invalid pointer: Root is NULL");
+    }
+
+    // check Root
+    auto &root = tree.root();
+    auto *rootData = root.data();
+    if (rootData == nullptr) {
+        return util::strcpy(error, "Invalid pointer: Root is NULL");
+    } else if (!isAligned((const void*)rootData)) {
+        return util::strcpy(error, "Invalid pointer: Root is misaligned");
+    } else if ( (const uint8_t*)(rootData) < (const uint8_t*)(&tree+1)) {
+       return util::strcpy(error, "Invalid root pointer (should be located after the Grid and Tree)");
+    } else if ( (const void*)(rootData) > util::PtrAdd(rootData, root.memUsage())) {
+       return util::strcpy(error, "Invalid root pointer (appears to be located after the end of the buffer)");
+    } else {// check root tiles
+        const void *bounds[2] = {rootData + 1, util::PtrAdd(rootData, root.memUsage())};
+        for (uint32_t i = 0; i<rootData->mTableSize; ++i) {
+            const void *tile = rootData->tile(i);
+            if ( tile < bounds[0] ) {
+                return util::strcpy(error, "Invalid root tile pointer (below lower bound");
+            } else if (tile >= bounds[1]) {
+                return util::strcpy(error, "Invalid root tile pointer (above higher bound");
+            }
+        }
+    }
+    if (mode == CheckMode::Half) return error;
+
+    // check nodes
+    const bool test = grid->isBreadthFirst();
+    auto *n0 = tree.template getFirstNode<0>();
+    auto *n1 = tree.template getFirstNode<1>();
+    auto *n2 = tree.template getFirstNode<2>();
+    const void *bounds[3][2] = {{n0, util::PtrAdd(n0, grid->gridSize())}, {n1, n0}, {n2, n1}};
+
+    auto check = [&](const void *ptr, int level) -> bool {
+        if (ptr==nullptr) {
+            util::strcpy(error, "Invalid node pointer: node is NULL");
+        } else if (!isAligned(ptr)) {
+            util::strcpy(error, "Invalid node pointer: node is misaligned");
+        } else if (test && level == 0 && (const void*)(n0++) != ptr) {
+            util::strcpy(error, "Leaf node is not stored breadth-first");
+        } else if (test && level == 1 && (const void*)(n1++) != ptr) {
+            util::strcpy(error, "Lower node is not stored breadth-first");
+        } else if (test && level == 2 && (const void*)(n2++) != ptr) {
+            util::strcpy(error, "Upper node is not stored breadth-first");
+        } else if ( ptr < bounds[level][0] ) {
+            util::strcpy(error, "Invalid node pointer: below lower bound");
+        } else if ( ptr >= bounds[level][1] ) {
+            util::strcpy(error, "Invalid node pointer: above higher bound");
+        }
+        return !util::empty(error);
+    };
+
+    for (auto it2 = root.cbeginChild(); it2; ++it2) {
+        if (check(&*it2, 2)) return error;
+        for (auto it1 = it2->cbeginChild(); it1; ++it1) {
+            if (check(&*it1, 1)) return error;
+            for (auto it0 = it1->cbeginChild(); it0; ++it0) if (check(&*it0, 0)) return error;
+        }// loop over child nodes of the upper internal node
+    }// loop over child nodes of the root node
+
+    return error;
+} // checkGrid
+
+//================================================================================================
+
+template <typename ValueT>
+bool isValid(const NanoGrid<ValueT> *grid, CheckMode mode, bool verbose)
+{
+    std::unique_ptr<char[]> strUP(new char[100]);
+    char *str = strUP.get();
+
+    tools::checkGrid(grid, str, mode);
+
+    if (util::empty(str) && !validateChecksum(grid, mode)) util::strcpy(str, "Mis-matching checksum");
+    if (verbose && !util::empty(str)) std::cerr << "Validation failed: " << str << std::endl;
+
+    return util::empty(str);
+}// isValid
+
+//================================================================================================
+
+struct IsNanoGridValid {
+    template <typename BuildT>
+    static bool   known(const GridData *gridData, CheckMode mode, bool verbose)
+    {
+        return tools::isValid((const NanoGrid<BuildT>*)gridData, mode, verbose);
+    }
+    static bool unknown(const GridData *gridData, CheckMode, bool verbose)
+    {
+        if (verbose) {
+            char str[16];
+            std::cerr << "Unsupported GridType: \"" << toStr(str,  gridData->mGridType) << "\"\n" << std::endl;
+        }
+        return false;
+    }
+};// IsNanoGridValid
+
+/// @brief Validate a specific grid in a GridHandle
+/// @tparam GridHandleT Type of GridHandle
+/// @param handle GridHandle containing host grids
+/// @param gridID linear index of the grid to be validated
+/// @param mode node of validation tests
+/// @param verbose if true information is printed if the grid fails a validation test
+/// @return true if grid @c gridID passes all the validation tests
+template <typename GridHandleT>
+bool validateGrid(const GridHandleT &handle, uint32_t gridID, CheckMode mode, bool verbose)
+{
+    if (mode == CheckMode::Disable) {
+        return true;
+    } else if (gridID >= handle.gridCount()) {
+        if (verbose) std::cerr << "grid index " << gridID << " exceeds available grid count " << handle.gridCount() << std::endl;
+        return false;
+    }
+    return callNanoGrid<IsNanoGridValid>(handle.gridData(gridID), mode, verbose);
+}// validateGrid
+
+//================================================================================================
+
+/// @brief Validate all the grids in a GridHandle
+/// @tparam GridHandleT Type of GridHandle
+/// @param handle GridHandle containing host grids (0,1...,N)
+/// @param mode node of validation tests
+/// @param verbose if true information is printed if a grid fails a validation test
+/// @return true if all grids pass alle the validation tests
+template <typename GridHandleT>
+bool validateGrids(const GridHandleT &handle, CheckMode mode, bool verbose)
+{
+    if (mode == CheckMode::Disable) return true;
+    for (uint32_t gridID=0; gridID<handle.gridCount(); ++gridID) {
+        if (!validateGrid(handle, gridID, mode, verbose)) return false;
+    }
+    return true;
+}// validateGrids
+
+}// namespace tools
+
+template<typename ValueT>
+[[deprecated("Use nanovdb:tools::checkGrid instead.")]]
+__hostdev__ char* checkGrid(const NanoGrid<ValueT> *grid, char *error, CheckMode mode = CheckMode::Full)
+{
+    return tools::checkGrid<ValueT>(grid, error, mode);
+}
+
+template <typename ValueT>
+[[deprecated("Use nanovdb:tools::isValid instead.")]]
+bool isValid(const NanoGrid<ValueT> *grid, CheckMode mode, bool verbose = false)
+{
+    return tools::isValid<ValueT>(grid, mode, verbose);
+}
+
+}// namespace nanovdb
+
+#endif // NANOVDB_TOOLS_GRID_VALIDATOR_H_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/tools/NanoToOpenVDB.h b/external/nanovdb/tools/NanoToOpenVDB.h
new file mode 100644
index 00000000..5966ece9
--- /dev/null
+++ b/external/nanovdb/tools/NanoToOpenVDB.h
@@ -0,0 +1,366 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file nanovdb/tools/NanoToOpenVDB.h
+
+    \author Ken Museth
+
+    \date May 6, 2020
+
+    \brief This class will deserialize an NanoVDB grid into an OpenVDB grid.
+
+    \todo Add support for PointIndexGrid and PointDataGrid
+*/
+
+#include <nanovdb/NanoVDB.h> // manages and streams the raw memory buffer of a NanoVDB grid.
+#include <nanovdb/GridHandle.h>
+#include <nanovdb/util/ForEach.h>
+
+#include <openvdb/openvdb.h>
+
+#ifndef NANOVDB_TOOLS_NANOTOOPENVDB_H_HAS_BEEN_INCLUDED
+#define NANOVDB_TOOLS_NANOTOOPENVDB_H_HAS_BEEN_INCLUDED
+
+template<typename T>
+struct ConvertTrait {using Type = T;};
+
+template<typename T>
+struct ConvertTrait<nanovdb::math::Vec3<T>> {using Type = openvdb::math::Vec3<T>;};
+
+template<typename T>
+struct ConvertTrait<nanovdb::math::Vec4<T>> {using Type = openvdb::math::Vec4<T>;};
+
+template<>
+struct ConvertTrait<nanovdb::Fp4> {using Type = float;};
+
+template<>
+struct ConvertTrait<nanovdb::Fp8> {using Type = float;};
+
+template<>
+struct ConvertTrait<nanovdb::Fp16> {using Type = float;};
+
+template<>
+struct ConvertTrait<nanovdb::FpN> {using Type = float;};
+
+template<>
+struct ConvertTrait<nanovdb::ValueMask> {using Type = openvdb::ValueMask;};
+
+namespace nanovdb {
+
+namespace tools {
+
+/// @brief Forward declaration of free-standing function that de-serializes a typed NanoVDB grid into an OpenVDB Grid
+template<typename NanoBuildT>
+typename openvdb::Grid<typename openvdb::tree::Tree4<typename ConvertTrait<NanoBuildT>::Type>::Type>::Ptr
+nanoToOpenVDB(const NanoGrid<NanoBuildT>& grid, int verbose = 0);
+
+/// @brief Forward declaration of free-standing function that de-serializes a NanoVDB GridHandle into an OpenVDB GridBase
+template<typename BufferT>
+openvdb::GridBase::Ptr
+nanoToOpenVDB(const GridHandle<BufferT>& handle, int verbose = 0, uint32_t n = 0);
+
+/// @brief This class will serialize an OpenVDB grid into a NanoVDB grid managed by a GridHandle.
+template<typename NanoBuildT>
+class NanoToOpenVDB
+{
+    using NanoNode0  = nanovdb::LeafNode<NanoBuildT, openvdb::Coord, openvdb::util::NodeMask>; // note that it's using openvdb coord nd mask types!
+    using NanoNode1  = nanovdb::InternalNode<NanoNode0>;
+    using NanoNode2  = nanovdb::InternalNode<NanoNode1>;
+    using NanoRootT  = nanovdb::RootNode<NanoNode2>;
+    using NanoTreeT  = nanovdb::Tree<NanoRootT>;
+    using NanoGridT  = nanovdb::Grid<NanoTreeT>;
+    using NanoValueT = typename NanoGridT::ValueType;
+
+    using OpenBuildT = typename ConvertTrait<NanoBuildT>::Type; // e.g. float -> float but nanovdb::math::Vec3<float> -> openvdb::Vec3<float>
+    using OpenNode0  = openvdb::tree::LeafNode<OpenBuildT, NanoNode0::LOG2DIM>; // leaf
+    using OpenNode1  = openvdb::tree::InternalNode<OpenNode0, NanoNode1::LOG2DIM>; // lower
+    using OpenNode2  = openvdb::tree::InternalNode<OpenNode1, NanoNode2::LOG2DIM>; // upper
+    using OpenRootT  = openvdb::tree::RootNode<OpenNode2>;
+    using OpenTreeT  = openvdb::tree::Tree<OpenRootT>;
+    using OpenGridT  = openvdb::Grid<OpenTreeT>;
+    using OpenValueT = typename OpenGridT::ValueType;
+
+public:
+    /// @brief Construction from an existing const OpenVDB Grid.
+    NanoToOpenVDB(){};
+
+    /// @brief Return a shared pointer to a NanoVDB grid constructed from the specified OpenVDB grid
+    typename OpenGridT::Ptr operator()(const NanoGrid<NanoBuildT>& grid, int verbose = 0);
+
+private:
+
+    template<typename NanoNodeT, typename OpenNodeT>
+    OpenNodeT* processNode(const NanoNodeT*);
+
+    OpenNode2* process(const NanoNode2* node) {return this->template processNode<NanoNode2, OpenNode2>(node);}
+    OpenNode1* process(const NanoNode1* node) {return this->template processNode<NanoNode1, OpenNode1>(node);}
+
+    template <typename NanoLeafT>
+    typename std::enable_if<!std::is_same<bool, typename NanoLeafT::BuildType>::value &&
+                            !std::is_same<ValueMask, typename NanoLeafT::BuildType>::value &&
+                            !std::is_same<Fp4, typename NanoLeafT::BuildType>::value &&
+                            !std::is_same<Fp8, typename NanoLeafT::BuildType>::value &&
+                            !std::is_same<Fp16,typename NanoLeafT::BuildType>::value &&
+                            !std::is_same<FpN, typename NanoLeafT::BuildType>::value,
+                            OpenNode0*>::type
+    process(const NanoLeafT* node);
+
+    template <typename NanoLeafT>
+    typename std::enable_if<std::is_same<Fp4, typename NanoLeafT::BuildType>::value ||
+                            std::is_same<Fp8, typename NanoLeafT::BuildType>::value ||
+                            std::is_same<Fp16,typename NanoLeafT::BuildType>::value ||
+                            std::is_same<FpN, typename NanoLeafT::BuildType>::value,
+                            OpenNode0*>::type
+    process(const NanoLeafT* node);
+
+    template <typename NanoLeafT>
+    typename std::enable_if<std::is_same<ValueMask, typename NanoLeafT::BuildType>::value,
+                            OpenNode0*>::type
+    process(const NanoLeafT* node);
+
+    template <typename NanoLeafT>
+    typename std::enable_if<std::is_same<bool, typename NanoLeafT::BuildType>::value,
+                            OpenNode0*>::type
+    process(const NanoLeafT* node);
+
+    /// converts nanovdb value types to openvdb value types, e.g. nanovdb::Vec3f& -> openvdb::Vec3f&
+    static const OpenValueT& Convert(const NanoValueT &v) {return reinterpret_cast<const OpenValueT&>(v);}
+    static const OpenValueT* Convert(const NanoValueT *v) {return reinterpret_cast<const OpenValueT*>(v);}
+
+}; // NanoToOpenVDB class
+
+template<typename NanoBuildT>
+typename NanoToOpenVDB<NanoBuildT>::OpenGridT::Ptr
+NanoToOpenVDB<NanoBuildT>::operator()(const NanoGrid<NanoBuildT>& grid, int /*verbose*/)
+{
+    // since the input nanovdb grid might use nanovdb types (Coord, Mask, Vec3) we cast to use openvdb types
+    const NanoGridT *srcGrid = reinterpret_cast<const NanoGridT*>(&grid);
+
+    auto dstGrid = openvdb::createGrid<OpenGridT>(Convert(srcGrid->tree().background()));
+    dstGrid->setName(srcGrid->gridName()); // set grid name
+    switch (srcGrid->gridClass()) { // set grid class
+    case nanovdb::GridClass::LevelSet:
+        dstGrid->setGridClass(openvdb::GRID_LEVEL_SET);
+        break;
+    case nanovdb::GridClass::FogVolume:
+        dstGrid->setGridClass(openvdb::GRID_FOG_VOLUME);
+        break;
+    case nanovdb::GridClass::Staggered:
+        dstGrid->setGridClass(openvdb::GRID_STAGGERED);
+        break;
+    case nanovdb::GridClass::PointIndex:
+        throw std::runtime_error("NanoToOpenVDB does not yet support PointIndexGrids");
+    case nanovdb::GridClass::PointData:
+        throw std::runtime_error("NanoToOpenVDB does not yet support PointDataGrids");
+    default:
+        dstGrid->setGridClass(openvdb::GRID_UNKNOWN);
+    }
+    // set transform
+    const nanovdb::Map& nanoMap = reinterpret_cast<const GridData*>(srcGrid)->mMap;
+    auto                mat = openvdb::math::Mat4<double>::identity();
+    mat.setMat3(openvdb::math::Mat3<double>(nanoMap.mMatD));
+    mat.transpose(); // the 3x3 in nanovdb is transposed relative to openvdb's 3x3
+    mat.setTranslation(openvdb::math::Vec3<double>(nanoMap.mVecD));
+    dstGrid->setTransform(openvdb::math::Transform::createLinearTransform(mat)); // calls simplify!
+
+    // process root node
+    auto &root = dstGrid->tree().root();
+    auto *data = srcGrid->tree().root().data();
+    for (uint32_t i=0; i<data->mTableSize; ++i) {
+        auto *tile = data->tile(i);
+        if (tile->isChild()) {
+            root.addChild( this->process( data->getChild(tile)) );
+        } else {
+            root.addTile(tile->origin(), Convert(tile->value), tile->state);
+        }
+    }
+
+    return dstGrid;
+}
+
+template<typename T>
+template<typename SrcNodeT, typename DstNodeT>
+DstNodeT*
+NanoToOpenVDB<T>::processNode(const SrcNodeT *srcNode)
+{
+    DstNodeT *dstNode = new DstNodeT(); // un-initialized for fast construction
+    dstNode->setOrigin(srcNode->origin());
+    const auto& childMask = srcNode->childMask();
+    const_cast<typename DstNodeT::NodeMaskType&>(dstNode->getValueMask()) = srcNode->valueMask();
+    const_cast<typename DstNodeT::NodeMaskType&>(dstNode->getChildMask()) = childMask;
+    auto* dstTable = const_cast<typename DstNodeT::UnionType*>(dstNode->getTable());
+    auto* srcData  = srcNode->data();
+    std::vector<std::pair<uint32_t, const typename SrcNodeT::ChildNodeType*>> childNodes;
+    const auto childCount = childMask.countOn();
+    childNodes.reserve(childCount);
+    for (uint32_t n = 0; n < DstNodeT::NUM_VALUES; ++n) {
+        if (childMask.isOn(n)) {
+            childNodes.emplace_back(n, srcData->getChild(n));
+        } else {
+            dstTable[n].setValue(Convert(srcData->mTable[n].value));
+        }
+    }
+    auto kernel = [&](const auto& r) {
+        for (auto i = r.begin(); i != r.end(); ++i) {
+            auto &p = childNodes[i];
+            dstTable[p.first].setChild( this->process(p.second) );
+        }
+    };
+
+#if 0
+    kernel(Range1D(0, childCount));
+#else
+    util::forEach(0, childCount, 1, kernel);
+#endif
+    return dstNode;
+} // processNode
+
+template<typename T>
+template <typename NanoLeafT>
+inline typename std::enable_if<!std::is_same<bool, typename NanoLeafT::BuildType>::value &&
+                               !std::is_same<ValueMask, typename NanoLeafT::BuildType>::value &&
+                               !std::is_same<Fp4, typename NanoLeafT::BuildType>::value &&
+                               !std::is_same<Fp8, typename NanoLeafT::BuildType>::value &&
+                               !std::is_same<Fp16,typename NanoLeafT::BuildType>::value &&
+                               !std::is_same<FpN, typename NanoLeafT::BuildType>::value,
+                               typename NanoToOpenVDB<T>::OpenNode0*>::type
+NanoToOpenVDB<T>::process(const NanoLeafT *srcNode)
+{
+    static_assert(std::is_same<NanoLeafT, NanoNode0>::value, "NanoToOpenVDB<FpN>::process assert failed");
+    OpenNode0* dstNode = new OpenNode0(); // un-initialized for fast construction
+    dstNode->setOrigin(srcNode->origin());
+    dstNode->setValueMask(srcNode->valueMask());
+
+    const auto* src = Convert(srcNode->data()->mValues);// doesn't work for compressed data, bool or ValueMask
+    for (auto *dst = dstNode->buffer().data(), *end = dst + OpenNode0::SIZE; dst != end; dst += 4, src += 4) {
+        dst[0] = src[0];
+        dst[1] = src[1];
+        dst[2] = src[2];
+        dst[3] = src[3];
+    }
+
+    return dstNode;
+} // process(NanoNode0)
+
+template<typename T>
+template <typename NanoLeafT>
+inline typename std::enable_if<std::is_same<Fp4, typename NanoLeafT::BuildType>::value ||
+                               std::is_same<Fp8, typename NanoLeafT::BuildType>::value ||
+                               std::is_same<Fp16,typename NanoLeafT::BuildType>::value ||
+                               std::is_same<FpN, typename NanoLeafT::BuildType>::value,
+                               typename NanoToOpenVDB<T>::OpenNode0*>::type
+NanoToOpenVDB<T>::process(const NanoLeafT *srcNode)
+{
+    static_assert(std::is_same<NanoLeafT, NanoNode0>::value, "NanoToOpenVDB<T>::process assert failed");
+    OpenNode0* dstNode = new OpenNode0(); // un-initialized for fast construction
+    dstNode->setOrigin(srcNode->origin());
+    dstNode->setValueMask(srcNode->valueMask());
+    float *dst = dstNode->buffer().data();
+    for (int i=0; i!=512; i+=4) {
+        *dst++ = srcNode->getValue(i);
+        *dst++ = srcNode->getValue(i+1);
+        *dst++ = srcNode->getValue(i+2);
+        *dst++ = srcNode->getValue(i+3);
+    }
+
+    return dstNode;
+} // process(NanoNode0)
+
+template<typename T>
+template <typename NanoLeafT>
+inline typename std::enable_if<std::is_same<ValueMask, typename NanoLeafT::BuildType>::value,
+                               typename NanoToOpenVDB<T>::OpenNode0*>::type
+NanoToOpenVDB<T>::process(const NanoLeafT *srcNode)
+{
+    static_assert(std::is_same<NanoLeafT, NanoNode0>::value, "NanoToOpenVDB<ValueMask>::process assert failed");
+    OpenNode0* dstNode = new OpenNode0(); // un-initialized for fast construction
+    dstNode->setOrigin(srcNode->origin());
+    dstNode->setValueMask(srcNode->valueMask());
+
+    return dstNode;
+} // process(NanoNode0)
+
+template<typename T>
+template <typename NanoLeafT>
+inline typename std::enable_if<std::is_same<bool, typename NanoLeafT::BuildType>::value,
+                               typename NanoToOpenVDB<T>::OpenNode0*>::type
+NanoToOpenVDB<T>::process(const NanoLeafT *srcNode)
+{
+    static_assert(std::is_same<NanoLeafT, NanoNode0>::value, "NanoToOpenVDB<ValueMask>::process assert failed");
+    OpenNode0* dstNode = new OpenNode0(); // un-initialized for fast construction
+    dstNode->setOrigin(srcNode->origin());
+    dstNode->setValueMask(srcNode->valueMask());
+    reinterpret_cast<openvdb::util::NodeMask<3>&>(dstNode->buffer()) = srcNode->data()->mValues;
+
+    return dstNode;
+} // process(NanoNode0)
+
+template<typename NanoBuildT>
+inline typename openvdb::Grid<typename openvdb::tree::Tree4<typename ConvertTrait<NanoBuildT>::Type>::Type>::Ptr
+nanoToOpenVDB(const NanoGrid<NanoBuildT>& grid, int verbose)
+{
+    NanoToOpenVDB<NanoBuildT> tmp;
+    return tmp(grid, verbose);
+}
+
+template<typename BufferT>
+openvdb::GridBase::Ptr
+nanoToOpenVDB(const GridHandle<BufferT>& handle, int verbose, uint32_t n)
+{
+    if (auto grid = handle.template grid<float>(n)) {
+        return tools::nanoToOpenVDB(*grid, verbose);
+    } else if (auto grid = handle.template grid<double>(n)) {
+        return tools::nanoToOpenVDB(*grid, verbose);
+    } else if (auto grid = handle.template grid<int32_t>(n)) {
+        return tools::nanoToOpenVDB(*grid, verbose);
+    } else if (auto grid = handle.template grid<int64_t>(n)) {
+        return tools::nanoToOpenVDB(*grid, verbose);
+    } else if (auto grid = handle.template grid<bool>(n)) {
+        return tools::nanoToOpenVDB(*grid, verbose);
+    } else if (auto grid = handle.template grid<nanovdb::Fp4>(n)) {
+        return tools::nanoToOpenVDB(*grid, verbose);
+    } else if (auto grid = handle.template grid<nanovdb::Fp8>(n)) {
+        return tools::nanoToOpenVDB(*grid, verbose);
+    } else if (auto grid = handle.template grid<nanovdb::Fp16>(n)) {
+        return tools::nanoToOpenVDB(*grid, verbose);
+    } else if (auto grid = handle.template grid<nanovdb::FpN>(n)) {
+        return tools::nanoToOpenVDB(*grid, verbose);
+    } else if (auto grid = handle.template grid<nanovdb::ValueMask>(n)) {
+        return tools::nanoToOpenVDB(*grid, verbose);
+    } else if (auto grid = handle.template grid<nanovdb::Vec3f>(n)) {
+        return tools::nanoToOpenVDB(*grid, verbose);
+    } else if (auto grid = handle.template grid<nanovdb::Vec3d>(n)) {
+        return tools::nanoToOpenVDB(*grid, verbose);
+    } else if (auto grid = handle.template grid<nanovdb::Vec4f>(n)) {
+        return tools::nanoToOpenVDB(*grid, verbose);
+    } else if (auto grid = handle.template grid<nanovdb::Vec4d>(n)) {
+        return tools::nanoToOpenVDB(*grid, verbose);
+    } else {
+        OPENVDB_THROW(openvdb::RuntimeError, "Unsupported NanoVDB grid type!");
+    }
+}// tools::nanoToOpenVDB
+
+}// namespace tools
+
+/// @brief Forward declaration of free-standing function that de-serializes a typed NanoVDB grid into an OpenVDB Grid
+template<typename NanoBuildT>
+[[deprecated("Use nanovdb::tools::nanoToOpenVDB instead.")]]
+typename openvdb::Grid<typename openvdb::tree::Tree4<typename ConvertTrait<NanoBuildT>::Type>::Type>::Ptr
+nanoToOpenVDB(const NanoGrid<NanoBuildT>& grid, int verbose = 0)
+{
+    return tools::nanoToOpenVDB(grid, verbose);
+}
+
+/// @brief Forward declaration of free-standing function that de-serializes a NanoVDB GridHandle into an OpenVDB GridBase
+template<typename BufferT>
+[[deprecated("Use nanovdb::tools::nanoToOpenVDB instead.")]]
+openvdb::GridBase::Ptr
+nanoToOpenVDB(const GridHandle<BufferT>& handle, int verbose = 0, uint32_t n = 0)
+{
+    return tools::nanoToOpenVDB(handle, verbose, n);
+}
+
+} // namespace nanovdb
+
+#endif // NANOVDB_TOOLS_NANOTOOPENVDB_H_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/tools/cuda/AddBlindData.cuh b/external/nanovdb/tools/cuda/AddBlindData.cuh
new file mode 100644
index 00000000..aab5796f
--- /dev/null
+++ b/external/nanovdb/tools/cuda/AddBlindData.cuh
@@ -0,0 +1,146 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file nanovdb/tools/cuda/AddBlindData.cuh
+
+    \author Ken Museth
+
+    \date August 3, 2023
+
+    \brief Defines function that appends blind device data to and existing device NanoGrid
+
+    \warning The header file contains cuda device code so be sure
+             to only include it in .cu files (or other .cuh files)
+*/
+
+#ifndef NVIDIA_TOOLS_CUDA_ADDBLINDDATA_CUH_HAS_BEEN_INCLUDED
+#define NVIDIA_TOOLS_CUDA_ADDBLINDDATA_CUH_HAS_BEEN_INCLUDED
+
+#include <nanovdb/NanoVDB.h>
+#include <nanovdb/cuda/DeviceBuffer.h>
+#include <nanovdb/GridHandle.h>
+#include <nanovdb/util/cuda/Util.h>
+#include <nanovdb/tools/GridChecksum.h>
+#include <nanovdb/tools/cuda/GridChecksum.cuh>
+
+#include <cstring> // for std::strcpy
+
+namespace nanovdb {// ================================================
+
+namespace tools::cuda {// ============================================
+
+/// @brief This function appends blind data to and existing NanoGrid
+/// @tparam BuildT Build type of the grid
+/// @tparam BlindDataT Type of the blind data
+/// @tparam BufferT Type of the buffer used for allocation
+/// @param d_grid Pointer to device grid
+/// @param d_blindData Pointer to device blind data
+/// @param valueCount number of values in the blind data
+/// @param blindClass class of the blind data
+/// @param semantics semantics of the blind data
+/// @param name optional name of the blind data
+/// @param pool optional pool used for allocation
+/// @param stream optional CUDA stream (defaults to CUDA stream 0)
+/// @return GridHandle with blind data appended
+template<typename BuildT, typename BlindDataT, typename BufferT = nanovdb::cuda::DeviceBuffer>
+GridHandle<BufferT>
+addBlindData(const NanoGrid<BuildT> *d_grid,
+             const BlindDataT *d_blindData,
+             uint64_t valueCount,
+             GridBlindDataClass blindClass   = GridBlindDataClass::Unknown,
+             GridBlindDataSemantic semantics = GridBlindDataSemantic::Unknown,
+             const char *name = "",
+             const BufferT &pool = BufferT(),
+             cudaStream_t stream = 0)
+{
+    // In:  |-----------|--------- |-----------|
+    //        old grid    old meta   old data
+    // Out: |-----------|----------|----------|-----------|------------|
+    //        old grid    old meta   new meta    old data    new data
+
+    static_assert(BufferTraits<BufferT>::hasDeviceDual, "Expected BufferT to support device allocation");
+
+    // extract byte sizes of the grid, blind meta data and blind data
+    enum {GRID=0, META=1, DATA=2, CHECKSUM=3};
+    uint64_t tmp[4], *d_tmp;
+    cudaCheck(util::cuda::mallocAsync((void**)&d_tmp, 4*sizeof(uint64_t), stream));
+    util::cuda::lambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {
+        if (auto count  = d_grid->blindDataCount()) {
+            d_tmp[GRID] = util::PtrDiff(&d_grid->blindMetaData(0), d_grid);
+            d_tmp[META] = count*sizeof(GridBlindMetaData);
+            d_tmp[DATA] = d_grid->gridSize() - d_tmp[GRID] - d_tmp[META];
+        } else {
+            d_tmp[GRID] = d_grid->gridSize();
+            d_tmp[META] = d_tmp[DATA] = 0u;
+        }
+        d_tmp[CHECKSUM] = d_grid->checksum().full();
+    }); cudaCheckError();
+    cudaCheck(cudaMemcpyAsync(&tmp, d_tmp, 4*sizeof(uint64_t), cudaMemcpyDeviceToHost, stream));
+
+    GridBlindMetaData metaData{int64_t(sizeof(GridBlindMetaData) + tmp[DATA]), valueCount,
+                               sizeof(BlindDataT), semantics, blindClass, toGridType<BlindDataT>()};
+    if (!metaData.isValid()) throw std::runtime_error("cudaAddBlindData: invalid combination of blind meta data");
+    std::strcpy(metaData.mName, name);
+    auto buffer = BufferT::create(tmp[GRID] + tmp[META] + sizeof(GridBlindMetaData) + tmp[DATA] + metaData.blindDataSize(), &pool, false);
+    void *d_data = buffer.deviceData();
+
+    // 1:   |-----------|----------|
+    //        old grid    old meta
+    cudaCheck(cudaMemcpyAsync(d_data, d_grid, tmp[GRID] + tmp[META], cudaMemcpyDeviceToDevice, stream));
+
+    // 2:   |-----------|----------|----------|
+    //        old grid    old meta   new meta
+    cudaCheck(cudaMemcpyAsync((char*)d_data + tmp[GRID] + tmp[META], &metaData, sizeof(GridBlindMetaData), cudaMemcpyHostToDevice, stream));
+
+    // 3:   |-----------|----------|----------|-----------|
+    //        old grid    old meta   new meta   old data
+    cudaCheck(cudaMemcpyAsync((char*)d_data + tmp[GRID] + tmp[META] + sizeof(GridBlindMetaData),
+                 (const char*)d_grid + tmp[GRID] + tmp[META], tmp[DATA], cudaMemcpyDeviceToDevice, stream));
+
+    // 4:   |-----------|----------|----------|-----------|------------|
+    //        old grid    old meta   new meta    old data    new data
+    const size_t dataSize = valueCount*sizeof(BlindDataT);// no padding
+    cudaCheck(cudaMemcpyAsync((char*)d_data + tmp[GRID] + tmp[META] + sizeof(GridBlindMetaData) + tmp[DATA],
+                              d_blindData, dataSize, cudaMemcpyDeviceToDevice, stream));
+    if (auto padding = metaData.blindDataSize() - dataSize) {// zero out possible padding
+        cudaCheck(cudaMemsetAsync((char*)d_data + tmp[GRID] + tmp[META] + sizeof(GridBlindMetaData) + tmp[DATA] + dataSize, 0, padding, stream));
+    }
+
+    // increment grid size and blind data counter in output grid
+    util::cuda::lambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {
+        auto &grid = *reinterpret_cast<NanoGrid<BuildT>*>(d_data);
+        grid.mBlindMetadataCount += 1;
+        grid.mBlindMetadataOffset = d_tmp[GRID];
+        auto *meta = util::PtrAdd<GridBlindMetaData>(d_data, grid.mBlindMetadataOffset);// points to first blind meta data
+        for (uint32_t i=0, n=grid.mBlindMetadataCount-1; i<n; ++i, ++meta) meta->mDataOffset += sizeof(GridBlindMetaData);
+        grid.mGridSize += sizeof(GridBlindMetaData) + meta->blindDataSize();// expansion with 32 byte alignment
+    }); cudaCheckError();
+    cudaCheck(util::cuda::freeAsync(d_tmp, stream));
+
+    Checksum cs(tmp[CHECKSUM]);
+    cuda::updateChecksum(reinterpret_cast<GridData*>(d_data), cs.mode(), stream);
+
+    return GridHandle<BufferT>(std::move(buffer));
+}// cudaAddBlindData
+
+}// namespace tools::cuda
+
+template<typename BuildT, typename BlindDataT, typename BufferT = cuda::DeviceBuffer>
+[[deprecated("Use nanovdb::cuda::addBlindData instead")]]
+GridHandle<BufferT>
+cudaAddBlindData(const NanoGrid<BuildT> *d_grid,
+                 const BlindDataT *d_blindData,
+                 uint64_t valueCount,
+                 GridBlindDataClass blindClass   = GridBlindDataClass::Unknown,
+                 GridBlindDataSemantic semantics = GridBlindDataSemantic::Unknown,
+                 const char *name = "",
+                 const BufferT &pool = BufferT(),
+                 cudaStream_t stream = 0)
+{
+    return tools::cuda::addBlindData<BuildT, BlindDataT, BufferT>(d_grid, d_blindData, valueCount, blindClass, semantics, name, pool, stream);
+}
+
+}// namespace nanovdb
+
+#endif // NVIDIA_TOOLS_CUDA_ADDBLINDDATA_CUH_HAS_BEEN_INCLUDED
\ No newline at end of file
diff --git a/external/nanovdb/tools/cuda/GridChecksum.cuh b/external/nanovdb/tools/cuda/GridChecksum.cuh
new file mode 100644
index 00000000..5cc964e5
--- /dev/null
+++ b/external/nanovdb/tools/cuda/GridChecksum.cuh
@@ -0,0 +1,441 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file nanovdb/tools/cuda/GridChecksum.cuh
+
+    \author Ken Museth
+
+    \date September 28, 2023
+
+    \brief Compute CRC32 checksum of NanoVDB grids
+
+    \note before v32.6.0: checksum[0] = Grid+Tree+Root, checksum[1] = nodes
+          after  v32.6.0: checksum[0] = Grid+Tree,      checksum[1] = nodes + blind data in 4K blocks
+
+    When serialized:
+                                [Grid,Tree][Root][ROOT TILES...][Node<5>...][Node<4>...][Leaf<3>...][BlindMeta...][BlindData...]
+    checksum[2] before v32.6.0: <------------- [0] ------------><-------------- [1] --------------->
+    checksum[]2 after  v32.6.0: <---[0]---><----------------------------------------[1]---------------------------------------->
+*/
+
+#ifndef NANOVDB_TOOLS_CUDA_GRIDCHECKSUM_CUH_HAS_BEEN_INCLUDED
+#define NANOVDB_TOOLS_CUDA_GRIDCHECKSUM_CUH_HAS_BEEN_INCLUDED
+
+#include <nanovdb/util/cuda/Util.h>
+#include <nanovdb/cuda/DeviceBuffer.h>// required for instantiation of move c-tor of GridHandle
+#include <nanovdb/cuda/NodeManager.cuh>
+#include <nanovdb/tools/GridChecksum.h>
+#include <nanovdb/GridHandle.h>
+
+namespace nanovdb {// =======================================================================
+
+namespace tools::cuda {// ===================================================================
+
+/// @brief Compute the (2 x CRC32) checksum of the specified @c d_gridData on the device
+/// @param d_gridData Device base pointer to the grid from which the checksum is computed.
+/// @param mode Defines the mode of computation for the checksum.
+/// @param stream optional cuda stream (defaults to zero)
+/// @return Return the (2 x CRC32) checksum of the specified @c d_gridData
+Checksum evalChecksum(const GridData *d_gridData, CheckMode mode = CheckMode::Default, cudaStream_t stream = 0);
+
+/// @brief Extract the checksum of a device grid
+/// @param d_gridData Device basepointer to grid with a checksum
+/// @param stream optional cuda stream (defaults to zero)
+/// @return Checksum encoded in the specified grid
+Checksum getChecksum(const GridData *d_gridData, cudaStream_t stream = 0);
+
+/// @brief Return true if the checksum of @c d_gridData matches the expected
+///        value already encoded into the grid's meta data.
+/// @tparam BuildT Template parameter used to build NanoVDB grid.
+/// @param d_gridData Grid whose checksum is validated.
+/// @param mode Defines the mode of computation for the checksum.
+/// @param stream optional cuda stream (defaults to zero)
+bool validateChecksum(const GridData *d_gridData, CheckMode mode = CheckMode::Default, cudaStream_t stream = 0);
+
+/// @brief Update the checksum of a device grid
+/// @param d_gridData device pointer to GridData
+/// @param mode Mode of computation for the checksum.
+/// @param stream optional cuda stream (defaults to zero)
+void updateChecksum(GridData *d_gridData, CheckMode mode, cudaStream_t stream = 0);
+
+/// @brief  Updates the checksum of a device grid by preserving its mode
+/// @param d_gridData Device base pointer to grid
+/// @param stream optional cuda stream (defaults to zero)
+inline void updateChecksum(GridData *d_gridData, cudaStream_t stream = 0)
+{
+    updateChecksum(d_gridData, getChecksum(d_gridData, stream).mode(), stream);
+}
+
+}// namespace tools::cuda
+
+namespace util::cuda {
+
+/// @brief Cuda kernel that computes CRC32 checksums of blocks of data using a look-up-table
+/// @param d_data device pointer to raw data from wich to compute the CRC32 checksums
+/// @param d_blockCRC device pointer to array of @c blockCount checksums for each block
+/// @param blockCount number of blocks and checksums
+/// @param blockSize size of each block in bytes
+/// @param d_lut device pointer to CRC32 Lookup Table
+template <typename T>
+__global__ void crc32Kernel(const T *d_data, uint32_t* d_blockCRC, uint32_t blockCount, uint32_t blockSize, const uint32_t *d_lut)
+{
+    const uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < blockCount) d_blockCRC[tid] = crc32((const uint8_t*)d_data + tid * blockSize, blockSize, d_lut);
+}
+
+/// @brief Cuda kernel that computes CRC32 checksums of blocks of data (without using a look-up-table)
+/// @param d_data device pointer to raw data from wich to compute the CRC32 checksums
+/// @param d_blockCRC device pointer to array of @c blockCount checksums for each block
+/// @param blockCount number of blocks and checksums
+/// @param blockSize size of each block in bytes
+template <typename T>
+__global__ void crc32Kernel(const T *d_data, uint32_t* d_blockCRC, uint32_t blockCount, uint32_t blockSize)
+{
+    const uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < blockCount) d_blockCRC[tid] = crc32((const uint8_t*)d_data + tid * blockSize, blockSize);
+}
+
+/// @brief Host function to allocate and initiate a Look-Up-Table of size 256 for subsequent CRC32 computation on the device
+/// @param extra number of extra elements in the LUT
+/// @param stream optional cuda stream (defaults to zero)
+/// @return returns a nanovdb::util::cuda::unique_ptr point to a lookup-table for CRC32 computation
+inline unique_ptr<uint32_t> createCrc32Lut(size_t extra = 0, cudaStream_t stream = 0)
+{
+    unique_ptr<uint32_t> lut(256 + extra, stream);
+    uint32_t *d_lut = lut.get();
+    lambdaKernel<<<1, 256, 0, stream>>>(256, [=] __device__(size_t tid) {initCrc32Lut(d_lut, tid);});
+    cudaCheckError();
+    return lut;
+}
+
+/// @brief Compute CRC32 checksum of 4K block
+/// @param d_data device pointer to start of data
+/// @param size number of bytes
+/// @param d_lut Look-Up-Table for CRC32 computation
+/// @param stream optional cuda stream (defaults to zero)
+inline void blockedCRC32(const void *d_data, size_t size, const uint32_t *d_lut, uint32_t *d_crc, cudaStream_t stream)
+{
+    NANOVDB_ASSERT(d_data && d_lut && d_crc);
+    static constexpr unsigned int threadsPerBlock = 128;// seems faster than the old value of 256!
+    const uint64_t checksumCount = size >> NANOVDB_CRC32_LOG2_BLOCK_SIZE;// 4 KB (4096 byte)
+    unique_ptr<uint32_t> buffer(checksumCount, stream);// for checksums of 4 KB blocks
+    uint32_t *d_checksums = buffer.get();
+    lambdaKernel<<<blocksPerGrid(checksumCount, threadsPerBlock), threadsPerBlock, 0, stream>>>(checksumCount, [=] __device__(size_t tid) {
+        uint32_t blockSize = 1 << NANOVDB_CRC32_LOG2_BLOCK_SIZE;
+        if (tid+1 == checksumCount) blockSize += size - (checksumCount<<NANOVDB_CRC32_LOG2_BLOCK_SIZE);
+        d_checksums[tid] = crc32((const uint8_t*)d_data + (tid<<NANOVDB_CRC32_LOG2_BLOCK_SIZE), blockSize, d_lut);
+    }); cudaCheckError();
+    lambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {// Compute CRC32 of all the 4K blocks
+        *d_crc = crc32((const uint8_t*)d_checksums, checksumCount*sizeof(uint32_t), d_lut);
+    }); cudaCheckError();
+}// void cudaBlockedCRC32(const void *d_data, size_t size, const uint32_t *d_lut, uint32_t *d_crc, cudaStream_t stream)
+
+/// @brief Compute CRC32 checksum of 4K block
+/// @param d_begin device pointer to start of data (inclusive)
+/// @param d_end device pointer to end of data (exclusive)
+/// @param d_lut pointer to Look-Up-Table for accelerated CRC32 computation
+/// @param stream optional cuda stream (defaults to zero)
+inline void blockedCRC32(const void *d_begin, const void *d_end, const uint32_t *d_lut, uint32_t *d_crc, cudaStream_t stream)
+{
+    blockedCRC32(d_begin, PtrDiff(d_end, d_begin), d_lut, d_crc, stream);
+}
+
+}// namespace util::cuda
+
+namespace tools::cuda {
+
+/// @brief
+/// @param d_gridData
+/// @param d_lut pointer to Look-Up-Table for accelerated CRC32 computation
+/// @param d_crc
+/// @param stream optional cuda stream (defaults to zero)
+inline void crc32Head(const GridData *d_gridData, const uint32_t *d_lut, uint32_t *d_crc, cudaStream_t stream)
+{
+    NANOVDB_ASSERT(d_gridData && d_lut && d_crc);
+    util::cuda::lambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t){*d_crc = tools::crc32Head(d_gridData, d_lut);});
+}// void cudaCrc32Head(const GridData *d_gridData, const uint32_t *d_lut, uint32_t *d_crc, cudaStream_t stream)
+
+/// @brief
+/// @param d_gridData
+/// @param gridData
+/// @param d_lut pointer to Look-Up-Table for accelerated CRC32 computation
+/// @param stream optional cuda stream (defaults to zero)
+inline void crc32Tail(const GridData *d_gridData, const GridData *gridData, const uint32_t *d_lut, uint32_t *d_crc, cudaStream_t stream)
+{
+    NANOVDB_ASSERT(d_gridData && gridData && d_lut && d_crc);
+    NANOVDB_ASSERT(gridData->mVersion > Version(32,6,0));
+    const uint8_t *d_begin = (const uint8_t*)d_gridData;
+    util::cuda::blockedCRC32(d_begin + sizeof(GridData) + sizeof(TreeData), d_begin + gridData->mGridSize, d_lut, d_crc, stream);
+}
+
+/// @brief
+/// @tparam ValueT
+/// @param d_grid
+/// @param gridData
+/// @param d_lut pointer to Look-Up-Table for accelerated CRC32 computation
+/// @param d_crc
+/// @param stream
+template <typename ValueT>
+void crc32TailOld(const NanoGrid<ValueT> *d_grid, const GridData *gridData, const uint32_t *d_lut, uint32_t *d_crc, cudaStream_t stream)
+{
+    static constexpr unsigned int threadsPerBlock = 128;// seems faster than the old value of 256!
+    auto nodeMgrHandle = nanovdb::cuda::createNodeManager<ValueT, nanovdb::cuda::DeviceBuffer>(d_grid, nanovdb::cuda::DeviceBuffer(), stream);
+    auto *d_nodeMgr = nodeMgrHandle.template deviceMgr<ValueT>();
+    NANOVDB_ASSERT(isAligned(d_nodeMgr));
+    const uint32_t nodeCount[3]={gridData->template nodeCount<0>(), gridData->template nodeCount<1>(), gridData->template nodeCount<2>()};
+    util::cuda::unique_ptr<uint32_t> d_checksumsUP(nodeCount[0]+nodeCount[1]+nodeCount[2]);
+    uint32_t *d_checksums = d_checksumsUP.get(), *d_ptr = d_checksums;
+
+    util::cuda::lambdaKernel<<<util::cuda::blocksPerGrid(nodeCount[2], threadsPerBlock), threadsPerBlock, 0, stream>>>(nodeCount[2], [=] __device__(size_t tid) {
+        auto &node = d_nodeMgr->upper(uint32_t(tid));
+        d_ptr[tid] = util::crc32((const uint8_t*)&node, node.memUsage(), d_lut);
+    }); cudaCheckError();
+
+    d_ptr += nodeCount[2];
+    util::cuda::lambdaKernel<<<util::cuda::blocksPerGrid(nodeCount[1], threadsPerBlock), threadsPerBlock, 0, stream>>>(nodeCount[1], [=] __device__(size_t tid) {
+        auto &node = d_nodeMgr->lower(uint32_t(tid));
+        d_ptr[tid] = util::crc32((const uint8_t*)&node, node.memUsage(), d_lut);
+    }); cudaCheckError();
+
+    d_ptr += nodeCount[1];
+    util::cuda::lambdaKernel<<<util::cuda::blocksPerGrid(nodeCount[0], threadsPerBlock), threadsPerBlock, 0, stream>>>(nodeCount[0], [=] __device__(size_t tid) {
+        auto &node = d_nodeMgr->leaf(uint32_t(tid));
+        d_ptr[tid] = util::crc32((const uint8_t*)&node, node.memUsage(), d_lut);
+    }); cudaCheckError();
+
+    util::cuda::lambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {
+        *d_crc = util::crc32(d_checksums, d_nodeMgr->tree().totalNodeCount()*sizeof(uint32_t), d_lut);
+    }); cudaCheckError();
+}// void cudaCrc32TailOld(const NanoGrid<ValueT> *d_grid, const GridData *gridData, uint32_t *d_lut, cudaStream_t stream)
+
+struct Crc32TailOld {
+    template <typename BuildT>
+    static void known(const GridData *d_gridData, const GridData *gridData, const uint32_t *d_lut, uint32_t *d_crc, cudaStream_t stream)
+    {
+        crc32TailOld((const NanoGrid<BuildT>*)d_gridData, gridData, d_lut, d_crc, stream);
+    }
+    static void unknown(const GridData*, const GridData*, const uint32_t*, uint32_t*, cudaStream_t)
+    {
+        throw std::runtime_error("Cannot call cudaCrc32TailOld with grid of unknown type");
+    }
+};// Crc32TailOld
+
+/// @brief
+/// @param d_gridData
+/// @param mode
+/// @param stream
+/// @return
+inline Checksum evalChecksum(const GridData *d_gridData, CheckMode mode, cudaStream_t stream)
+{
+    static const int headSize = sizeof(GridData) + sizeof(TreeData);
+    NANOVDB_ASSERT(d_gridData);
+    Checksum cs;
+    if (mode != CheckMode::Empty) {
+        auto d_lut = util::cuda::createCrc32Lut(1, stream);
+        crc32Head(d_gridData, d_lut.get(), d_lut.get() + 256, stream);
+        cudaCheck(cudaMemcpyAsync(&(cs.head()), d_lut.get() + 256, headSize, cudaMemcpyDeviceToHost, stream));
+        if (mode == CheckMode::Full) {
+            std::unique_ptr<char[]> buffer(new char[headSize]);
+            auto *gridData = (GridData*)(buffer.get());
+            cudaCheck(cudaMemcpyAsync(gridData, d_gridData, headSize, cudaMemcpyDeviceToHost, stream));
+            if (gridData->mVersion > Version(32,6,0)) {
+                crc32Tail(d_gridData, gridData, d_lut.get(), d_lut.get() + 256, stream);
+            } else {
+                callNanoGrid<Crc32TailOld>(d_gridData, gridData, d_lut.get(), d_lut.get() + 256, stream);
+            }
+            cudaCheck(cudaMemcpyAsync(&(cs.tail()), d_lut.get() + 256, headSize, cudaMemcpyDeviceToHost, stream));
+        }
+    }
+    return cs;
+}
+
+/// @brief
+/// @tparam BuildT
+/// @param d_grid
+/// @param mode
+/// @param stream
+/// @return
+template <typename BuildT>
+Checksum evalChecksum(const NanoGrid<BuildT> *d_grid, CheckMode mode, cudaStream_t stream = 0)
+{
+    static const int headSize = sizeof(GridData) + sizeof(TreeData);
+    NANOVDB_ASSERT(d_grid);
+    Checksum cs;
+    if (mode != CheckMode::Empty) {
+        auto d_lut = util::cuda::createCrc32Lut(1, stream);
+        crc32Head(d_grid, d_lut.get(), d_lut.get() + 256, stream);
+        cudaCheck(cudaMemcpyAsync(&(cs.head()), d_lut.get() + 256, headSize, cudaMemcpyDeviceToHost, stream));
+        if (mode == CheckMode::Full) {
+            std::unique_ptr<char[]> buffer(new char[headSize]);
+            auto *gridData = (GridData*)(buffer.get());
+            cudaCheck(cudaMemcpyAsync(gridData, d_grid, headSize, cudaMemcpyDeviceToHost, stream));
+            if (gridData->mVersion > Version(32,6,0)) {
+                crc32Tail(d_grid, gridData, d_lut.get(), d_lut.get() + 256, stream);
+            } else {
+                crc32TailOld(d_grid, gridData, d_lut.get(), d_lut.get() + 256, stream);
+            }
+            cudaCheck(cudaMemcpyAsync(&(cs.tail()), d_lut.get() + 256, headSize, cudaMemcpyDeviceToHost, stream));
+        }
+    }
+    return cs;
+}
+
+/// @brief
+/// @param d_gridData
+/// @param mode
+/// @param stream
+/// @return
+inline bool validateChecksum(const GridData *d_gridData, CheckMode mode, cudaStream_t stream)
+{
+    static const int headSize = sizeof(GridData) + sizeof(TreeData);
+    NANOVDB_ASSERT(d_gridData);
+    if (mode == CheckMode::Empty) return true;
+
+    // Copy just the GridData from the device to the host
+    std::unique_ptr<char[]> buffer(new char[headSize]);
+    auto *gridData = (GridData*)(buffer.get());
+    cudaCheck(cudaMemcpyAsync(gridData, d_gridData, headSize, cudaMemcpyDeviceToHost, stream));
+    if (gridData->mChecksum.isEmpty()) return true;// checksum is empty so nothing to check
+
+    // Allocate device LUT for CRC32 computation
+    auto d_lut = util::cuda::createCrc32Lut(1, stream);// unique pointer
+    uint32_t crc = 0, *d_crc = d_lut.get() + 256;
+
+    // Check head checksum
+    crc32Head(d_gridData, d_lut.get(), d_crc, stream);
+    cudaCheck(cudaMemcpyAsync(&crc, d_crc, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
+    const bool checkHead = (crc == gridData->mChecksum.head());
+    if (gridData->mChecksum.isHalf() || mode == CheckMode::Half || !checkHead) return checkHead;
+
+    // Check tail checksum
+    if (gridData->mVersion > Version(32,6,0)) {
+        crc32Tail(d_gridData, gridData, d_lut.get(), d_crc, stream);
+    } else {
+        callNanoGrid<Crc32TailOld>(d_gridData, gridData, d_lut.get(), d_crc, stream);
+    }
+    cudaCheck(cudaMemcpyAsync(&crc, d_crc, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
+    return crc == gridData->mChecksum.tail();
+}// bool cudaValidateChecksum(const GridData *d_gridData, CheckMode mode, cudaStream_t stream = 0)
+
+/// @brief
+/// @tparam BuildT
+/// @param d_grid
+/// @param mode
+/// @param stream
+/// @return
+template <typename BuildT>
+bool validateChecksum(const NanoGrid<BuildT> *d_grid, CheckMode mode, cudaStream_t stream = 0)
+{
+    static const int headSize = sizeof(GridData) + sizeof(TreeData);
+    NANOVDB_ASSERT(d_grid);
+    if (mode == CheckMode::Empty) return true;
+
+    // Copy just the GridData from the device to the host
+    std::unique_ptr<char[]> buffer(new char[headSize]);
+    auto *gridData = (GridData*)(buffer.get());
+    cudaCheck(cudaMemcpyAsync(gridData, d_grid, headSize, cudaMemcpyDeviceToHost, stream));
+    if (gridData->mChecksum.isEmpty()) return true;// checksum is empty so nothing to check
+
+    // Allocate device LUT for CRC32 computation
+    auto d_lut = util::cuda::createCrc32Lut(1, stream);// unique pointer
+    uint32_t crc = 0, *d_crc = d_lut.get() + 256;
+
+    // Check head checksum
+    crc32Head(d_grid, d_lut.get(), d_crc, stream);
+    cudaCheck(cudaMemcpyAsync(&crc, d_crc, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
+    const bool checkHead = (crc == gridData->mChecksum.head());
+    if (gridData->mChecksum.isHalf() || mode == CheckMode::Half || !checkHead) return checkHead;
+
+    // Check tail checksum
+    if (gridData->mVersion > Version(32,6,0)) {
+        crc32Tail(d_grid, gridData, d_lut.get(), d_crc, stream);
+    } else {
+        crc32TailOld(d_grid, gridData, d_lut.get(), d_crc, stream);
+    }
+    cudaCheck(cudaMemcpyAsync(&crc, d_crc, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
+    return crc == gridData->mChecksum.tail();
+}// bool cudaValidateChecksum(const GridData *d_gridData, CheckMode mode, cudaStream_t stream = 0)
+
+/// @brief Extract the checksum of a device grid
+/// @param d_gridData Device pointer to grid with a checksum
+/// @param stream optional cuda stream (defaults to zero)
+inline Checksum getChecksum(const GridData *d_gridData, cudaStream_t stream)
+{
+    NANOVDB_ASSERT(d_gridData);
+    Checksum cs;
+    cudaCheck(cudaMemcpyAsync(&cs, (const uint8_t*)d_gridData + 8, sizeof(cs), cudaMemcpyDeviceToHost, stream));
+    return cs;
+}
+
+/// @brief Update the checksum of a device grid
+/// @param d_gridData device pointer to GridData
+/// @param mode Mode of computation for the checksum.
+/// @param stream optional cuda stream (defaults to zero)
+/// @return The actual mode used for checksum computation. Eg. if @c d_gridData is NULL (or @c mode = CheckMode::Empty)
+///         then CheckMode::Empty is always returned. Else if the grid has no nodes or blind data CheckMode::Partial
+///         is always returnd (even if @c mode = CheckMode::Full).
+inline void updateChecksum(GridData *d_gridData, CheckMode mode, cudaStream_t stream)
+{
+    NANOVDB_ASSERT(d_gridData);
+    if (mode == CheckMode::Empty) return;
+
+    // Allocate device LUT for CRC32 computation
+    auto d_lut = util::cuda::createCrc32Lut(0, stream);// unique pointers
+
+    // Update head checksum
+    crc32Head(d_gridData, d_lut.get(), (uint32_t*)d_gridData + 2, stream);
+
+    if (mode == CheckMode::Half) return;
+
+    // Copy just the GridData from the device to the host
+    std::unique_ptr<char[]> buffer(new char[sizeof(GridData) + sizeof(TreeData)]);
+    auto *gridData = (GridData*)(buffer.get());
+    cudaCheck(cudaMemcpyAsync(gridData, d_gridData, sizeof(GridData) + sizeof(TreeData), cudaMemcpyDeviceToHost, stream));
+
+    // Update tail checksum
+    uint32_t *d_tail = (uint32_t*)d_gridData + 3;
+    if (gridData->mVersion > Version(32,6,0)) {
+        crc32Tail(d_gridData, gridData, d_lut.get(), d_tail, stream);
+    } else {
+        callNanoGrid<Crc32TailOld>(d_gridData, gridData, d_lut.get(), d_tail, stream);
+    }
+}// cudaUpdateChecksum
+
+/// @brief
+/// @tparam ValueT
+/// @param d_grid
+/// @param mode
+/// @param stream
+template <typename ValueT>
+void updateChecksum(NanoGrid<ValueT> *d_grid, CheckMode mode, cudaStream_t stream = 0)
+{
+    NANOVDB_ASSERT(d_grid);
+    if (mode == CheckMode::Empty) return;
+
+    // Allocate device LUT for CRC32 computation
+    auto d_lut = util::cuda::createCrc32Lut(0, stream);// unique pointers
+
+    // Update head checksum
+    cuda::crc32Head(d_grid, d_lut.get(), (uint32_t*)d_grid + 2, stream);
+    if (mode == CheckMode::Half) return;
+
+    // Copy just the GridData from the device to the host
+    std::unique_ptr<char[]> buffer(new char[sizeof(GridData) + sizeof(TreeData)]);
+    auto *gridData = (GridData*)(buffer.get());
+    cudaCheck(cudaMemcpyAsync(gridData, d_grid, sizeof(GridData) + sizeof(TreeData), cudaMemcpyDeviceToHost, stream));
+
+    // Update tail checksum
+    uint32_t *d_tail = (uint32_t*)d_grid + 3;
+    if (gridData->mVersion > Version(32,6,0)) {
+        crc32Tail(d_grid->data(), gridData, d_lut.get(), d_tail, stream);
+    } else {
+        crc32TailOld(d_grid, gridData, d_lut.get(), d_tail, stream);
+    }
+}
+
+}// namespace tools::cuda // ================================================
+
+}// namespace nanovdb // ====================================================
+
+#endif // NANOVDB_TOOLS_CUDA_GRIDCHECKSUM_CUH_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/tools/cuda/GridStats.cuh b/external/nanovdb/tools/cuda/GridStats.cuh
new file mode 100644
index 00000000..0ba570ac
--- /dev/null
+++ b/external/nanovdb/tools/cuda/GridStats.cuh
@@ -0,0 +1,249 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file nanovdb/tools/cuda/GridStats.cuh
+
+    \author Ken Museth
+
+    \date October 9, 2023
+
+    \brief Re-computes min/max/avg/var/bbox information for each node in a
+           pre-existing NanoVDB grid on the device.
+*/
+
+#ifndef NANOVDB_TOOLS_CUDA_GRIDSTATS_CUH_HAS_BEEN_INCLUDED
+#define NANOVDB_TOOLS_CUDA_GRIDSTATS_CUH_HAS_BEEN_INCLUDED
+
+#include <nanovdb/NanoVDB.h>
+#include <nanovdb/tools/GridStats.h>
+
+namespace nanovdb {
+
+namespace tools::cuda {
+
+/// @brief Update, i.e. re-compute, grid statistics like min/max, stats and bbox
+///        information for an existing NanoVDB Grid.
+/// @param grid   Grid whose stats to update
+/// @param mode   Mode of computation for the statistics.
+/// @param stream Optional cuda stream (defaults to zero)
+template<typename BuildT>
+void updateGridStats(NanoGrid<BuildT> *d_grid, StatsMode mode = StatsMode::Default, cudaStream_t stream = 0);
+
+//================================================================================================
+
+/// @brief Allows for the construction of NanoVDB grids without any dependecy
+template<typename BuildT, typename StatsT = Stats<typename NanoGrid<BuildT>::ValueType>>
+class GridStats
+{
+    using GridT  = NanoGrid<BuildT>;
+    using TreeT  = typename GridT::TreeType;
+    using ValueT = typename TreeT::ValueType;
+    using Node0  = typename TreeT::Node0; // leaf
+    using Node1  = typename TreeT::Node1; // lower
+    using Node2  = typename TreeT::Node2; // upper
+    using RootT  = typename TreeT::Node3; // root
+    static_assert(util::is_same<ValueT, typename StatsT::ValueType>::value, "Mismatching type");
+
+    ValueT mDelta; // skip rendering of node if: node.max < -mDelta || node.min > mDelta
+
+public:
+    GridStats(ValueT delta = ValueT(0)) : mDelta(delta) {}
+
+    void update(GridT *d_grid, cudaStream_t stream = 0);
+
+}; // cuda::GridStats
+
+//================================================================================================
+
+namespace {// define cuda kernels in an unnamed namespace
+
+template<typename BuildT, typename StatsT>
+__global__ void processLeaf(NodeManager<BuildT> *d_nodeMgr, StatsT *d_stats)
+{
+    const uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= d_nodeMgr->leafCount()) return;
+    auto &d_leaf = d_nodeMgr->leaf(tid);
+
+    if (d_leaf.updateBBox()) {// updates active bounding box (also updates data->mFlags) and return true if non-empty
+        if constexpr(StatsT::hasStats()) {
+            StatsT stats;
+            for (auto it = d_leaf.cbeginValueOn(); it; ++it) stats.add(*it);
+            if constexpr(StatsT::hasAverage()) {
+                d_stats[tid] = stats;
+                *reinterpret_cast<uint32_t*>(&d_leaf.mMinimum) = tid;
+            } else {
+                stats.setStats(d_leaf);
+            }
+        }
+    }
+    d_leaf.mFlags &= ~uint8_t(1u);// enable rendering
+}// processLeaf
+
+template<typename BuildT, typename StatsT, int LEVEL>
+__global__ void processInternal(NodeManager<BuildT> *d_nodeMgr, StatsT *d_stats)
+{
+    using ChildT = typename NanoNode<BuildT,LEVEL-1>::type;
+    const uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= d_nodeMgr->nodeCount(LEVEL)) return;
+    auto &d_node = d_nodeMgr->template node<LEVEL>(tid);
+    auto &bbox   = d_node.mBBox;
+    bbox         = CoordBBox();// empty bbox
+    StatsT stats;
+    uint32_t childID = 0u;
+
+    for (auto it = d_node.beginChild(); it; ++it) {
+        auto &child = *it;
+        bbox.expand( child.bbox() );
+        if constexpr(StatsT::hasAverage()) {
+            childID = *reinterpret_cast<uint32_t*>(&child.mMinimum);
+            StatsT &s = d_stats[childID];
+            s.setStats(child);
+            stats.add(s);
+        } else if constexpr(StatsT::hasMinMax()) {
+            stats.add(child.minimum());
+            stats.add(child.maximum());
+        }
+    }
+    for (auto it = d_node.cbeginValueOn(); it; ++it) {
+        const Coord ijk = it.getCoord();
+        bbox[0].minComponent(ijk);
+        bbox[1].maxComponent(ijk + Coord(ChildT::DIM - 1));
+        if constexpr(StatsT::hasStats()) stats.add(*it, ChildT::NUM_VALUES);
+    }
+    if constexpr(StatsT::hasAverage()) {
+        d_stats[childID] = stats;
+        *reinterpret_cast<uint32_t*>(&d_node.mMinimum) = childID;
+    } else if constexpr(StatsT::hasMinMax()) {
+        stats.setStats(d_node);
+    }
+    d_node.mFlags &= ~uint64_t(1u);// enable rendering
+}// processInternal
+
+template<typename BuildT, typename StatsT>
+__global__ void processRootAndGrid(NodeManager<BuildT> *d_nodeMgr, StatsT *d_stats)
+{
+    using ChildT = NanoUpper<BuildT>;
+    using ValueT = typename ChildT::ValueType;
+
+    // process root
+    auto &root = d_nodeMgr->root();
+    root.mBBox = CoordBBox();
+    if (root.isEmpty()) {
+        root.mMinimum = root.mMaximum = root.mBackground;
+        root.mAverage = root.mStdDevi = 0;
+    } else {
+        ValueT v;
+        StatsT s;
+        for (auto it = root.beginDense(); it; ++it) {
+            if (auto *child = it.probeChild(v)) {
+                root.mBBox.expand( child->bbox() );
+                if constexpr(StatsT::hasAverage()) {
+                    StatsT &stats = d_stats[*reinterpret_cast<uint32_t*>(&child->mMinimum)];
+                    stats.setStats(*child);
+                    s.add(stats);
+                } else if constexpr(StatsT::hasMinMax()){
+                    s.add(child->minimum());
+                    s.add(child->maximum());
+                }
+            } else if (it.isValueOn()) {
+                const Coord ijk = it.getCoord();
+                root.mBBox[0].minComponent(ijk);
+                root.mBBox[1].maxComponent(ijk + Coord(ChildT::DIM - 1));
+                if constexpr(StatsT::hasStats()) s.add(v, ChildT::NUM_VALUES);
+            }
+        }
+        s.setStats(root);
+    }
+
+    // process Grid
+    auto& grid = d_nodeMgr->grid();
+    const auto& indexBBox = root.bbox();
+    if (indexBBox.empty()) {
+        grid.mWorldBBox = Vec3dBBox();
+        grid.setBBoxOn(false);
+    } else {
+        // Note that below max is offset by one since CoordBBox.max is inclusive
+        // while bbox<Vec3d>.max is exclusive. However, min is inclusive in both
+        // CoordBBox and Vec3dBBox. This also guarantees that a grid with a single
+        // active voxel, does not have an empty world bbox! E.g. if a grid with a
+        // unit index-to-world transformation only contains the active voxel (0,0,0)
+        // then indeBBox = (0,0,0) -> (0,0,0) and then worldBBox = (0.0, 0.0, 0.0)
+        // -> (1.0, 1.0, 1.0). This is a consequence of the different definitions
+        // of index and world bounding boxes inherited from OpenVDB!
+        grid.mWorldBBox = CoordBBox(indexBBox[0], indexBBox[1].offsetBy(1)).transform(grid.map());
+        grid.setBBoxOn(true);
+    }
+
+    // set bit flags
+    grid.setMinMaxOn(StatsT::hasMinMax());
+    grid.setAverageOn(StatsT::hasAverage());
+    grid.setStdDeviationOn(StatsT::hasStdDeviation());
+}// processRootAndGrid
+
+}// cuda kernels are defined in an unnamed namespace
+
+//================================================================================================
+
+template<typename BuildT, typename StatsT>
+void GridStats<BuildT, StatsT>::update(NanoGrid<BuildT> *d_grid, cudaStream_t stream)
+{
+    static const uint32_t threadsPerBlock = 128;
+    auto blocksPerGrid = [&](uint32_t count)->uint32_t{return (count + (threadsPerBlock - 1)) / threadsPerBlock;};
+
+    auto nodeMgrHandle = nanovdb::cuda::createNodeManager(d_grid, CudaDeviceBuffer(), stream);
+    auto *d_nodeMgr = nodeMgrHandle.template deviceMgr<BuildT>();
+
+    uint32_t nodeCount[3];// {leaf, lower, upper}
+    cudaCheck(cudaMemcpyAsync(nodeCount, (char*)d_grid + sizeof(GridData) + 4*sizeof(uint64_t), 3*sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
+    //cudaStreamSynchronize(stream);// finish all device tasks in stream
+
+    StatsT *d_stats = nullptr;
+
+    if constexpr(StatsT::hasAverage()) cudaCheck(util::cuda::mallocAsync((void**)&d_stats, nodeCount[0]*sizeof(StatsT), stream));
+
+    processLeaf<BuildT><<<blocksPerGrid(nodeCount[0]), threadsPerBlock, 0, stream>>>(d_nodeMgr, d_stats);
+
+    processInternal<BuildT, StatsT, 1><<<blocksPerGrid(nodeCount[1]), threadsPerBlock, 0, stream>>>(d_nodeMgr, d_stats);
+
+    processInternal<BuildT, StatsT, 2><<<blocksPerGrid(nodeCount[2]), threadsPerBlock, 0, stream>>>(d_nodeMgr, d_stats);
+
+    processRootAndGrid<BuildT><<<1, 1, 0, stream>>>(d_nodeMgr, d_stats);
+
+    if constexpr(StatsT::hasAverage()) cudaCheck(util::cuda::freeAsync(d_stats, stream));
+
+} // cuda::GridStats::update( Grid )
+
+//================================================================================================
+
+template<typename BuildT>
+void updateGridStats(NanoGrid<BuildT> *d_grid, StatsMode mode, cudaStream_t stream)
+{
+    if (d_grid == nullptr && mode == StatsMode::Disable) {
+        return;
+    } else if (mode == StatsMode::BBox || util::is_same<bool, BuildT>::value) {
+        GridStats<BuildT, NoopStats<BuildT> > stats;
+        stats.update(d_grid, stream);
+    } else if (mode == StatsMode::MinMax) {
+        GridStats<BuildT, Extrema<BuildT> > stats;
+        stats.update(d_grid, stream);
+    } else if (mode == StatsMode::All) {
+        GridStats<BuildT, Stats<BuildT> > stats;
+        stats.update(d_grid, stream);
+    } else {
+        throw std::runtime_error("GridStats: Unsupported statistics mode.");
+    }
+}// cuda::updateGridStats
+
+}// namespace tools::cuda
+
+template<typename BuildT>
+[[deprecated("Use nanovdb::cuda::updateGridStats instead")]]
+void cudaGridStats(NanoGrid<BuildT> *d_grid, tools::StatsMode mode = tools::StatsMode::Default, cudaStream_t stream = 0)
+{
+    tools::cuda::updateGridStats<BuildT>(d_grid, mode, stream);
+}
+
+} // namespace nanovdb
+
+#endif // NANOVDB_TOOLS_CUDA_GRIDSTATS_CUH_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/tools/cuda/GridValidator.cuh b/external/nanovdb/tools/cuda/GridValidator.cuh
new file mode 100644
index 00000000..aaa28412
--- /dev/null
+++ b/external/nanovdb/tools/cuda/GridValidator.cuh
@@ -0,0 +1,59 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file nanovdb/tools/cuda/GridValidator.cuh
+
+    \author Ken Museth
+
+    \date November 3, 2023
+
+    \brief Checks the validity of an existing NanoVDB device grid.
+*/
+
+#ifndef NANOVDB_TOOLS_CUDA_GRIDVALIDATOR_CUH_HAS_BEEN_INCLUDED
+#define NANOVDB_TOOLS_CUDA_GRIDVALIDATOR_CUH_HAS_BEEN_INCLUDED
+
+#include <nanovdb/NanoVDB.h>
+#include <nanovdb/tools/GridValidator.h>
+#include <nanovdb/tools/cuda/GridChecksum.cuh>
+#include <nanovdb/util/cuda/Util.h>
+
+namespace nanovdb {
+
+namespace tools::cuda {
+
+/// @brief Return true if the specified grid passes several validation tests.
+///
+/// @param grid Grid to validate
+/// @param detailed If true the validation test is detailed and relatively slow.
+/// @param verbose If true information about the first failed test is printed to std::cerr
+template <typename ValueT>
+bool isValid(const NanoGrid<ValueT> *d_grid, CheckMode mode, bool verbose = false, cudaStream_t stream = 0)
+{
+    static const int size = 100;
+    std::unique_ptr<char[]> strUP(new char[size]);
+    util::cuda::unique_ptr<char> d_strUP(size);
+    char *str = strUP.get(), *d_str = d_strUP.get();
+
+    util::cuda::lambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {nanovdb::tools::checkGrid(d_grid, d_str, mode);});
+    cudaMemcpyAsync(str, d_str, size, cudaMemcpyDeviceToHost, stream);
+
+    if (util::empty(str) && !cuda::validateChecksum(d_grid, mode)) util::strcpy(str, "Mis-matching checksum");
+    if (verbose && !util::empty(str)) std::cerr << "Validation failed: " << str << std::endl;
+
+    return util::empty(str);
+}// tools::cuda::isValid
+
+}// namespace tools::cuda
+
+template <typename ValueT>
+[[deprecated("Use cuda::isValid() instead.")]]
+bool cudaIsValid(const NanoGrid<ValueT> *d_grid, CheckMode mode, bool verbose = false, cudaStream_t stream = 0)
+{
+    return tools::cuda::isValid(d_grid, mode, verbose, stream);
+}
+
+} // namespace nanovdb
+
+#endif // NANOVDB_TOOLS_CUDA_GRIDVALIDATOR_CUH_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/tools/cuda/IndexToGrid.cuh b/external/nanovdb/tools/cuda/IndexToGrid.cuh
new file mode 100644
index 00000000..d26b09a2
--- /dev/null
+++ b/external/nanovdb/tools/cuda/IndexToGrid.cuh
@@ -0,0 +1,407 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file nanovdb/tools/cuda/IndexToGrid.cuh
+
+    \author Ken Museth
+
+    \date April 17, 2023
+
+    \brief Combines an IndexGrid and values into a regular Grid on the device
+
+    \warning The header file contains cuda device code so be sure
+             to only include it in .cu files (or other .cuh files)
+*/
+
+#ifndef NVIDIA_TOOLS_CUDA_INDEXTOGRID_CUH_HAS_BEEN_INCLUDED
+#define NVIDIA_TOOLS_CUDA_INDEXTOGRID_CUH_HAS_BEEN_INCLUDED
+
+#include <nanovdb/NanoVDB.h>
+#include <nanovdb/cuda/DeviceBuffer.h>
+#include <nanovdb/GridHandle.h>
+#include <nanovdb/util/cuda/Timer.h>
+#include <nanovdb/util/cuda/Util.h>
+
+namespace nanovdb {// ================================================================
+
+namespace tools::cuda {// ============================================================
+
+/// @brief Freestanding function that combines an IndexGrid and values into a regular Grid
+/// @tparam DstBuildT Build time of the destination/output Grid
+/// @tparam SrcBuildT  Build type of the source/input IndexGrid
+/// @tparam BufferT Type of the buffer used for allocation of the destination Grid
+/// @param d_srcGrid Device pointer to source/input IndexGrid, i.e. SrcBuildT={ValueIndex,ValueOnIndex,ValueIndexMask,ValueOnIndexMask}
+/// @param d_srcValues Device pointer to an array of values
+/// @param pool Memory pool used to create a buffer for the destination/output Grid
+/// @param stream optional CUDA stream (defaults to CUDA stream 0
+/// @note If d_srcGrid has stats (min,max,avg,std-div), the d_srcValues is also assumed
+///       to have the same information, all of which are then copied to the destination/output grid.
+///       An exception to this rule is if the type of d_srcValues is different from the stats type
+///       NanoRoot<DstBuildT>::FloatType, e.g. if DstBuildT=Vec3f then NanoRoot<DstBuildT>::FloatType=float,
+///       in which case average and standard-deviation is undefined in the output grid.
+/// @return returns handle to grid that combined IndexGrid and values
+template<typename DstBuildT, typename SrcBuildT, typename BufferT = nanovdb::cuda::DeviceBuffer>
+typename util::enable_if<BuildTraits<SrcBuildT>::is_index, GridHandle<BufferT>>::type
+indexToGrid(const NanoGrid<SrcBuildT> *d_srcGrid, const typename BuildToValueMap<DstBuildT>::type *d_srcValues, const BufferT &pool = BufferT(), cudaStream_t stream = 0);
+
+
+template<typename DstBuildT, typename SrcBuildT, typename BufferT = nanovdb::cuda::DeviceBuffer>
+typename util::enable_if<BuildTraits<SrcBuildT>::is_index, GridHandle<BufferT>>::type
+createNanoGrid(const NanoGrid<SrcBuildT> *d_srcGrid, const typename BuildToValueMap<DstBuildT>::type *d_srcValues, const BufferT &pool = BufferT(), cudaStream_t stream = 0)
+{
+    return indexToGrid<DstBuildT, SrcBuildT, BufferT>(d_srcGrid, d_srcValues, pool, stream);
+}
+
+namespace {// anonymous namespace
+
+template<typename SrcBuildT>
+class IndexToGrid
+{
+    using SrcGridT = NanoGrid<SrcBuildT>;
+public:
+    struct NodeAccessor;
+
+    /// @brief Constructor from a source IndeGrid
+    /// @param srcGrid Device pointer to IndexGrid used as the source
+    IndexToGrid(const SrcGridT *d_srcGrid, cudaStream_t stream = 0);
+
+    ~IndexToGrid() {cudaCheck(util::cuda::freeAsync(mDevNodeAcc, mStream));}
+
+    /// @brief Toggle on and off verbose mode
+    /// @param on if true verbose is turned on
+    void setVerbose(bool on = true) {mVerbose = on; }
+
+    /// @brief Set the name of the destination/output grid
+    /// @param name Name used for the destination grid
+    void setGridName(const std::string &name) {mGridName = name;}
+
+    /// @brief Combines the IndexGrid with values to produce a regular Grid
+    /// @tparam DstBuildT Template parameter of the destination grid and value type
+    /// @tparam BufferT Template parameter of the memory allocator
+    /// @param srcValues pointer to values that will be inserted into the output grid
+    /// @param buffer optional buffer used for memory allocation
+    /// @return A new GridHandle with the grid of type @c DstBuildT
+    template<typename DstBuildT, typename BufferT = nanovdb::cuda::DeviceBuffer>
+    GridHandle<BufferT> getHandle(const typename BuildToValueMap<DstBuildT>::type *srcValues, const BufferT &buffer = BufferT());
+
+private:
+    cudaStream_t      mStream{0};
+    util::cuda::Timer mTimer;
+    std::string       mGridName;
+    bool              mVerbose{false};
+    NodeAccessor      mNodeAcc, *mDevNodeAcc;
+
+    template<typename DstBuildT, typename BufferT>
+    BufferT getBuffer(const BufferT &pool);
+};// IndexToGrid
+
+//================================================================================================
+
+template<typename SrcBuildT>
+struct IndexToGrid<SrcBuildT>::NodeAccessor
+{
+    uint64_t grid, tree, root, node[3], meta, blind, size;// byte offsets, node: 0=leaf,1=lower, 2=upper
+    const SrcGridT *d_srcGrid;// device point to source IndexGrid
+    void *d_dstPtr;// device pointer to buffer with destination Grid
+    char *d_gridName;
+    uint32_t nodeCount[4];// 0=leaf, 1=lower, 2=upper, 3=root tiles
+
+    __device__ const NanoGrid<SrcBuildT>& srcGrid() const {return *d_srcGrid;}
+    __device__ const NanoTree<SrcBuildT>& srcTree() const {return d_srcGrid->tree();}
+    __device__ const NanoRoot<SrcBuildT>& srcRoot() const {return d_srcGrid->tree().root();}
+    template <int LEVEL>
+    __device__ const typename NanoNode<SrcBuildT, LEVEL>::type& srcNode(int i) const {
+        return *(this->srcTree().template getFirstNode<LEVEL>() + i);
+    }
+
+    template <typename DstBuildT>
+    __device__ NanoGrid<DstBuildT>& dstGrid() const {return *util::PtrAdd<NanoGrid<DstBuildT>>(d_dstPtr, grid);}
+    template <typename DstBuildT>
+    __device__ NanoTree<DstBuildT>& dstTree() const {return *util::PtrAdd<NanoTree<DstBuildT>>(d_dstPtr, tree);}
+    template <typename DstBuildT>
+    __device__ NanoRoot<DstBuildT>& dstRoot() const {return *util::PtrAdd<NanoRoot<DstBuildT>>(d_dstPtr, root);}
+    template <typename DstBuildT, int LEVEL>
+    __device__ typename NanoNode<DstBuildT, LEVEL>::type& dstNode(int i) const {
+        return *(util::PtrAdd<typename NanoNode<DstBuildT,LEVEL>::type>(d_dstPtr, node[LEVEL])+i);
+    }
+};// IndexToGrid<SrcBuildT>::NodeAccessor
+
+//================================================================================================
+
+template<typename SrcBuildT, typename DstBuildT>
+__global__ void processGridTreeRootKernel(typename IndexToGrid<SrcBuildT>::NodeAccessor *nodeAcc,
+                                          const typename BuildToValueMap<DstBuildT>::type *srcValues)
+{
+    using SrcValueT = typename BuildToValueMap<DstBuildT>::type;
+    using DstStatsT = typename NanoRoot<DstBuildT>::FloatType;
+
+    auto &srcGrid = nodeAcc->srcGrid();
+    auto &dstGrid = nodeAcc->template dstGrid<DstBuildT>();
+    auto &srcTree = srcGrid.tree();
+    auto &dstTree = nodeAcc->template dstTree<DstBuildT>();
+    auto &srcRoot = srcTree.root();
+    auto &dstRoot = nodeAcc->template dstRoot<DstBuildT>();
+
+    // process Grid
+    *dstGrid.data() = *srcGrid.data();
+    dstGrid.mGridType = toGridType<DstBuildT>();
+    dstGrid.mData1 = 0u;
+    // we will recompute GridData::mChecksum later
+
+    // process Tree
+    *dstTree.data() = *srcTree.data();
+    dstTree.setRoot(&dstRoot);
+    dstTree.setFirstNode(&nodeAcc->template dstNode<DstBuildT,2>(0));
+    dstTree.setFirstNode(&nodeAcc->template dstNode<DstBuildT,1>(0));
+    dstTree.setFirstNode(&nodeAcc->template dstNode<DstBuildT,0>(0));
+
+    // process Root
+    dstRoot.mBBox = srcRoot.mBBox;
+    dstRoot.mTableSize = srcRoot.mTableSize;
+    dstRoot.mBackground = srcValues[srcRoot.mBackground];
+    if (srcGrid.hasMinMax()) {
+        dstRoot.mMinimum = srcValues[srcRoot.mMinimum];
+        dstRoot.mMaximum = srcValues[srcRoot.mMaximum];
+    }
+    if constexpr(util::is_same<SrcValueT, DstStatsT>::value) {// e.g. {float,float} or {Vec3f,float}
+        if (srcGrid.hasAverage())      dstRoot.mAverage = srcValues[srcRoot.mAverage];
+        if (srcGrid.hasStdDeviation()) dstRoot.mStdDevi = srcValues[srcRoot.mStdDevi];
+    }
+}// processGridTreeRootKernel
+
+//================================================================================================
+
+template<typename SrcBuildT, typename DstBuildT>
+__global__ void processRootTilesKernel(typename IndexToGrid<SrcBuildT>::NodeAccessor *nodeAcc,
+                                       const typename BuildToValueMap<DstBuildT>::type *srcValues)
+{
+    const auto tid = blockIdx.x;
+
+    // Process children and tiles
+    const auto &srcTile = *nodeAcc->srcRoot().tile(tid);
+    auto &dstTile = *nodeAcc->template dstRoot<DstBuildT>().tile(tid);
+    dstTile.key   = srcTile.key;
+    if (srcTile.child) {
+        dstTile.child = sizeof(NanoRoot<DstBuildT>) + sizeof(NanoRoot<DstBuildT>::Tile)*((srcTile.child - sizeof(NanoRoot<SrcBuildT>))/sizeof(NanoRoot<SrcBuildT>::Tile));
+        dstTile.value = srcValues[0];// set to background
+        dstTile.state = false;
+    } else {
+        dstTile.child = 0;// i.e. no child node
+        dstTile.value = srcValues[srcTile.value];
+        dstTile.state = srcTile.state;
+    }
+}// processRootTilesKernel
+
+//================================================================================================
+
+template<typename SrcBuildT, typename DstBuildT, int LEVEL>
+__global__ void processNodesKernel(typename IndexToGrid<SrcBuildT>::NodeAccessor *nodeAcc,
+                                   const typename BuildToValueMap<DstBuildT>::type *srcValues)
+{
+    using SrcNodeT  = typename NanoNode<SrcBuildT, LEVEL>::type;
+    using DstNodeT  = typename NanoNode<DstBuildT, LEVEL>::type;
+    using SrcChildT = typename SrcNodeT::ChildNodeType;
+    using DstChildT = typename DstNodeT::ChildNodeType;
+    using SrcValueT = typename BuildToValueMap<DstBuildT>::type;
+    using DstStatsT = typename NanoRoot<DstBuildT>::FloatType;
+
+    auto &srcNode = nodeAcc->template srcNode<LEVEL>(blockIdx.x);
+    auto &dstNode = nodeAcc->template dstNode<DstBuildT, LEVEL>(blockIdx.x);
+
+    if (threadIdx.x == 0 && threadIdx.y == 0) {
+        dstNode.mBBox = srcNode.mBBox;
+        dstNode.mFlags = srcNode.mFlags;
+        dstNode.mValueMask = srcNode.mValueMask;
+        dstNode.mChildMask = srcNode.mChildMask;
+        auto &srcGrid = nodeAcc->srcGrid();
+        if (srcGrid.hasMinMax()) {
+            dstNode.mMinimum = srcValues[srcNode.mMinimum];
+            dstNode.mMaximum = srcValues[srcNode.mMaximum];
+        }
+        if constexpr(util::is_same<SrcValueT, DstStatsT>::value) {// e.g. {float,float} or {Vec3f,float}
+            if (srcGrid.hasAverage())      dstNode.mAverage = srcValues[srcNode.mAverage];
+            if (srcGrid.hasStdDeviation()) dstNode.mStdDevi = srcValues[srcNode.mStdDevi];
+        }
+    }
+    const uint64_t nodeSkip = nodeAcc->nodeCount[LEVEL] - blockIdx.x, srcOff = sizeof(SrcNodeT)*nodeSkip, dstOff = sizeof(DstNodeT)*nodeSkip;// offset to first node of child type
+    const int off = blockDim.x*blockDim.y*threadIdx.x + blockDim.x*threadIdx.y;
+    for (int threadIdx_z=0; threadIdx_z<blockDim.x; ++threadIdx_z) {
+        const int i = off + threadIdx_z;
+        if (srcNode.mChildMask.isOn(i)) {
+            if constexpr(sizeof(SrcNodeT)==sizeof(DstNodeT) && sizeof(SrcChildT)==sizeof(DstChildT)) {
+                dstNode.mTable[i].child = srcNode.mTable[i].child;
+            } else {
+                const uint64_t childID = (srcNode.mTable[i].child - srcOff)/sizeof(SrcChildT);
+                dstNode.mTable[i].child = dstOff + childID*sizeof(DstChildT);
+            }
+        } else {
+            dstNode.mTable[i].value = srcValues[srcNode.mTable[i].value];
+        }
+    }
+}// processNodesKernel
+
+//================================================================================================
+
+template<typename SrcBuildT, typename DstBuildT>
+__global__ void processLeafsKernel(typename IndexToGrid<SrcBuildT>::NodeAccessor *nodeAcc,
+                                     const typename BuildToValueMap<DstBuildT>::type *srcValues)
+{
+    using SrcValueT = typename BuildToValueMap<DstBuildT>::type;
+    using DstStatsT = typename NanoRoot<DstBuildT>::FloatType;
+    static_assert(!BuildTraits<DstBuildT>::is_special, "Invalid destination type!");
+    auto &srcLeaf = nodeAcc->template srcNode<0>(blockIdx.x);
+    auto &dstLeaf = nodeAcc->template dstNode<DstBuildT,0>(blockIdx.x);
+    if (threadIdx.x == 0 && threadIdx.y == 0) {
+        dstLeaf.mBBoxMin = srcLeaf.mBBoxMin;
+        for (int i=0; i<3; ++i) dstLeaf.mBBoxDif[i] = srcLeaf.mBBoxDif[i];
+        dstLeaf.mFlags = srcLeaf.mFlags;
+        dstLeaf.mValueMask = srcLeaf.mValueMask;
+        ///
+        auto &srcGrid = nodeAcc->srcGrid();
+        if (srcGrid.hasMinMax()) {
+            dstLeaf.mMinimum = srcValues[srcLeaf.getMin()];
+            dstLeaf.mMaximum = srcValues[srcLeaf.getMax()];
+        }
+        if constexpr(util::is_same<SrcValueT, DstStatsT>::value) {// e.g. {float,float} or {Vec3f,float}
+            if (srcGrid.hasAverage())      dstLeaf.mAverage = srcValues[srcLeaf.getAvg()];
+            if (srcGrid.hasStdDeviation()) dstLeaf.mStdDevi = srcValues[srcLeaf.getDev()];
+        }
+    }
+    const int off = blockDim.x*blockDim.y*threadIdx.x + blockDim.x*threadIdx.y;
+    auto *dst = dstLeaf.mValues + off;
+    for (int threadIdx_z=0; threadIdx_z<blockDim.x; ++threadIdx_z) {
+        const int i = off + threadIdx_z;
+        *dst++ = srcValues[srcLeaf.getValue(i)];
+    }
+}// processLeafsKernel
+
+//================================================================================================
+
+template <typename SrcBuildT>
+__global__ void cpyNodeCountKernel(const NanoGrid<SrcBuildT> *srcGrid,
+                                   typename IndexToGrid<SrcBuildT>::NodeAccessor *nodeAcc)
+{
+    assert(srcGrid->isSequential());
+    nodeAcc->d_srcGrid = srcGrid;
+    for (int i=0; i<3; ++i) nodeAcc->nodeCount[i] = srcGrid->tree().nodeCount(i);
+    nodeAcc->nodeCount[3] = srcGrid->tree().root().tileCount();
+}
+
+}// anonymous namespace
+
+//================================================================================================
+
+template <typename SrcBuildT>
+IndexToGrid<SrcBuildT>::IndexToGrid(const SrcGridT *d_srcGrid, cudaStream_t stream)
+    : mStream(stream), mTimer(stream)
+{
+    NANOVDB_ASSERT(d_srcGrid);
+    cudaCheck(util::cuda::mallocAsync((void**)&mDevNodeAcc, sizeof(NodeAccessor), mStream));
+    cpyNodeCountKernel<SrcBuildT><<<1, 1, 0, mStream>>>(d_srcGrid, mDevNodeAcc);
+    cudaCheckError();
+    cudaCheck(cudaMemcpyAsync(&mNodeAcc, mDevNodeAcc, sizeof(NodeAccessor), cudaMemcpyDeviceToHost, mStream));// mNodeAcc = *mDevNodeAcc
+}
+
+//================================================================================================
+
+template <typename SrcBuildT>
+template <typename DstBuildT, typename BufferT>
+GridHandle<BufferT> IndexToGrid<SrcBuildT>::getHandle(const typename BuildToValueMap<DstBuildT>::type *srcValues,
+                                                          const BufferT &pool)
+{
+    if (mVerbose) mTimer.start("Initiate buffer");
+    auto buffer = this->template getBuffer<DstBuildT, BufferT>(pool);
+
+    if (mVerbose) mTimer.restart("Process grid,tree,root");
+    processGridTreeRootKernel<SrcBuildT,DstBuildT><<<1, 1, 0, mStream>>>(mDevNodeAcc, srcValues);
+    cudaCheckError();
+
+    if (mVerbose) mTimer.restart("Process root children and tiles");
+    processRootTilesKernel<SrcBuildT,DstBuildT><<<mNodeAcc.nodeCount[3], 1, 0, mStream>>>(mDevNodeAcc, srcValues);
+    cudaCheckError();
+
+    cudaCheck(util::cuda::freeAsync(mNodeAcc.d_gridName, mStream));
+
+    if (mVerbose) mTimer.restart("Process upper internal nodes");
+    processNodesKernel<SrcBuildT,DstBuildT,2><<<mNodeAcc.nodeCount[2], dim3(32,32), 0, mStream>>>(mDevNodeAcc, srcValues);
+    cudaCheckError();
+
+    if (mVerbose) mTimer.restart("Process lower internal nodes");
+    processNodesKernel<SrcBuildT,DstBuildT,1><<<mNodeAcc.nodeCount[1], dim3(16,16), 0, mStream>>>(mDevNodeAcc, srcValues);
+    cudaCheckError();
+
+    if (mVerbose) mTimer.restart("Process leaf nodes");
+    processLeafsKernel<SrcBuildT,DstBuildT><<<mNodeAcc.nodeCount[0], dim3(8,8), 0, mStream>>>(mDevNodeAcc, srcValues);
+    if (mVerbose) mTimer.stop();
+    cudaCheckError();
+
+    if (mVerbose) mTimer.restart("Compute checksums");
+    updateChecksum((GridData*)mNodeAcc.d_dstPtr, mStream);
+    if (mVerbose) mTimer.stop();
+
+    //cudaStreamSynchronize(mStream);// finish all device tasks in mStream
+    return GridHandle<BufferT>(std::move(buffer));
+}// IndexToGrid::getHandle
+
+//================================================================================================
+
+template <typename SrcBuildT>
+template <typename DstBuildT, typename BufferT>
+inline BufferT IndexToGrid<SrcBuildT>::getBuffer(const BufferT &pool)
+{
+    mNodeAcc.grid  = 0;// grid is always stored at the start of the buffer!
+    mNodeAcc.tree  = NanoGrid<DstBuildT>::memUsage(); // grid ends and tree begins
+    mNodeAcc.root  = mNodeAcc.tree  + NanoTree<DstBuildT>::memUsage(); // tree ends and root node begins
+    mNodeAcc.node[2] = mNodeAcc.root  + NanoRoot<DstBuildT>::memUsage(mNodeAcc.nodeCount[3]); // root node ends and upper internal nodes begin
+    mNodeAcc.node[1] = mNodeAcc.node[2] + NanoUpper<DstBuildT>::memUsage()*mNodeAcc.nodeCount[2]; // upper internal nodes ends and lower internal nodes begin
+    mNodeAcc.node[0] = mNodeAcc.node[1] + NanoLower<DstBuildT>::memUsage()*mNodeAcc.nodeCount[1]; // lower internal nodes ends and leaf nodes begin
+    mNodeAcc.meta  = mNodeAcc.node[0]  + NanoLeaf<DstBuildT>::DataType::memUsage()*mNodeAcc.nodeCount[0];// leaf nodes end and blind meta data begins
+    mNodeAcc.blind = mNodeAcc.meta  + 0*sizeof(GridBlindMetaData); // meta data ends and blind data begins
+    mNodeAcc.size  = mNodeAcc.blind;// end of buffer
+    auto buffer = BufferT::create(mNodeAcc.size, &pool, false, mStream);
+    mNodeAcc.d_dstPtr = buffer.deviceData();
+    if (mNodeAcc.d_dstPtr == nullptr) throw std::runtime_error("Failed memory allocation on the device");
+
+    if (size_t size = mGridName.size()) {
+        cudaCheck(util::cuda::mallocAsync((void**)&mNodeAcc.d_gridName, size, mStream));
+        cudaCheck(cudaMemcpyAsync(mNodeAcc.d_gridName, mGridName.data(), size, cudaMemcpyHostToDevice, mStream));
+    } else {
+        mNodeAcc.d_gridName = nullptr;
+    }
+    cudaCheck(cudaMemcpyAsync(mDevNodeAcc, &mNodeAcc, sizeof(NodeAccessor), cudaMemcpyHostToDevice, mStream));// copy NodeAccessor CPU -> GPU
+    return buffer;
+}
+
+//================================================================================================
+
+template<typename DstBuildT, typename SrcBuildT, typename BufferT>
+typename util::enable_if<BuildTraits<SrcBuildT>::is_index, GridHandle<BufferT>>::type
+indexToGrid(const NanoGrid<SrcBuildT> *d_srcGrid, const typename BuildToValueMap<DstBuildT>::type *d_srcValues, const BufferT &pool, cudaStream_t stream)
+{
+    IndexToGrid<SrcBuildT> converter(d_srcGrid, stream);
+    return converter.template getHandle<DstBuildT>(d_srcValues, pool);
+}
+
+}// namespace tools::cuda  =============================================================
+
+template<typename DstBuildT, typename SrcBuildT, typename BufferT = cuda::DeviceBuffer>
+[[deprecated("Use nanovdb::cuda::indexToGrid instead")]]
+typename util::enable_if<BuildTraits<SrcBuildT>::is_index, GridHandle<BufferT>>::type
+cudaIndexToGrid(const NanoGrid<SrcBuildT> *d_srcGrid, const typename BuildToValueMap<DstBuildT>::type *d_srcValues, const BufferT &pool = BufferT(), cudaStream_t stream = 0)
+{
+    return tools::cuda::indexToGrid<DstBuildT, SrcBuildT, BufferT>(d_srcGrid, d_srcValues, pool, stream);
+}
+
+
+template<typename DstBuildT, typename SrcBuildT, typename BufferT = cuda::DeviceBuffer>
+[[deprecated("Use nanovdb::cuda::indexToGrid instead")]]
+typename util::enable_if<BuildTraits<SrcBuildT>::is_index, GridHandle<BufferT>>::type
+cudaCreateNanoGrid(const NanoGrid<SrcBuildT> *d_srcGrid, const typename BuildToValueMap<DstBuildT>::type *d_srcValues, const BufferT &pool = BufferT(), cudaStream_t stream = 0)
+{
+    return tools::cuda::indexToGrid<DstBuildT, SrcBuildT, BufferT>(d_srcGrid, d_srcValues, pool, stream);
+}
+
+}// nanovdb namespace ===================================================================
+
+#endif // NVIDIA_TOOLS_CUDA_INDEXTOGRID_CUH_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/tools/cuda/PointsToGrid.cuh b/external/nanovdb/tools/cuda/PointsToGrid.cuh
new file mode 100644
index 00000000..bcf335ef
--- /dev/null
+++ b/external/nanovdb/tools/cuda/PointsToGrid.cuh
@@ -0,0 +1,1293 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file nanovdb/tools/cuda/PointsToGrid.cuh
+
+    \authors Greg Klar (initial version) and Ken Museth (final version)
+
+    \brief Generates NanoVDB grids from a list of voxels or points on the device
+
+    \warning The header file contains cuda device code so be sure
+             to only include it in .cu files (or other .cuh files)
+*/
+
+#ifndef NVIDIA_TOOLS_CUDA_POINTSTOGRID_CUH_HAS_BEEN_INCLUDED
+#define NVIDIA_TOOLS_CUDA_POINTSTOGRID_CUH_HAS_BEEN_INCLUDED
+
+#include <cub/cub.cuh>
+#include <cub/util_allocator.cuh>
+#include <vector>
+#include <tuple>
+
+#include <nanovdb/NanoVDB.h>
+#include <nanovdb/cuda/DeviceBuffer.h>
+#include <nanovdb/GridHandle.h>
+#include <nanovdb/tools/cuda/GridChecksum.cuh>
+#include <nanovdb/util/cuda/Timer.h>
+#include <nanovdb/util/cuda/Util.h>
+
+/*
+   Note: 4.29 billion (=2^32) coordinates of type Vec3f have a memory footprint of 48 GB!
+*/
+
+namespace nanovdb {// ================================================================================
+
+namespace tools::cuda {// ============================================================================
+
+/// @brief Generates a NanoGrid<Point> from a list of point coordinates on the device. This method is
+///        mainly used as a means to build a BVH acceleration structure for points, e.g. for efficient rendering.
+/// @tparam PtrT Template type to a raw or fancy-pointer of point coordinates in world space. Dereferencing should return Vec3f or Vec3d.
+/// @tparam BufferT Template type of buffer used for memory allocation on the device
+/// @tparam AllocT  Template type of optional device allocator for internal temporary memory
+/// @param dWorldPoints Raw or fancy pointer to list of point coordinates in world space on the device
+/// @param pointCount number of point in the list @c d_world
+/// @param voxelSize Size of a voxel in world units used for the output grid
+/// @param type Defined the way point information is represented in the output grid (see PointType enum NanoVDB.h)
+///             Should not be PointType::Disable!
+/// @param buffer Instance of the device buffer used for memory allocation
+/// @param stream optional CUDA stream (defaults to CUDA stream 0)
+/// @return Returns a handle with a grid of type NanoGrid<Point> where point information, e.g. coordinates,
+///         are represented as blind data defined by @c type.
+template<typename PtrT, typename BufferT = nanovdb::cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+GridHandle<BufferT>
+pointsToGrid(const PtrT dWorldPoints,
+             int pointCount,
+             double voxelSize,
+             PointType type = PointType::Default,
+             const BufferT &buffer = BufferT(),
+             cudaStream_t stream = 0);
+
+//-----------------------------------------------------------------------------------------------------
+
+/// @brief Generates a NanoGrid<Point> from a list of point coordinates on the device. This method is
+///        mainly used as a means to build a BVH acceleration structure for points, e.g. for efficient rendering.
+/// @tparam PtrT Template type to a raw or fancy-pointer of point coordinates in world space. Dereferencing should return Vec3f or Vec3d.
+/// @tparam BufferT Template type of buffer used for memory allocation on the device
+/// @tparam AllocT  Template type of optional device allocator for internal temporary memory
+/// @param dWorldPoints Raw or fancy pointer to list of point coordinates in world space on the device
+/// @param pointCount total number of point in the list @c d_world
+/// @param maxPointsPerVoxel Max density of points per voxel, i.e. maximum number of points in any voxel
+/// @param tolerance allow for point density to vary by the specified tolerance (defaults to 1). That is, the voxel size
+///                  is selected such that the max density is +/- the tolerance.
+/// @param maxIterations Maximum number of iterations used to seach for a voxel size that produces a point density
+///                      with specified tolerance takes.
+/// @param type Defined the way point information is represented in the output grid (see PointType enum in NanoVDB.h)
+///             Should not be PointType::Disable!
+/// @param buffer Instance of the device buffer used for memory allocation
+/// @param stream optional CUDA stream (defaults to CUDA stream 0)
+/// @return Returns a handle with a grid of type NanoGrid<Point> where point information, e.g. coordinates,
+///         are represented as blind data defined by @c type.
+template<typename PtrT, typename BufferT = nanovdb::cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+GridHandle<BufferT>
+pointsToGrid(const PtrT dWorldPoints,
+             int pointCount,
+             int maxPointPerVoxel,
+             int tolerance = 1,
+             int maxIterations = 10,
+             PointType type = PointType::Default,
+             const BufferT &buffer = BufferT(),
+             cudaStream_t stream = 0);
+
+//-----------------------------------------------------------------------------------------------------
+
+template<typename BuildT, typename PtrT, typename BufferT = nanovdb::cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+GridHandle<BufferT>
+pointsToGrid(std::vector<std::tuple<const PtrT,size_t,double,PointType>> pointSet,
+            const BufferT &buffer = BufferT(),
+            cudaStream_t stream = 0);
+
+//-----------------------------------------------------------------------------------------------------
+
+/// @brief Generates a NanoGrid of any type from a list of voxel coordinates on the device. Unlike @c cudaPointsToGrid
+///        this method only builds the grid but does not encode the coordinates as blind data. It is mainly useful as a
+///        means to generate a grid that is know to contain the voxels given in the list.
+/// @tparam BuildT Template type of the return grid
+/// @tparam PtrT Template type to a raw or fancy-pointer of point coordinates in world space. Dereferencing should return Vec3f or Vec3d.
+/// @tparam BufferT Template type of buffer used for memory allocation on the device
+/// @tparam AllocT  Template type of optional device allocator for internal temporary memory
+/// @param dGridVoxels Raw or fancy pointer to list of voxel coordinates in grid (or index) space on the device
+/// @param pointCount number of voxel in the list @c dGridVoxels
+/// @param voxelSize Size of a voxel in world units used for the output grid
+/// @param buffer Instance of the device buffer used for memory allocation
+/// @return Returns a handle with the grid of type NanoGrid<BuildT>
+template<typename BuildT, typename PtrT, typename BufferT = nanovdb::cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+GridHandle<BufferT>
+voxelsToGrid(const PtrT dGridVoxels,
+             size_t voxelCount,
+             double voxelSize = 1.0,
+             const BufferT &buffer = BufferT(),
+             cudaStream_t stream = 0);
+
+//-------------------------------------------------------------------------------------------------------
+
+template<typename BuildT, typename PtrT, typename BufferT = nanovdb::cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+GridHandle<BufferT>
+voxelsToGrid(std::vector<std::tuple<const PtrT, size_t, double>> pointSet,
+             const BufferT &buffer = BufferT(),
+             cudaStream_t stream = 0);
+
+}// namespace tools::cuda ========================================================================
+
+/// @brief Example class of a fancy pointer that can optionally be used as a template for writing
+///        a custom fancy pointer that allows for particle coordinates to be arrange non-linearly
+///        in memory. For instance with coordinates are interlaced with other dats, i.e. an array
+///        of structs, a custom implementation of fancy_ptr::operator[](size_t i) can account for
+///        strides that skip other interlaces data.
+/// @tparam T Template type that specifies the type use for the coordinates of the points
+template <typename T>
+class fancy_ptr
+{
+    const T* mPtr;
+public:
+    /// @brief Default constructor.
+    /// @note  This method is atcually not required by cuda::PointsToGrid
+    /// @param ptr Pointer to array of elements
+    __hostdev__ explicit fancy_ptr(const T* ptr = nullptr) : mPtr(ptr) {}
+    /// @brief Index acces into the array pointed to by the stored pointer.
+    /// @note  This method is required by cuda::PointsToGrid!
+    /// @param i Unsigned index of the element to be returned
+    /// @return Const refernce to the element at the i'th poisiton
+    __hostdev__ inline const T& operator[](size_t i) const {return mPtr[i];}
+    /// @brief Dummy implementation required by pointer_traits.
+    /// @note  Note that only the return type matters!
+    /// @details Unlike operator[] it is safe to assume that all pointer types have operator*,
+    ///          which is why pointer_traits makes use of it to determine the element_type that
+    ///          a pointer class is pointing to. E.g. operator[] is not always defined for std::shared_ptr!
+    __hostdev__ inline const T& operator*() const {return *mPtr;}
+};// fancy_ptr<T>
+
+/// @brief Simple stand-alone function that can be used to conveniently construct a fancy_ptr
+/// @tparam T Template type that specifies the type use for the coordinates of the points
+/// @param ptr Raw pointer to data
+/// @return a new instance of a fancy_ptr
+template <typename T>
+fancy_ptr<T> make_fancy(const T* ptr = nullptr) {return fancy_ptr<T>(ptr);}
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+/// @brief Trait of points, like type of pointer and size of the pointer type
+template <typename>
+struct pointer_traits;
+
+template <typename T>
+struct pointer_traits<T*> {
+    using element_type = T;
+    static constexpr size_t element_size = sizeof(T);
+};
+
+template <typename T>
+struct pointer_traits {
+    using element_type = typename util::remove_reference<decltype(*util::declval<T>())>::type;// assumes T::operator*() exists!
+    static constexpr size_t element_size = sizeof(element_type);
+};
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+/// @brief computes the relative 8-bit voxel offsets from a world coordinate
+/// @tparam Vec3T Type of the world coordinate
+/// @param voxel 8-bit output coordinates that are relative to a voxel
+/// @param world input world coordinates
+/// @param indexToWorld Transform from index to world space
+template <typename Vec3T>
+__hostdev__ inline static void worldToVoxel(Vec3u8 &voxel, const Vec3T &world, const Map &indexToWorld)
+{
+    const Vec3d ijk = indexToWorld.applyInverseMap(world);// world -> index
+    static constexpr double encode = double((1<<8) - 1);
+    voxel[0] = uint8_t( encode*(ijk[0] - math::Floor(ijk[0] + 0.5) + 0.5) );
+    voxel[1] = uint8_t( encode*(ijk[1] - math::Floor(ijk[1] + 0.5) + 0.5) );
+    voxel[2] = uint8_t( encode*(ijk[2] - math::Floor(ijk[2] + 0.5) + 0.5) );
+}
+
+/// @brief computes the relative 16-bit voxel offsets from a world coordinate
+/// @tparam Vec3T Type of the world coordinate
+/// @param voxel 16-bit output coordinates that are relative to a voxel
+/// @param world input world coordinates
+/// @param indexToWorld Transform from index to world space
+template <typename Vec3T>
+__hostdev__ inline static void worldToVoxel(Vec3u16 &voxel, const Vec3T &world, const Map &indexToWorld)
+{
+    const Vec3d ijk = indexToWorld.applyInverseMap(world);// world -> index
+    static constexpr double encode = double((1<<16) - 1);
+    voxel[0] = uint16_t( encode*(ijk[0] - math::Floor(ijk[0] + 0.5) + 0.5) );
+    voxel[1] = uint16_t( encode*(ijk[1] - math::Floor(ijk[1] + 0.5) + 0.5) );
+    voxel[2] = uint16_t( encode*(ijk[2] - math::Floor(ijk[2] + 0.5) + 0.5) );
+}
+
+/// @brief computes the relative float voxel offsets from a world coordinate
+/// @tparam Vec3T Type of the world coordinate
+/// @param voxel float output coordinates that are relative to a voxel
+/// @param world input world coordinates
+/// @param indexToWorld Transform from index to world space
+template <typename Vec3T>
+__hostdev__ inline static void worldToVoxel(Vec3f &voxel, const Vec3T &world, const Map &indexToWorld)
+{
+    const Vec3d ijk = indexToWorld.applyInverseMap(world);// world -> index
+    voxel[0] = float( ijk[0] - math::Floor(ijk[0] + 0.5) );
+    voxel[1] = float( ijk[1] - math::Floor(ijk[1] + 0.5) );
+    voxel[2] = float( ijk[2] - math::Floor(ijk[2] + 0.5) );
+}
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template <typename Vec3T = Vec3d>
+__hostdev__ inline static Vec3T voxelToWorld(const Vec3u8 &voxel, const Coord &ijk, const Map &map)
+{
+    static constexpr double decode = 1.0/double((1<<8) - 1);
+    if constexpr(util::is_same<Vec3T,Vec3d>::value) {
+        return map.applyMap( Vec3d(ijk[0] + decode*voxel[0] - 0.5, ijk[1] + decode*voxel[1] - 0.5, ijk[2] + decode*voxel[2] - 0.5));
+    } else {
+        return map.applyMapF(Vec3f(ijk[0] + decode*voxel[0] - 0.5f, ijk[1] + decode*voxel[1] - 0.5f, ijk[2] + decode*voxel[2] - 0.5f));
+    }
+}
+
+template <typename Vec3T = Vec3d>
+__hostdev__ inline static Vec3T voxelToWorld(const Vec3u16 &voxel, const Coord &ijk, const Map &map)
+{
+    static constexpr double decode = 1.0/double((1<<16) - 1);
+    if constexpr(util::is_same<Vec3T,Vec3d>::value) {
+        return map.applyMap( Vec3d(ijk[0] + decode*voxel[0] - 0.5, ijk[1] + decode*voxel[1] - 0.5, ijk[2] + decode*voxel[2] - 0.5));
+    } else {
+        return map.applyMapF(Vec3f(ijk[0] + decode*voxel[0] - 0.5f, ijk[1] + decode*voxel[1] - 0.5f, ijk[2] + decode*voxel[2] - 0.5f));
+    }
+}
+
+template <typename Vec3T = Vec3d>
+__hostdev__ inline static Vec3T voxelToWorld(const Vec3f &voxel, const Coord &ijk, const Map &map)
+{
+    if constexpr(util::is_same<Vec3T,Vec3d>::value) {
+        return map.applyMap( Vec3d(ijk[0] + voxel[0], ijk[1] + voxel[1], ijk[2] + voxel[2]));
+    } else {
+        return map.applyMapF(Vec3f(ijk[0] + voxel[0], ijk[1] + voxel[1], ijk[2] + voxel[2]));
+    }
+}
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+namespace tools::cuda {
+
+template <typename BuildT, typename AllocT = cub::CachingDeviceAllocator>
+class PointsToGrid
+{
+public:
+
+    struct Data {
+        Map map;
+        void     *d_bufferPtr;
+        uint64_t *d_keys, *d_tile_keys, *d_lower_keys, *d_leaf_keys;// device pointer to 64 bit keys
+        uint64_t  grid, tree, root, upper, lower, leaf, meta, blind, size;// byte offsets to nodes in buffer
+        uint32_t *d_indx;// device pointer to point indices (or IDs)
+        uint32_t  nodeCount[3], *pointsPerLeafPrefix, *pointsPerLeaf;// 0=leaf,1=lower, 2=upper
+        uint32_t  voxelCount,  *pointsPerVoxelPrefix, *pointsPerVoxel;
+        BitFlags<16> flags;
+        __hostdev__ NanoGrid<BuildT>&  getGrid() const {return *util::PtrAdd<NanoGrid<BuildT>>(d_bufferPtr, grid);}
+        __hostdev__ NanoTree<BuildT>&  getTree() const {return *util::PtrAdd<NanoTree<BuildT>>(d_bufferPtr, tree);}
+        __hostdev__ NanoRoot<BuildT>&  getRoot() const {return *util::PtrAdd<NanoRoot<BuildT>>(d_bufferPtr, root);}
+        __hostdev__ NanoUpper<BuildT>& getUpper(int i) const {return *(util::PtrAdd<NanoUpper<BuildT>>(d_bufferPtr, upper)+i);}
+        __hostdev__ NanoLower<BuildT>& getLower(int i) const {return *(util::PtrAdd<NanoLower<BuildT>>(d_bufferPtr, lower)+i);}
+        __hostdev__ NanoLeaf<BuildT>&  getLeaf(int i) const {return *(util::PtrAdd<NanoLeaf<BuildT>>(d_bufferPtr, leaf)+i);}
+        __hostdev__ GridBlindMetaData& getMeta() const { return *util::PtrAdd<GridBlindMetaData>(d_bufferPtr, meta);};
+         template <typename Vec3T>
+        __hostdev__ Vec3T& getPoint(int i) const {return *(util::PtrAdd<Vec3T>(d_bufferPtr, blind)+i);}
+    };// Data
+
+    /// @brief Map constructor, which other constructors might call
+    /// @param map Map to be used for the output device grid
+    /// @param stream optional CUDA stream (defaults to CUDA stream 0)
+    PointsToGrid(const Map &map, cudaStream_t stream = 0)
+        : mStream(stream)
+        , mPointType(util::is_same<BuildT,Point>::value ? PointType::Default : PointType::Disable)
+    {
+        mData.map = map;
+        mData.flags.initMask({GridFlags::HasBBox, GridFlags::IsBreadthFirst});
+        mDeviceData = mMemPool.template alloc<Data>(mStream);
+    }
+
+    /// @brief Default constructor that calls the Map constructor defined above
+    /// @param scale Voxel size in world units
+    /// @param trans Translation of origin in world units
+    /// @param stream optional CUDA stream (defaults to CUDA stream 0)
+    PointsToGrid(const double scale = 1.0, const Vec3d &trans = Vec3d(0.0), cudaStream_t stream = 0)
+        : PointsToGrid(Map(scale, trans), stream){}
+
+    /// @brief Constructor from a target maximum number of particles per voxel. Calls the Map constructor defined above
+    /// @param maxPointsPerVoxel Maximum number of points oer voxel
+    /// @param stream optional CUDA stream (defaults to CUDA stream 0)
+    PointsToGrid(int maxPointsPerVoxel, int tolerance = 1, int maxIterations = 10, cudaStream_t stream = 0)
+        : PointsToGrid(Map(1.0), stream)
+    {
+        mMaxPointsPerVoxel = maxPointsPerVoxel;
+        mTolerance = tolerance;
+        mMaxIterations = maxIterations;
+    }
+
+    /// @brief Toggle on and off verbose mode
+    /// @param level Verbose level: 0=quiet, 1=timing, 2=benchmarking
+    void setVerbose(int level = 1) {mVerbose = level; mData.flags.setBit(7u, level); }
+
+    /// @brief Set the mode for checksum computation, which is disabled by default
+    /// @param mode Mode of checksum computation
+    void setChecksum(CheckMode mode = CheckMode::Disable){mChecksum = mode;}
+
+    /// @brief Toggle on and off the computation of a bounding-box
+    /// @param on If true bbox will be computed
+    void includeBBox(bool on = true) { mData.flags.setMask(GridFlags::HasBBox, on); }
+
+    /// @brief Set the name of the output grid
+    /// @param name name of the output grid
+    void setGridName(const std::string &name) {mGridName = name;}
+
+    // only available when BuildT == Point
+    template <typename T = BuildT> typename util::enable_if<util::is_same<T, Point>::value>::type
+    setPointType(PointType type) { mPointType = type; }
+
+    /// @brief Creates a handle to a grid with the specified build type from a list of points in index or world space
+    /// @tparam BuildT Build type of the output grid, i.e NanoGrid<BuildT>
+    /// @tparam PtrT Template type to a raw or fancy-pointer of point coordinates in world or index space.
+    /// @tparam BufferT Buffer type used for allocation of the grid handle
+    /// @param points device point to an array of points in world space
+    /// @param pointCount number of input points or voxels
+    /// @param buffer optional buffer (currently ignored)
+    /// @return returns a handle with a grid of type NanoGrid<BuildT>
+    template<typename PtrT, typename BufferT = nanovdb::cuda::DeviceBuffer>
+    GridHandle<BufferT> getHandle(const PtrT points,
+                                  size_t pointCount,
+                                  const BufferT &buffer = BufferT());
+
+    template <typename PtrT>
+    void countNodes(const PtrT points, size_t pointCount);
+
+    template <typename PtrT>
+    void processGridTreeRoot(const PtrT points, size_t pointCount);
+
+    void processUpperNodes();
+
+    void processLowerNodes();
+
+    template <typename PtrT>
+    void processLeafNodes(const PtrT points);
+
+    template <typename PtrT>
+    void processPoints(const PtrT points, size_t pointCount);
+
+    void processBBox();
+
+    // the following methods are only defined when BuildT == Point
+    template <typename T = BuildT> typename util::enable_if<util::is_same<T, Point>::value, uint32_t>::type
+    maxPointsPerVoxel() const {return mMaxPointsPerVoxel;}
+    template <typename T = BuildT> typename util::enable_if<util::is_same<T, Point>::value, uint32_t>::type
+    maxPointsPerLeaf()  const {return mMaxPointsPerLeaf;}
+
+private:
+    static constexpr unsigned int mNumThreads = 128;// seems faster than the old value of 256!
+    static unsigned int numBlocks(unsigned int n) {return (n + mNumThreads - 1) / mNumThreads;}
+
+    cudaStream_t      mStream{0};
+    util::cuda::Timer mTimer;
+    PointType         mPointType;
+    std::string       mGridName;
+    int               mVerbose{0};
+    Data              mData, *mDeviceData;
+    uint32_t          mMaxPointsPerVoxel{0u}, mMaxPointsPerLeaf{0u};
+    int               mTolerance{1}, mMaxIterations{1};
+    CheckMode         mChecksum{CheckMode::Disable};
+
+    // wrapper of AllocT, defaulting to cub::CachingDeviceAllocator, which offers a shared scratch space
+    struct Allocator {
+        AllocT mAllocator;
+        void* d_scratch;
+        size_t scratchSize, actualScratchSize;
+        Allocator() : d_scratch(nullptr), scratchSize(0), actualScratchSize(0) {}
+        ~Allocator() {
+            if (scratchSize > 0) this->free(d_scratch);// a bug in cub makes this necessary
+            mAllocator.FreeAllCached();
+        }
+        template <typename T>
+        T* alloc(size_t count, cudaStream_t stream) {
+            T* d_ptr = nullptr;
+            cudaCheck(mAllocator.DeviceAllocate((void**)&d_ptr, sizeof(T)*count, stream));
+            return d_ptr;
+        }
+        template <typename T>
+        T* alloc(cudaStream_t stream) {return this->template alloc<T>(1, stream);}
+        void free(void *d_ptr) {if (d_ptr) cudaCheck(mAllocator.DeviceFree(d_ptr));}
+        template<class... T>
+        void free(void *d_ptr, T... other) {
+            if (d_ptr) cudaCheck(mAllocator.DeviceFree(d_ptr));
+            this->free(other...);
+        }
+        void adjustScratch(cudaStream_t stream){
+            if (scratchSize > actualScratchSize) {
+                if (actualScratchSize>0) cudaCheck(mAllocator.DeviceFree(d_scratch));
+                cudaCheck(mAllocator.DeviceAllocate((void**)&d_scratch, scratchSize, stream));
+                actualScratchSize = scratchSize;
+            }
+        }
+    } mMemPool;
+
+    template<typename PtrT, typename BufferT>
+    BufferT getBuffer(const PtrT points, size_t pointCount, const BufferT &buffer);
+};// tools::cuda::PointsToGrid<BuildT>
+
+namespace kernels {
+/// @details Used by cuda::PointsToGrid<BuildT>::processLeafNodes before the computation
+/// of prefix-sum for index grid.
+/// Moving this away from an implementation using the lambdaKernel wrapper
+/// to fix the following on Windows platform:
+/// error : For this host platform/dialect, an extended lambda cannot be defined inside the 'if'
+/// or 'else' block of a constexpr if statement.
+/// function in a lambda through lambdaKernel wrapper defined in CudaUtils.h.
+template <typename BuildT, typename AllocT = cub::CachingDeviceAllocator>
+__global__ void fillValueIndexKernel(const size_t numItems, uint64_t* devValueIndex, typename PointsToGrid<BuildT, AllocT>::Data* d_data) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= numItems) return;
+    devValueIndex[tid] = static_cast<uint64_t>(d_data->getLeaf(tid).mValueMask.countOn());
+}
+
+/// @details Used by PointsToGrid<BuildT>::processLeafNodes for the computation
+/// of prefix-sum for index grid.
+/// Moving this away from an implementation using the lambdaKernel wrapper
+/// to fix the following on Windows platform:
+/// error : For this host platform/dialect, an extended lambda cannot be defined inside the 'if'
+/// or 'else' block of a constexpr if statement.
+template <typename BuildT, typename AllocT = cub::CachingDeviceAllocator>
+__global__ void leafPrefixSumKernel(const size_t numItems, uint64_t* devValueIndexPrefix, typename PointsToGrid<BuildT, AllocT>::Data* d_data) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= numItems) return;
+
+    auto &leaf = d_data->getLeaf(tid);
+    leaf.mOffset = 1u;// will be re-set below
+    const uint64_t *w = leaf.mValueMask.words();
+    uint64_t &prefixSum = leaf.mPrefixSum, sum = util::countOn(*w++);
+    prefixSum = sum;
+    for (int n = 9; n < 55; n += 9) {// n=i*9 where i=1,2,..6
+        sum += util::countOn(*w++);
+        prefixSum |= sum << n;// each pre-fixed sum is encoded in 9 bits
+    }
+    if (tid==0) {
+        d_data->getGrid().mData1 = 1u + devValueIndexPrefix[d_data->nodeCount[0]-1];// set total count
+        d_data->getTree().mVoxelCount = devValueIndexPrefix[d_data->nodeCount[0]-1];
+    } else {
+        leaf.mOffset = 1u + devValueIndexPrefix[tid-1];// background is index 0
+    }
+}
+
+/// @details Used by PointsToGrid<BuildT>::processLeafNodes to make sure leaf.mMask - leaf.mValueMask.
+/// Moving this away from an implementation using the lambdaKernel wrapper
+/// to fix the following on Windows platform:
+/// error : For this host platform/dialect, an extended lambda cannot be defined inside the 'if'
+/// or 'else' block of a constexpr if statement.
+template <typename BuildT, typename AllocT = cub::CachingDeviceAllocator>
+__global__ void setMaskEqValMaskKernel(const size_t numItems, typename PointsToGrid<BuildT, AllocT>::Data* d_data) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= numItems) return;
+    auto &leaf = d_data->getLeaf(tid);
+    leaf.mMask = leaf.mValueMask;
+}
+} // namespace kernels
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+// Define utility macro used to call cub functions that use dynamic temporary storage
+#ifndef CALL_CUBS
+#ifdef _WIN32
+#define CALL_CUBS(func, ...) \
+    cudaCheck(cub::func(nullptr, mMemPool.scratchSize, __VA_ARGS__, mStream)); \
+    mMemPool.adjustScratch(mStream); \
+    cudaCheck(cub::func(mMemPool.d_scratch, mMemPool.scratchSize, __VA_ARGS__, mStream));
+#else// fdef _WIN32
+#define CALL_CUBS(func, args...) \
+    cudaCheck(cub::func(nullptr, mMemPool.scratchSize, args, mStream)); \
+    mMemPool.adjustScratch(mStream); \
+    cudaCheck(cub::func(mMemPool.d_scratch, mMemPool.scratchSize, args, mStream));
+#endif// ifdef _WIN32
+#endif// ifndef CALL_CUBS
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template<typename BuildT, typename AllocT>
+template<typename PtrT, typename BufferT>
+inline GridHandle<BufferT>
+PointsToGrid<BuildT, AllocT>::getHandle(const PtrT points,
+                                        size_t pointCount,
+                                        const BufferT &pool)
+{
+    if (mVerbose==1) mTimer.start("\nCounting nodes");
+    this->countNodes(points, pointCount);
+
+    if (mVerbose==1) mTimer.restart("Initiate buffer");
+    auto buffer = this->getBuffer(points, pointCount, pool);
+
+    if (mVerbose==1) mTimer.restart("Process grid,tree,root");
+    this->processGridTreeRoot(points, pointCount);
+
+    if (mVerbose==1) mTimer.restart("Process upper nodes");
+    this->processUpperNodes();
+
+    if (mVerbose==1) mTimer.restart("Process lower nodes");
+    this->processLowerNodes();
+
+    if (mVerbose==1) mTimer.restart("Process leaf nodes");
+    this->processLeafNodes(points);
+
+    if (mVerbose==1) mTimer.restart("Process points");
+    this->processPoints(points, pointCount);
+
+    if (mVerbose==1) mTimer.restart("Process bbox");
+    this->processBBox();
+    if (mVerbose==1) mTimer.stop();
+
+    if (mVerbose==1) mTimer.restart("Computation of checksum");
+    tools::cuda::updateChecksum((GridData*)buffer.deviceData(), mChecksum);
+    if (mVerbose==1) mTimer.stop();
+
+    return GridHandle<BufferT>(std::move(buffer));
+}// PointsToGrid<BuildT>::getHandle
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+// --- CUB helpers ---
+template<uint8_t BitCount, typename InT, typename OutT>
+struct ShiftRight
+{
+    __hostdev__ inline OutT operator()(const InT& v) const {return static_cast<OutT>(v >> BitCount);}
+};
+
+template<uint8_t BitCount, typename InT = uint64_t, typename OutT = uint64_t>
+struct ShiftRightIterator : public cub::TransformInputIterator<OutT, ShiftRight<BitCount, InT, OutT>, InT*>
+{
+    using BASE = cub::TransformInputIterator<OutT, ShiftRight<BitCount, InT, OutT>, InT*>;
+    __hostdev__ inline ShiftRightIterator(uint64_t* input_itr) : BASE(input_itr, ShiftRight<BitCount, InT, OutT>()) {}
+};
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template <typename BuildT, typename AllocT>
+template <typename PtrT>
+void PointsToGrid<BuildT, AllocT>::countNodes(const PtrT points, size_t pointCount)
+{
+    using Vec3T = typename util::remove_const<typename pointer_traits<PtrT>::element_type>::type;
+    if constexpr(util::is_same<BuildT, Point>::value) {
+        static_assert(util::is_same<Vec3T, Vec3f, Vec3d>::value, "Point (vs voxels) coordinates should be represented as Vec3f or Vec3d");
+    } else {
+        static_assert(util::is_same<Vec3T, Coord, Vec3f, Vec3d>::value, "Voxel coordinates should be represented as Coord, Vec3f or Vec3d");
+    }
+
+    mMaxPointsPerVoxel = math::Min(mMaxPointsPerVoxel, pointCount);
+    int iterCounter = 0;
+    struct Foo {// pairs current voxel size, dx, with the corresponding particle density, i.e. maximum number of points per voxel
+        double   dx;
+        uint32_t density;
+        bool operator<(const Foo &rhs) const {return density < rhs.density || (density == rhs.density && dx < rhs.dx);}
+    } min{0.0, 1}, max{0.0, 0};// min: as dx -> 0 density -> 1 point per voxel, max: density is 0 i.e. undefined
+
+jump:// this marks the beginning of the actual algorithm
+
+    mData.d_keys = mMemPool.template alloc<uint64_t>(pointCount, mStream);
+    mData.d_indx = mMemPool.template alloc<uint32_t>(pointCount, mStream);// uint32_t can index 4.29 billion Coords, corresponding to 48 GB
+    cudaCheck(cudaMemcpyAsync(mDeviceData, &mData, sizeof(Data), cudaMemcpyHostToDevice, mStream));// copy mData from CPU -> GPU
+
+    if (mVerbose==2) mTimer.start("\nAllocating arrays for keys and indices");
+    auto *d_keys = mMemPool.template alloc<uint64_t>(pointCount, mStream);
+    auto *d_indx = mMemPool.template alloc<uint32_t>(pointCount, mStream);
+
+    if (mVerbose==2) mTimer.restart("Generate tile keys");
+    util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, const Data *d_data, const PtrT points) {
+        auto coordToKey = [](const Coord &ijk)->uint64_t{
+            // Note: int32_t has a range of -2^31 to 2^31 - 1 whereas uint32_t has a range of 0 to 2^32 - 1
+            static constexpr int64_t offset = 1 << 31;
+            return (uint64_t(uint32_t(int64_t(ijk[2]) + offset) >> 12)      ) | // z is the lower 21 bits
+                   (uint64_t(uint32_t(int64_t(ijk[1]) + offset) >> 12) << 21) | // y is the middle 21 bits
+                   (uint64_t(uint32_t(int64_t(ijk[0]) + offset) >> 12) << 42); //  x is the upper 21 bits
+        };// coordToKey lambda functor
+        d_indx[tid] = uint32_t(tid);
+        uint64_t &key = d_keys[tid];
+        if constexpr(util::is_same<BuildT, Point>::value) {// points are in world space
+            if constexpr(util::is_same<Vec3T, Vec3f>::value) {
+                key = coordToKey(d_data->map.applyInverseMapF(points[tid]).round());
+            } else {// points are Vec3d
+                key = coordToKey(d_data->map.applyInverseMap(points[tid]).round());
+            }
+        } else if constexpr(util::is_same<Vec3T, Coord>::value) {// points Coord are in index space
+            key = coordToKey(points[tid]);
+        } else {// points are Vec3f or Vec3d in index space
+            key = coordToKey(points[tid].round());
+        }
+    }, mDeviceData, points);
+    cudaCheckError();
+    if (mVerbose==2) mTimer.restart("DeviceRadixSort of "+std::to_string(pointCount)+" tile keys");
+    CALL_CUBS(DeviceRadixSort::SortPairs, d_keys, mData.d_keys, d_indx, mData.d_indx, pointCount, 0, 62);// 21 bits per coord
+    std::swap(d_indx, mData.d_indx);// sorted indices are now in d_indx
+
+    if (mVerbose==2) mTimer.restart("Allocate runs");
+    auto *d_points_per_tile = mMemPool.template alloc<uint32_t>(pointCount, mStream);
+    uint32_t *d_node_count  = mMemPool.template alloc<uint32_t>(3, mStream);
+
+    if (mVerbose==2) mTimer.restart("DeviceRunLengthEncode tile keys");
+    CALL_CUBS(DeviceRunLengthEncode::Encode, mData.d_keys, d_keys, d_points_per_tile, d_node_count+2, pointCount);
+    cudaCheck(cudaMemcpyAsync(mData.nodeCount+2, d_node_count+2, sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
+    mData.d_tile_keys = mMemPool.template alloc<uint64_t>(mData.nodeCount[2], mStream);
+    cudaCheck(cudaMemcpyAsync(mData.d_tile_keys, d_keys, mData.nodeCount[2]*sizeof(uint64_t), cudaMemcpyDeviceToDevice, mStream));
+
+    if (mVerbose) mTimer.restart("DeviceRadixSort of " + std::to_string(pointCount) + " voxel keys in " + std::to_string(mData.nodeCount[2]) + " tiles");
+    uint32_t *points_per_tile = new uint32_t[mData.nodeCount[2]];
+    cudaCheck(cudaMemcpyAsync(points_per_tile, d_points_per_tile, mData.nodeCount[2]*sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
+    mMemPool.free(d_points_per_tile);
+
+    for (uint32_t id = 0, offset = 0; id < mData.nodeCount[2]; ++id) {
+        const uint32_t count = points_per_tile[id];
+        util::cuda::lambdaKernel<<<numBlocks(count), mNumThreads, 0, mStream>>>(count, [=] __device__(size_t tid, const Data *d_data) {
+            auto voxelKey = [] __device__ (uint64_t tileID, const Coord &ijk){
+                return tileID << 36 |                                       // upper offset: 64-15-12-9=28, i.e. last 28 bits
+                    uint64_t(NanoUpper<BuildT>::CoordToOffset(ijk)) << 21 | // lower offset: 32^3 = 2^15,   i.e. next 15 bits
+                    uint64_t(NanoLower<BuildT>::CoordToOffset(ijk)) <<  9 | // leaf  offset: 16^3 = 2^12,   i.e. next 12 bits
+                    uint64_t(NanoLeaf< BuildT>::CoordToOffset(ijk));        // voxel offset:  8^3 =  2^9,   i.e. first 9 bits
+            };// voxelKey lambda functor
+            tid += offset;
+            Vec3T p = points[d_indx[tid]];
+            if constexpr(util::is_same<BuildT, Point>::value) p = util::is_same<Vec3T, Vec3f>::value ? d_data->map.applyInverseMapF(p) : d_data->map.applyInverseMap(p);
+            d_keys[tid] = voxelKey(id, p.round());
+        }, mDeviceData); cudaCheckError();
+        CALL_CUBS(DeviceRadixSort::SortPairs, d_keys + offset, mData.d_keys + offset, d_indx + offset, mData.d_indx + offset, count, 0, 36);// 9+12+15=36
+        offset += count;
+    }
+    mMemPool.free(d_indx);
+    delete [] points_per_tile;
+
+    if (mVerbose==2) mTimer.restart("Count points per voxel");
+
+    mData.pointsPerVoxel    = mMemPool.template alloc<uint32_t>(pointCount, mStream);
+    uint32_t *d_voxel_count = mMemPool.template alloc<uint32_t>(mStream);
+    CALL_CUBS(DeviceRunLengthEncode::Encode, mData.d_keys, d_keys, mData.pointsPerVoxel, d_voxel_count, pointCount);
+    cudaCheck(cudaMemcpyAsync(&mData.voxelCount, d_voxel_count, sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
+    mMemPool.free(d_voxel_count);
+
+    if (util::is_same<BuildT, Point>::value) {
+        if (mVerbose==2) mTimer.restart("Count max points per voxel");
+        uint32_t *d_maxPointsPerVoxel = mMemPool.template alloc<uint32_t>(mStream), maxPointsPerVoxel;
+        CALL_CUBS(DeviceReduce::Max, mData.pointsPerVoxel, d_maxPointsPerVoxel, mData.voxelCount);
+        cudaCheck(cudaMemcpyAsync(&maxPointsPerVoxel, d_maxPointsPerVoxel, sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
+        mMemPool.free(d_maxPointsPerVoxel);
+        double dx = mData.map.getVoxelSize()[0];
+        if (++iterCounter >= mMaxIterations || pointCount == 1u || math::Abs((int)maxPointsPerVoxel - (int)mMaxPointsPerVoxel) <= mTolerance) {
+            mMaxPointsPerVoxel = maxPointsPerVoxel;
+        } else {
+            const Foo tmp{dx, maxPointsPerVoxel};
+            if (maxPointsPerVoxel < mMaxPointsPerVoxel) {
+                if (min < tmp) min = tmp;
+            } else if (max.density == 0 || tmp < max) {
+                max = tmp;
+            }
+            if (max.density) {
+                dx = (min.dx*(max.density - mMaxPointsPerVoxel) + max.dx*(mMaxPointsPerVoxel-min.density))/double(max.density-min.density);
+            } else if (maxPointsPerVoxel > 1u) {
+                dx *= (mMaxPointsPerVoxel-1.0)/(maxPointsPerVoxel-1.0);
+            } else {// maxPointsPerVoxel = 1 so increase dx significantly
+                dx *= 10.0;
+            }
+            if (mVerbose==2) printf("\ntarget density = %u, current density = %u current dx = %f, next dx = %f\n", mMaxPointsPerVoxel, maxPointsPerVoxel, tmp.dx, dx);
+            mData.map = Map(dx);
+            mMemPool.free(mData.d_keys, mData.d_indx, d_keys, mData.d_tile_keys, d_node_count, mData.pointsPerVoxel);
+            goto jump;
+        }
+    }
+    if (iterCounter>1 && mVerbose) std::cerr << "Used " << iterCounter << " attempts to determine dx that produces a target dpoint denisty\n\n";
+
+    if (mVerbose==2) mTimer.restart("Compute prefix sum of points per voxel");
+    mData.pointsPerVoxelPrefix = mMemPool.template alloc<uint32_t>(mData.voxelCount, mStream);
+    CALL_CUBS(DeviceScan::ExclusiveSum, mData.pointsPerVoxel, mData.pointsPerVoxelPrefix, mData.voxelCount);
+
+    mData.pointsPerLeaf = mMemPool.template alloc<uint32_t>(pointCount, mStream);
+    CALL_CUBS(DeviceRunLengthEncode::Encode, ShiftRightIterator<9>(mData.d_keys), d_keys, mData.pointsPerLeaf, d_node_count, pointCount);
+    cudaCheck(cudaMemcpyAsync(mData.nodeCount, d_node_count, sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
+
+    if constexpr(util::is_same<BuildT, Point>::value) {
+        uint32_t *d_maxPointsPerLeaf = mMemPool.template alloc<uint32_t>(mStream);
+        CALL_CUBS(DeviceReduce::Max, mData.pointsPerLeaf, d_maxPointsPerLeaf, mData.nodeCount[0]);
+        cudaCheck(cudaMemcpyAsync(&mMaxPointsPerLeaf, d_maxPointsPerLeaf, sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
+        //printf("\n Leaf count = %u, max points per leaf = %u\n", mData.nodeCount[0], mMaxPointsPerLeaf);
+        if (mMaxPointsPerLeaf > std::numeric_limits<uint16_t>::max()) {
+            throw std::runtime_error("Too many points per leaf: "+std::to_string(mMaxPointsPerLeaf));
+        }
+        mMemPool.free(d_maxPointsPerLeaf);
+    }
+
+    mData.pointsPerLeafPrefix = mMemPool.template alloc<uint32_t>(mData.nodeCount[0], mStream);
+    CALL_CUBS(DeviceScan::ExclusiveSum, mData.pointsPerLeaf, mData.pointsPerLeafPrefix, mData.nodeCount[0]);
+
+    mData.d_leaf_keys = mMemPool.template alloc<uint64_t>(mData.nodeCount[0], mStream);
+    cudaCheck(cudaMemcpyAsync(mData.d_leaf_keys, d_keys, mData.nodeCount[0]*sizeof(uint64_t), cudaMemcpyDeviceToDevice, mStream));
+
+    CALL_CUBS(DeviceSelect::Unique, ShiftRightIterator<12>(mData.d_leaf_keys), d_keys, d_node_count+1, mData.nodeCount[0]);// count lower nodes
+    cudaCheck(cudaMemcpyAsync(mData.nodeCount+1, d_node_count+1, sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
+    mData.d_lower_keys = mMemPool.template alloc<uint64_t>(mData.nodeCount[1], mStream);
+    cudaCheck(cudaMemcpyAsync(mData.d_lower_keys, d_keys, mData.nodeCount[1]*sizeof(uint64_t), cudaMemcpyDeviceToDevice, mStream));
+
+    mMemPool.free(d_keys, d_node_count);
+    if (mVerbose==2) mTimer.stop();
+
+    //printf("Leaf count = %u, lower count = %u, upper count = %u\n", mData.nodeCount[0], mData.nodeCount[1], mData.nodeCount[2]);
+}// PointsToGrid<BuildT>::countNodes
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template <typename BuildT, typename AllocT>
+template <typename PtrT, typename BufferT>
+inline BufferT PointsToGrid<BuildT, AllocT>::getBuffer(const PtrT, size_t pointCount, const BufferT &pool)
+{
+    auto sizeofPoint = [&]()->size_t{
+        switch (mPointType){
+        case PointType::PointID: return sizeof(uint32_t);
+        case PointType::World64: return sizeof(Vec3d);
+        case PointType::World32: return sizeof(Vec3f);
+        case PointType::Grid64:  return sizeof(Vec3d);
+        case PointType::Grid32:  return sizeof(Vec3f);
+        case PointType::Voxel32: return sizeof(Vec3f);
+        case PointType::Voxel16: return sizeof(Vec3u16);
+        case PointType::Voxel8:  return sizeof(Vec3u8);
+        case PointType::Default: return pointer_traits<PtrT>::element_size;
+        default: return size_t(0);// PointType::Disable
+        }
+    };
+
+    mData.grid  = 0;// grid is always stored at the start of the buffer!
+    mData.tree  = NanoGrid<BuildT>::memUsage(); // grid ends and tree begins
+    mData.root  = mData.tree  + NanoTree<BuildT>::memUsage(); // tree ends and root node begins
+    mData.upper = mData.root  + NanoRoot<BuildT>::memUsage(mData.nodeCount[2]); // root node ends and upper internal nodes begin
+    mData.lower = mData.upper + NanoUpper<BuildT>::memUsage()*mData.nodeCount[2]; // upper internal nodes ends and lower internal nodes begin
+    mData.leaf  = mData.lower + NanoLower<BuildT>::memUsage()*mData.nodeCount[1]; // lower internal nodes ends and leaf nodes begin
+    mData.meta  = mData.leaf  + NanoLeaf<BuildT>::DataType::memUsage()*mData.nodeCount[0];// leaf nodes end and blind meta data begins
+    mData.blind = mData.meta  + sizeof(GridBlindMetaData)*int( mPointType!=PointType::Disable ); // meta data ends and blind data begins
+    mData.size  = mData.blind + pointCount*sizeofPoint();// end of buffer
+
+    auto buffer = BufferT::create(mData.size, &pool, false);// only allocate buffer on the device
+    mData.d_bufferPtr = buffer.deviceData();
+    if (mData.d_bufferPtr == nullptr) throw std::runtime_error("Failed to allocate grid buffer on the device");
+    cudaCheck(cudaMemcpyAsync(mDeviceData, &mData, sizeof(Data), cudaMemcpyHostToDevice, mStream));// copy Data CPU -> GPU
+    return buffer;
+}// PointsToGrid<BuildT>::getBuffer
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template <typename BuildT, typename AllocT>
+template <typename PtrT>
+inline void PointsToGrid<BuildT, AllocT>::processGridTreeRoot(const PtrT points, size_t pointCount)
+{
+    using Vec3T = typename util::remove_const<typename pointer_traits<PtrT>::element_type>::type;
+    util::cuda::lambdaKernel<<<1, 1, 0, mStream>>>(1, [=] __device__(size_t, Data *d_data, PointType pointType) {
+       // process Root
+        auto &root = d_data->getRoot();
+        root.mBBox = CoordBBox(); // init to empty
+        root.mTableSize = d_data->nodeCount[2];
+        root.mBackground = NanoRoot<BuildT>::ValueType(0);// background_value
+        root.mMinimum = root.mMaximum = NanoRoot<BuildT>::ValueType(0);
+        root.mAverage = root.mStdDevi = NanoRoot<BuildT>::FloatType(0);
+
+        // process Tree
+        auto &tree = d_data->getTree();
+        tree.setRoot(&root);
+        tree.setFirstNode(&d_data->getUpper(0));
+        tree.setFirstNode(&d_data->getLower(0));
+        tree.setFirstNode(&d_data->getLeaf(0));
+        tree.mNodeCount[2] = tree.mTileCount[2] = d_data->nodeCount[2];
+        tree.mNodeCount[1] = tree.mTileCount[1] = d_data->nodeCount[1];
+        tree.mNodeCount[0] = tree.mTileCount[0] = d_data->nodeCount[0];
+        tree.mVoxelCount = d_data->voxelCount;
+
+        // process Grid
+        auto &grid = d_data->getGrid();
+        grid.init({GridFlags::HasBBox, GridFlags::IsBreadthFirst}, d_data->size, d_data->map, toGridType<BuildT>());
+        grid.mChecksum = ~uint64_t(0);// set all bits on which means it's disabled
+        grid.mBlindMetadataCount  = util::is_same<BuildT, Point>::value;// ? 1u : 0u;
+        grid.mBlindMetadataOffset = d_data->meta;
+        if (pointType != PointType::Disable) {
+            const auto lastLeaf = tree.mNodeCount[0] - 1;
+            grid.mData1 = d_data->pointsPerLeafPrefix[lastLeaf] + d_data->pointsPerLeaf[lastLeaf];
+            auto &meta = d_data->getMeta();
+            meta.mDataOffset = sizeof(GridBlindMetaData);// blind data is placed right after this meta data
+            meta.mValueCount = pointCount;
+            // Blind meta data
+            switch (pointType){
+            case PointType::PointID:
+                grid.mGridClass = GridClass::PointIndex;
+                meta.mSemantic  = GridBlindDataSemantic::PointId;
+                meta.mDataClass = GridBlindDataClass::IndexArray;
+                meta.mDataType  = toGridType<uint32_t>();
+                meta.mValueSize = sizeof(uint32_t);
+                util::strcpy(meta.mName, "PointID: uint32_t indices to points");
+                break;
+            case PointType::World64:
+                grid.mGridClass = GridClass::PointData;
+                meta.mSemantic  = GridBlindDataSemantic::WorldCoords;
+                meta.mDataClass = GridBlindDataClass::AttributeArray;
+                meta.mDataType  = toGridType<Vec3d>();
+                meta.mValueSize = sizeof(Vec3d);
+                util::strcpy(meta.mName, "World64: Vec3<double> point coordinates in world space");
+                break;
+            case PointType::World32:
+                grid.mGridClass = GridClass::PointData;
+                meta.mSemantic  = GridBlindDataSemantic::WorldCoords;
+                meta.mDataClass = GridBlindDataClass::AttributeArray;
+                meta.mDataType  = toGridType<Vec3f>();
+                meta.mValueSize = sizeof(Vec3f);
+                util::strcpy(meta.mName, "World32: Vec3<float> point coordinates in world space");
+                break;
+            case PointType::Grid64:
+                grid.mGridClass = GridClass::PointData;
+                meta.mSemantic  = GridBlindDataSemantic::GridCoords;
+                meta.mDataClass = GridBlindDataClass::AttributeArray;
+                meta.mDataType  = toGridType<Vec3d>();
+                meta.mValueSize = sizeof(Vec3d);
+                util::strcpy(meta.mName, "Grid64: Vec3<double> point coordinates in grid space");
+                break;
+            case PointType::Grid32:
+                grid.mGridClass = GridClass::PointData;
+                meta.mSemantic  = GridBlindDataSemantic::GridCoords;
+                meta.mDataClass = GridBlindDataClass::AttributeArray;
+                meta.mDataType  = toGridType<Vec3f>();
+                meta.mValueSize = sizeof(Vec3f);
+                util::strcpy(meta.mName, "Grid32: Vec3<float> point coordinates in grid space");
+                break;
+            case PointType::Voxel32:
+                grid.mGridClass = GridClass::PointData;
+                meta.mSemantic  = GridBlindDataSemantic::VoxelCoords;
+                meta.mDataClass = GridBlindDataClass::AttributeArray;
+                meta.mDataType  = toGridType<Vec3f>();
+                meta.mValueSize = sizeof(Vec3f);
+                util::strcpy(meta.mName, "Voxel32: Vec3<float> point coordinates in voxel space");
+                break;
+            case PointType::Voxel16:
+                grid.mGridClass = GridClass::PointData;
+                meta.mSemantic  = GridBlindDataSemantic::VoxelCoords;
+                meta.mDataClass = GridBlindDataClass::AttributeArray;
+                meta.mDataType  = toGridType<Vec3u16>();
+                meta.mValueSize = sizeof(Vec3u16);
+                util::strcpy(meta.mName, "Voxel16: Vec3<uint16_t> point coordinates in voxel space");
+                break;
+            case PointType::Voxel8:
+                grid.mGridClass = GridClass::PointData;
+                meta.mSemantic  = GridBlindDataSemantic::VoxelCoords;
+                meta.mDataClass = GridBlindDataClass::AttributeArray;
+                meta.mDataType  = toGridType<Vec3u8>();
+                meta.mValueSize = sizeof(Vec3u8);
+                util::strcpy(meta.mName, "Voxel8: Vec3<uint8_t> point coordinates in voxel space");
+                break;
+            case PointType::Default:
+                grid.mGridClass = GridClass::PointData;
+                meta.mSemantic  = GridBlindDataSemantic::WorldCoords;
+                meta.mDataClass = GridBlindDataClass::AttributeArray;
+                meta.mDataType  = toGridType<Vec3T>();
+                meta.mValueSize = sizeof(Vec3T);
+                if constexpr(util::is_same<Vec3T, Vec3f>::value) {
+                    util::strcpy(meta.mName, "World32: Vec3<float> point coordinates in world space");
+                } else if constexpr(util::is_same<Vec3T, Vec3d>::value){
+                    util::strcpy(meta.mName, "World64: Vec3<double> point coordinates in world space");
+                } else {
+                    printf("Error in PointsToGrid<BuildT>::processGridTreeRoot: expected Vec3T = Vec3f or Vec3d\n");
+                }
+                break;
+            default:
+                printf("Error in PointsToGrid<BuildT>::processGridTreeRoot: invalid pointType\n");
+            }
+        } else if constexpr(BuildTraits<BuildT>::is_offindex) {
+            grid.mData1 = 1u + 512u*d_data->nodeCount[0];
+            grid.mGridClass = GridClass::IndexGrid;
+        }
+    }, mDeviceData, mPointType);// lambdaKernel
+    cudaCheckError();
+
+    char *dst = mData.getGrid().mGridName;
+    if (const char *src = mGridName.data()) {
+        cudaCheck(cudaMemcpyAsync(dst, src, GridData::MaxNameSize, cudaMemcpyHostToDevice, mStream));
+    } else {
+        cudaCheck(cudaMemsetAsync(dst, 0, GridData::MaxNameSize, mStream));
+    }
+}// PointsToGrid<BuildT>::processGridTreeRoot
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template <typename BuildT, typename AllocT>
+inline void PointsToGrid<BuildT, AllocT>::processUpperNodes()
+{
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[2]), mNumThreads, 0, mStream>>>(mData.nodeCount[2], [=] __device__(size_t tid, Data *d_data) {
+        auto &root  = d_data->getRoot();
+        auto &upper = d_data->getUpper(tid);
+#if 1
+        auto keyToCoord = [](uint64_t key)->nanovdb::Coord{
+            static constexpr int64_t offset = 1 << 31;// max values of uint32_t is 2^31 - 1
+            static constexpr uint64_t MASK = (1u << 21) - 1; // used to mask out 21 lower bits
+            return nanovdb::Coord(int(int64_t(((key >> 42) & MASK) << 12) - offset),  // x are the upper 21 bits
+                                  int(int64_t(((key >> 21) & MASK) << 12) - offset),  // y are the middle 21 bits
+                                  int(int64_t(( key        & MASK) << 12) - offset)); // z are the lower 21 bits
+        };
+        const Coord ijk = keyToCoord(d_data->d_tile_keys[tid]);
+#else
+        const Coord ijk = NanoRoot<uint32_t>::KeyToCoord(d_data->d_tile_keys[tid]);
+#endif
+        root.tile(tid)->setChild(ijk, &upper, &root);
+        upper.mBBox[0] = ijk;
+        upper.mFlags = 0;
+        upper.mValueMask.setOff();
+        upper.mChildMask.setOff();
+        upper.mMinimum = upper.mMaximum = NanoLower<BuildT>::ValueType(0);
+        upper.mAverage = upper.mStdDevi = NanoLower<BuildT>::FloatType(0);
+    }, mDeviceData);
+    cudaCheckError();
+
+    mMemPool.free(mData.d_tile_keys);
+
+    const uint64_t valueCount = mData.nodeCount[2] << 15;
+    util::cuda::lambdaKernel<<<numBlocks(valueCount), mNumThreads, 0, mStream>>>(valueCount, [=] __device__(size_t tid, Data *d_data) {
+        auto &upper = d_data->getUpper(tid >> 15);
+        upper.mTable[tid & 32767u].value = NanoUpper<BuildT>::ValueType(0);// background
+    }, mDeviceData);
+    cudaCheckError();
+}// PointsToGrid<BuildT>::processUpperNodes
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template <typename BuildT, typename AllocT>
+inline void PointsToGrid<BuildT, AllocT>::processLowerNodes()
+{
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[1]), mNumThreads, 0, mStream>>>(mData.nodeCount[1], [=] __device__(size_t tid, Data *d_data) {
+        auto &root  = d_data->getRoot();
+        const uint64_t lowerKey = d_data->d_lower_keys[tid];
+        auto &upper = d_data->getUpper(lowerKey >> 15);
+        const uint32_t upperOffset = lowerKey & 32767u;// (1 << 15) - 1 = 32767
+        upper.mChildMask.setOnAtomic(upperOffset);
+        auto &lower = d_data->getLower(tid);
+        upper.setChild(upperOffset, &lower);
+        lower.mBBox[0] = upper.offsetToGlobalCoord(upperOffset);
+        lower.mFlags = 0;
+        lower.mValueMask.setOff();
+        lower.mChildMask.setOff();
+        lower.mMinimum = lower.mMaximum = NanoLower<BuildT>::ValueType(0);// background;
+        lower.mAverage = lower.mStdDevi = NanoLower<BuildT>::FloatType(0);
+    }, mDeviceData);
+    cudaCheckError();
+
+    const uint64_t valueCount = mData.nodeCount[1] << 12;
+    util::cuda::lambdaKernel<<<numBlocks(valueCount), mNumThreads, 0, mStream>>>(valueCount, [=] __device__(size_t tid, Data *d_data) {
+        auto &lower = d_data->getLower(tid >> 12);
+        lower.mTable[tid & 4095u].value = NanoLower<BuildT>::ValueType(0);// background
+    }, mDeviceData);
+    cudaCheckError();
+}// PointsToGrid<BuildT>::processLowerNodes
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template <typename BuildT, typename AllocT>
+template <typename PtrT>
+inline void PointsToGrid<BuildT, AllocT>::processLeafNodes(const PtrT points)
+{
+    const uint8_t flags = static_cast<uint8_t>(mData.flags.data());// mIncludeStats ? 16u : 0u;// 4th bit indicates stats
+
+    if (mVerbose==2) mTimer.start("process leaf meta data");
+    // loop over leaf nodes and add it to its parent node
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], [=] __device__(size_t tid, Data *d_data) {
+        const uint64_t leafKey = d_data->d_leaf_keys[tid], tile_id = leafKey >> 27;
+        auto &upper = d_data->getUpper(tile_id);
+        const uint32_t lowerOffset = leafKey & 4095u, upperOffset = (leafKey >> 12) & 32767u;
+        auto &lower = *upper.getChild(upperOffset);
+        lower.mChildMask.setOnAtomic(lowerOffset);
+        auto &leaf = d_data->getLeaf(tid);
+        lower.setChild(lowerOffset, &leaf);
+        leaf.mBBoxMin = lower.offsetToGlobalCoord(lowerOffset);
+        leaf.mFlags = flags;
+        auto &valueMask = leaf.mValueMask;
+        valueMask.setOff();// initiate all bits to off
+
+        if constexpr(util::is_same<Point, BuildT>::value) {
+            leaf.mOffset = d_data->pointsPerLeafPrefix[tid];
+            leaf.mPointCount = d_data->pointsPerLeaf[tid];
+        } else if constexpr(BuildTraits<BuildT>::is_offindex) {
+            leaf.mOffset = tid*512u + 1u;// background is index 0
+            leaf.mPrefixSum = 0u;
+        } else if constexpr(!BuildTraits<BuildT>::is_special) {
+            leaf.mAverage = leaf.mStdDevi = NanoLeaf<BuildT>::FloatType(0);
+            leaf.mMinimum = leaf.mMaximum = NanoLeaf<BuildT>::ValueType(0);
+        }
+    }, mDeviceData); cudaCheckError();
+
+    if (mVerbose==2) mTimer.restart("set active voxel state and values");
+    // loop over all active voxels and set LeafNode::mValueMask and LeafNode::mValues
+    util::cuda::lambdaKernel<<<numBlocks(mData.voxelCount), mNumThreads, 0, mStream>>>(mData.voxelCount, [=] __device__(size_t tid, Data *d_data) {
+        const uint32_t pointID  = d_data->pointsPerVoxelPrefix[tid];
+        const uint64_t voxelKey = d_data->d_keys[pointID];
+        auto &upper = d_data->getUpper(voxelKey >> 36);
+        auto &lower = *upper.getChild((voxelKey >> 21) & 32767u);
+        auto &leaf  = *lower.getChild((voxelKey >>  9) &  4095u);
+        const uint32_t n = voxelKey & 511u;
+        leaf.mValueMask.setOnAtomic(n);// <--- slow!
+        if constexpr(util::is_same<Point, BuildT>::value) {
+            leaf.mValues[n] = uint16_t(pointID + d_data->pointsPerVoxel[tid] - leaf.offset());
+        } else if constexpr(!BuildTraits<BuildT>::is_special) {
+            leaf.mValues[n] = NanoLeaf<BuildT>::ValueType(1);// set value of active voxels that are not points (or index)
+        }
+    }, mDeviceData); cudaCheckError();
+
+    mMemPool.free(mData.d_keys, mData.pointsPerVoxel, mData.pointsPerVoxelPrefix, mData.pointsPerLeafPrefix, mData.pointsPerLeaf);
+
+    if (mVerbose==2) mTimer.restart("set inactive voxel values");
+    const uint64_t denseVoxelCount = mData.nodeCount[0] << 9;
+    util::cuda::lambdaKernel<<<numBlocks(denseVoxelCount), mNumThreads, 0, mStream>>>(denseVoxelCount, [=] __device__(size_t tid, Data *d_data) {
+        auto &leaf = d_data->getLeaf(tid >> 9u);
+        const uint32_t n = tid & 511u;
+        if (leaf.mValueMask.isOn(n)) return;
+        if constexpr(util::is_same<BuildT, Point>::value) {
+            const uint32_t m = leaf.mValueMask.findPrev<true>(n - 1);
+            leaf.mValues[n] = m < 512u ? leaf.mValues[m] : 0u;
+        } else if constexpr(!BuildTraits<BuildT>::is_special) {
+            leaf.mValues[n] = NanoLeaf<BuildT>::ValueType(0);// value of inactive voxels
+        }
+    }, mDeviceData); cudaCheckError();
+
+    if constexpr(BuildTraits<BuildT>::is_onindex) {
+        if (mVerbose==2) mTimer.restart("prefix-sum for index grid");
+        uint64_t *devValueIndex = mMemPool.template alloc<uint64_t>(mData.nodeCount[0], mStream);
+        auto devValueIndexPrefix = mMemPool.template alloc<uint64_t>(mData.nodeCount[0], mStream);
+        kernels::fillValueIndexKernel<BuildT, AllocT><<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], devValueIndex, mDeviceData);
+        cudaCheckError();
+        CALL_CUBS(DeviceScan::InclusiveSum, devValueIndex, devValueIndexPrefix, mData.nodeCount[0]);
+        mMemPool.free(devValueIndex);
+        kernels::leafPrefixSumKernel<BuildT, AllocT><<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], devValueIndexPrefix, mDeviceData);
+        cudaCheckError();
+        mMemPool.free(devValueIndexPrefix);
+    }
+
+    if constexpr(BuildTraits<BuildT>::is_indexmask) {
+        if (mVerbose==2) mTimer.restart("leaf.mMask = leaf.mValueMask");
+        kernels::setMaskEqValMaskKernel<BuildT, AllocT><<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], mDeviceData);
+        cudaCheckError();
+    }
+    if (mVerbose==2) mTimer.stop();
+}// PointsToGrid<BuildT>::processLeafNodes
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template <typename BuildT, typename AllocT>
+template <typename PtrT>
+inline void PointsToGrid<BuildT, AllocT>::processPoints(const PtrT, size_t)
+{
+    mMemPool.free(mData.d_indx);
+}
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+// Template specialization with BuildT = Point
+template <>
+template <typename PtrT>
+inline void PointsToGrid<Point>::processPoints(const PtrT points, size_t pointCount)
+{
+    switch (mPointType){
+    case PointType::Disable:
+        throw std::runtime_error("PointsToGrid<Point>::processPoints: mPointType == PointType::Disable\n");
+    case PointType::PointID:
+        util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
+            d_data->template getPoint<uint32_t>(tid) = d_data->d_indx[tid];
+        }, mDeviceData); cudaCheckError();
+        break;
+    case PointType::World64:
+        util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
+            d_data->template getPoint<Vec3d>(tid) = points[d_data->d_indx[tid]];
+        }, mDeviceData); cudaCheckError();
+        break;
+    case PointType::World32:
+        util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
+            d_data->template getPoint<Vec3f>(tid) = points[d_data->d_indx[tid]];
+        }, mDeviceData); cudaCheckError();
+        break;
+    case PointType::Grid64:
+        util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
+            d_data->template getPoint<Vec3d>(tid) = d_data->map.applyInverseMap(points[d_data->d_indx[tid]]);
+        }, mDeviceData); cudaCheckError();
+        break;
+    case PointType::Grid32:
+        util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
+            d_data->template getPoint<Vec3f>(tid) = d_data->map.applyInverseMapF(points[d_data->d_indx[tid]]);
+        }, mDeviceData); cudaCheckError();
+        break;
+    case PointType::Voxel32:
+        util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
+            worldToVoxel(d_data->template getPoint<Vec3f>(tid), points[d_data->d_indx[tid]], d_data->map);
+        }, mDeviceData); cudaCheckError();
+        break;
+    case PointType::Voxel16:
+        util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
+            worldToVoxel(d_data->template getPoint<Vec3u16>(tid), points[d_data->d_indx[tid]], d_data->map);
+        }, mDeviceData); cudaCheckError();
+        break;
+    case PointType::Voxel8:
+        util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
+            worldToVoxel(d_data->template getPoint<Vec3u8>(tid), points[d_data->d_indx[tid]], d_data->map);
+        }, mDeviceData); cudaCheckError();
+        break;
+    case PointType::Default:
+        util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
+            d_data->template getPoint<typename pointer_traits<PtrT>::element_type>(tid) = points[d_data->d_indx[tid]];
+        }, mDeviceData); cudaCheckError();
+        break;
+    default:
+        printf("Internal error in PointsToGrid<Point>::processPoints\n");
+    }
+    mMemPool.free(mData.d_indx);
+}// PointsToGrid<Point>::processPoints
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template <typename BuildT, typename AllocT>
+inline void PointsToGrid<BuildT, AllocT>::processBBox()
+{
+    if (mData.flags.isMaskOff(GridFlags::HasBBox)) {
+        mMemPool.free(mData.d_leaf_keys, mData.d_lower_keys);
+        return;
+    }
+
+    // reset bbox in lower nodes
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[1]), mNumThreads, 0, mStream>>>(mData.nodeCount[1], [=] __device__(size_t tid, Data *d_data) {
+        d_data->getLower(tid).mBBox = CoordBBox();
+    }, mDeviceData);
+    cudaCheckError();
+
+    // update and propagate bbox from leaf -> lower/parent nodes
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], [=] __device__(size_t tid, Data *d_data) {
+        const uint64_t leafKey = d_data->d_leaf_keys[tid];
+        auto &upper = d_data->getUpper(leafKey >> 27);
+        auto &lower = *upper.getChild((leafKey >> 12) & 32767u);
+        auto &leaf = d_data->getLeaf(tid);
+        leaf.updateBBox();
+        lower.mBBox.expandAtomic(leaf.bbox());
+    }, mDeviceData);
+    mMemPool.free(mData.d_leaf_keys);
+    cudaCheckError();
+
+    // reset bbox in upper nodes
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[2]), mNumThreads, 0, mStream>>>(mData.nodeCount[2], [=] __device__(size_t tid, Data *d_data) {
+        d_data->getUpper(tid).mBBox = CoordBBox();
+    }, mDeviceData);
+    cudaCheckError();
+
+    // propagate bbox from lower -> upper/parent node
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[1]), mNumThreads, 0, mStream>>>(mData.nodeCount[1], [=] __device__(size_t tid, Data *d_data) {
+        const uint64_t lowerKey = d_data->d_lower_keys[tid];
+        auto &upper = d_data->getUpper(lowerKey >> 15);
+        auto &lower = d_data->getLower(tid);
+        upper.mBBox.expandAtomic(lower.bbox());
+    }, mDeviceData);
+    mMemPool.free(mData.d_lower_keys);
+    cudaCheckError()
+
+    // propagate bbox from upper -> root/parent node
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[2]), mNumThreads, 0, mStream>>>(mData.nodeCount[2], [=] __device__(size_t tid, Data *d_data) {
+        d_data->getRoot().mBBox.expandAtomic(d_data->getUpper(tid).bbox());
+    }, mDeviceData);
+    cudaCheckError();
+
+    // update the world-bbox in the root node
+    util::cuda::lambdaKernel<<<1, 1, 0, mStream>>>(1, [=] __device__(size_t, Data *d_data) {
+        d_data->getGrid().mWorldBBox = d_data->getRoot().mBBox.transform(d_data->map);
+    }, mDeviceData);
+    cudaCheckError();
+}// PointsToGrid<BuildT>::processBBox
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template<typename BuildT, typename PtrT, typename BufferT, typename AllocT>
+GridHandle<BufferT>// Grid<BuildT>
+voxelsToGrid(const PtrT d_ijk, size_t voxelCount, double voxelSize, const BufferT &buffer, cudaStream_t stream)
+{
+    PointsToGrid<BuildT, AllocT> converter(voxelSize, Vec3d(0.0), stream);
+    return converter.getHandle(d_ijk, voxelCount, buffer);
+}
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template<typename PtrT, typename BufferT, typename AllocT>
+GridHandle<BufferT>// Grid<Point> with PointType coordinates as blind data
+pointsToGrid(const PtrT d_xyz, int pointCount, int maxPointsPerVoxel, int tolerance, int maxIterations, PointType type, const BufferT &buffer, cudaStream_t stream)
+{
+    PointsToGrid<Point, AllocT> converter(maxPointsPerVoxel, tolerance, maxIterations, Vec3d(0.0), stream);
+    converter.setPointType(type);
+    return converter.getHandle(d_xyz, pointCount, buffer);
+}
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template<typename BuildT, typename PtrT, typename BufferT, typename AllocT>
+GridHandle<BufferT>
+pointsToGrid(std::vector<std::tuple<const PtrT,size_t,double,PointType>> vec, const BufferT &buffer, cudaStream_t stream)
+{
+    std::vector<GridHandle<BufferT>> handles;
+    for (auto &p : vec) handles.push_back(pointsToGrid<BuildT, AllocT>(std::get<0>(p), std::get<1>(p), std::get<2>(p), std::get<3>(p), buffer, stream));
+    return mergeDeviceGrids(handles, stream);
+}
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template<typename BuildT, typename PtrT, typename BufferT, typename AllocT>
+GridHandle<BufferT>
+voxelsToGrid(std::vector<std::tuple<const PtrT,size_t,double>> vec, const BufferT &buffer, cudaStream_t stream)
+{
+    std::vector<GridHandle<BufferT>> handles;
+    for (auto &p : vec) handles.push_back(voxelsToGrid<BuildT, PtrT, BufferT, AllocT>(std::get<0>(p), std::get<1>(p), std::get<2>(p), buffer, stream));
+    return mergeDeviceGrids(handles, stream);
+}
+
+}// namespace tools::cuda ======================================================================================================================================
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template<typename PtrT, typename BufferT = cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+[[deprecated("Use cuda::pointsToGrid instead")]]
+GridHandle<BufferT>
+cudaPointsToGrid(const PtrT dWorldPoints,
+                 int pointCount,
+                 double voxelSize = 1.0,
+                 PointType type = PointType::Default,
+                 const BufferT &buffer = BufferT(),
+                 cudaStream_t stream = 0)
+{
+    return tools::cuda::pointsToGrid<PtrT, BufferT, AllocT>(dWorldPoints, pointCount, voxelSize, type, buffer, stream);
+}
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template<typename BuildT, typename PtrT, typename BufferT = cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+[[deprecated("Use cuda::pointsToGrid instead")]]
+GridHandle<BufferT>
+cudaPointsToGrid(std::vector<std::tuple<const PtrT,size_t,double,PointType>> pointSet,
+                 const BufferT &buffer = BufferT(),
+                 cudaStream_t stream = 0)
+{
+    return tools::cuda::pointsToGrid<BuildT, PtrT, BufferT,AllocT>(pointSet, buffer, stream);
+}
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template<typename BuildT, typename PtrT, typename BufferT = cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+[[deprecated("Use cuda::voxelsToGrid instead")]]
+GridHandle<BufferT>
+cudaVoxelsToGrid(const PtrT dGridVoxels,
+                 size_t voxelCount,
+                 double voxelSize = 1.0,
+                 const BufferT &buffer = BufferT(),
+                 cudaStream_t stream = 0)
+{
+    return tools::cuda::voxelsToGrid<BuildT, PtrT, BufferT, AllocT>(dGridVoxels, voxelCount, voxelSize, buffer, stream);
+}
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template<typename BuildT, typename PtrT, typename BufferT = cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+[[deprecated("Use cuda::voxelsToGrid instead")]]
+GridHandle<BufferT>
+cudaVoxelsToGrid(std::vector<std::tuple<const PtrT, size_t, double>> pointSet,
+                 const BufferT &buffer = BufferT(),
+                 cudaStream_t stream = 0)
+{
+    return tools::cuda::voxelsToGrid<BuildT, PtrT, BufferT, AllocT>(pointSet, buffer, stream);
+}
+
+}// namespace nanovdb
+
+#endif // NVIDIA_TOOLS_CUDA_POINTSTOGRID_CUH_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/tools/cuda/SignedFloodFill.cuh b/external/nanovdb/tools/cuda/SignedFloodFill.cuh
new file mode 100644
index 00000000..f214247a
--- /dev/null
+++ b/external/nanovdb/tools/cuda/SignedFloodFill.cuh
@@ -0,0 +1,213 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file nanovdb/tools/cuda/SignedFloodFill.cuh
+
+    \author Ken Museth
+
+    \date May 3, 2023
+
+    \brief Performs signed flood-fill operation on the hierarchical tree structure on the device
+
+    \todo This tools needs to handle the (extremely) rare case when root node
+          needs to be modified during the signed flood fill operation. This happens
+          when the root-table needs to be expanded with tile values (of size 4096^3)
+          that are completely inside the implicit surface.
+
+    \warning The header file contains cuda device code so be sure
+             to only include it in .cu files (or other .cuh files)
+*/
+
+#ifndef NANOVDB_TOOLS_CUDA_SIGNEDFLOODFILL_CUH_HAS_BEEN_INCLUDED
+#define NANOVDB_TOOLS_CUDA_SIGNEDFLOODFILL_CUH_HAS_BEEN_INCLUDED
+
+#include <nanovdb/NanoVDB.h>
+#include <nanovdb/GridHandle.h>
+#include <nanovdb/util/cuda/Timer.h>
+#include <nanovdb/util/cuda/Util.h>
+#include <nanovdb/tools/cuda/GridChecksum.cuh>
+
+namespace nanovdb {
+
+namespace tools::cuda {
+
+/// @brief Performs signed flood-fill operation on the hierarchical tree structure on the device
+/// @tparam BuildT Build type of the grid to be flood-filled
+/// @param d_grid Non-const device pointer to the grid that will be flood-filled
+/// @param verbose If true timing information will be printed to the terminal
+/// @param stream optional cuda stream
+template<typename BuildT>
+typename util::enable_if<BuildTraits<BuildT>::is_float, void>::type
+signedFloodFill(NanoGrid<BuildT> *d_grid, bool verbose = false, cudaStream_t stream = 0);
+
+namespace {// anonymous namespace
+
+template<typename BuildT>
+class SignedFloodFill
+{
+public:
+    SignedFloodFill(bool verbose = false, cudaStream_t stream = 0)
+        : mStream(stream), mVerbose(verbose) {}
+
+    /// @brief Toggle on and off verbose mode
+    /// @param on if true verbose is turned on
+    void setVerbose(bool on = true) {mVerbose = on;}
+
+    void operator()(NanoGrid<BuildT> *d_grid);
+
+private:
+    cudaStream_t      mStream{0};
+    util::cuda::Timer mTimer;
+    bool              mVerbose{false};
+
+};// SignedFloodFill
+
+//================================================================================================
+
+template<typename BuildT>
+__global__ void processRootKernel(NanoTree<BuildT> *tree)
+{
+    // auto &root = tree->root();
+    /*
+    using ChildT = typename RootT::ChildNodeType;
+    // Insert the child nodes into a map sorted according to their origin
+    std::map<Coord, ChildT*> nodeKeys;
+    typename RootT::ChildOnIter it = root.beginChildOn();
+    for (; it; ++it) nodeKeys.insert(std::pair<Coord, ChildT*>(it.getCoord(), &(*it)));
+    static const Index DIM = RootT::ChildNodeType::DIM;
+
+    // We employ a simple z-scanline algorithm that inserts inactive tiles with
+    // the inside value if they are sandwiched between inside child nodes only!
+    typename std::map<Coord, ChildT*>::const_iterator b = nodeKeys.begin(), e = nodeKeys.end();
+    if ( b == e ) return;
+    for (typename std::map<Coord, ChildT*>::const_iterator a = b++; b != e; ++a, ++b) {
+        Coord d = b->first - a->first; // delta of neighboring coordinates
+        if (d[0]!=0 || d[1]!=0 || d[2]==Int32(DIM)) continue;// not same z-scanline or neighbors
+        const ValueT fill[] = { a->second->getLastValue(), b->second->getFirstValue() };
+        if (!(fill[0] < 0) || !(fill[1] < 0)) continue; // scanline isn't inside
+        Coord c = a->first + Coord(0u, 0u, DIM);
+        for (; c[2] != b->first[2]; c[2] += DIM) root.addTile(c, mInside, false);
+    }
+    */
+    //root.setBackground(mOutside, /*updateChildNodes=*/false);
+}// processRootKernel
+
+//================================================================================================
+
+template<typename BuildT, int LEVEL>
+__global__ void processNodeKernel(NanoTree<BuildT> *tree, size_t count)
+{
+    using NodeT = typename NanoNode<BuildT, LEVEL>::type;
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= count) return;
+    const uint32_t nValue = tid & (NodeT::SIZE - 1u);
+    auto &node = *(tree->template getFirstNode<LEVEL>() + (tid >> (3*NodeT::LOG2DIM)));
+    const auto &mask = node.childMask();
+    if (mask.isOn(nValue)) return;// ignore if child
+    auto value = tree->background();// initiate to outside value
+    auto n = mask.template findNext<true>(nValue);
+    if (n < NodeT::SIZE) {
+        if (node.getChild(n)->getFirstValue() < 0) value = -value;
+    } else if ((n = mask.template findPrev<true>(nValue)) < NodeT::SIZE) {
+        if (node.getChild(n)->getLastValue()  < 0) value = -value;
+    } else if (node.getValue(0)<0) {
+        value = -value;
+    }
+    node.setValue(nValue, value);
+}// processNodeKernel
+
+//================================================================================================
+
+template<typename BuildT>
+__global__ void processLeafKernel(NanoTree<BuildT> *tree, size_t count)
+{
+    using LeafT = NanoLeaf<BuildT>;
+    const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= count) return;
+    const uint32_t nVoxel = tid & (LeafT::SIZE - 1u);
+    auto *leaf = tree->getFirstLeaf() + (tid >> (3*LeafT::LOG2DIM));
+    const auto &mask = leaf->valueMask();
+    if (mask.isOn(nVoxel)) return;
+    auto *buffer = leaf->mValues;
+    auto n = mask.template findNext<true>(nVoxel);
+    if (n == LeafT::SIZE && (n = mask.template findPrev<true>(nVoxel)) == LeafT::SIZE) n = 0u;
+    buffer[nVoxel] = buffer[n]<0 ? -tree->background() : tree->background();
+}// processLeafKernel
+
+//================================================================================================
+
+template <typename BuildT>
+__global__ void cpyNodeCountKernel(NanoGrid<BuildT> *d_grid, uint64_t *d_count)
+{
+    NANOVDB_ASSERT(d_grid->isSequential());
+    for (int i=0; i<3; ++i) *d_count++ = d_grid->tree().nodeCount(i);
+    *d_count = d_grid->tree().root().tileCount();
+}
+
+}// anonymous namespace
+
+//================================================================================================
+
+template <typename BuildT>
+void SignedFloodFill<BuildT>::operator()(NanoGrid<BuildT> *d_grid)
+{
+    static_assert(BuildTraits<BuildT>::is_float, "cuda::SignedFloodFill only works on float grids");
+    NANOVDB_ASSERT(d_grid);
+    uint64_t count[4], *d_count = nullptr;
+    cudaCheck(util::cuda::mallocAsync((void**)&d_count, 4*sizeof(uint64_t), mStream));
+    cpyNodeCountKernel<BuildT><<<1, 1, 0, mStream>>>(d_grid, d_count);
+    cudaCheckError();
+    cudaCheck(cudaMemcpyAsync(&count, d_count, 4*sizeof(uint64_t), cudaMemcpyDeviceToHost, mStream));
+    cudaCheck(util::cuda::freeAsync(d_count, mStream));
+
+    static const int threadsPerBlock = 128;
+    auto blocksPerGrid = [&](size_t count)->uint32_t{return (count + (threadsPerBlock - 1)) / threadsPerBlock;};
+    auto *tree = reinterpret_cast<NanoTree<BuildT>*>(d_grid + 1);
+
+    if (mVerbose) mTimer.start("\nProcess leaf nodes");
+    processLeafKernel<BuildT><<<blocksPerGrid(count[0]<<9), threadsPerBlock, 0, mStream>>>(tree, count[0]<<9);
+    cudaCheckError();
+
+    if (mVerbose) mTimer.restart("Process lower internal nodes");
+    processNodeKernel<BuildT,1><<<blocksPerGrid(count[1]<<12), threadsPerBlock, 0, mStream>>>(tree, count[1]<<12);
+    cudaCheckError();
+
+    if (mVerbose) mTimer.restart("Process upper internal nodes");
+    processNodeKernel<BuildT,2><<<blocksPerGrid(count[2]<<15), threadsPerBlock, 0, mStream>>>(tree, count[2]<<15);
+    cudaCheckError();
+
+    //if (mVerbose) mTimer.restart("Process root node");
+    //processRootKernel<BuildT><<<1, 1, 0, mStream>>>(tree);
+    if (mVerbose) mTimer.stop();
+    cudaCheckError();
+}// SignedFloodFill::operator()
+
+//================================================================================================
+
+template<typename BuildT>
+typename util::enable_if<BuildTraits<BuildT>::is_float, void>::type
+signedFloodFill(NanoGrid<BuildT> *d_grid, bool verbose, cudaStream_t stream)
+{
+    SignedFloodFill<BuildT> sff(verbose, stream);
+    sff(d_grid);
+    auto *d_gridData = d_grid->data();
+    Checksum cs = getChecksum(d_gridData, stream);
+    if (cs.isFull()) {// CheckMode::Partial checksum is unaffected
+        updateChecksum(d_gridData, CheckMode::Full, stream);
+    }
+}
+
+}// namespace tools::cuda
+
+template<typename BuildT>
+[[deprecated("Use nanovdb::tools::cuda::signedFloodFill instead.")]]
+typename util::enable_if<BuildTraits<BuildT>::is_float, void>::type
+cudaSignedFloodFill(NanoGrid<BuildT> *d_grid, bool verbose = false, cudaStream_t stream = 0)
+{
+    return tools::cuda::signedFloodFill<BuildT>(d_grid, verbose, stream);
+}
+
+}// namespace nanovdb
+
+#endif // NANOVDB_TOOLS_CUDA_SIGNEDFLOODFILL_CUH_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/util/CpuTimer.h b/external/nanovdb/util/CpuTimer.h
new file mode 100644
index 00000000..4c22f01d
--- /dev/null
+++ b/external/nanovdb/util/CpuTimer.h
@@ -0,0 +1,6 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/util/Timer.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/util/Timer.h instead.")
diff --git a/external/nanovdb/util/CreateNanoGrid.h b/external/nanovdb/util/CreateNanoGrid.h
new file mode 100644
index 00000000..60fa3fd5
--- /dev/null
+++ b/external/nanovdb/util/CreateNanoGrid.h
@@ -0,0 +1,6 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/tools/CreateNanoGrid.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/tools/CreateNanoGrid.h instead.")
diff --git a/external/nanovdb/util/DitherLUT.h b/external/nanovdb/util/DitherLUT.h
new file mode 100644
index 00000000..4d6ff166
--- /dev/null
+++ b/external/nanovdb/util/DitherLUT.h
@@ -0,0 +1,6 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/math/DitherLUT.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/math/DitherLUT.h instead.")
diff --git a/external/nanovdb/util/ForEach.h b/external/nanovdb/util/ForEach.h
new file mode 100644
index 00000000..d71769c5
--- /dev/null
+++ b/external/nanovdb/util/ForEach.h
@@ -0,0 +1,116 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file nanovdb/util/ForEach.h
+
+    \author Ken Museth
+
+    \date August 24, 2020
+
+    \brief A unified wrapper for tbb::parallel_for and a naive std::thread fallback
+*/
+
+#ifndef NANOVDB_UTIL_FOREACH_H_HAS_BEEN_INCLUDED
+#define NANOVDB_UTIL_FOREACH_H_HAS_BEEN_INCLUDED
+
+#include <nanovdb/util/Range.h>// for Range1D
+
+#ifdef NANOVDB_USE_TBB
+#include <tbb/parallel_for.h>
+#else
+#include <thread>
+#include <mutex>
+#include <vector>
+#endif
+
+namespace nanovdb {
+
+namespace util {
+
+/// @brief simple wrapper for tbb::parallel_for with a naive std fallback
+///
+/// @param range Range, CoordBBox, tbb::blocked_range, blocked_range2D, or blocked_range3D.
+/// @param func functor with the signature [](const RangeT&){...},
+///
+/// @code
+///     std::vector<int> array(100);
+///     auto func = [&array](auto &r){for (auto i=r.begin(); i!=r.end(); ++i) array[i]=i;};
+///     forEach(array, func);
+/// @endcode
+template <typename RangeT, typename FuncT>
+inline void forEach(RangeT range, const FuncT &func)
+{
+    if (range.empty()) return;
+#ifdef NANOVDB_USE_TBB
+    tbb::parallel_for(range, func);
+#else// naive and likely slow alternative based on std::thread
+    if (const size_t threadCount = std::thread::hardware_concurrency()>>1) {
+        std::vector<RangeT> rangePool{ range };
+        while(rangePool.size() < threadCount) {
+            const size_t oldSize = rangePool.size();
+            for (size_t i = 0; i < oldSize && rangePool.size() < threadCount; ++i) {
+                auto &r = rangePool[i];
+                if (r.is_divisible()) rangePool.push_back(RangeT(r, Split()));
+            }
+            if (rangePool.size() == oldSize) break;// none of the ranges were divided so stop
+        }
+        std::vector<std::thread> threadPool;
+        for (auto &r : rangePool) threadPool.emplace_back(func, r);// launch threads
+        for (auto &t : threadPool) t.join();// synchronize threads
+    } else {//serial
+        func(range);
+    }
+#endif
+}
+
+/// @brief Simple wrapper for the function defined above
+template <typename FuncT>
+inline void forEach(size_t begin, size_t end, size_t grainSize, const FuncT& func)
+{
+    forEach(Range1D(begin, end, grainSize), func);
+}
+
+/// @brief Simple wrapper for the function defined above, which works with std::containers
+template <template<typename...> class ContainerT, typename... T, typename FuncT>
+inline void forEach(const ContainerT<T...> &c, const FuncT& func)
+{
+    forEach(Range1D(0, c.size(), 1), func);
+}
+
+/// @brief Simple wrapper for the function defined above, which works with std::containers
+template <template<typename...> class ContainerT, typename... T, typename FuncT>
+inline void forEach(const ContainerT<T...> &c, size_t grainSize, const FuncT& func)
+{
+    forEach(Range1D(0, c.size(), grainSize), func);
+}
+
+}// namespace util
+
+/// @brief Simple wrapper for the function defined above
+template <typename FuncT>
+[[deprecated("Use nanovdb::util::forEach instead")]]
+inline void forEach(size_t begin, size_t end, size_t grainSize, const FuncT& func)
+{
+    util::forEach(util::Range1D(begin, end, grainSize), func);
+}
+
+/// @brief Simple wrapper for the function defined above, which works with std::containers
+template <template<typename...> class ContainerT, typename... T, typename FuncT>
+[[deprecated("Use nanovdb::util::forEach instead")]]
+inline void forEach(const ContainerT<T...> &c, const FuncT& func)
+{
+    util::forEach(util::Range1D(0, c.size(), 1), func);
+}
+
+/// @brief Simple wrapper for the function defined above, which works with std::containers
+template <template<typename...> class ContainerT, typename... T, typename FuncT>
+[[deprecated("Use nanovdb::util::forEach instead")]]
+inline void forEach(const ContainerT<T...> &c, size_t grainSize, const FuncT& func)
+{
+    util::forEach(util::Range1D(0, c.size(), grainSize), func);
+}
+
+}// namespace nanovdb
+
+#endif // NANOVDB_UTIL_FOREACH_H_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/util/GridBuilder.h b/external/nanovdb/util/GridBuilder.h
new file mode 100644
index 00000000..681da5ff
--- /dev/null
+++ b/external/nanovdb/util/GridBuilder.h
@@ -0,0 +1,6 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/tools/GridBuilder.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/tools/GridBuilder.h instead.")
diff --git a/external/nanovdb/util/GridChecksum.h b/external/nanovdb/util/GridChecksum.h
new file mode 100644
index 00000000..1c70c7b3
--- /dev/null
+++ b/external/nanovdb/util/GridChecksum.h
@@ -0,0 +1,6 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/tools/GridChecksum.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/tools/GridChecksum.h instead.")
diff --git a/external/nanovdb/util/GridStats.h b/external/nanovdb/util/GridStats.h
new file mode 100644
index 00000000..61de3b0d
--- /dev/null
+++ b/external/nanovdb/util/GridStats.h
@@ -0,0 +1,6 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/tools/GridStats.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/tools/GridStats.h instead.")
diff --git a/external/nanovdb/util/GridValidator.h b/external/nanovdb/util/GridValidator.h
new file mode 100644
index 00000000..8dc1465c
--- /dev/null
+++ b/external/nanovdb/util/GridValidator.h
@@ -0,0 +1,6 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/tools/GridValidator.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/tools/GridValidator.h instead.")
diff --git a/external/nanovdb/util/HDDA.h b/external/nanovdb/util/HDDA.h
new file mode 100644
index 00000000..9944833b
--- /dev/null
+++ b/external/nanovdb/util/HDDA.h
@@ -0,0 +1,6 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/math/HDDA.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/math/HDDA.h instead.")
diff --git a/external/nanovdb/util/HostBuffer.h b/external/nanovdb/util/HostBuffer.h
new file mode 100644
index 00000000..a893d494
--- /dev/null
+++ b/external/nanovdb/util/HostBuffer.h
@@ -0,0 +1,6 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/HostBuffer.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/HostBuffer.h instead.")
diff --git a/external/nanovdb/util/IO.h b/external/nanovdb/util/IO.h
new file mode 100644
index 00000000..385d4251
--- /dev/null
+++ b/external/nanovdb/util/IO.h
@@ -0,0 +1,6 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/io/IO.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/io/IO.h instead.")
diff --git a/external/nanovdb/util/Invoke.h b/external/nanovdb/util/Invoke.h
new file mode 100644
index 00000000..677e033c
--- /dev/null
+++ b/external/nanovdb/util/Invoke.h
@@ -0,0 +1,97 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file nanovdb/util/Invoke.h
+
+    \author Ken Museth
+
+    \date August 24, 2020
+
+    \brief A unified wrapper for tbb::parallel_invoke and a naive std::thread analog
+
+    @code
+    template<typename Func0, typename Func1, ..., typename FuncN>
+    int invoke(const Func0& f0, const Func1& f1, ..., const FuncN& fN);
+    @endcode
+*/
+
+#ifndef NANOVDB_UTIL_INVOKE_H_HAS_BEEN_INCLUDED
+#define NANOVDB_UTIL_INVOKE_H_HAS_BEEN_INCLUDED
+
+#include <nanovdb/NanoVDB.h>// for nanovdb::CoordBBox
+
+#ifdef NANOVDB_USE_TBB
+#include <tbb/parallel_invoke.h>
+#endif
+
+#include <thread>
+#include <mutex>
+#include <vector>
+
+namespace nanovdb {
+
+namespace util {
+
+namespace {
+#ifndef NANOVDB_USE_TBB
+// Base case
+template<typename Func>
+void parallel_invoke(std::vector<std::thread> &threadPool, const Func &taskFunc) {
+    threadPool.emplace_back(taskFunc);
+}
+
+// Iterative call
+template<typename Func, typename... Rest>
+void parallel_invoke(std::vector<std::thread> &threadPool, const Func &taskFunc1, Rest... taskFuncN) {
+    threadPool.emplace_back(taskFunc1);
+    parallel_invoke(threadPool, taskFuncN...);
+}
+
+// Base case
+template<typename Func>
+void serial_invoke(const Func &taskFunc) {taskFunc();}
+
+// Iterative call
+template<typename Func, typename... Rest>
+void serial_invoke(const Func &taskFunc1, Rest... taskFuncN) {
+    taskFunc1();
+    serial_invoke(taskFuncN...);
+}
+#endif
+}// unnamed namespace
+
+/// @return 1 for serial, 2 for tbb multi-threading, and 3 for std multi-threading
+template<typename Func, typename... Rest>
+int invoke(const Func &taskFunc1, Rest... taskFuncN) {
+#ifdef NANOVDB_USE_TBB
+    tbb::parallel_invoke(taskFunc1, taskFuncN...);
+    return 2;
+#else
+    const auto threadCount = std::thread::hardware_concurrency()>>1;
+    if (1 + sizeof...(Rest) <= threadCount) {
+        std::vector<std::thread> threadPool;
+        threadPool.emplace_back(taskFunc1);
+        parallel_invoke(threadPool, taskFuncN...);
+        for (auto &t : threadPool) t.join();
+        return 3;// std multi-threading
+    } else {
+        taskFunc1();
+        serial_invoke(taskFuncN...);
+        return 1;// serial
+    }
+#endif
+    return -1;// should never happen
+}
+
+}// namespace util
+
+template<typename Func, typename... Rest>
+[[deprecated("Use nanovdb::util::invoke instead")]]
+int invoke(const Func &taskFunc1, Rest... taskFuncN) {
+    return util::invoke<Func, Rest...>(taskFunc1, taskFuncN...);
+}
+
+}// namespace nanovdb
+
+#endif // NANOVDB_UTIL_INVOKE_H_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/util/NanoToOpenVDB.h b/external/nanovdb/util/NanoToOpenVDB.h
new file mode 100644
index 00000000..a6c21682
--- /dev/null
+++ b/external/nanovdb/util/NanoToOpenVDB.h
@@ -0,0 +1,6 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/tools/NanoToOpenVDB.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/tools/NanoToOpenVDB.h instead.")
diff --git a/external/nanovdb/util/NodeManager.h b/external/nanovdb/util/NodeManager.h
new file mode 100644
index 00000000..076a18eb
--- /dev/null
+++ b/external/nanovdb/util/NodeManager.h
@@ -0,0 +1,6 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/NodeManager.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/NodeManager.h instead.")
diff --git a/external/nanovdb/util/OpenToNanoVDB.h b/external/nanovdb/util/OpenToNanoVDB.h
new file mode 100644
index 00000000..c7dcce33
--- /dev/null
+++ b/external/nanovdb/util/OpenToNanoVDB.h
@@ -0,0 +1,6 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/tools/CreateNanoGrid.h>
+NANOVDB_DEPRECATED_HEADER("Use nanovdb/tools/CreateNanoGrid.h instead.")
\ No newline at end of file
diff --git a/external/nanovdb/util/PrefixSum.h b/external/nanovdb/util/PrefixSum.h
new file mode 100644
index 00000000..11001087
--- /dev/null
+++ b/external/nanovdb/util/PrefixSum.h
@@ -0,0 +1,90 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file nanovdb/util/PrefixSum.h
+
+    \author Ken Museth
+
+    \date March 12, 2023
+
+    \brief Multi-threaded implementations of inclusive prefix sum
+
+    \note An exclusive prefix sum is simply an array starting with zero
+          followed by the elements in the inclusive prefix sum, minus its
+          last entry which is the sum of all the input elements.
+*/
+
+#ifndef NANOVDB_UTIL_PREFIX_SUM_H_HAS_BEEN_INCLUDED
+#define NANOVDB_UTIL_PREFIX_SUM_H_HAS_BEEN_INCLUDED
+
+#include <nanovdb/util/Range.h>// for Range1D
+#include <vector>
+#include <functional>// for std::plus
+
+#ifdef NANOVDB_USE_TBB
+#include <tbb/parallel_scan.h>
+#endif
+
+namespace nanovdb {
+
+namespace util {
+
+/// @brief Computes inclusive prefix sum of a vector
+/// @tparam T Type of the elements in the input/out vector
+/// @tparam OpT Type of operation performed on each element (defaults to sum)
+/// @param vec input and output vector
+/// @param threaded if true multi-threading is used
+/// @note Inclusive prefix sum: for (i=1; i<N; ++i) vec[i] += vec[i-1]
+/// @return sum of all input elements, which is also the last element of the inclusive prefix sum
+template<typename T, typename OpT = std::plus<T>>
+T prefixSum(std::vector<T> &vec, bool threaded = true, OpT op = OpT());
+
+/// @brief An inclusive scan includes in[i] when computing out[i]
+/// @note Inclusive prefix operation: for (i=1; i<N; ++i) vec[i] = Op(vec[i],vec[i-1])
+template<typename T, typename Op>
+void inclusiveScan(T *array, size_t size, const T &identity, bool threaded, Op op)
+{
+#ifndef NANOVDB_USE_TBB
+    threaded = false;
+    (void)identity;// avoids compiler warning
+#endif
+
+    if (threaded) {
+#ifdef NANOVDB_USE_TBB
+        using RangeT = tbb::blocked_range<size_t>;
+        tbb::parallel_scan(RangeT(0, size), identity,
+            [&](const RangeT &r, T sum, bool is_final_scan)->T {
+                T tmp = sum;
+                for (size_t i = r.begin(); i < r.end(); ++i) {
+                    tmp = op(tmp, array[i]);
+                    if (is_final_scan) array[i] = tmp;
+                }
+                return tmp;
+            },[&](const T &a, const T &b) {return op(a, b);}
+        );
+#endif
+    } else { // serial inclusive prefix operation
+        for (size_t i=1; i<size; ++i) array[i] = op(array[i], array[i-1]);
+    }
+}// inclusiveScan
+
+template<typename T, typename OpT>
+T prefixSum(std::vector<T> &vec, bool threaded, OpT op)
+{
+    inclusiveScan(vec.data(), vec.size(), T(0), threaded, op);
+    return vec.back();// sum of all input elements
+}// prefixSum
+
+}// namespace util
+
+template<typename T, typename OpT = std::plus<T>>
+[[deprecated("Use nanovdb::util::prefixSum instead")]]
+T prefixSum(std::vector<T> &vec, bool threaded = true, OpT op = OpT())
+{
+    return util::prefixSum<T, OpT>(vec, threaded, op);
+}// prefixSum
+
+}// namespace nanovdb
+
+#endif // NANOVDB_UTIL_PREFIX_SUM_H_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/util/Primitives.h b/external/nanovdb/util/Primitives.h
new file mode 100644
index 00000000..79cfe615
--- /dev/null
+++ b/external/nanovdb/util/Primitives.h
@@ -0,0 +1,6 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/tools/CreatePrimitives.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/tools/CreatePrimitives.h instead.")
diff --git a/external/nanovdb/util/Range.h b/external/nanovdb/util/Range.h
new file mode 100644
index 00000000..e9ff766e
--- /dev/null
+++ b/external/nanovdb/util/Range.h
@@ -0,0 +1,158 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file nanovdb/util/Range.h
+
+    \author Ken Museth
+
+    \date August 28, 2020
+
+    \brief Custom Range class that is compatible with the tbb::blocked_range classes
+*/
+
+#ifndef NANOVDB_UTIL_RANGE_H_HAS_BEEN_INCLUDED
+#define NANOVDB_UTIL_RANGE_H_HAS_BEEN_INCLUDED
+
+#include <cassert>
+#include <cstddef>// for size_t
+
+#ifdef NANOVDB_USE_TBB
+#include <tbb/blocked_range.h>// for tbb::split
+#endif
+
+namespace nanovdb {
+
+namespace util {
+
+class Split {};// Dummy class used by split constructors
+
+template <int, typename>
+class Range;
+
+using Range1D = Range<1, size_t>;
+using Range2D = Range<2, size_t>;
+using Range3D = Range<3, size_t>;
+
+// template specialization for Rank = 1
+template <typename T>
+class Range<1, T>
+{
+    T mBegin, mEnd;
+    size_t mGrainsize;
+    template<int, typename>
+    friend class Range;
+public:
+    using const_iterator = T;
+    using size_type = size_t;
+    Range(const Range&) = default;
+    Range(T begin, T end, size_type grainsize = size_type(1))
+        : mBegin(begin), mEnd(end), mGrainsize(grainsize)
+    {
+        assert(grainsize > size_type(0));
+    }
+    /// @brief Split constructor: r[a,b[ -> r[a,b/2[ & this[b/2,b[
+    Range(Range &r, Split) : mBegin(r.mBegin), mEnd(r.mEnd), mGrainsize(r.mGrainsize) {
+        assert(r.is_divisible());
+        r.mEnd = mBegin = this->middle();
+    }
+#ifdef NANOVDB_USE_TBB
+    Range(Range &r, tbb::split) : Range(r, Split()) {}
+#endif
+    bool operator==(const Range& rhs) const { return mBegin == rhs.mBegin && mEnd == rhs.mEnd && mGrainsize == rhs.mGrainsize; }
+    T middle() const {return mBegin + (mEnd - mBegin) / T(2);}
+    size_type size()  const { assert(!this->empty()); return size_type(mEnd - mBegin); }
+    bool empty()   const { return !(mBegin < mEnd); }
+    size_type grainsize() const {return mGrainsize;}
+    bool is_divisible() const {return mGrainsize < this->size();}
+    const_iterator begin() const { return mBegin; }
+    const_iterator end()   const { return mEnd; }
+};// Range<1, T>
+
+// template specialization for Rank = 2
+template <typename T>
+class Range<2, T>
+{
+    Range<1, T> mRange[2];
+public:
+    using size_type = typename Range<1, T>::size_type;
+    Range(const Range<1, T> &rangeRow, const Range<1, T> &rangeCol) : mRange{ rangeRow, rangeCol } {}
+    Range(T beginRow, T endRow, size_type grainsizeRow, T beginCol, T endCol, size_type grainsizeCol)
+        : Range( Range<1,T>(beginRow, endRow, grainsizeRow), Range<1,T>(beginCol, endCol, grainsizeCol) )
+    {
+    }
+    Range(T beginRow, T endRow, T beginCol, T endCol) : Range(Range<1,T>(beginRow, endRow), Range<1,T>(beginCol, endCol) )
+    {
+    }
+    Range(Range &r, Split) : Range(r.mRange[0], r.mRange[1]) {
+        assert( r.is_divisible() );// at least one of the two dimensions must be divisible!
+        if( mRange[0].size()*double(mRange[1].grainsize()) < mRange[1].size()*double(mRange[0].grainsize()) ) {
+            r.mRange[1].mEnd = mRange[1].mBegin = mRange[1].middle();
+        } else {
+            r.mRange[0].mEnd = mRange[0].mBegin = mRange[0].middle();
+        }
+    }
+#ifdef NANOVDB_USE_TBB
+    Range(Range &r, tbb::split) : Range(r, Split()) {}
+#endif
+    bool operator==(const Range& rhs) const {return mRange[0] == rhs[0] && mRange[1] == rhs[1]; }
+    bool empty() const { return mRange[0].empty() || mRange[1].empty(); }
+    bool is_divisible() const {return mRange[0].is_divisible() || mRange[1].is_divisible();}
+    const Range<1, T>& operator[](int i) const { assert(i==0 || i==1); return mRange[i]; }
+};// Range<2, T>
+
+// template specialization for Rank = 3
+template <typename T>
+class Range<3, T>
+{
+    Range<1, T> mRange[3];
+public:
+    using size_type = typename Range<1, T>::size_type;
+    Range(const Range<1, T> &rangeX, const Range<1, T> &rangeY, const Range<1, T> &rangeZ) : mRange{ rangeX, rangeY, rangeZ } {}
+    Range(T beginX, T endX, size_type grainsizeX,
+          T beginY, T endY, size_type grainsizeY,
+          T beginZ, T endZ, size_type grainsizeZ)
+        : Range( Range<1,T>(beginX, endX, grainsizeX),
+                 Range<1,T>(beginY, endY, grainsizeY),
+                 Range<1,T>(beginZ, endZ, grainsizeZ) )
+    {
+    }
+    Range(T beginX, T endX, T beginY, T endY, T beginZ, T endZ)
+        : Range( Range<1,T>(beginX, endX), Range<1,T>(beginY, endY), Range<1,T>(beginZ, endZ) )
+    {
+    }
+    Range(Range &r, Split) : Range(r.mRange[0], r.mRange[1], r.mRange[2])
+    {
+        assert( r.is_divisible() );// at least one of the three dimensions must be divisible!
+        if ( mRange[2].size()*double(mRange[0].grainsize()) < mRange[0].size()*double(mRange[2].grainsize()) ) {
+            if ( mRange[0].size()*double(mRange[1].grainsize()) < mRange[1].size()*double(mRange[0].grainsize()) ) {
+                r.mRange[1].mEnd = mRange[1].mBegin = mRange[1].middle();
+            } else {
+                r.mRange[0].mEnd = mRange[0].mBegin = mRange[0].middle();
+            }
+        } else {
+            if ( mRange[2].size()*double(mRange[1].grainsize()) < mRange[1].size()*double(mRange[2].grainsize()) ) {
+                r.mRange[1].mEnd = mRange[1].mBegin = mRange[1].middle();
+            } else {
+                r.mRange[2].mEnd = mRange[2].mBegin = mRange[2].middle();
+            }
+        }
+    }
+#ifdef NANOVDB_USE_TBB
+    Range(Range &r, tbb::split) : Range(r, Split()) {}
+#endif
+    bool operator==(const Range& rhs) const {return mRange[0] == rhs[0] && mRange[1] == rhs[1] && mRange[2] == rhs[2]; }
+    bool empty() const { return mRange[0].empty() || mRange[1].empty() || mRange[2].empty(); }
+    bool is_divisible() const {return mRange[0].is_divisible() || mRange[1].is_divisible() || mRange[2].is_divisible();}
+    const Range<1, T>& operator[](int i) const { assert(i==0 || i==1 || i==2); return mRange[i]; }
+};// Range<3, T>
+
+}// namespace util
+
+using Range1D [[deprecated("Use nanovdb::util::Range1D instead")]] = util::Range<1, size_t>;
+using Range2D [[deprecated("Use nanovdb::util::Range2D instead")]] = util::Range<2, size_t>;
+using Range3D [[deprecated("Use nanovdb::util::Range3D instead")]] = util::Range<3, size_t>;
+
+}// namespace nanovdb
+
+#endif // NANOVDB_UTIL_RANGE_H_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/util/Ray.h b/external/nanovdb/util/Ray.h
new file mode 100644
index 00000000..90384909
--- /dev/null
+++ b/external/nanovdb/util/Ray.h
@@ -0,0 +1,6 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/math/Ray.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/math/Ray.h instead.")
diff --git a/external/nanovdb/util/Reduce.h b/external/nanovdb/util/Reduce.h
new file mode 100644
index 00000000..f171b252
--- /dev/null
+++ b/external/nanovdb/util/Reduce.h
@@ -0,0 +1,133 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file nanovdb/util/Reduce.h
+
+    \author Ken Museth
+
+    \date March 4, 2021
+
+    \brief A unified wrapper for tbb::parallel_reduce and a naive std::future analog
+*/
+
+#ifndef NANOVDB_UTIL_REDUCE_H_HAS_BEEN_INCLUDED
+#define NANOVDB_UTIL_REDUCE_H_HAS_BEEN_INCLUDED
+
+#include <nanovdb/util/Range.h>// for util::Range1D
+
+#ifdef NANOVDB_USE_TBB
+#include <tbb/parallel_reduce.h>
+#else
+#include <thread>
+#include <future>
+#include <vector>
+#endif
+
+namespace nanovdb {
+
+namespace util {
+
+/// @return reduction
+///
+/// @param range  RangeT can be Range<dim,T>, CoordBBox, tbb::blocked_range, blocked_range2D, or blocked_range3D.
+/// @param identity  initial value
+/// @param func   functor with signature T FuncT::operator()(const RangeT& range, const T& a) const
+/// @param join   functor with the signature T JoinT::operator()(const T& a, const T& b) const
+/// @code
+///     std::vector<int> array(100, 1);
+///     auto func = [&array](auto &r, int a){for (auto i=r.begin(); i!=r.end(); ++i) a+=array[i]; return a;};
+///     int sum = reduce(array, 0, func, [](int a, int b){return a + b;});
+/// @endcode
+template <typename RangeT, typename T, typename FuncT, typename JoinT>
+inline T reduce(RangeT range, const T& identity, const FuncT &func, const JoinT &join)
+{
+    if (range.empty()) return identity;
+#ifdef NANOVDB_USE_TBB
+    return tbb::parallel_reduce(range, identity, func, join);
+#else// naive and likely slow alternative based on std::future
+    if (const size_t threadCount = std::thread::hardware_concurrency()>>1) {
+        std::vector<RangeT> rangePool{ range };
+        while(rangePool.size() < threadCount) {
+            const size_t oldSize = rangePool.size();
+            for (size_t i = 0; i < oldSize && rangePool.size() < threadCount; ++i) {
+                auto &r = rangePool[i];
+                if (r.is_divisible()) rangePool.push_back(RangeT(r, Split()));
+            }
+            if (rangePool.size() == oldSize) break;// none of the ranges were divided so stop
+        }
+        std::vector< std::future<T> > futurePool;
+        for (auto &r : rangePool) {
+            auto task = std::async(std::launch::async, [&](){return func(r, identity);});
+            futurePool.push_back( std::move(task) );// launch tasks
+        }
+        T result = identity;
+        for (auto &f : futurePool) {
+            result = join(result, f.get());// join results
+        }
+        return result;
+    } else {// serial
+        return static_cast<T>(func(range, identity));
+    }
+#endif
+    return identity;// should never happen
+}
+
+/// @brief Simple wrapper to the function defined above
+template <typename T, typename FuncT, typename JoinT>
+inline T reduce(size_t begin, size_t end, size_t grainSize, const T& identity, const FuncT& func, const JoinT& join)
+{
+    Range1D range(begin, end, grainSize);
+    return reduce( range, identity, func, join );
+}
+
+/// @brief Simple wrapper that works with std::containers
+template <template<typename...> class ContainerT, typename... ArgT, typename T, typename FuncT, typename JoinT >
+inline T reduce(const ContainerT<ArgT...> &c, const T& identity, const FuncT& func, const JoinT& join)
+{
+    Range1D range(0, c.size(), 1);
+    return reduce( range, identity, func, join );
+
+}
+
+/// @brief Simple wrapper that works with std::containers
+template <template<typename...> class ContainerT, typename... ArgT, typename T, typename FuncT, typename JoinT >
+inline T reduce(const ContainerT<ArgT...> &c, size_t grainSize, const T& identity, const FuncT& func, const JoinT& join)
+{
+    Range1D range(0, c.size(), grainSize);
+    return reduce( range, identity, func, join );
+}
+
+}// namespace util
+
+/// @brief Simple wrapper to the function defined above
+template <typename T, typename FuncT, typename JoinT>
+[[deprecated("Use nanovdb::util::reduce instead")]]
+inline T reduce(size_t begin, size_t end, size_t grainSize, const T& identity, const FuncT& func, const JoinT& join)
+{
+    util::Range1D range(begin, end, grainSize);
+    return util::reduce( range, identity, func, join );
+}
+
+/// @brief Simple wrapper that works with std::containers
+template <template<typename...> class ContainerT, typename... ArgT, typename T, typename FuncT, typename JoinT >
+[[deprecated("Use nanovdb::util::reduce instead")]]
+inline T reduce(const ContainerT<ArgT...> &c, const T& identity, const FuncT& func, const JoinT& join)
+{
+    util::Range1D range(0, c.size(), 1);
+    return util::reduce( range, identity, func, join );
+
+}
+
+/// @brief Simple wrapper that works with std::containers
+template <template<typename...> class ContainerT, typename... ArgT, typename T, typename FuncT, typename JoinT >
+[[deprecated("Use nanovdb::util::reduce instead")]]
+T reduce(const ContainerT<ArgT...> &c, size_t grainSize, const T& identity, const FuncT& func, const JoinT& join)
+{
+    util::Range1D range(0, c.size(), grainSize);
+    return util::reduce( range, identity, func, join );
+}
+
+}// namespace nanovdb
+
+#endif // NANOVDB_UTIL_REDUCE_H_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/util/SampleFromVoxels.h b/external/nanovdb/util/SampleFromVoxels.h
new file mode 100644
index 00000000..02802444
--- /dev/null
+++ b/external/nanovdb/util/SampleFromVoxels.h
@@ -0,0 +1,6 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/math/SampleFromVoxels.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/math/SampleFromVoxels.h instead.")
diff --git a/external/nanovdb/util/Stencils.h b/external/nanovdb/util/Stencils.h
new file mode 100644
index 00000000..2de91c52
--- /dev/null
+++ b/external/nanovdb/util/Stencils.h
@@ -0,0 +1,6 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/math/Stencils.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/math/Stencils.h instead.")
diff --git a/external/nanovdb/util/Timer.h b/external/nanovdb/util/Timer.h
new file mode 100644
index 00000000..b2d8dc9e
--- /dev/null
+++ b/external/nanovdb/util/Timer.h
@@ -0,0 +1,87 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/// @file nanovdb/util/Timer.h
+///
+/// @author Ken Museth
+///
+/// @brief A simple timing class (in case openvdb::util::CpuTimer is unavailable)
+
+#ifndef NANOVDB_UTIL_TIMER_H_HAS_BEEN_INCLUDED
+#define NANOVDB_UTIL_TIMER_H_HAS_BEEN_INCLUDED
+
+#include <iostream>
+#include <chrono>
+
+namespace nanovdb {
+
+namespace util {
+
+class Timer
+{
+    std::chrono::high_resolution_clock::time_point mStart;
+public:
+    /// @brief Default constructor
+    Timer() {}
+
+    /// @brief Constructor that starts the timer
+    /// @param msg string message to be printed when timer is started
+    /// @param os output stream for the message above
+    Timer(const std::string &msg, std::ostream& os = std::cerr) {this->start(msg, os);}
+
+    /// @brief Start the timer
+    /// @param msg string message to be printed when timer is started
+    /// @param os output stream for the message above
+    void start(const std::string &msg, std::ostream& os = std::cerr)
+    {
+        os << msg << " ... " << std::flush;
+        mStart = std::chrono::high_resolution_clock::now();
+    }
+
+    /// @brief elapsed time (since start) in miliseconds
+    template <typename AccuracyT = std::chrono::milliseconds>
+    auto elapsed()
+    {
+        auto end = std::chrono::high_resolution_clock::now();
+        return std::chrono::duration_cast<AccuracyT>(end - mStart).count();
+    }
+
+    /// @brief stop the timer
+    /// @tparam AccuracyT Template parameter defining the accuracy of the reported times
+    /// @param os output stream for the message above
+    template <typename AccuracyT = std::chrono::milliseconds>
+    void stop(std::ostream& os = std::cerr)
+    {
+        auto end = std::chrono::high_resolution_clock::now();
+        auto diff = std::chrono::duration_cast<AccuracyT>(end - mStart).count();
+        os << "completed in " << diff;
+        if (std::is_same<AccuracyT, std::chrono::microseconds>::value) {// resolved at compile-time
+            os << " microseconds" << std::endl;
+        } else if (std::is_same<AccuracyT, std::chrono::milliseconds>::value) {
+            os << " milliseconds" << std::endl;
+        } else if (std::is_same<AccuracyT, std::chrono::seconds>::value) {
+            os << " seconds" << std::endl;
+        } else {
+            os << " unknown time unit" << std::endl;
+        }
+    }
+
+    /// @brief stop and start the timer
+    /// @tparam AccuracyT Template parameter defining the accuracy of the reported times
+    /// @param msg string message to be printed when timer is started
+    /// @param os output stream for the message above
+    template <typename AccuracyT = std::chrono::milliseconds>
+    void restart(const std::string &msg, std::ostream& os = std::cerr)
+    {
+        this->stop<AccuracyT>();
+        this->start(msg, os);
+    }
+};// Timer
+
+}// namespace util
+
+using CpuTimer [[deprecated("Use nanovdb::util::Timer instead")]] = util::Timer;
+
+} // namespace nanovdb
+
+#endif // NANOVDB_UTIL_TIMER_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/util/Util.h b/external/nanovdb/util/Util.h
new file mode 100644
index 00000000..7dcdfbdb
--- /dev/null
+++ b/external/nanovdb/util/Util.h
@@ -0,0 +1,657 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file   nanovdb/util/Util.h
+
+    \author Ken Museth
+
+    \date  January 8, 2020
+
+    \brief Utility functions
+*/
+
+#ifndef NANOVDB_UTIL_UTIL_H_HAS_BEEN_INCLUDED
+#define NANOVDB_UTIL_UTIL_H_HAS_BEEN_INCLUDED
+
+#ifdef __CUDACC_RTC__
+
+typedef signed char        int8_t;
+typedef short              int16_t;
+typedef int                int32_t;
+typedef long long          int64_t;
+typedef unsigned char      uint8_t;
+typedef unsigned int       uint32_t;
+typedef unsigned short     uint16_t;
+typedef unsigned long long uint64_t;
+
+#define NANOVDB_ASSERT(x)
+
+#ifndef UINT64_C
+#define UINT64_C(x) (x ## ULL)
+#endif
+
+#else // !__CUDACC_RTC__
+
+#include <stdlib.h> //    for abs in clang7
+#include <stdint.h> //    for types like int32_t etc
+#include <stddef.h> //    for size_t type
+#include <cassert> //     for assert
+#include <cstdio> //      for stderr and snprintf
+#include <cmath> //       for sqrt and fma
+#include <limits> //      for numeric_limits
+#include <utility>//      for std::move
+#ifdef NANOVDB_USE_IOSTREAMS
+#include <fstream>//      for read/writeUncompressedGrids
+#endif// ifdef NANOVDB_USE_IOSTREAMS
+
+// All asserts can be disabled here, even for debug builds
+#if 1
+#define NANOVDB_ASSERT(x) assert(x)
+#else
+#define NANOVDB_ASSERT(x)
+#endif
+
+#if defined(NANOVDB_USE_INTRINSICS) && defined(_MSC_VER)
+#include <intrin.h>
+#pragma intrinsic(_BitScanReverse)
+#pragma intrinsic(_BitScanForward)
+#pragma intrinsic(_BitScanReverse64)
+#pragma intrinsic(_BitScanForward64)
+#endif
+
+#endif // __CUDACC_RTC__
+
+#if defined(__CUDACC__) || defined(__HIP__)
+// Only define __hostdev__ qualifier when using NVIDIA CUDA or HIP compilers
+#ifndef __hostdev__
+#define __hostdev__ __host__ __device__ // Runs on the CPU and GPU, called from the CPU or the GPU
+#endif
+#else
+// Dummy definitions of macros only defined by CUDA and HIP compilers
+#ifndef __hostdev__
+#define __hostdev__ // Runs on the CPU and GPU, called from the CPU or the GPU
+#endif
+#ifndef __global__
+#define __global__ // Runs on the GPU, called from the CPU or the GPU
+#endif
+#ifndef __device__
+#define __device__ // Runs on the GPU, called from the GPU
+#endif
+#ifndef __host__
+#define __host__ // Runs on the CPU, called from the CPU
+#endif
+
+#endif // if defined(__CUDACC__) || defined(__HIP__)
+
+// The following macro will suppress annoying warnings when nvcc
+// compiles functions that call (host) intrinsics (which is perfectly valid)
+#if defined(_MSC_VER) && defined(__CUDACC__)
+#define NANOVDB_HOSTDEV_DISABLE_WARNING __pragma("hd_warning_disable")
+#elif defined(__GNUC__) && defined(__CUDACC__)
+#define NANOVDB_HOSTDEV_DISABLE_WARNING _Pragma("hd_warning_disable")
+#else
+#define NANOVDB_HOSTDEV_DISABLE_WARNING
+#endif
+
+// Define compiler warnings that work with all compilers
+//#if defined(_MSC_VER)
+//#define NANO_WARNING(msg) _pragma("message" #msg)
+//#else
+//#define NANO_WARNING(msg) _Pragma("message" #msg)
+//#endif
+
+//==============================================
+/// @brief Defines macros that issues warnings for deprecated header files
+/// @details Example:
+/// @code
+/// #include <nanovdb/util/Util.h> // for NANOVDB_DEPRECATED_HEADER
+/// #include <nanovdb/path/Alternative.h>
+/// NANOVDB_DEPRECATED_HEADER("This header file is deprecated, please use <nanovdb/path/Alternative.h> instead")
+/// @endcode
+#ifdef __GNUC__
+#define NANOVDB_PRAGMA(X) _Pragma(#X)
+#define NANOVDB_DEPRECATED_HEADER(MSG) NANOVDB_PRAGMA(GCC warning MSG)
+#elif defined(_MSC_VER)
+#define NANOVDB_STRINGIZE_(MSG) #MSG
+#define NANOVDB_STRINGIZE(MSG) NANOVDB_STRINGIZE_(MSG)
+#define NANOVDB_DEPRECATED_HEADER(MSG) \
+    __pragma(message(__FILE__ "(" NANOVDB_STRINGIZE(__LINE__) ") : Warning: " MSG))
+#endif
+
+// A portable implementation of offsetof - unfortunately it doesn't work with static_assert
+#define NANOVDB_OFFSETOF(CLASS, MEMBER) ((int)(size_t)((char*)&((CLASS*)0)->MEMBER - (char*)0))
+
+namespace nanovdb {// =================================================================
+
+namespace util {// ====================================================================
+
+/// @brief Minimal implementation of std::declval, which converts any type @c T to
+////       a reference type, making it possible to use member functions in the operand
+///        of the decltype specifier without the need to go through constructors.
+/// @tparam T Template type to be converted to T&&
+/// @return T&&
+/// @warning Unlike std::declval, this version does not work when T = void! However,
+///          NVRTC does not like std::declval, so we provide our own implementation.
+template<typename T>
+T&& declval() noexcept;
+
+// --------------------------> string utility functions <------------------------------------
+
+/// @brief tests if a c-string @c str is empty, that is its first value is '\0'
+/// @param str c-string to be tested for null termination
+/// @return true if str[0] = '\0'
+__hostdev__ inline bool empty(const char* str)
+{
+    NANOVDB_ASSERT(str != nullptr);
+    return *str == '\0';
+}// util::empty
+
+/// @brief length of a c-sting, excluding '\0'.
+/// @param str c-string
+/// @return the number of characters that precede the terminating null character.
+__hostdev__ inline size_t strlen(const char *str)
+{
+    NANOVDB_ASSERT(str != nullptr);
+    const char *s = str;
+    while(*s) ++s;               ;
+    return (s - str);
+}// util::strlen
+
+/// @brief Copy characters from @c src to @c dst.
+/// @param dst pointer to the destination string.
+/// @param src pointer to the null-terminated source string.
+/// @return destination string @c dst.
+/// @note Emulates the behaviour of std::strcpy, except this version also runs on the GPU.
+__hostdev__ inline char* strcpy(char *dst, const char *src)
+{
+    NANOVDB_ASSERT(dst != nullptr && src != nullptr);
+    for (char *p = dst; (*p++ = *src) != '\0'; ++src);
+    return dst;
+}// util::strcpy(char*, const char*)
+
+/// @brief Copies the first num characters of @c src to @c dst.
+///        If the end of the source C string (which is signaled by a
+///        null-character) is found before @c max characters have been
+///        copied, @c dst is padded with zeros until a total of @c max
+///        characters have been written to it.
+/// @param dst destination string
+/// @param src source string
+/// @param max maximum number of character in destination string
+/// @return destination string @c dst
+/// @warning if strncpy(dst, src, max)[max-1]!='\0' then @c src has more
+///          characters than @c max and the return string needs to be
+///          manually null-terminated, i.e. strncpy(dst, src, max)[max-1]='\0'
+__hostdev__ inline char* strncpy(char *dst, const char *src, size_t max)
+{
+    NANOVDB_ASSERT(dst != nullptr && src != nullptr);
+    size_t i = 0;
+    for (; i < max && src[i] != '\0'; ++i) dst[i] = src[i];
+    for (; i < max; ++i) dst[i] = '\0';
+    return dst;
+}// util::strncpy(char *dst, const char *src, size_t max)
+
+/// @brief converts a number to a string using a specific base
+/// @param dst destination string
+/// @param num signed number to be concatenated after @c dst
+/// @param bas base used when converting @c num to a string
+/// @return destination string @c dst
+/// @note Emulates the behaviour of itoa, except this verion also works on the GPU.
+__hostdev__ inline char* strcpy(char* dst, int num, int bas = 10)
+{
+    NANOVDB_ASSERT(dst != nullptr && bas > 0);
+    int len = 0;// length of number once converted to a string
+    if (num == 0) dst[len++] = '0';
+    for (int abs = num < 0 && bas == 10 ? -num : num; abs; abs /= bas) {
+        const int rem = abs % bas;
+        dst[len++] = rem > 9 ? rem - 10 + 'a' : rem + '0';
+    }
+    if (num < 0) dst[len++] = '-';// append '-' if negative
+    for (char *a = dst, *b = a + len - 1; a < b; ++a, --b) {// reverse dst
+        dst[len] = *a;// use end of string as temp
+        *a = *b;
+        *b = dst[len];
+    }
+    dst[len] = '\0';// explicitly terminate end of string
+    return dst;
+}// util::strcpy(char*, int, int)
+
+/// @brief Appends a copy of the character string pointed to by @c src to
+///        the end of the character string pointed to by @c dst on the device.
+/// @param dst pointer to the null-terminated byte string to append to.
+/// @param src pointer to the null-terminated byte string to copy from.
+/// @return pointer to the character array being appended to.
+/// @note Emulates the behaviour of std::strcat, except this version also runs on the GPU.
+__hostdev__ inline char* strcat(char *dst, const char *src)
+{
+    NANOVDB_ASSERT(dst != nullptr && src != nullptr);
+    char *p = dst;
+    while (*p != '\0') ++p;// advance till end of dst
+    strcpy(p, src);// append src
+    return dst;
+}// util::strcat(char*, const char*)
+
+/// @brief concatenates a number after a string using a specific base
+/// @param dst null terminated destination string
+/// @param num signed number to be concatenated after @c dst
+/// @param bas base used when converting @c num to a string
+/// @return destination string @c dst
+__hostdev__ inline char* strcat(char* dst, int num, int bas = 10)
+{
+    NANOVDB_ASSERT(dst != nullptr);
+    char *p = dst;
+    while (*p != '\0') ++p;
+    strcpy(p, num, bas);
+    return dst;
+}// util::strcat(char*, int, int)
+
+/// @brief Compares two null-terminated byte strings lexicographically.
+/// @param lhs pointer to the null-terminated byte strings to compare
+/// @param rhs pointer to the null-terminated byte strings to compare
+/// @return Negative value if @c lhs appears before @c rhs in lexicographical order.
+///         Zero if @c lhs and @c rhs compare equal. Positive value if @c lhs appears
+///         after @c rhs in lexicographical order.
+/// @note Emulates the behaviour of std::strcmp, except this version also runs on the GPU.
+__hostdev__ inline int strcmp(const char *lhs, const char *rhs)
+{
+    while(*lhs != '\0' && (*lhs == *rhs)){
+        lhs++;
+        rhs++;
+    }
+    return *(const unsigned char*)lhs - *(const unsigned char*)rhs;// zero if lhs == rhs
+}// util::strcmp(const char*, const char*)
+
+/// @brief Test if two null-terminated byte strings are the same
+/// @param lhs pointer to the null-terminated byte strings to compare
+/// @param rhs pointer to the null-terminated byte strings to compare
+/// @return true if the two c-strings are identical
+__hostdev__ inline bool streq(const char *lhs, const char *rhs)
+{
+    return strcmp(lhs, rhs) == 0;
+}// util::streq
+
+namespace impl {// =======================================================
+// Base-case implementation of Variadic Template function impl::sprint
+__hostdev__ inline char* sprint(char *dst){return dst;}
+// Variadic Template function impl::sprint
+template <typename T, typename... Types>
+__hostdev__ inline char* sprint(char *dst, T var1, Types... var2)
+{
+    return impl::sprint(strcat(dst, var1), var2...);
+}
+}// namespace impl =========================================================
+
+/// @brief prints a variable number of string and/or numbers to a destination string
+template <typename T, typename... Types>
+__hostdev__ inline char* sprint(char *dst, T var1, Types... var2)
+{
+    return impl::sprint(strcpy(dst, var1), var2...);
+}// util::sprint
+
+// --------------------------> memzero <------------------------------------
+
+/// @brief Zero initialization of memory
+/// @param dst pointer to destination
+/// @param byteCount number of bytes to be initialized to zero
+/// @return destination pointer @c dst
+__hostdev__ inline static void* memzero(void *dst, size_t byteCount)
+{
+    NANOVDB_ASSERT(dst);
+    const size_t wordCount = byteCount >> 3;
+    if (wordCount << 3 == byteCount) {
+        for (auto *d = (uint64_t*)dst, *e = d + wordCount; d != e; ++d) *d = 0ULL;
+    } else {
+        for (auto *d = (char*)dst, *e = d + byteCount; d != e; ++d) *d = '\0';
+    }
+    return dst;
+}// util::memzero
+
+// --------------------------> util::is_same <------------------------------------
+
+/// @brief C++11 implementation of std::is_same
+/// @note When more than two arguments are provided value = T0==T1 || T0==T2 || ...
+template<typename T0, typename T1, typename ...T>
+struct is_same
+{
+    static constexpr bool value = is_same<T0, T1>::value || is_same<T0, T...>::value;
+};
+
+template<typename T0, typename T1>
+struct is_same<T0, T1> {static constexpr bool value = false;};
+
+template<typename T>
+struct is_same<T, T> {static constexpr bool value = true;};
+
+// --------------------------> util::is_floating_point <------------------------------------
+
+/// @brief C++11 implementation of std::is_floating_point
+template<typename T>
+struct is_floating_point {static constexpr bool value = is_same<T, float, double>::value;};
+
+// --------------------------> util::enable_if <------------------------------------
+
+/// @brief C++11 implementation of std::enable_if
+template <bool, typename T = void>
+struct enable_if {};
+
+template <typename T>
+struct enable_if<true, T> {using type = T;};
+
+// --------------------------> util::disable_if <------------------------------------
+
+template<bool, typename T = void>
+struct disable_if {using type = T;};
+
+template<typename T>
+struct disable_if<true, T> {};
+
+// --------------------------> util::is_const <------------------------------------
+
+template<typename T>
+struct is_const {static constexpr bool value = false;};
+
+template<typename T>
+struct is_const<const T> {static constexpr bool value = true;};
+
+// --------------------------> util::is_pointer <------------------------------------
+
+/// @brief Trait used to identify template parameter that are pointers
+/// @tparam T Template parameter to be tested
+template<class T>
+struct is_pointer {static constexpr bool value = false;};
+
+/// @brief Template specialization of pointers
+/// @tparam T Template parameter to be tested
+/// @note T can be both a non-const and const type
+template<class T>
+struct is_pointer<T*> {static constexpr bool value = true;};
+
+// --------------------------> util::conditional <------------------------------------
+
+/// @brief C++11 implementation of std::conditional
+template<bool, class TrueT, class FalseT>
+struct conditional { using type = TrueT; };
+
+/// @brief Template specialization of conditional
+/// @tparam FalseT Type used when boolean is false
+/// @tparam TrueT Type used when boolean is true
+template<class TrueT, class FalseT>
+struct conditional<false, TrueT, FalseT> { using type = FalseT; };
+
+// --------------------------> util::remove_const <------------------------------------
+
+/// @brief Trait use to const from type. Default implementation is just a pass-through
+/// @tparam T Type
+/// @details remove_pointer<float>::type = float
+template<typename T>
+struct remove_const {using type = T;};
+
+/// @brief Template specialization of trait class use to remove const qualifier type from a type
+/// @tparam T Type of the const type
+/// @details remove_pointer<const float>::type = float
+template<typename T>
+struct remove_const<const T> {using type = T;};
+
+// --------------------------> util::remove_reference <------------------------------------
+
+/// @brief Trait use to remove reference, i.e. "&", qualifier from a type. Default implementation is just a pass-through
+/// @tparam T Type
+/// @details remove_pointer<float>::type = float
+template <typename T>
+struct remove_reference {using type = T;};
+
+/// @brief Template specialization of trait class use to remove reference, i.e. "&", qualifier from a type
+/// @tparam T Type of the reference
+/// @details remove_pointer<float&>::type = float
+template <typename T>
+struct remove_reference<T&> {using type = T;};
+
+// --------------------------> util::remove_pointer <------------------------------------
+
+/// @brief Trait use to remove pointer, i.e. "*", qualifier from a type. Default implementation is just a pass-through
+/// @tparam T Type
+/// @details remove_pointer<float>::type = float
+template <typename T>
+struct remove_pointer {using type = T;};
+
+/// @brief Template specialization of trait class use to to remove pointer, i.e. "*", qualifier from a type
+/// @tparam T Type of the pointer
+/// @details remove_pointer<float*>::type = float
+template <typename T>
+struct remove_pointer<T*> {using type = T;};
+
+// --------------------------> util::match_const <------------------------------------
+
+/// @brief Trait used to transfer the const-ness of a reference type to another type
+/// @tparam T Type whose const-ness needs to match the reference type
+/// @tparam ReferenceT Reference type that is not const
+/// @details match_const<const int, float>::type = int
+///          match_const<int, float>::type = int
+template<typename T, typename ReferenceT>
+struct match_const {using type = typename remove_const<T>::type;};
+
+/// @brief Template specialization used to transfer the const-ness of a reference type to another type
+/// @tparam T Type that will adopt the const-ness of the reference type
+/// @tparam ReferenceT Reference type that is const
+/// @details match_const<const int, const float>::type = const int
+///          match_const<int, const float>::type = const int
+template<typename T, typename ReferenceT>
+struct match_const<T, const ReferenceT> {using type = const typename remove_const<T>::type;};
+
+// --------------------------> util::is_specialization <------------------------------------
+
+/// @brief Metafunction used to determine if the first template
+///        parameter is a specialization of the class template
+///        given in the second template parameter.
+///
+/// @details is_specialization<Vec3<float>, Vec3>::value == true;
+///          is_specialization<Vec3f, Vec3>::value == true;
+///          is_specialization<std::vector<float>, std::vector>::value == true;
+template<typename AnyType, template<typename...> class TemplateType>
+struct is_specialization {static const bool value = false;};
+template<typename... Args, template<typename...> class TemplateType>
+struct is_specialization<TemplateType<Args...>, TemplateType>
+{
+    static const bool value = true;
+};// util::is_specialization
+
+// --------------------------> util::PtrDiff <------------------------------------
+
+/// @brief Compute the distance, in bytes, between two pointers, dist = p - q
+/// @param p fist pointer, assumed to NOT be NULL
+/// @param q second pointer, assumed to NOT be NULL
+/// @return signed distance between pointer, p - q, addresses in units of bytes
+__hostdev__ inline static int64_t PtrDiff(const void* p, const void* q)
+{
+    NANOVDB_ASSERT(p && q);
+    return reinterpret_cast<const char*>(p) - reinterpret_cast<const char*>(q);
+}// util::PtrDiff
+
+// --------------------------> util::PtrAdd <------------------------------------
+
+/// @brief Adds a byte offset to a non-const pointer to produce another non-const pointer
+/// @tparam DstT Type of the return pointer (defaults to void)
+/// @param p non-const input pointer, assumed to NOT be NULL
+/// @param offset signed byte offset
+/// @return a non-const pointer defined as the offset of an input pointer
+template<typename DstT = void>
+__hostdev__ inline static DstT* PtrAdd(void* p, int64_t offset)
+{
+    NANOVDB_ASSERT(p);
+    return reinterpret_cast<DstT*>(reinterpret_cast<char*>(p) + offset);
+}// util::PtrAdd
+
+/// @brief Adds a byte offset to a const pointer to produce another const pointer
+/// @tparam DstT Type of the return pointer (defaults to void)
+/// @param p const input pointer, assumed to NOT be NULL
+/// @param offset signed byte offset
+/// @return a const pointer defined as the offset of a const input pointer
+template<typename DstT = void>
+__hostdev__ inline static const DstT* PtrAdd(const void* p, int64_t offset)
+{
+    NANOVDB_ASSERT(p);
+    return reinterpret_cast<const DstT*>(reinterpret_cast<const char*>(p) + offset);
+}// util::PtrAdd
+
+// -------------------> findLowestOn <----------------------------
+
+/// @brief Returns the index of the lowest, i.e. least significant, on bit in the specified 32 bit word
+///
+/// @warning Assumes that at least one bit is set in the word, i.e. @a v != uint32_t(0)!
+NANOVDB_HOSTDEV_DISABLE_WARNING
+__hostdev__ inline uint32_t findLowestOn(uint32_t v)
+{
+    NANOVDB_ASSERT(v);
+#if (defined(__CUDA_ARCH__) || defined(__HIP__)) && defined(NANOVDB_USE_INTRINSICS)
+    return __ffs(v) - 1; // one based indexing
+#elif defined(_MSC_VER) && defined(NANOVDB_USE_INTRINSICS)
+    unsigned long index;
+    _BitScanForward(&index, v);
+    return static_cast<uint32_t>(index);
+#elif (defined(__GNUC__) || defined(__clang__)) && defined(NANOVDB_USE_INTRINSICS)
+    return static_cast<uint32_t>(__builtin_ctzl(v));
+#else
+    //NANO_WARNING("Using software implementation for findLowestOn(uint32_t v)")
+    static const unsigned char DeBruijn[32] = {
+        0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9};
+// disable unary minus on unsigned warning
+#if defined(_MSC_VER) && !defined(__NVCC__)
+#pragma warning(push)
+#pragma warning(disable : 4146)
+#endif
+    return DeBruijn[uint32_t((v & -v) * 0x077CB531U) >> 27];
+#if defined(_MSC_VER) && !defined(__NVCC__)
+#pragma warning(pop)
+#endif
+
+#endif
+}// util::findLowestOn(uint32_t)
+
+/// @brief Returns the index of the lowest, i.e. least significant, on bit in the specified 64 bit word
+///
+/// @warning Assumes that at least one bit is set in the word, i.e. @a v != uint32_t(0)!
+NANOVDB_HOSTDEV_DISABLE_WARNING
+__hostdev__ inline uint32_t findLowestOn(uint64_t v)
+{
+    NANOVDB_ASSERT(v);
+#if (defined(__CUDA_ARCH__) || defined(__HIP__)) && defined(NANOVDB_USE_INTRINSICS)
+    return __ffsll(static_cast<unsigned long long int>(v)) - 1; // one based indexing
+#elif defined(_MSC_VER) && defined(NANOVDB_USE_INTRINSICS)
+    unsigned long index;
+    _BitScanForward64(&index, v);
+    return static_cast<uint32_t>(index);
+#elif (defined(__GNUC__) || defined(__clang__)) && defined(NANOVDB_USE_INTRINSICS)
+    return static_cast<uint32_t>(__builtin_ctzll(v));
+#else
+    //NANO_WARNING("Using software implementation for util::findLowestOn(uint64_t)")
+    static const unsigned char DeBruijn[64] = {
+        0,   1,  2, 53,  3,  7, 54, 27, 4,  38, 41,  8, 34, 55, 48, 28,
+        62,  5, 39, 46, 44, 42, 22,  9, 24, 35, 59, 56, 49, 18, 29, 11,
+        63, 52,  6, 26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10,
+        51, 25, 36, 32, 60, 20, 57, 16, 50, 31, 19, 15, 30, 14, 13, 12,
+    };
+// disable unary minus on unsigned warning
+#if defined(_MSC_VER) && !defined(__NVCC__)
+#pragma warning(push)
+#pragma warning(disable : 4146)
+#endif
+    return DeBruijn[uint64_t((v & -v) * UINT64_C(0x022FDD63CC95386D)) >> 58];
+#if defined(_MSC_VER) && !defined(__NVCC__)
+#pragma warning(pop)
+#endif
+
+#endif
+}// util::findLowestOn(uint64_t)
+
+// -------------------> findHighestOn <----------------------------
+
+/// @brief Returns the index of the highest, i.e. most significant, on bit in the specified 32 bit word
+///
+/// @warning Assumes that at least one bit is set in the word, i.e. @a v != uint32_t(0)!
+NANOVDB_HOSTDEV_DISABLE_WARNING
+__hostdev__ inline uint32_t findHighestOn(uint32_t v)
+{
+    NANOVDB_ASSERT(v);
+#if (defined(__CUDA_ARCH__) || defined(__HIP__)) && defined(NANOVDB_USE_INTRINSICS)
+    return sizeof(uint32_t) * 8 - 1 - __clz(v); // Return the number of consecutive high-order zero bits in a 32-bit integer.
+#elif defined(_MSC_VER) && defined(NANOVDB_USE_INTRINSICS)
+    unsigned long index;
+    _BitScanReverse(&index, v);
+    return static_cast<uint32_t>(index);
+#elif (defined(__GNUC__) || defined(__clang__)) && defined(NANOVDB_USE_INTRINSICS)
+    return sizeof(unsigned long) * 8 - 1 - __builtin_clzl(v);
+#else
+    //NANO_WARNING("Using software implementation for util::findHighestOn(uint32_t)")
+    static const unsigned char DeBruijn[32] = {
+        0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
+        8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31};
+    v |= v >> 1; // first round down to one less than a power of 2
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    return DeBruijn[uint32_t(v * 0x07C4ACDDU) >> 27];
+#endif
+}// util::findHighestOn
+
+/// @brief Returns the index of the highest, i.e. most significant, on bit in the specified 64 bit word
+///
+/// @warning Assumes that at least one bit is set in the word, i.e. @a v != uint32_t(0)!
+NANOVDB_HOSTDEV_DISABLE_WARNING
+__hostdev__ inline uint32_t findHighestOn(uint64_t v)
+{
+    NANOVDB_ASSERT(v);
+#if (defined(__CUDA_ARCH__) || defined(__HIP__)) && defined(NANOVDB_USE_INTRINSICS)
+    return sizeof(unsigned long) * 8 - 1 - __clzll(static_cast<unsigned long long int>(v));
+#elif defined(_MSC_VER) && defined(NANOVDB_USE_INTRINSICS)
+    unsigned long index;
+    _BitScanReverse64(&index, v);
+    return static_cast<uint32_t>(index);
+#elif (defined(__GNUC__) || defined(__clang__)) && defined(NANOVDB_USE_INTRINSICS)
+    return sizeof(unsigned long) * 8 - 1 - __builtin_clzll(v);
+#else
+    const uint32_t* p = reinterpret_cast<const uint32_t*>(&v);
+    return p[1] ? 32u + findHighestOn(p[1]) : findHighestOn(p[0]);
+#endif
+}// util::findHighestOn
+
+// ----------------------------> util::countOn <--------------------------------------
+
+/// @return Number of bits that are on in the specified 64-bit word
+NANOVDB_HOSTDEV_DISABLE_WARNING
+__hostdev__ inline uint32_t countOn(uint64_t v)
+{
+#if (defined(__CUDA_ARCH__) || defined(__HIP__)) && defined(NANOVDB_USE_INTRINSICS)
+    //#warning Using popcll for util::countOn
+    return __popcll(v);
+// __popcnt64 intrinsic support was added in VS 2019 16.8
+#elif defined(_MSC_VER) && defined(_M_X64) && (_MSC_VER >= 1928) && defined(NANOVDB_USE_INTRINSICS)
+    //#warning Using popcnt64 for util::countOn
+    return uint32_t(__popcnt64(v));
+#elif (defined(__GNUC__) || defined(__clang__)) && defined(NANOVDB_USE_INTRINSICS)
+    //#warning Using builtin_popcountll for util::countOn
+    return __builtin_popcountll(v);
+#else // use software implementation
+    //NANO_WARNING("Using software implementation for util::countOn")
+    v = v - ((v >> 1) & uint64_t(0x5555555555555555));
+    v = (v & uint64_t(0x3333333333333333)) + ((v >> 2) & uint64_t(0x3333333333333333));
+    return (((v + (v >> 4)) & uint64_t(0xF0F0F0F0F0F0F0F)) * uint64_t(0x101010101010101)) >> 56;
+#endif
+}// util::countOn(uint64_t)
+
+}// namespace util ==================================================================
+
+[[deprecated("Use nanovdb::util::findLowestOn instead")]]
+__hostdev__ inline uint32_t FindLowestOn(uint32_t v){return util::findLowestOn(v);}
+[[deprecated("Use nanovdb::util::findLowestOn instead")]]
+__hostdev__ inline uint32_t FindLowestOn(uint64_t v){return util::findLowestOn(v);}
+[[deprecated("Use nanovdb::util::findHighestOn instead")]]
+__hostdev__ inline uint32_t FindHighestOn(uint32_t v){return util::findHighestOn(v);}
+[[deprecated("Use nanovdb::util::findHighestOn instead")]]
+__hostdev__ inline uint32_t FindHighestOn(uint64_t v){return util::findHighestOn(v);}
+[[deprecated("Use nanovdb::util::countOn instead")]]
+__hostdev__ inline uint32_t CountOn(uint64_t v){return util::countOn(v);}
+
+} // namespace nanovdb ===================================================================
+
+#endif // end of NANOVDB_UTIL_UTIL_H_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/util/cuda/CudaAddBlindData.cuh b/external/nanovdb/util/cuda/CudaAddBlindData.cuh
new file mode 100644
index 00000000..39ece43d
--- /dev/null
+++ b/external/nanovdb/util/cuda/CudaAddBlindData.cuh
@@ -0,0 +1,6 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/tools/cuda/AddBlindData.cuh>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/tools/cuda/AddBlindData.cuh instead.")
diff --git a/external/nanovdb/util/cuda/CudaDeviceBuffer.h b/external/nanovdb/util/cuda/CudaDeviceBuffer.h
new file mode 100644
index 00000000..65371c6c
--- /dev/null
+++ b/external/nanovdb/util/cuda/CudaDeviceBuffer.h
@@ -0,0 +1,6 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/cuda/DeviceBuffer.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/cuda/DeviceBuffer.h instead.")
\ No newline at end of file
diff --git a/external/nanovdb/util/cuda/CudaGridChecksum.cuh b/external/nanovdb/util/cuda/CudaGridChecksum.cuh
new file mode 100644
index 00000000..fe897d45
--- /dev/null
+++ b/external/nanovdb/util/cuda/CudaGridChecksum.cuh
@@ -0,0 +1,6 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/tools/cuda/GridChecksum.cuh>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/tools/cuda/GridChecksum.cuh instead.")
diff --git a/external/nanovdb/util/cuda/CudaGridHandle.cuh b/external/nanovdb/util/cuda/CudaGridHandle.cuh
new file mode 100644
index 00000000..db68f238
--- /dev/null
+++ b/external/nanovdb/util/cuda/CudaGridHandle.cuh
@@ -0,0 +1,6 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/cuda/GridHandle.cuh>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/cuda/GridHandle.cuh instead.")
\ No newline at end of file
diff --git a/external/nanovdb/util/cuda/CudaGridStats.cuh b/external/nanovdb/util/cuda/CudaGridStats.cuh
new file mode 100644
index 00000000..acc62af5
--- /dev/null
+++ b/external/nanovdb/util/cuda/CudaGridStats.cuh
@@ -0,0 +1,6 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/tools/cuda/GridStats.cuh>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/tools/cuda/GridStats.cuh instead.")
diff --git a/external/nanovdb/util/cuda/CudaGridValidator.cuh b/external/nanovdb/util/cuda/CudaGridValidator.cuh
new file mode 100644
index 00000000..a89c8cae
--- /dev/null
+++ b/external/nanovdb/util/cuda/CudaGridValidator.cuh
@@ -0,0 +1,6 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/tools/cuda/GridValidator.cuh>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/tools/cuda/GridValidator.cuh instead.")
diff --git a/external/nanovdb/util/cuda/CudaIndexToGrid.cuh b/external/nanovdb/util/cuda/CudaIndexToGrid.cuh
new file mode 100644
index 00000000..4a15b523
--- /dev/null
+++ b/external/nanovdb/util/cuda/CudaIndexToGrid.cuh
@@ -0,0 +1,6 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/tools/cuda/IndexToGrid.cuh>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/tools/cuda/IndexToGrid.cuh instead.")
diff --git a/external/nanovdb/util/cuda/CudaNodeManager.cuh b/external/nanovdb/util/cuda/CudaNodeManager.cuh
new file mode 100644
index 00000000..ca287266
--- /dev/null
+++ b/external/nanovdb/util/cuda/CudaNodeManager.cuh
@@ -0,0 +1,6 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/cuda/NodeManager.cuh>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/cuda/NodeManager.cuh instead.")
\ No newline at end of file
diff --git a/external/nanovdb/util/cuda/CudaPointsToGrid.cuh b/external/nanovdb/util/cuda/CudaPointsToGrid.cuh
new file mode 100644
index 00000000..7494b607
--- /dev/null
+++ b/external/nanovdb/util/cuda/CudaPointsToGrid.cuh
@@ -0,0 +1,6 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/tools/cuda/PointsToGrid.cuh>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/tools/cuda/PointsToGrid.cuh instead.")
diff --git a/external/nanovdb/util/cuda/CudaSignedFloodFill.cuh b/external/nanovdb/util/cuda/CudaSignedFloodFill.cuh
new file mode 100644
index 00000000..7f0d9ce0
--- /dev/null
+++ b/external/nanovdb/util/cuda/CudaSignedFloodFill.cuh
@@ -0,0 +1,6 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/tools/cuda/SignedFloodFill.cuh>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/tools/cuda/SignedFloodFill.cuh instead.")
diff --git a/external/nanovdb/util/cuda/CudaUtils.h b/external/nanovdb/util/cuda/CudaUtils.h
new file mode 100644
index 00000000..38f7c94a
--- /dev/null
+++ b/external/nanovdb/util/cuda/CudaUtils.h
@@ -0,0 +1,6 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/util/cuda/Util.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/util/cuda/Util.h instead.")
\ No newline at end of file
diff --git a/external/nanovdb/util/cuda/GpuTimer.h b/external/nanovdb/util/cuda/GpuTimer.h
new file mode 100644
index 00000000..ee0b0c71
--- /dev/null
+++ b/external/nanovdb/util/cuda/GpuTimer.h
@@ -0,0 +1,6 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+#include <nanovdb/util/Util.h>// for NANOVDB_DEPRECATED_HEADER
+#include <nanovdb/util/cuda/Timer.h>
+NANOVDB_DEPRECATED_HEADER("Include nanovdb/util/cuda/Timer.h instead.")
\ No newline at end of file
diff --git a/external/nanovdb/util/cuda/Timer.h b/external/nanovdb/util/cuda/Timer.h
new file mode 100644
index 00000000..07c9366a
--- /dev/null
+++ b/external/nanovdb/util/cuda/Timer.h
@@ -0,0 +1,116 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/// @file nanovdb/util/cuda/Timer.h
+///
+/// @author Ken Museth
+///
+/// @brief A simple GPU timing class
+
+#ifndef NANOVDB_UTIL_CUDA_TIMER_H_HAS_BEEN_INCLUDED
+#define NANOVDB_UTIL_CUDA_TIMER_H_HAS_BEEN_INCLUDED
+
+#include <iostream>// for std::cerr
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+namespace nanovdb {
+
+namespace util::cuda {
+
+class Timer
+{
+    cudaStream_t mStream{0};
+    cudaEvent_t mStart, mStop;
+
+public:
+    /// @brief Default constructor
+    /// @param stream CUDA stream to be timed (defaults to stream 0)
+    /// @note Starts the timer
+    Timer(cudaStream_t stream = 0) : mStream(stream)
+    {
+        cudaEventCreate(&mStart);
+        cudaEventCreate(&mStop);
+        cudaEventRecord(mStart, mStream);
+    }
+
+    /// @brief Construct and start the timer
+    /// @param msg string message to be printed when timer is started
+    /// @param stream CUDA stream to be timed (defaults to stream 0)
+    /// @param os output stream for the message above
+    Timer(const std::string &msg, cudaStream_t stream = 0, std::ostream& os = std::cerr)
+        : mStream(stream)
+    {
+        os << msg << " ... " << std::flush;
+        cudaEventCreate(&mStart);
+        cudaEventCreate(&mStop);
+        cudaEventRecord(mStart, mStream);
+    }
+
+    /// @brief Destructor
+    ~Timer()
+    {
+        cudaEventDestroy(mStart);
+        cudaEventDestroy(mStop);
+    }
+
+    /// @brief Start the timer
+    /// @param stream CUDA stream to be timed (defaults to stream 0)
+    /// @param os output stream for the message above
+    void start() {cudaEventRecord(mStart, mStream);}
+
+    /// @brief Start the timer
+    /// @param msg string message to be printed when timer is started
+
+    /// @param os output stream for the message above
+    void start(const std::string &msg, std::ostream& os = std::cerr)
+    {
+        os << msg << " ... " << std::flush;
+        this->start();
+    }
+
+    /// @brief Start the timer
+    /// @param msg string message to be printed when timer is started
+    /// @param os output stream for the message above
+    void start(const char* msg, std::ostream& os = std::cerr)
+    {
+        os << msg << " ... " << std::flush;
+        this->start();
+    }
+
+    /// @brief elapsed time (since start) in miliseconds
+    /// @return elapsed time (since start) in miliseconds
+    float elapsed()
+    {
+        cudaEventRecord(mStop, mStream);
+        cudaEventSynchronize(mStop);
+        float diff = 0.0f;
+        cudaEventElapsedTime(&diff, mStart, mStop);
+        return diff;
+    }
+
+    /// @brief stop the timer
+    /// @param os output stream for the message above
+    void stop(std::ostream& os = std::cerr)
+    {
+        float diff = this->elapsed();
+        os << "completed in " << diff << " milliseconds" << std::endl;
+    }
+
+    /// @brief stop and start the timer
+    /// @param msg string message to be printed when timer is started
+    /// @warning Remember to call start before restart
+    void restart(const std::string &msg, std::ostream& os = std::cerr)
+    {
+        this->stop();
+        this->start(msg, os);
+    }
+};// Timer
+
+}// namespace util::cuda
+
+using GpuTimer [[deprecated("Use nanovdb::util::cuda::Timer instead")]]= util::cuda::Timer;
+
+} // namespace nanovdb
+
+#endif // NANOVDB_UTIL_CUDA_TIMER_H_HAS_BEEN_INCLUDED
diff --git a/external/nanovdb/util/cuda/Util.h b/external/nanovdb/util/cuda/Util.h
new file mode 100644
index 00000000..8d1711b3
--- /dev/null
+++ b/external/nanovdb/util/cuda/Util.h
@@ -0,0 +1,193 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file nanovdb/util/cuda/Util.h
+
+    \author Ken Museth
+
+    \date December 20, 2023
+
+    \brief Cuda specific utility functions
+*/
+
+#ifndef NANOVDB_UTIL_CUDA_UTIL_H_HAS_BEEN_INCLUDED
+#define NANOVDB_UTIL_CUDA_UTIL_H_HAS_BEEN_INCLUDED
+
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <nanovdb/util/Util.h> // for stderr and NANOVDB_ASSERT
+
+// change 1 -> 0 to only perform asserts during debug builds
+#if 1 || defined(DEBUG) || defined(_DEBUG)
+    static inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
+    {
+        if (code != cudaSuccess) {
+            fprintf(stderr, "CUDA error %u: %s (%s:%d)\n", unsigned(code), cudaGetErrorString(code), file, line);
+            //fprintf(stderr, "CUDA Runtime Error: %s %s %d\n", cudaGetErrorString(code), file, line);
+            if (abort) exit(code);
+        }
+    }
+    static inline void ptrAssert(const void* ptr, const char* msg, const char* file, int line, bool abort = true)
+    {
+        if (ptr == nullptr) {
+            fprintf(stderr, "NULL pointer error: %s %s %d\n", msg, file, line);
+            if (abort) exit(1);
+        } else if (uint64_t(ptr) % 32) {
+            fprintf(stderr, "Pointer misalignment error: %s %s %d\n", msg, file, line);
+            if (abort) exit(1);
+        }
+    }
+#else
+    static inline void gpuAssert(cudaError_t, const char*, int, bool = true){}
+    static inline void ptrAssert(void*, const char*, const char*, int, bool = true){}
+#endif
+
+// Convenience function for checking CUDA runtime API results
+// can be wrapped around any runtime API call. No-op in release builds.
+#define cudaCheck(ans) \
+    { \
+        gpuAssert((ans), __FILE__, __LINE__); \
+    }
+
+#define checkPtr(ptr, msg) \
+    { \
+        ptrAssert((ptr), (msg), __FILE__, __LINE__); \
+    }
+
+#define cudaSync() \
+    { \
+        cudaCheck(cudaDeviceSynchronize()); \
+    }
+
+#define cudaCheckError() \
+    { \
+        cudaCheck(cudaGetLastError()); \
+    }
+
+namespace nanovdb {// =========================================================
+
+namespace util::cuda {// ======================================================
+
+//#define NANOVDB_USE_SYNC_CUDA_MALLOC
+// cudaMallocAsync and cudaFreeAsync were introduced in CUDA 11.2 so we introduce
+// custom implementations that map to cudaMalloc and cudaFree below. If NANOVDB_USE_SYNC_CUDA_MALLOC
+// is defined these implementations will also be defined, which is useful in virtualized environments
+// that slice up the GPU and share it between instances as vGPU's. GPU unified memory is usually disabled
+// out of security considerations. Asynchronous CUDA malloc/free depends on GPU unified memory, so it
+// is not possible to use cudaMallocAsync and cudaFreeAsync in such environments.
+
+#if (CUDART_VERSION < 11020) || defined(NANOVDB_USE_SYNC_CUDA_MALLOC) // 11.2 introduced cudaMallocAsync and cudaFreeAsync
+
+/// @brief Simple wrapper that calls cudaMalloc
+/// @param d_ptr Device pointer to allocated device memory
+/// @param size  Number of bytes to allocate
+/// @param dummy The stream establishing the stream ordering contract and the memory pool to allocate from (ignored)
+/// @return Cuda error code
+inline cudaError_t mallocAsync(void** d_ptr, size_t size, cudaStream_t){return cudaMalloc(d_ptr, size);}
+
+/// @brief Simple wrapper that calls cudaFree
+/// @param d_ptr Device pointer that will be freed
+/// @param dummy The stream establishing the stream ordering promise (ignored)
+/// @return Cuda error code
+inline cudaError_t freeAsync(void* d_ptr, cudaStream_t){return cudaFree(d_ptr);}
+
+#else
+
+/// @brief Simple wrapper that calls cudaMallocAsync
+/// @param d_ptr Device pointer to allocated device memory
+/// @param size  Number of bytes to allocate
+/// @param stream The stream establishing the stream ordering contract and the memory pool to allocate from
+/// @return Cuda error code
+inline cudaError_t mallocAsync(void** d_ptr, size_t size, cudaStream_t stream){return cudaMallocAsync(d_ptr, size, stream);}
+
+/// @brief Simple wrapper that calls cudaFreeAsync
+/// @param d_ptr Device pointer that will be freed
+/// @param stream The stream establishing the stream ordering promise
+/// @return Cuda error code
+inline cudaError_t freeAsync(void* d_ptr, cudaStream_t stream){return cudaFreeAsync(d_ptr, stream);}
+
+#endif
+
+/// @brief Simple (naive) implementation of a unique device pointer
+///        using stream ordered memory allocation and deallocation.
+/// @tparam T Type of the device pointer
+template <typename T>
+class unique_ptr
+{
+    T           *mPtr;// pointer to stream ordered memory allocation
+    cudaStream_t mStream;
+public:
+    unique_ptr(size_t count = 0, cudaStream_t stream = 0) : mPtr(nullptr), mStream(stream)
+    {
+        if (count>0) cudaCheck(mallocAsync((void**)&mPtr, count*sizeof(T), stream));
+    }
+    unique_ptr(const unique_ptr&) = delete;
+    unique_ptr(unique_ptr&& other) : mPtr(other.mPtr), mStream(other.mStream)
+    {
+        other.mPtr = nullptr;
+    }
+    ~unique_ptr()
+    {
+        if (mPtr) cudaCheck(freeAsync(mPtr, mStream));
+    }
+    unique_ptr& operator=(const unique_ptr&) = delete;
+    unique_ptr& operator=(unique_ptr&& rhs) noexcept
+    {
+        mPtr = rhs.mPtr;
+        mStream = rhs.mStream;
+        rhs.mPtr = nullptr;
+        return *this;
+    }
+    void reset() {
+        if (mPtr) {
+            cudaCheck(freeAsync(mPtr, mStream));
+            mPtr = nullptr;
+        }
+    }
+    T* get()                 const {return mPtr;}
+    explicit operator bool() const {return mPtr != nullptr;}
+};// util::cuda::unique_ptr
+
+/// @brief Computes the number of blocks per grid given the problem size and number of threads per block
+/// @param numItems Problem size
+/// @param threadsPerBlock Number of threads per block (second CUDA launch parameter)
+/// @return number of blocks per grid (first CUDA launch parameter)
+/// @note CUDA launch parameters: kernel<<< blocksPerGrid, threadsPerBlock, sharedMemSize, streamID>>>
+inline size_t blocksPerGrid(size_t numItems, size_t threadsPerBlock)
+{
+    NANOVDB_ASSERT(numItems > 0 && threadsPerBlock >= 32 && threadsPerBlock % 32 == 0);
+    return (numItems + threadsPerBlock - 1) / threadsPerBlock;
+}
+
+
+#if defined(__CUDACC__)// the following functions only run on the GPU!
+
+/// @brief Cuda kernel that launches device lambda functions
+/// @param numItems Problem size
+template<typename Func, typename... Args>
+__global__ void lambdaKernel(const size_t numItems, Func func, Args... args)
+{
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= numItems) return;
+    func(tid, args...);
+}// util::cuda::lambdaKernel
+
+#endif// __CUDACC__
+
+}// namespace util::cuda ============================================================
+
+}// namespace nanovdb ===============================================================
+
+#if defined(__CUDACC__)// the following functions only run on the GPU!
+template<typename Func, typename... Args>
+[[deprecated("Use nanovdb::cuda::lambdaKernel instead")]]
+__global__ void cudaLambdaKernel(const size_t numItems, Func func, Args... args)
+{
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= numItems) return;
+    func(tid, args...);
+}
+#endif// __CUDACC__
+
+#endif// NANOVDB_UTIL_CUDA_UTIL_H_HAS_BEEN_INCLUDED
\ No newline at end of file