From df53a72d4b8cc298d7d2b5931c5a47eacb5a7991 Mon Sep 17 00:00:00 2001
From: Piero Toffanin <pt@masseranolabs.com>
Date: Mon, 11 Mar 2024 14:23:47 -0400
Subject: [PATCH 01/19] CPU gsplat implementation scaffold

---
 CMakeLists.txt                   |  45 ++++++---
 gsplat.hpp                       |   5 +
 model.cpp                        |   2 +-
 project_gaussians.cpp            |   2 +-
 rasterize_gaussians.cpp          |   2 +-
 simple_trainer.cpp               |  27 -----
 spherical_harmonics.cpp          |   2 +-
 vendor/gsplat-cpu/bindings.h     | 130 ++++++++++++++++++++++++
 vendor/gsplat-cpu/gsplat_cpu.cpp | 163 +++++++++++++++++++++++++++++++
 9 files changed, 333 insertions(+), 45 deletions(-)
 create mode 100644 gsplat.hpp
 create mode 100644 vendor/gsplat-cpu/bindings.h
 create mode 100644 vendor/gsplat-cpu/gsplat_cpu.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e298ada..5e4d3dd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,14 +9,14 @@ if(NOT CMAKE_BUILD_TYPE)
 set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel." FORCE)
 endif()
 
-enable_language(${GPU_RUNTIME})
-set(CMAKE_${GPU_RUNTIME}_STANDARD 17)
-set(${GPU_RUNTIME}_STANDARD 17)
-
 if(GPU_RUNTIME STREQUAL "CUDA")
     set(CMAKE_CUDA_ARCHITECTURES 70 75)
-    find_package(CUDAToolkit REQUIRED)
-else()
+    find_package(CUDAToolkit)
+    if (CUDAToolkit-NOTFOUND)
+        message(WARNING "CUDA toolkit not found, building with CPU support only")
+        set(GPU_RUNTIME "CPU")
+    endif()
+elseif(GPU_RUNTIME STREQUAL "HIP")
     set(USE_HIP ON CACHE BOOL "Use HIP for GPU acceleration")
 
     if(NOT DEFINED HIP_PATH)
@@ -40,6 +40,15 @@ else()
     list(APPEND CMAKE_PREFIX_PATH "${ROCM_ROOT}")
 endif()
 
+if((GPU_RUNTIME STREQUAL "CUDA") OR (GPU_RUNTIME STREQUAL "HIP"))
+    enable_language(${GPU_RUNTIME})
+    set(CMAKE_${GPU_RUNTIME}_STANDARD 17)
+    set(${GPU_RUNTIME}_STANDARD 17)
+else()
+    # CPU
+    set(CMAKE_CXX_STANDARD 17)
+endif()
+
 if (NOT WIN32 AND NOT APPLE)
     set(STDPPFS_LIBRARY stdc++fs)
 endif()
@@ -52,13 +61,18 @@ if (NOT WIN32 AND NOT APPLE)
 endif()
 set(OpenCV_LIBS opencv_core opencv_imgproc opencv_highgui opencv_calib3d)
 
-add_library(gsplat vendor/gsplat/forward.cu vendor/gsplat/backward.cu vendor/gsplat/bindings.cu vendor/gsplat/helpers.cuh)
-if(GPU_RUNTIME STREQUAL "CUDA")
-    set(GPU_LIBRARIES "cuda")
-    target_link_libraries(gsplat PUBLIC cuda)
-else(GPU_RUNTIME STREQUAL "HIP")
-    set(GPU_INCLUDE_DIRS "${ROCM_ROOT}/include")
-    target_compile_definitions(gsplat PRIVATE USE_HIP __HIP_PLATFORM_AMD__)
+if((GPU_RUNTIME STREQUAL "CUDA") OR (GPU_RUNTIME STREQUAL "HIP"))
+    add_library(gsplat vendor/gsplat/forward.cu vendor/gsplat/backward.cu vendor/gsplat/bindings.cu vendor/gsplat/helpers.cuh)
+    if(GPU_RUNTIME STREQUAL "CUDA")
+        set(GPU_LIBRARIES "cuda")
+        target_link_libraries(gsplat PUBLIC cuda)
+        set_target_properties(gsplat PROPERTIES CUDA_ARCHITECTURES "70;75")
+    else(GPU_RUNTIME STREQUAL "HIP")
+        set(GPU_INCLUDE_DIRS "${ROCM_ROOT}/include")
+        target_compile_definitions(gsplat PRIVATE USE_HIP __HIP_PLATFORM_AMD__)
+    endif()
+else()
+    add_library(gsplat vendor/gsplat-cpu/gsplat_cpu.cpp)
 endif()
 
 target_include_directories(gsplat PRIVATE
@@ -67,7 +81,6 @@ target_include_directories(gsplat PRIVATE
     ${TORCH_INCLUDE_DIRS}
 )
 set_target_properties(gsplat PROPERTIES LINKER_LANGUAGE CXX)
-set_target_properties(gsplat PROPERTIES CUDA_ARCHITECTURES "70;75")
 
 add_executable(opensplat opensplat.cpp point_io.cpp nerfstudio.cpp model.cpp kdtree_tensor.cpp spherical_harmonics.cpp cv_utils.cpp utils.cpp project_gaussians.cpp rasterize_gaussians.cpp ssim.cpp optim_scheduler.cpp colmap.cpp input_data.cpp tensor_math.cpp)
 set_property(TARGET opensplat PROPERTY CXX_STANDARD 17)
@@ -75,6 +88,8 @@ target_include_directories(opensplat PRIVATE ${PROJECT_SOURCE_DIR}/vendor/glm ${
 target_link_libraries(opensplat PUBLIC ${STDPPFS_LIBRARY} ${GPU_LIBRARIES} gsplat ${TORCH_LIBRARIES} ${OpenCV_LIBS})
 if(GPU_RUNTIME STREQUAL "HIP")
     target_compile_definitions(opensplat PRIVATE USE_HIP __HIP_PLATFORM_AMD__)
+elseif(GPU_RUNTIME STREQUAL "CUDA")
+    target_compile_definitions(opensplat PRIVATE USE_CUDA)
 endif()
 
 if(OPENSPLAT_BUILD_SIMPLE_TRAINER)
@@ -84,6 +99,8 @@ if(OPENSPLAT_BUILD_SIMPLE_TRAINER)
     set_property(TARGET simple_trainer PROPERTY CXX_STANDARD 17)
     if(GPU_RUNTIME STREQUAL "HIP")
         target_compile_definitions(simple_trainer PRIVATE USE_HIP __HIP_PLATFORM_AMD__)
+    elseif(GPU_RUNTIME STREQUAL "CUDA")
+        target_compile_definitions(simple_trainer PRIVATE USE_CUDA)
     endif()
 endif()
 
diff --git a/gsplat.hpp b/gsplat.hpp
new file mode 100644
index 0000000..c0b2408
--- /dev/null
+++ b/gsplat.hpp
@@ -0,0 +1,5 @@
+#if defined(USE_HIP) || defined(USE_CUDA)
+#include "vendor/gsplat/bindings.h"
+#else
+#include "vendor/gsplat-cpu/bindings.h"
+#endif
\ No newline at end of file
diff --git a/model.cpp b/model.cpp
index 915c334..401be84 100644
--- a/model.cpp
+++ b/model.cpp
@@ -4,7 +4,7 @@
 #include "project_gaussians.hpp"
 #include "rasterize_gaussians.hpp"
 #include "tensor_math.hpp"
-#include "vendor/gsplat/config.h"
+#include "gsplat.hpp"
 
 torch::Tensor randomQuatTensor(long long n){
     torch::Tensor u = torch::rand(n);
diff --git a/project_gaussians.cpp b/project_gaussians.cpp
index f479919..d899362 100644
--- a/project_gaussians.cpp
+++ b/project_gaussians.cpp
@@ -1,5 +1,5 @@
 #include "project_gaussians.hpp"
-#include "vendor/gsplat/bindings.h"
+#include "gsplat.hpp"
 
 variable_list ProjectGaussians::forward(AutogradContext *ctx, 
                 torch::Tensor means,
diff --git a/rasterize_gaussians.cpp b/rasterize_gaussians.cpp
index f7cb655..42d3bdc 100644
--- a/rasterize_gaussians.cpp
+++ b/rasterize_gaussians.cpp
@@ -1,5 +1,5 @@
 #include "rasterize_gaussians.hpp"
-#include "vendor/gsplat/bindings.h"
+#include "gsplat.hpp"
 #include "vendor/gsplat/config.h"
 
 std::tuple<torch::Tensor,
diff --git a/simple_trainer.cpp b/simple_trainer.cpp
index b40e6d6..7e54dea 100644
--- a/simple_trainer.cpp
+++ b/simple_trainer.cpp
@@ -117,15 +117,6 @@ int main(int argc, char **argv){
                                 width,
                                 tileBounds);
 
-#ifdef USE_HIP
-        hipError_t err = hipDeviceSynchronize();
-        if (err != hipSuccess) {
-            std::cerr << "hipDeviceSynchronize failed with error: " << hipGetErrorString(err) << std::endl;
-        }
-#else   
-        torch::cuda::synchronize();
-#endif
-        
         torch::Tensor outImg = RasterizeGaussians::apply(
             p[0], // xys
             p[1], // depths
@@ -138,28 +129,10 @@ int main(int argc, char **argv){
             width,
             background);
         
-#ifdef USE_HIP
-        err = hipDeviceSynchronize();
-        if (err != hipSuccess) {
-            std::cerr << "hipDeviceSynchronize failed with error: " << hipGetErrorString(err) << std::endl;
-        }
-#else   
-        torch::cuda::synchronize();
-#endif
-
         outImg.requires_grad_();
         torch::Tensor loss = mseLoss(outImg, gtImage);
         optimizer.zero_grad();
         loss.backward();
-
-#ifdef USE_HIP
-        err = hipDeviceSynchronize();
-        if (err != hipSuccess) {
-            std::cerr << "hipDeviceSynchronize failed with error: " << hipGetErrorString(err) << std::endl;
-        }
-#else   
-        torch::cuda::synchronize();
-#endif
         optimizer.step();
 
         std::cout << "Iteration " << std::to_string(i + 1) << "/" << std::to_string(iterations) << " Loss: " << loss.item<float>() << std::endl; 
diff --git a/spherical_harmonics.cpp b/spherical_harmonics.cpp
index c88b462..c6d4036 100644
--- a/spherical_harmonics.cpp
+++ b/spherical_harmonics.cpp
@@ -1,5 +1,5 @@
 #include "spherical_harmonics.hpp"
-#include "vendor/gsplat/bindings.h"
+#include "gsplat.hpp"
 
 int numShBases(int degree){
     switch(degree){
diff --git a/vendor/gsplat-cpu/bindings.h b/vendor/gsplat-cpu/bindings.h
new file mode 100644
index 0000000..12daf87
--- /dev/null
+++ b/vendor/gsplat-cpu/bindings.h
@@ -0,0 +1,130 @@
+#include <cstdio>
+#include <iostream>
+#include <math.h>
+#include <tuple>
+#include <torch/all.h>
+
+torch::Tensor compute_sh_forward_tensor(
+    unsigned num_points,
+    unsigned degree,
+    unsigned degrees_to_use,
+    torch::Tensor &viewdirs,
+    torch::Tensor &coeffs
+);
+
+torch::Tensor compute_sh_backward_tensor(
+    unsigned num_points,
+    unsigned degree,
+    unsigned degrees_to_use,
+    torch::Tensor &viewdirs,
+    torch::Tensor &v_colors
+);
+
+std::tuple<
+    torch::Tensor,
+    torch::Tensor,
+    torch::Tensor,
+    torch::Tensor,
+    torch::Tensor,
+    torch::Tensor>
+project_gaussians_forward_tensor(
+    const int num_points,
+    torch::Tensor &means3d,
+    torch::Tensor &scales,
+    const float glob_scale,
+    torch::Tensor &quats,
+    torch::Tensor &viewmat,
+    torch::Tensor &projmat,
+    const float fx,
+    const float fy,
+    const float cx,
+    const float cy,
+    const unsigned img_height,
+    const unsigned img_width,
+    const std::tuple<int, int, int> tile_bounds,
+    const float clip_thresh
+);
+
+std::tuple<
+    torch::Tensor,
+    torch::Tensor,
+    torch::Tensor,
+    torch::Tensor,
+    torch::Tensor>
+project_gaussians_backward_tensor(
+    const int num_points,
+    torch::Tensor &means3d,
+    torch::Tensor &scales,
+    const float glob_scale,
+    torch::Tensor &quats,
+    torch::Tensor &viewmat,
+    torch::Tensor &projmat,
+    const float fx,
+    const float fy,
+    const float cx,
+    const float cy,
+    const unsigned img_height,
+    const unsigned img_width,
+    torch::Tensor &cov3d,
+    torch::Tensor &radii,
+    torch::Tensor &conics,
+    torch::Tensor &v_xy,
+    torch::Tensor &v_depth,
+    torch::Tensor &v_conic
+);
+
+
+std::tuple<torch::Tensor, torch::Tensor> map_gaussian_to_intersects_tensor(
+    const int num_points,
+    const int num_intersects,
+    const torch::Tensor &xys,
+    const torch::Tensor &depths,
+    const torch::Tensor &radii,
+    const torch::Tensor &cum_tiles_hit,
+    const std::tuple<int, int, int> tile_bounds
+);
+
+torch::Tensor get_tile_bin_edges_tensor(
+    int num_intersects,
+    const torch::Tensor &isect_ids_sorted
+);
+
+std::tuple<
+    torch::Tensor,
+    torch::Tensor,
+    torch::Tensor
+> rasterize_forward_tensor(
+    const std::tuple<int, int, int> tile_bounds,
+    const std::tuple<int, int, int> block,
+    const std::tuple<int, int, int> img_size,
+    const torch::Tensor &gaussian_ids_sorted,
+    const torch::Tensor &tile_bins,
+    const torch::Tensor &xys,
+    const torch::Tensor &conics,
+    const torch::Tensor &colors,
+    const torch::Tensor &opacities,
+    const torch::Tensor &background
+);
+
+std::
+    tuple<
+        torch::Tensor, // dL_dxy
+        torch::Tensor, // dL_dconic
+        torch::Tensor, // dL_dcolors
+        torch::Tensor  // dL_dopacity
+        >
+    rasterize_backward_tensor(
+        const unsigned img_height,
+        const unsigned img_width,
+        const torch::Tensor &gaussians_ids_sorted,
+        const torch::Tensor &tile_bins,
+        const torch::Tensor &xys,
+        const torch::Tensor &conics,
+        const torch::Tensor &colors,
+        const torch::Tensor &opacities,
+        const torch::Tensor &background,
+        const torch::Tensor &final_Ts,
+        const torch::Tensor &final_idx,
+        const torch::Tensor &v_output, // dL_dout_color
+        const torch::Tensor &v_output_alpha
+    );
diff --git a/vendor/gsplat-cpu/gsplat_cpu.cpp b/vendor/gsplat-cpu/gsplat_cpu.cpp
new file mode 100644
index 0000000..9d557d1
--- /dev/null
+++ b/vendor/gsplat-cpu/gsplat_cpu.cpp
@@ -0,0 +1,163 @@
+#include "bindings.h"
+
+#include <cstdio>
+#include <iostream>
+#include <math.h>
+#include <tuple>
+#include <torch/torch.h>
+
+using namespace torch::indexing;
+
+torch::Tensor compute_sh_forward_tensor(
+    unsigned num_points,
+    unsigned degree,
+    unsigned degrees_to_use,
+    torch::Tensor &viewdirs,
+    torch::Tensor &coeffs
+){
+    return torch::Tensor();
+}
+
+torch::Tensor compute_sh_backward_tensor(
+    unsigned num_points,
+    unsigned degree,
+    unsigned degrees_to_use,
+    torch::Tensor &viewdirs,
+    torch::Tensor &v_colors
+){
+    return torch::Tensor();
+}
+
+std::tuple<
+    torch::Tensor,
+    torch::Tensor,
+    torch::Tensor,
+    torch::Tensor,
+    torch::Tensor,
+    torch::Tensor>
+project_gaussians_forward_tensor(
+    const int num_points,
+    torch::Tensor &means3d,
+    torch::Tensor &scales,
+    const float glob_scale,
+    torch::Tensor &quats,
+    torch::Tensor &viewmat,
+    torch::Tensor &projmat,
+    const float fx,
+    const float fy,
+    const float cx,
+    const float cy,
+    const unsigned img_height,
+    const unsigned img_width,
+    const std::tuple<int, int, int> tile_bounds,
+    const float clip_thresh
+){
+    float fovx = 0.5f * static_cast<float>(img_width) / fx;
+    float fovy = 0.5f * static_cast<float>(img_height) / fy;
+
+    // clip_near_plane
+    torch::Tensor R = viewmat.index({"...", Slice(None, 3), Slice(None, 3)}); 
+    torch::Tensor T = viewmat.index({"...", Slice(None, 3), 3});
+    torch::Tensor pView = torch::matmul(R, means3d.index({"...", None})).index({"...", 0}) + T;
+    torch::Tensor isClose = pView.index({"...", 2}) < clip_thresh;
+
+    
+
+
+    return std::make_tuple(torch::Tensor(), torch::Tensor(), torch::Tensor(), torch::Tensor(), torch::Tensor(), torch::Tensor());
+}
+
+std::tuple<
+    torch::Tensor,
+    torch::Tensor,
+    torch::Tensor,
+    torch::Tensor,
+    torch::Tensor>
+project_gaussians_backward_tensor(
+    const int num_points,
+    torch::Tensor &means3d,
+    torch::Tensor &scales,
+    const float glob_scale,
+    torch::Tensor &quats,
+    torch::Tensor &viewmat,
+    torch::Tensor &projmat,
+    const float fx,
+    const float fy,
+    const float cx,
+    const float cy,
+    const unsigned img_height,
+    const unsigned img_width,
+    torch::Tensor &cov3d,
+    torch::Tensor &radii,
+    torch::Tensor &conics,
+    torch::Tensor &v_xy,
+    torch::Tensor &v_depth,
+    torch::Tensor &v_conic
+){
+    return std::make_tuple(torch::Tensor(), torch::Tensor(), torch::Tensor(), torch::Tensor(), torch::Tensor());
+}
+
+
+std::tuple<torch::Tensor, torch::Tensor> map_gaussian_to_intersects_tensor(
+    const int num_points,
+    const int num_intersects,
+    const torch::Tensor &xys,
+    const torch::Tensor &depths,
+    const torch::Tensor &radii,
+    const torch::Tensor &cum_tiles_hit,
+    const std::tuple<int, int, int> tile_bounds
+){
+    return std::make_tuple(torch::Tensor(), torch::Tensor());
+}
+
+torch::Tensor get_tile_bin_edges_tensor(
+    int num_intersects,
+    const torch::Tensor &isect_ids_sorted
+){
+    return torch::Tensor();
+}
+
+std::tuple<
+    torch::Tensor,
+    torch::Tensor,
+    torch::Tensor
+> rasterize_forward_tensor(
+    const std::tuple<int, int, int> tile_bounds,
+    const std::tuple<int, int, int> block,
+    const std::tuple<int, int, int> img_size,
+    const torch::Tensor &gaussian_ids_sorted,
+    const torch::Tensor &tile_bins,
+    const torch::Tensor &xys,
+    const torch::Tensor &conics,
+    const torch::Tensor &colors,
+    const torch::Tensor &opacities,
+    const torch::Tensor &background
+){
+    return std::make_tuple(torch::Tensor(), torch::Tensor(), torch::Tensor());
+}
+
+
+std::
+    tuple<
+        torch::Tensor, // dL_dxy
+        torch::Tensor, // dL_dconic
+        torch::Tensor, // dL_dcolors
+        torch::Tensor  // dL_dopacity
+        >
+    rasterize_backward_tensor(
+        const unsigned img_height,
+        const unsigned img_width,
+        const torch::Tensor &gaussians_ids_sorted,
+        const torch::Tensor &tile_bins,
+        const torch::Tensor &xys,
+        const torch::Tensor &conics,
+        const torch::Tensor &colors,
+        const torch::Tensor &opacities,
+        const torch::Tensor &background,
+        const torch::Tensor &final_Ts,
+        const torch::Tensor &final_idx,
+        const torch::Tensor &v_output, // dL_dout_color
+        const torch::Tensor &v_output_alpha
+    ){
+        return std::make_tuple(torch::Tensor(), torch::Tensor(), torch::Tensor(), torch::Tensor());
+}
\ No newline at end of file

From 0fc488a95eb9376464a4f107ccc6a22f66205554 Mon Sep 17 00:00:00 2001
From: Piero Toffanin <pt@masseranolabs.com>
Date: Mon, 11 Mar 2024 16:28:06 -0400
Subject: [PATCH 02/19] Add compute_cov2d_bounds

---
 vendor/gsplat-cpu/bindings.h     |  2 +
 vendor/gsplat-cpu/gsplat_cpu.cpp | 82 ++++++++++++++++++++++++++++++--
 2 files changed, 81 insertions(+), 3 deletions(-)

diff --git a/vendor/gsplat-cpu/bindings.h b/vendor/gsplat-cpu/bindings.h
index 12daf87..d831b4b 100644
--- a/vendor/gsplat-cpu/bindings.h
+++ b/vendor/gsplat-cpu/bindings.h
@@ -1,3 +1,5 @@
+// Originally based on https://github.dev/nerfstudio-project/gsplat
+
 #include <cstdio>
 #include <iostream>
 #include <math.h>
diff --git a/vendor/gsplat-cpu/gsplat_cpu.cpp b/vendor/gsplat-cpu/gsplat_cpu.cpp
index 9d557d1..7b2f31b 100644
--- a/vendor/gsplat-cpu/gsplat_cpu.cpp
+++ b/vendor/gsplat-cpu/gsplat_cpu.cpp
@@ -1,3 +1,6 @@
+// Originally based on https://github.dev/nerfstudio-project/gsplat
+// This implementation is licensed under the AGPLv3
+
 #include "bindings.h"
 
 #include <cstdio>
@@ -8,6 +11,32 @@
 
 using namespace torch::indexing;
 
+torch::Tensor quatToRotMat(const torch::Tensor &quat){
+    auto u = torch::unbind(torch::nn::functional::normalize(quat, torch::nn::functional::NormalizeFuncOptions().dim(-1)), -1);
+    torch::Tensor w = u[0];
+    torch::Tensor x = u[1];
+    torch::Tensor y = u[2];
+    torch::Tensor z = u[3];
+    return torch::stack({
+        torch::stack({
+            1.0 - 2.0 * (y.pow(2) + z.pow(2)),
+            2.0 * (x * y - w * z),
+            2.0 * (x * z + w * y)
+        }, -1),
+        torch::stack({
+            2.0 * (x * y + w * z),
+            1.0 - 2.0 * (x.pow(2) + z.pow(2)),
+            2.0 * (y * z - w * x)
+        }, -1),
+        torch::stack({
+            2.0 * (x * z - w * y),
+            2.0 * (y * z + w * x),
+            1.0 - 2.0 * (x.pow(2) + y.pow(2))
+        }, -1)
+    }, -2);
+    
+}
+
 torch::Tensor compute_sh_forward_tensor(
     unsigned num_points,
     unsigned degree,
@@ -56,12 +85,59 @@ project_gaussians_forward_tensor(
     float fovy = 0.5f * static_cast<float>(img_height) / fy;
 
     // clip_near_plane
-    torch::Tensor R = viewmat.index({"...", Slice(None, 3), Slice(None, 3)}); 
-    torch::Tensor T = viewmat.index({"...", Slice(None, 3), 3});
-    torch::Tensor pView = torch::matmul(R, means3d.index({"...", None})).index({"...", 0}) + T;
+    torch::Tensor Rclip = viewmat.index({"...", Slice(None, 3), Slice(None, 3)}); 
+    torch::Tensor Tclip = viewmat.index({"...", Slice(None, 3), 3});
+    torch::Tensor pView = torch::matmul(Rclip, means3d.index({"...", None})).index({"...", 0}) + Tclip;
     torch::Tensor isClose = pView.index({"...", 2}) < clip_thresh;
 
+    // scale_rot_to_cov3d
+    torch::Tensor R = quatToRotMat(quats);
+    torch::Tensor M = R * glob_scale * scales.index({"...", None, Slice()});
+    torch::Tensor cov3d = torch::matmul(M, M.transpose(-1, -2));
+
+    // project_cov3d_ewa
+    torch::Tensor W = viewmat.index({"...", Slice(None, 3), Slice(None, 3)});
+    torch::Tensor p = viewmat.index({"...", Slice(None, 3), 3});
+    torch::Tensor t = torch::matmul(W, means3d.index({"...", None})).index({"...", 0}) + p;
+
+    torch::Tensor limX = 1.3f * torch::tensor({fovx}, means3d.device());
+    torch::Tensor limY = 1.3f * torch::tensor({fovy}, means3d.device());
+    
+    torch::Tensor minLimX = t.index({"...", 2}) * torch::min(limX, torch::max(-limX, t.index({"...", 0}) / t.index({"...", 2})));
+    torch::Tensor minLimY = t.index({"...", 2}) * torch::min(limY, torch::max(-limY, t.index({"...", 1}) / t.index({"...", 2})));
     
+    t = torch::cat({minLimX.index({"...", None}), minLimY.index({"...", None}), t.index({"...", 2, None})}, -1);
+    torch::Tensor rz = 1.0f / t.index({"...", 2});
+    torch::Tensor rz2 = rz.pow(2);
+
+    torch::Tensor J = torch::stack({
+        torch::stack({fx * rz, torch::zeros_like(rz), -fx * t.index({"...", 0}) * rz2}, -1),
+        torch::stack({torch::zeros_like(rz), fy * rz, -fy * t.index({"...", 1}) * rz2}, -1)
+    }, -2);
+
+    torch::Tensor T = torch::matmul(J, W);
+    torch::Tensor cov2d = torch::matmul(T, torch::matmul(cov3d, T.transpose(-1, -2)));
+
+    // Add blur along axes
+    cov2d.index_put_({"...", 0, 0}, cov2d.index({"...", 0, 0}) + 0.3f);
+    cov2d.index_put_({"...", 1, 1}, cov2d.index({"...", 1, 1}) + 0.3f);
+     
+    // compute_cov2d_bounds
+    float eps = 1e-6f;
+    torch::Tensor det = cov2d.index({"...", 0, 0}) * cov2d.index({"...", 1, 1}) - cov2d.index({"...", 0, 1}).pow(2);
+    det = torch::clamp_min(det, eps);
+    torch::Tensor conic = torch::stack({
+            cov2d.index({"...", 1, 1}) / det,
+            -cov2d.index({"...", 0, 1}) / det,
+            cov2d.index({"...", 0, 0}) / det
+        }, -1);
+
+    torch::Tensor b = (cov2d.index({"...", 0, 0}) + cov2d.index({"...", 1, 1})) / 2.0f;
+    torch::Tensor sq = torch::sqrt(torch::clamp_min(b.pow(2) - det, 0.1f));
+    torch::Tensor v1 = b + sq;
+    torch::Tensor v2 = b - sq;
+    torch::Tensor radius = torch::ceil(3.0f * torch::sqrt(torch::max(v1, v2)));
+    torch::Tensor detValid = det > eps;
 
 
     return std::make_tuple(torch::Tensor(), torch::Tensor(), torch::Tensor(), torch::Tensor(), torch::Tensor(), torch::Tensor());

From 9bc7d587888cf458011ae047e1f6bc26bc9147fd Mon Sep 17 00:00:00 2001
From: Piero Toffanin <pt@masseranolabs.com>
Date: Tue, 12 Mar 2024 11:07:48 -0400
Subject: [PATCH 03/19] project_gaussians_forward_tensor implemented

---
 CMakeLists.txt                   |  4 +--
 simple_trainer.cpp               |  4 +--
 vendor/gsplat-cpu/gsplat_cpu.cpp | 43 +++++++++++++++++++++++++++++---
 3 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5e4d3dd..6c584f9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,10 +3,10 @@ project(opensplat)
 
 set(OPENSPLAT_BUILD_SIMPLE_TRAINER OFF CACHE BOOL "Build simple trainer applications")
 set(GPU_RUNTIME "CUDA" CACHE STRING "HIP or CUDA")
-set(OPENCV_DIR "OPENCV_DIR-NOTFOUND" CACHE "OPENCV_DIR" "Path to the OPENCV installation directory")
+set(OPENCV_DIR "OPENCV_DIR-NOTFOUND" CACHE PATH "Path to the OPENCV installation directory")
 
 if(NOT CMAKE_BUILD_TYPE)
-set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel." FORCE)
+    set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel." FORCE)
 endif()
 
 if(GPU_RUNTIME STREQUAL "CUDA")
diff --git a/simple_trainer.cpp b/simple_trainer.cpp
index 7e54dea..d09f4fc 100644
--- a/simple_trainer.cpp
+++ b/simple_trainer.cpp
@@ -64,10 +64,10 @@ int main(int argc, char **argv){
     // torch::Tensor block = torch::tensor({BLOCK_X, BLOCK_Y, 1}, device);
     
     // Init gaussians
-#ifdef USE_HIP
-#else
+#ifdef USE_CUDA
     torch::cuda::manual_seed_all(0);
 #endif
+    torch::manual_seed(0);
 
     // Random points, scales and colors
     torch::Tensor means = 2.0 * (torch::rand({numPoints, 3}, device) - 0.5); // Positions [-1, 1]
diff --git a/vendor/gsplat-cpu/gsplat_cpu.cpp b/vendor/gsplat-cpu/gsplat_cpu.cpp
index 7b2f31b..7432610 100644
--- a/vendor/gsplat-cpu/gsplat_cpu.cpp
+++ b/vendor/gsplat-cpu/gsplat_cpu.cpp
@@ -2,12 +2,12 @@
 // This implementation is licensed under the AGPLv3
 
 #include "bindings.h"
+#include "../gsplat/config.h"
 
 #include <cstdio>
 #include <iostream>
 #include <math.h>
 #include <tuple>
-#include <torch/torch.h>
 
 using namespace torch::indexing;
 
@@ -37,6 +37,24 @@ torch::Tensor quatToRotMat(const torch::Tensor &quat){
     
 }
 
+std::tuple<torch::Tensor, torch::Tensor> getTileBbox(torch::Tensor &pixCenter, const torch::Tensor &pixRadius, const std::tuple<int, int, int> &tileBounds){
+    torch::Tensor tileSize = torch::tensor({BLOCK_X, BLOCK_Y}, torch::TensorOptions().dtype(torch::kFloat32).device(pixCenter.device()));
+    torch::Tensor tileCenter = pixCenter / tileSize;
+    torch::Tensor tileRadius = pixRadius.index({"...", None}) / tileSize;
+    torch::Tensor topLeft = (tileCenter - tileRadius).to(torch::kInt32);
+    torch::Tensor bottomRight = (tileCenter + tileRadius).to(torch::kInt32) + 1;
+    torch::Tensor tileMin = torch::stack({
+        torch::clamp(topLeft.index({"...", 0}), 0, std::get<0>(tileBounds)),
+        torch::clamp(topLeft.index({"...", 1}), 0, std::get<1>(tileBounds))
+    }, -1);
+    torch::Tensor tileMax = torch::stack({
+        torch::clamp(bottomRight.index({"...", 0}), 0, std::get<0>(tileBounds)),
+        torch::clamp(bottomRight.index({"...", 1}), 0, std::get<1>(tileBounds))
+    }, -1);
+
+    return std::make_tuple(tileMin, tileMax);    
+}
+
 torch::Tensor compute_sh_forward_tensor(
     unsigned num_points,
     unsigned degree,
@@ -88,7 +106,7 @@ project_gaussians_forward_tensor(
     torch::Tensor Rclip = viewmat.index({"...", Slice(None, 3), Slice(None, 3)}); 
     torch::Tensor Tclip = viewmat.index({"...", Slice(None, 3), 3});
     torch::Tensor pView = torch::matmul(Rclip, means3d.index({"...", None})).index({"...", 0}) + Tclip;
-    torch::Tensor isClose = pView.index({"...", 2}) < clip_thresh;
+    // torch::Tensor isClose = pView.index({"...", 2}) < clip_thresh;
 
     // scale_rot_to_cov3d
     torch::Tensor R = quatToRotMat(quats);
@@ -137,10 +155,27 @@ project_gaussians_forward_tensor(
     torch::Tensor v1 = b + sq;
     torch::Tensor v2 = b - sq;
     torch::Tensor radius = torch::ceil(3.0f * torch::sqrt(torch::max(v1, v2)));
-    torch::Tensor detValid = det > eps;
+    // torch::Tensor detValid = det > eps;
+
+    // project_pix
+    torch::Tensor pHom = torch::nn::functional::pad(means3d, torch::nn::functional::PadFuncOptions({0, 1}).mode(torch::kConstant).value(1.0f));
+    pHom = torch::einsum("...ij,...j->...i", {projmat, pHom});
+    torch::Tensor rw = 1.0f / torch::clamp_min(pHom.index({"...", 3}), eps);
+    torch::Tensor pProj = pHom.index({"...", Slice(None, 3)}) * rw.index({"...", None});
+    torch::Tensor u = 0.5f * ((pProj.index({"...", 0}) + 1.0f) * static_cast<float>(img_height) - 1.0f);
+    torch::Tensor v = 0.5f * ((pProj.index({"...", 1}) + 1.0f) * static_cast<float>(img_width) - 1.0f);
+    torch::Tensor xys = torch::stack({u, v}, -1); // center
+
+    auto bbox = getTileBbox(xys, radius, tile_bounds);
+    torch::Tensor tileMin = std::get<0>(bbox);
+    torch::Tensor tileMax = std::get<1>(bbox);
+    torch::Tensor numTilesHit = (tileMax.index({"...", 0}) - tileMin.index({"...", 0})) * 
+                   (tileMax.index({"...", 1}) - tileMin.index({"...", 1}));
 
+    torch::Tensor depths = pView.index({"...", 2});
+    torch::Tensor radii = radius.to(torch::kInt32);
 
-    return std::make_tuple(torch::Tensor(), torch::Tensor(), torch::Tensor(), torch::Tensor(), torch::Tensor(), torch::Tensor());
+    return std::make_tuple(cov3d, xys, depths, radii, conic, numTilesHit );
 }
 
 std::tuple<

From 5781d52e444c89a09d79ec003706a98bc1861db6 Mon Sep 17 00:00:00 2001
From: Piero Toffanin <pt@masseranolabs.com>
Date: Tue, 12 Mar 2024 17:22:17 +0000
Subject: [PATCH 04/19] Implemented rasterize_forward

---
 vendor/gsplat-cpu/gsplat_cpu.cpp | 83 +++++++++++++++++++++++++++++++-
 1 file changed, 82 insertions(+), 1 deletion(-)

diff --git a/vendor/gsplat-cpu/gsplat_cpu.cpp b/vendor/gsplat-cpu/gsplat_cpu.cpp
index 7432610..ef0feb9 100644
--- a/vendor/gsplat-cpu/gsplat_cpu.cpp
+++ b/vendor/gsplat-cpu/gsplat_cpu.cpp
@@ -218,6 +218,33 @@ std::tuple<torch::Tensor, torch::Tensor> map_gaussian_to_intersects_tensor(
     const torch::Tensor &cum_tiles_hit,
     const std::tuple<int, int, int> tile_bounds
 ){
+    num_intersects = cum_tiles_hit[-1]
+    isect_ids = torch.zeros(num_intersects, dtype=torch.int64, device=xys.device)
+    gaussian_ids = torch.zeros(num_intersects, dtype=torch.int32, device=xys.device)
+
+    for idx in range(num_points):
+        if radii[idx] <= 0:
+            break
+
+        tile_min, tile_max = get_tile_bbox(xys[idx], radii[idx], tile_bounds)
+
+        cur_idx = 0 if idx == 0 else cum_tiles_hit[idx - 1].item()
+
+        # Get raw byte representation of the float value at the given index
+        raw_bytes = struct.pack("f", depths[idx])
+
+        # Interpret those bytes as an int32_t
+        depth_id_n = struct.unpack("i", raw_bytes)[0]
+
+        for i in range(tile_min[1], tile_max[1]):
+            for j in range(tile_min[0], tile_max[0]):
+                tile_id = i * tile_bounds[0] + j
+                isect_ids[cur_idx] = (tile_id << 32) | depth_id_n
+                gaussian_ids[cur_idx] = idx
+                cur_idx += 1
+
+    return isect_ids, gaussian_ids
+
     return std::make_tuple(torch::Tensor(), torch::Tensor());
 }
 
@@ -244,7 +271,61 @@ std::tuple<
     const torch::Tensor &opacities,
     const torch::Tensor &background
 ){
-    return std::make_tuple(torch::Tensor(), torch::Tensor(), torch::Tensor());
+    int channels = colors.size(1);
+    int width = std::get<1>(img_size);
+    int height = std::get<0>(img_size);
+    torch::Device device = xys.device();
+
+    torch::Tensor outImg = torch::zeros({width, height, channels}, torch::TensorOptions().dtype(torch::kFloat32).device(device));
+    torch::Tensor finalTs = torch::zeros({width, height, channels}, torch::TensorOptions().dtype(torch::kFloat32).device(device));   
+    torch::Tensor finalIdx = torch::zeros({width, height, channels}, torch::TensorOptions().dtype(torch::kInt32).device(device));   
+
+
+    for (int i = 0; i < width; i++){
+        for (int j = 0; j < height; j++){
+            int tileId = (i / std::get<0>(block)) * std::get<0>(tile_bounds) + (j / std::get<1>(block));
+            int tileBinStart = tile_bins[tileId][0].item<int>();
+            int tileBinEnd = tile_bins[tileId][1].item<int>();
+            float T = 1.0f;
+
+            int idx = tileBinStart;
+            for (; idx < tileBinEnd; idx++){
+                torch::Tensor gaussianId = gaussian_ids_sorted[idx];
+                torch::Tensor conic = conics[gaussianId];
+                torch::Tensor center = xys[gaussianId];
+                torch::Tensor delta = center - torch::tensor({j, i}, torch::TensorOptions().dtype(torch::kFloat32).device(device));
+
+                torch::Tensor sigma = (
+                    0.5f
+                    * (conic[0] * delta[0] * delta[0] + conic[2] * delta[1] * delta[1])
+                    + conic[1] * delta[0] * delta[1]
+                );
+
+                if (sigma.item<float>() < 0.0f) continue;
+
+                float alpha = (std::min)(0.999f, (opacities[gaussianId] * torch::exp(-sigma)).item<float>());
+
+                if (alpha < 1.0f / 255.0f) continue;
+
+                float nextT = T * (1.0f - alpha);
+
+                if (nextT <= 1e-4f){
+                    idx -= 1;
+                    break;
+                }
+
+                float vis = alpha * T;
+                outImg[i][j] += vis * colors[gaussianId];
+                T = nextT;
+            }
+
+            finalTs[i][j] = T;
+            finalIdx[i][j] = idx;
+            outImg[i][j] += T * background;
+        }
+    }
+
+    return std::make_tuple(outImg, finalTs, finalIdx);
 }
 
 

From 6499b5d1f9dfa58f6a8916b0a85d537ae7e71c43 Mon Sep 17 00:00:00 2001
From: Piero Toffanin <pt@masseranolabs.com>
Date: Tue, 12 Mar 2024 19:15:32 +0000
Subject: [PATCH 05/19] Work on map_gaussian_to_intersects_tensor

---
 vendor/gsplat-cpu/gsplat_cpu.cpp | 64 +++++++++++++++++++-------------
 1 file changed, 39 insertions(+), 25 deletions(-)

diff --git a/vendor/gsplat-cpu/gsplat_cpu.cpp b/vendor/gsplat-cpu/gsplat_cpu.cpp
index ef0feb9..2509d33 100644
--- a/vendor/gsplat-cpu/gsplat_cpu.cpp
+++ b/vendor/gsplat-cpu/gsplat_cpu.cpp
@@ -218,32 +218,46 @@ std::tuple<torch::Tensor, torch::Tensor> map_gaussian_to_intersects_tensor(
     const torch::Tensor &cum_tiles_hit,
     const std::tuple<int, int, int> tile_bounds
 ){
-    num_intersects = cum_tiles_hit[-1]
-    isect_ids = torch.zeros(num_intersects, dtype=torch.int64, device=xys.device)
-    gaussian_ids = torch.zeros(num_intersects, dtype=torch.int32, device=xys.device)
-
-    for idx in range(num_points):
-        if radii[idx] <= 0:
-            break
-
-        tile_min, tile_max = get_tile_bbox(xys[idx], radii[idx], tile_bounds)
-
-        cur_idx = 0 if idx == 0 else cum_tiles_hit[idx - 1].item()
-
-        # Get raw byte representation of the float value at the given index
-        raw_bytes = struct.pack("f", depths[idx])
-
-        # Interpret those bytes as an int32_t
-        depth_id_n = struct.unpack("i", raw_bytes)[0]
-
-        for i in range(tile_min[1], tile_max[1]):
-            for j in range(tile_min[0], tile_max[0]):
-                tile_id = i * tile_bounds[0] + j
-                isect_ids[cur_idx] = (tile_id << 32) | depth_id_n
-                gaussian_ids[cur_idx] = idx
-                cur_idx += 1
+    torch::Device device = xys.device();
+    int numIntersects = cum_tiles_hit[-1].item<int>();
+    torch::Tensor isectIds = torch::zeros(numIntersects, torch::TensorOptions().dtype(torch::kInt64).device(device));
+    torch::Tensor gaussianIds = torch::zeros(numIntersects, torch::TensorOptions().dtype(torch::kInt32).device(device));
+    for (int idx = 0; idx < num_points; idx++){
+        if (radii[idx].item<float>() <= 0.0f) break;
+
+        auto bbox = getTileBbox(xys[idx], radii[idx], tile_bounds);
+        torch::Tensor tileMin = std::get<0>(bbox);
+        torch::Tensor tileMax = std::get<1>(bbox);
+        int curIdx;
+
+        if (idx == 0){
+            curIdx = 0;
+        }else{
+            curIdx = cum_tiles_hit[idx - 1].item<int>();
+        }
 
-    return isect_ids, gaussian_ids
+        int32_t depthIdN = static_cast<int32_t>(depths[idx].item<float>());
+        int iStart = tileMin[1].item<int>();
+        int iEnd = tileMax[1].item<int>();
+        int jStart = tileMin[0].item<int>();
+        int jEnd = tileMax[0].item<int>();
+        int b = std::get<0>(tile_bounds);
+
+        for (int i = iStart; i < iEnd; i++){
+            for (int j = jStart; j < jEnd; j++){
+                int tileId = i * b + j;
+                isectIds[curIdx]
+            }
+        }
+    }
+    //     for i in range(tile_min[1], tile_max[1]):
+    //         for j in range(tile_min[0], tile_max[0]):
+    //             tile_id = i * tile_bounds[0] + j
+    //             isect_ids[cur_idx] = (tile_id << 32) | depth_id_n
+    //             gaussian_ids[cur_idx] = idx
+    //             cur_idx += 1
+
+    // return isect_ids, gaussian_ids
 
     return std::make_tuple(torch::Tensor(), torch::Tensor());
 }

From 9f8c4d3643270e367a76ef30f3815a42e8cba323 Mon Sep 17 00:00:00 2001
From: Piero Toffanin <pt@masseranolabs.com>
Date: Tue, 12 Mar 2024 19:56:22 +0000
Subject: [PATCH 06/19] Forward pass working (very slow)

---
 vendor/gsplat-cpu/gsplat_cpu.cpp | 51 ++++++++++++++++++++++----------
 1 file changed, 36 insertions(+), 15 deletions(-)

diff --git a/vendor/gsplat-cpu/gsplat_cpu.cpp b/vendor/gsplat-cpu/gsplat_cpu.cpp
index 2509d33..a53ce12 100644
--- a/vendor/gsplat-cpu/gsplat_cpu.cpp
+++ b/vendor/gsplat-cpu/gsplat_cpu.cpp
@@ -37,7 +37,7 @@ torch::Tensor quatToRotMat(const torch::Tensor &quat){
     
 }
 
-std::tuple<torch::Tensor, torch::Tensor> getTileBbox(torch::Tensor &pixCenter, const torch::Tensor &pixRadius, const std::tuple<int, int, int> &tileBounds){
+std::tuple<torch::Tensor, torch::Tensor> getTileBbox(const torch::Tensor &pixCenter, const torch::Tensor &pixRadius, const std::tuple<int, int, int> &tileBounds){
     torch::Tensor tileSize = torch::tensor({BLOCK_X, BLOCK_Y}, torch::TensorOptions().dtype(torch::kFloat32).device(pixCenter.device()));
     torch::Tensor tileCenter = pixCenter / tileSize;
     torch::Tensor tileRadius = pixRadius.index({"...", None}) / tileSize;
@@ -245,28 +245,45 @@ std::tuple<torch::Tensor, torch::Tensor> map_gaussian_to_intersects_tensor(
 
         for (int i = iStart; i < iEnd; i++){
             for (int j = jStart; j < jEnd; j++){
-                int tileId = i * b + j;
-                isectIds[curIdx]
+                int64_t tileId = i * b + j;
+                isectIds[curIdx] = (tileId << 32) | depthIdN;
+                gaussianIds[curIdx] = idx;
+                curIdx += 1;
             }
         }
     }
-    //     for i in range(tile_min[1], tile_max[1]):
-    //         for j in range(tile_min[0], tile_max[0]):
-    //             tile_id = i * tile_bounds[0] + j
-    //             isect_ids[cur_idx] = (tile_id << 32) | depth_id_n
-    //             gaussian_ids[cur_idx] = idx
-    //             cur_idx += 1
 
-    // return isect_ids, gaussian_ids
-
-    return std::make_tuple(torch::Tensor(), torch::Tensor());
+    return std::make_tuple(isectIds, gaussianIds); 
 }
 
 torch::Tensor get_tile_bin_edges_tensor(
     int num_intersects,
     const torch::Tensor &isect_ids_sorted
 ){
-    return torch::Tensor();
+    torch::Tensor tileBins = torch::zeros({num_intersects, 2}, torch::TensorOptions().dtype(torch::kInt32).device(isect_ids_sorted.device()));
+
+    for (int idx = 0; idx < num_intersects; idx++){
+        int64_t curTileIdx = isect_ids_sorted[idx].item<int64_t>() >> 32;
+
+        if (idx == 0){
+            tileBins[curTileIdx][0] = 0;
+            continue;
+        }
+
+        if (idx == num_intersects - 1){
+            tileBins[curTileIdx][1] = num_intersects;
+            break;
+        }
+
+        int64_t prevTileIdx = isect_ids_sorted[idx - 1].item<int64_t>() >> 32;
+
+        if (curTileIdx != prevTileIdx){
+            tileBins[prevTileIdx][1] = idx;
+            tileBins[curTileIdx][0] = idx;
+        }
+    }
+
+    return tileBins;
 }
 
 std::tuple<
@@ -294,10 +311,14 @@ std::tuple<
     torch::Tensor finalTs = torch::zeros({width, height, channels}, torch::TensorOptions().dtype(torch::kFloat32).device(device));   
     torch::Tensor finalIdx = torch::zeros({width, height, channels}, torch::TensorOptions().dtype(torch::kInt32).device(device));   
 
-
+    int blockX = std::get<0>(block);
+    int blockY = std::get<1>(block);
+    int tileBoundsX = std::get<0>(tile_bounds);
+    
     for (int i = 0; i < width; i++){
+        std::cout << i << std::endl;
         for (int j = 0; j < height; j++){
-            int tileId = (i / std::get<0>(block)) * std::get<0>(tile_bounds) + (j / std::get<1>(block));
+            int tileId = (i / blockX) * tileBoundsX + (j / blockY);
             int tileBinStart = tile_bins[tileId][0].item<int>();
             int tileBinEnd = tile_bins[tileId][1].item<int>();
             float T = 1.0f;

From 863e1170990c54901763ea2a1bb9ad3449d61780 Mon Sep 17 00:00:00 2001
From: Piero Toffanin <pt@masseranolabs.com>
Date: Wed, 13 Mar 2024 17:31:19 +0000
Subject: [PATCH 07/19] PoC CPU simple_trainer working (slow)

---
 CMakeLists.txt                   |  2 +-
 project_gaussians.cpp            | 20 +++++++++++---------
 project_gaussians.hpp            |  2 ++
 rasterize_gaussians.cpp          |  9 +++++----
 simple_trainer.cpp               | 14 +++++++-------
 vendor/gsplat-cpu/gsplat_cpu.cpp | 15 +++++++++------
 6 files changed, 35 insertions(+), 27 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6c584f9..9d1eff4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -93,7 +93,7 @@ elseif(GPU_RUNTIME STREQUAL "CUDA")
 endif()
 
 if(OPENSPLAT_BUILD_SIMPLE_TRAINER)
-    add_executable(simple_trainer simple_trainer.cpp project_gaussians.cpp rasterize_gaussians.cpp)
+    add_executable(simple_trainer simple_trainer.cpp project_gaussians.cpp rasterize_gaussians.cpp cv_utils.cpp)
     target_include_directories(simple_trainer PRIVATE ${PROJECT_SOURCE_DIR}/vendor/glm ${GPU_INCLUDE_DIRS})
     target_link_libraries(simple_trainer PUBLIC ${GPU_LIBRARIES} gsplat ${TORCH_LIBRARIES} ${OpenCV_LIBS})
     set_property(TARGET simple_trainer PROPERTY CXX_STANDARD 17)
diff --git a/project_gaussians.cpp b/project_gaussians.cpp
index d899362..029d589 100644
--- a/project_gaussians.cpp
+++ b/project_gaussians.cpp
@@ -30,16 +30,18 @@ variable_list ProjectGaussians::forward(AutogradContext *ctx,
     torch::Tensor conics = std::get<4>(t);
     torch::Tensor numTilesHit = std::get<5>(t);
 
-    ctx->saved_data["imgHeight"] = imgHeight;
-    ctx->saved_data["imgWidth"] = imgWidth;
-    ctx->saved_data["numPoints"] = numPoints;
-    ctx->saved_data["globScale"] = globScale;
-    ctx->saved_data["fx"] = fx;
-    ctx->saved_data["fy"] = fy;
-    ctx->saved_data["cx"] = cx;
-    ctx->saved_data["cy"] = cy;
+    if (ctx != nullptr){
+        ctx->saved_data["imgHeight"] = imgHeight;
+        ctx->saved_data["imgWidth"] = imgWidth;
+        ctx->saved_data["numPoints"] = numPoints;
+        ctx->saved_data["globScale"] = globScale;
+        ctx->saved_data["fx"] = fx;
+        ctx->saved_data["fy"] = fy;
+        ctx->saved_data["cx"] = cx;
+        ctx->saved_data["cy"] = cy;
+        ctx->save_for_backward({ means, scales, quats, viewMat, projMat, cov3d, radii, conics });
+    }
 
-    ctx->save_for_backward({ means, scales, quats, viewMat, projMat, cov3d, radii, conics });
     return { xys, depths, radii, conics, numTilesHit, cov3d };
 }
 
diff --git a/project_gaussians.hpp b/project_gaussians.hpp
index ece9399..d84f53f 100644
--- a/project_gaussians.hpp
+++ b/project_gaussians.hpp
@@ -27,4 +27,6 @@ class ProjectGaussians : public Function<ProjectGaussians>{
 };
 
 
+
+
 #endif
\ No newline at end of file
diff --git a/rasterize_gaussians.cpp b/rasterize_gaussians.cpp
index 42d3bdc..bf664c4 100644
--- a/rasterize_gaussians.cpp
+++ b/rasterize_gaussians.cpp
@@ -82,10 +82,11 @@ torch::Tensor RasterizeGaussians::forward(AutogradContext *ctx,
     // Map of tile bin IDs
     torch::Tensor finalIdx = std::get<2>(t);
 
-    ctx->saved_data["imgWidth"] = imgWidth;
-    ctx->saved_data["imgHeight"] = imgHeight;
-    
-    ctx->save_for_backward({ gaussianIdsSorted, tileBins, xys, conics, colors, opacity, background, finalTs, finalIdx });
+    if (ctx != nullptr){
+        ctx->saved_data["imgWidth"] = imgWidth;
+        ctx->saved_data["imgHeight"] = imgHeight;
+        ctx->save_for_backward({ gaussianIdsSorted, tileBins, xys, conics, colors, opacity, background, finalTs, finalIdx });
+    }
     
     return outImg;
 }
diff --git a/simple_trainer.cpp b/simple_trainer.cpp
index d09f4fc..1dbb849 100644
--- a/simple_trainer.cpp
+++ b/simple_trainer.cpp
@@ -27,8 +27,8 @@ using namespace torch::indexing;
 
 
 int main(int argc, char **argv){
-    int width = 256,
-        height = 256;
+    int width = 16,
+        height = 16;
     int numPoints = 100000;
     int iterations = 1000;
     float learningRate = 0.01;
@@ -108,7 +108,7 @@ int main(int argc, char **argv){
     torch::nn::MSELoss mseLoss;
 
     for (size_t i = 0; i < iterations; i++){
-        auto p = ProjectGaussians::apply(means, scales, 1, 
+        auto p = ProjectGaussians::forward(nullptr, means, scales, 1, 
                                 quats, viewMat, viewMat,
                                 focal, focal,
                                 width / 2,
@@ -117,7 +117,7 @@ int main(int argc, char **argv){
                                 width,
                                 tileBounds);
 
-        torch::Tensor outImg = RasterizeGaussians::apply(
+        torch::Tensor outImg = RasterizeGaussians::forward(nullptr,
             p[0], // xys
             p[1], // depths
             p[2], // radii,
@@ -137,8 +137,8 @@ int main(int argc, char **argv){
 
         std::cout << "Iteration " << std::to_string(i + 1) << "/" << std::to_string(iterations) << " Loss: " << loss.item<float>() << std::endl; 
 
-        // cv::Mat image = tensorToImage(outImg.detach().cpu());
-        // cv::cvtColor(image, image, cv::COLOR_RGB2BGR);
-        // cv::imwrite("render/" + std::to_string(i + 1) + ".png", image);
+        cv::Mat image = tensorToImage(outImg.detach().cpu());
+        cv::cvtColor(image, image, cv::COLOR_RGB2BGR);
+        cv::imwrite("render/" + std::to_string(i + 1) + ".png", image);
     }
 }
\ No newline at end of file
diff --git a/vendor/gsplat-cpu/gsplat_cpu.cpp b/vendor/gsplat-cpu/gsplat_cpu.cpp
index a53ce12..52aecff 100644
--- a/vendor/gsplat-cpu/gsplat_cpu.cpp
+++ b/vendor/gsplat-cpu/gsplat_cpu.cpp
@@ -236,7 +236,9 @@ std::tuple<torch::Tensor, torch::Tensor> map_gaussian_to_intersects_tensor(
             curIdx = cum_tiles_hit[idx - 1].item<int>();
         }
 
-        int32_t depthIdN = static_cast<int32_t>(depths[idx].item<float>());
+        float depth = depths[idx].item<float>();
+        int32_t depthIdN = *(reinterpret_cast<int32_t *>(&depth));
+
         int iStart = tileMin[1].item<int>();
         int iEnd = tileMax[1].item<int>();
         int jStart = tileMin[0].item<int>();
@@ -246,7 +248,7 @@ std::tuple<torch::Tensor, torch::Tensor> map_gaussian_to_intersects_tensor(
         for (int i = iStart; i < iEnd; i++){
             for (int j = jStart; j < jEnd; j++){
                 int64_t tileId = i * b + j;
-                isectIds[curIdx] = (tileId << 32) | depthIdN;
+                isectIds[curIdx] = static_cast<int64_t>(tileId << 32) | depthIdN;
                 gaussianIds[curIdx] = idx;
                 curIdx += 1;
             }
@@ -263,7 +265,7 @@ torch::Tensor get_tile_bin_edges_tensor(
     torch::Tensor tileBins = torch::zeros({num_intersects, 2}, torch::TensorOptions().dtype(torch::kInt32).device(isect_ids_sorted.device()));
 
     for (int idx = 0; idx < num_intersects; idx++){
-        int64_t curTileIdx = isect_ids_sorted[idx].item<int64_t>() >> 32;
+        int32_t curTileIdx = static_cast<int32_t>(isect_ids_sorted[idx].item<int64_t>() >> 32);
 
         if (idx == 0){
             tileBins[curTileIdx][0] = 0;
@@ -275,7 +277,7 @@ torch::Tensor get_tile_bin_edges_tensor(
             break;
         }
 
-        int64_t prevTileIdx = isect_ids_sorted[idx - 1].item<int64_t>() >> 32;
+        int32_t prevTileIdx = static_cast<int32_t>(isect_ids_sorted[idx - 1].item<int64_t>() >> 32);
 
         if (curTileIdx != prevTileIdx){
             tileBins[prevTileIdx][1] = idx;
@@ -322,13 +324,14 @@ std::tuple<
             int tileBinStart = tile_bins[tileId][0].item<int>();
             int tileBinEnd = tile_bins[tileId][1].item<int>();
             float T = 1.0f;
-
+            torch::Tensor ji = torch::tensor({j, i}, torch::TensorOptions().dtype(torch::kFloat32).device(device));
+            
             int idx = tileBinStart;
             for (; idx < tileBinEnd; idx++){
                 torch::Tensor gaussianId = gaussian_ids_sorted[idx];
                 torch::Tensor conic = conics[gaussianId];
                 torch::Tensor center = xys[gaussianId];
-                torch::Tensor delta = center - torch::tensor({j, i}, torch::TensorOptions().dtype(torch::kFloat32).device(device));
+                torch::Tensor delta = center - ji;
 
                 torch::Tensor sigma = (
                     0.5f

From cc8166b7e8b886d9eda6a7191037232d9b283b12 Mon Sep 17 00:00:00 2001
From: Piero Toffanin <pt@masseranolabs.com>
Date: Fri, 15 Mar 2024 18:14:15 +0000
Subject: [PATCH 08/19] PoC bounding box based rendering

---
 project_gaussians.cpp            |   4 +-
 rasterize_gaussians.cpp          |  13 +++-
 rasterize_gaussians.hpp          |   1 +
 simple_trainer.cpp               |   5 +-
 vendor/gsplat-cpu/bindings.h     |   4 +-
 vendor/gsplat-cpu/gsplat_cpu.cpp | 123 ++++++++++++++++++++++++++++++-
 6 files changed, 141 insertions(+), 9 deletions(-)

diff --git a/project_gaussians.cpp b/project_gaussians.cpp
index 029d589..6781078 100644
--- a/project_gaussians.cpp
+++ b/project_gaussians.cpp
@@ -29,6 +29,8 @@ variable_list ProjectGaussians::forward(AutogradContext *ctx,
     torch::Tensor radii = std::get<3>(t);
     torch::Tensor conics = std::get<4>(t);
     torch::Tensor numTilesHit = std::get<5>(t);
+    torch::Tensor cov2d = std::get<6>(t);
+    
 
     if (ctx != nullptr){
         ctx->saved_data["imgHeight"] = imgHeight;
@@ -42,7 +44,7 @@ variable_list ProjectGaussians::forward(AutogradContext *ctx,
         ctx->save_for_backward({ means, scales, quats, viewMat, projMat, cov3d, radii, conics });
     }
 
-    return { xys, depths, radii, conics, numTilesHit, cov3d };
+    return { xys, depths, radii, conics, numTilesHit, cov3d, cov2d };
 }
 
 tensor_list ProjectGaussians::backward(AutogradContext *ctx, tensor_list grad_outputs) {
diff --git a/rasterize_gaussians.cpp b/rasterize_gaussians.cpp
index bf664c4..e1e01f0 100644
--- a/rasterize_gaussians.cpp
+++ b/rasterize_gaussians.cpp
@@ -2,6 +2,8 @@
 #include "gsplat.hpp"
 #include "vendor/gsplat/config.h"
 
+#include "cv_utils.hpp" // TODO REMOVE
+
 std::tuple<torch::Tensor,
         torch::Tensor,
         torch::Tensor,
@@ -43,6 +45,7 @@ torch::Tensor RasterizeGaussians::forward(AutogradContext *ctx,
             torch::Tensor numTilesHit,
             torch::Tensor colors,
             torch::Tensor opacity,
+            torch::Tensor cov2d,
             int imgHeight,
             int imgWidth,
             torch::Tensor background
@@ -72,10 +75,18 @@ torch::Tensor RasterizeGaussians::forward(AutogradContext *ctx,
                             conics,
                             colors,
                             opacity,
-                            background);
+                            background,
+                            cov2d);
     // Final image
     torch::Tensor outImg = std::get<0>(t);
 
+    cv::Mat image = tensorToImage(outImg.detach().cpu());
+    cv::cvtColor(image, image, cv::COLOR_RGB2BGR);
+    cv::imwrite("test.png", image);
+    std::cout << "WROTE";
+    exit(1);
+
+
     // Map of alpha-inverse (1 - finalTs = alpha)
     torch::Tensor finalTs = std::get<1>(t);
 
diff --git a/rasterize_gaussians.hpp b/rasterize_gaussians.hpp
index f73a85f..a97f69d 100644
--- a/rasterize_gaussians.hpp
+++ b/rasterize_gaussians.hpp
@@ -27,6 +27,7 @@ class RasterizeGaussians : public Function<RasterizeGaussians>{
             torch::Tensor numTilesHit,
             torch::Tensor colors,
             torch::Tensor opacity,
+            torch::Tensor cov2d,
             int imgHeight,
             int imgWidth,
             torch::Tensor background);
diff --git a/simple_trainer.cpp b/simple_trainer.cpp
index 1dbb849..38660b1 100644
--- a/simple_trainer.cpp
+++ b/simple_trainer.cpp
@@ -27,8 +27,8 @@ using namespace torch::indexing;
 
 
 int main(int argc, char **argv){
-    int width = 16,
-        height = 16;
+    int width = 128,
+        height = 128;
     int numPoints = 100000;
     int iterations = 1000;
     float learningRate = 0.01;
@@ -125,6 +125,7 @@ int main(int argc, char **argv){
             p[4], // numTilesHit
             torch::sigmoid(rgbs),
             torch::sigmoid(opacities),
+            p[6], // cov2d
             height,
             width,
             background);
diff --git a/vendor/gsplat-cpu/bindings.h b/vendor/gsplat-cpu/bindings.h
index d831b4b..aa9504e 100644
--- a/vendor/gsplat-cpu/bindings.h
+++ b/vendor/gsplat-cpu/bindings.h
@@ -28,6 +28,7 @@ std::tuple<
     torch::Tensor,
     torch::Tensor,
     torch::Tensor,
+    torch::Tensor,
     torch::Tensor>
 project_gaussians_forward_tensor(
     const int num_points,
@@ -105,7 +106,8 @@ std::tuple<
     const torch::Tensor &conics,
     const torch::Tensor &colors,
     const torch::Tensor &opacities,
-    const torch::Tensor &background
+    const torch::Tensor &background,
+    const torch::Tensor &cov2d
 );
 
 std::
diff --git a/vendor/gsplat-cpu/gsplat_cpu.cpp b/vendor/gsplat-cpu/gsplat_cpu.cpp
index 52aecff..7472c26 100644
--- a/vendor/gsplat-cpu/gsplat_cpu.cpp
+++ b/vendor/gsplat-cpu/gsplat_cpu.cpp
@@ -81,6 +81,7 @@ std::tuple<
     torch::Tensor,
     torch::Tensor,
     torch::Tensor,
+    torch::Tensor,
     torch::Tensor>
 project_gaussians_forward_tensor(
     const int num_points,
@@ -101,6 +102,8 @@ project_gaussians_forward_tensor(
 ){
     float fovx = 0.5f * static_cast<float>(img_width) / fx;
     float fovy = 0.5f * static_cast<float>(img_height) / fy;
+    
+    // TODO: no need to recompute W,p,t below (they are the same)
 
     // clip_near_plane
     torch::Tensor Rclip = viewmat.index({"...", Slice(None, 3), Slice(None, 3)}); 
@@ -175,7 +178,7 @@ project_gaussians_forward_tensor(
     torch::Tensor depths = pView.index({"...", 2});
     torch::Tensor radii = radius.to(torch::kInt32);
 
-    return std::make_tuple(cov3d, xys, depths, radii, conic, numTilesHit );
+    return std::make_tuple(cov3d, xys, depths, radii, conic, numTilesHit, cov2d );
 }
 
 std::tuple<
@@ -302,23 +305,134 @@ std::tuple<
     const torch::Tensor &conics,
     const torch::Tensor &colors,
     const torch::Tensor &opacities,
-    const torch::Tensor &background
+    const torch::Tensor &background,
+    const torch::Tensor &cov2d
 ){
     int channels = colors.size(1);
     int width = std::get<1>(img_size);
     int height = std::get<0>(img_size);
+    int numPoints = xys.size(0);
+
     torch::Device device = xys.device();
 
     torch::Tensor outImg = torch::zeros({width, height, channels}, torch::TensorOptions().dtype(torch::kFloat32).device(device));
-    torch::Tensor finalTs = torch::zeros({width, height, channels}, torch::TensorOptions().dtype(torch::kFloat32).device(device));   
+    torch::Tensor finalTs = torch::ones({width, height, channels}, torch::TensorOptions().dtype(torch::kFloat32).device(device));   
     torch::Tensor finalIdx = torch::zeros({width, height, channels}, torch::TensorOptions().dtype(torch::kInt32).device(device));   
 
+    const float alphaThresh = 1.0f / 255.0f;
+    int idx = 0;
+    for (; idx < numPoints; idx++){
+        torch::Tensor gaussianId = gaussian_ids_sorted[idx];
+        torch::Tensor conic = conics[gaussianId];
+        torch::Tensor center = xys[gaussianId];
+        
+        float sqx = 3.0f * std::sqrt(cov2d[gaussianId][0][0].item<float>());
+        float sqy = 3.0f * std::sqrt(cov2d[gaussianId][1][1].item<float>());
+
+        int minx = (std::max)(0, static_cast<int>(std::floor(center[1].item<float>() - sqy)) - 2);
+        int maxx = (std::min)(width, static_cast<int>(std::ceil(center[1].item<float>() + sqy)) + 2);
+        int miny = (std::max)(0, static_cast<int>(std::floor(center[0].item<float>() - sqx)) - 2);
+        int maxy = (std::min)(height, static_cast<int>(std::ceil(center[0].item<float>() + sqx)) + 2);
+
+        for (int i = minx; i < maxx; i++){
+            for (int j = miny; j < maxy; j++){
+                torch::Tensor ji = torch::tensor({j, i}, torch::TensorOptions().dtype(torch::kFloat32).device(device));
+            
+                torch::Tensor delta = center - ji;
+                torch::Tensor sigma = (
+                    0.5f
+                    * (conic[0] * delta[0] * delta[0] + conic[2] * delta[1] * delta[1])
+                    + conic[1] * delta[0] * delta[1]
+                );
+
+                if (sigma.item<float>() < 0.0f) continue;
+                float alpha = (std::min)(0.999f, (opacities[gaussianId] * torch::exp(-sigma)).item<float>());
+                if (alpha < alphaThresh) continue;
+
+                torch::Tensor T = finalTs[i][j];
+                torch::Tensor nextT = T * (1.0f - alpha);
+
+                // if (nextT.item<float>() <= 1e-4f){
+                //     idx -= 1;
+                //     break;
+                // }
+
+                // outImg[i][j] = torch::tensor({1.0f, 1.0f, 1.0f}); 
+                outImg[i][j] += alpha * T * colors[gaussianId];
+                finalTs[i][j] = nextT;
+            }
+        }
+    }
+
+    return std::make_tuple(outImg, finalTs, finalIdx);
+
+/*
+    int minx = 99999;
+    int miny = 99999;
+    int maxx = 0;
+    int maxy = 0;
+    for (int i = 0; i < width; i++){
+        std::cout << i << std::endl;
+        for (int j = 0; j < height; j++){
+            float T = 1.0f;
+            torch::Tensor ji = torch::tensor({j, i}, torch::TensorOptions().dtype(torch::kFloat32).device(device));
+            
+            int idx = 0;
+            for (; idx < 1; idx++){
+                torch::Tensor gaussianId = gaussian_ids_sorted[idx];
+                torch::Tensor conic = conics[gaussianId];
+                torch::Tensor center = xys[gaussianId];
+                torch::Tensor delta = center - ji;
+
+                torch::Tensor sigma = (
+                    0.5f
+                    * (conic[0] * delta[0] * delta[0] + conic[2] * delta[1] * delta[1])
+                    + conic[1] * delta[0] * delta[1]
+                );
+
+                if (sigma.item<float>() < 0.0f) continue;
+
+                float alpha = (std::min)(0.999f, (opacities[gaussianId] * torch::exp(-sigma)).item<float>());
+
+                if (alpha < 1.0f / 255.0f) continue;
+
+                float nextT = T * (1.0f - alpha);
+
+                if (nextT <= 1e-4f){
+                    idx -= 1;
+                    break;
+                }
+
+                float vis = alpha * T;
+                // outImg[i][j] = torch::tensor({1.0f, 1.0f, 1.0f}); 
+                outImg[i][j] += vis * colors[gaussianId];
+
+                maxx = (std::max)(i, maxx);
+                maxy = (std::max)(j, maxy);
+                minx = (std::min)(i, minx);
+                miny = (std::min)(j, miny);               
+                
+
+                T = nextT;
+            }
+
+            finalTs[i][j] = T;
+            finalIdx[i][j] = idx;
+            // outImg[i][j] += T * background;
+        }
+    }
+
+    std::cout << "[" << minx << ", " << miny << "], [" << maxx << ", " << maxy << "]" << std::endl;
+
+    return std::make_tuple(outImg, finalTs, finalIdx);
+*/
+/*
+
     int blockX = std::get<0>(block);
     int blockY = std::get<1>(block);
     int tileBoundsX = std::get<0>(tile_bounds);
     
     for (int i = 0; i < width; i++){
-        std::cout << i << std::endl;
         for (int j = 0; j < height; j++){
             int tileId = (i / blockX) * tileBoundsX + (j / blockY);
             int tileBinStart = tile_bins[tileId][0].item<int>();
@@ -364,6 +478,7 @@ std::tuple<
     }
 
     return std::make_tuple(outImg, finalTs, finalIdx);
+    */
 }
 
 

From e0f8b537f42b331aad4f22f7e30c0816299df90a Mon Sep 17 00:00:00 2001
From: Piero Toffanin <pt@masseranolabs.com>
Date: Fri, 15 Mar 2024 20:22:14 +0000
Subject: [PATCH 09/19] Work on rasterizer

---
 rasterize_gaussians.cpp          |   3 +-
 vendor/gsplat-cpu/bindings.h     |   3 +-
 vendor/gsplat-cpu/gsplat_cpu.cpp | 106 +++++++++++++++++++++----------
 3 files changed, 78 insertions(+), 34 deletions(-)

diff --git a/rasterize_gaussians.cpp b/rasterize_gaussians.cpp
index e1e01f0..deee76c 100644
--- a/rasterize_gaussians.cpp
+++ b/rasterize_gaussians.cpp
@@ -76,7 +76,8 @@ torch::Tensor RasterizeGaussians::forward(AutogradContext *ctx,
                             colors,
                             opacity,
                             background,
-                            cov2d);
+                            cov2d,
+                            depths);
     // Final image
     torch::Tensor outImg = std::get<0>(t);
 
diff --git a/vendor/gsplat-cpu/bindings.h b/vendor/gsplat-cpu/bindings.h
index aa9504e..f9ccd8f 100644
--- a/vendor/gsplat-cpu/bindings.h
+++ b/vendor/gsplat-cpu/bindings.h
@@ -107,7 +107,8 @@ std::tuple<
     const torch::Tensor &colors,
     const torch::Tensor &opacities,
     const torch::Tensor &background,
-    const torch::Tensor &cov2d
+    const torch::Tensor &cov2d,
+    const torch::Tensor &depths
 );
 
 std::
diff --git a/vendor/gsplat-cpu/gsplat_cpu.cpp b/vendor/gsplat-cpu/gsplat_cpu.cpp
index 7472c26..8e0a4c0 100644
--- a/vendor/gsplat-cpu/gsplat_cpu.cpp
+++ b/vendor/gsplat-cpu/gsplat_cpu.cpp
@@ -178,6 +178,8 @@ project_gaussians_forward_tensor(
     torch::Tensor depths = pView.index({"...", 2});
     torch::Tensor radii = radius.to(torch::kInt32);
 
+    // TODO: compute camDepths as pProj.index({"...", 2});
+
     return std::make_tuple(cov3d, xys, depths, radii, conic, numTilesHit, cov2d );
 }
 
@@ -306,12 +308,26 @@ std::tuple<
     const torch::Tensor &colors,
     const torch::Tensor &opacities,
     const torch::Tensor &background,
-    const torch::Tensor &cov2d
+    const torch::Tensor &cov2d,
+    const torch::Tensor &depths
 ){
+    // torch::NoGradGuard noGrad;
+
     int channels = colors.size(1);
     int width = std::get<1>(img_size);
     int height = std::get<0>(img_size);
     int numPoints = xys.size(0);
+    float *pDepths = static_cast<float *>(depths.data_ptr());
+
+    std::vector< size_t > gIndices( numPoints );
+    std::iota( gIndices.begin(), gIndices.end(), 0 );
+    std::sort(gIndices.begin(), gIndices.end(), [&pDepths](int a, int b){
+        return pDepths[a] < pDepths[b];
+    });
+
+    std::cout << pDepths[0] << std::endl;
+
+    std::cout << pDepths[100];
 
     torch::Device device = xys.device();
 
@@ -319,53 +335,79 @@ std::tuple<
     torch::Tensor finalTs = torch::ones({width, height, channels}, torch::TensorOptions().dtype(torch::kFloat32).device(device));   
     torch::Tensor finalIdx = torch::zeros({width, height, channels}, torch::TensorOptions().dtype(torch::kInt32).device(device));   
 
+    torch::Tensor sqCov2dX = 3.0f * torch::sqrt(cov2d.index({"...", 0, 0}));
+    torch::Tensor sqCov2dY = 3.0f * torch::sqrt(cov2d.index({"...", 1, 1}));
+    
+    int32_t *pGaussianIds = static_cast<int32_t*>(gaussian_ids_sorted.data_ptr());
+    float *pConics = static_cast<float *>(conics.data_ptr());
+    float *pCenters = static_cast<float *>(xys.data_ptr());
+    float *pSqCov2dX = static_cast<float *>(sqCov2dX.data_ptr());
+    float *pSqCov2dY = static_cast<float *>(sqCov2dY.data_ptr());
+    float *pOpacities = static_cast<float *>(opacities.data_ptr());
+
+    float *pOutImg = static_cast<float *>(outImg.data_ptr());
+    float *pFinalTs = static_cast<float *>(finalTs.data_ptr());
+    int32_t *pFinalIdx = static_cast<int32_t *>(finalIdx.data_ptr());
+    float *pColors = static_cast<float *>(colors.data_ptr());
+    
+    float bgX = background[0].item<float>();
+    float bgY = background[1].item<float>();
+    float bgZ = background[2].item<float>();
+
     const float alphaThresh = 1.0f / 255.0f;
+    float T = 1.0f;
     int idx = 0;
     for (; idx < numPoints; idx++){
-        torch::Tensor gaussianId = gaussian_ids_sorted[idx];
-        torch::Tensor conic = conics[gaussianId];
-        torch::Tensor center = xys[gaussianId];
-        
-        float sqx = 3.0f * std::sqrt(cov2d[gaussianId][0][0].item<float>());
-        float sqy = 3.0f * std::sqrt(cov2d[gaussianId][1][1].item<float>());
+        int32_t gaussianId = gIndices[idx];
 
-        int minx = (std::max)(0, static_cast<int>(std::floor(center[1].item<float>() - sqy)) - 2);
-        int maxx = (std::min)(width, static_cast<int>(std::ceil(center[1].item<float>() + sqy)) + 2);
-        int miny = (std::max)(0, static_cast<int>(std::floor(center[0].item<float>() - sqx)) - 2);
-        int maxy = (std::min)(height, static_cast<int>(std::ceil(center[0].item<float>() + sqx)) + 2);
+        float A = pConics[gaussianId * 3 + 0];
+        float B = pConics[gaussianId * 3 + 1];
+        float C = pConics[gaussianId * 3 + 2];
+
+        float gX = pCenters[gaussianId * 2 + 0];
+        float gY = pCenters[gaussianId * 2 + 1];
+
+        float sqx = pSqCov2dX[gaussianId];
+        float sqy = pSqCov2dY[gaussianId];
+        
+        int minx = (std::max)(0, static_cast<int>(std::floor(gY - sqy)) - 2);
+        int maxx = (std::min)(width, static_cast<int>(std::ceil(gY + sqy)) + 2);
+        int miny = (std::max)(0, static_cast<int>(std::floor(gX - sqx)) - 2);
+        int maxy = (std::min)(height, static_cast<int>(std::ceil(gX + sqx)) + 2);
 
         for (int i = minx; i < maxx; i++){
             for (int j = miny; j < maxy; j++){
-                torch::Tensor ji = torch::tensor({j, i}, torch::TensorOptions().dtype(torch::kFloat32).device(device));
-            
-                torch::Tensor delta = center - ji;
-                torch::Tensor sigma = (
+                float xCam = gX - j;
+                float yCam = gY - i;
+                float sigma = (
                     0.5f
-                    * (conic[0] * delta[0] * delta[0] + conic[2] * delta[1] * delta[1])
-                    + conic[1] * delta[0] * delta[1]
+                    * (A * xCam * xCam + C * yCam * yCam)
+                    + B * xCam * yCam
                 );
 
-                if (sigma.item<float>() < 0.0f) continue;
-                float alpha = (std::min)(0.999f, (opacities[gaussianId] * torch::exp(-sigma)).item<float>());
+                if (sigma < 0.0f) continue;
+                float alpha = (std::min)(0.999f, (pOpacities[gaussianId] * std::exp(-sigma)));
                 if (alpha < alphaThresh) continue;
 
-                torch::Tensor T = finalTs[i][j];
-                torch::Tensor nextT = T * (1.0f - alpha);
+                size_t pixIdx = (i * height + j);
+                float T = pFinalTs[pixIdx];
+                float nextT = T * (1.0f - alpha);
 
-                // if (nextT.item<float>() <= 1e-4f){
-                //     idx -= 1;
-                //     break;
-                // }
+                float alphaT = alpha * T;
 
-                // outImg[i][j] = torch::tensor({1.0f, 1.0f, 1.0f}); 
-                outImg[i][j] += alpha * T * colors[gaussianId];
-                finalTs[i][j] = nextT;
+                pOutImg[pixIdx * 3 + 0] += alphaT * (pColors[gaussianId * 3 + 0] + bgX);
+                pOutImg[pixIdx * 3 + 1] += alphaT * (pColors[gaussianId * 3 + 1] + bgY);
+                pOutImg[pixIdx * 3 + 2] += alphaT * (pColors[gaussianId * 3 + 2] + bgZ);
+                
+                pFinalTs[pixIdx] = nextT;
+                pFinalIdx[pixIdx] = idx;
             }
         }
     }
 
     return std::make_tuple(outImg, finalTs, finalIdx);
 
+
 /*
     int minx = 99999;
     int miny = 99999;
@@ -418,16 +460,16 @@ std::tuple<
 
             finalTs[i][j] = T;
             finalIdx[i][j] = idx;
-            // outImg[i][j] += T * background;
+            outImg[i][j] += T * background;
         }
     }
 
     std::cout << "[" << minx << ", " << miny << "], [" << maxx << ", " << maxy << "]" << std::endl;
 
     return std::make_tuple(outImg, finalTs, finalIdx);
+
 */
 /*
-
     int blockX = std::get<0>(block);
     int blockY = std::get<1>(block);
     int tileBoundsX = std::get<0>(tile_bounds);
@@ -446,7 +488,7 @@ std::tuple<
                 torch::Tensor conic = conics[gaussianId];
                 torch::Tensor center = xys[gaussianId];
                 torch::Tensor delta = center - ji;
-
+pGaussianIds
                 torch::Tensor sigma = (
                     0.5f
                     * (conic[0] * delta[0] * delta[0] + conic[2] * delta[1] * delta[1])
@@ -478,7 +520,7 @@ std::tuple<
     }
 
     return std::make_tuple(outImg, finalTs, finalIdx);
-    */
+*/
 }
 
 

From df60f5226e124277014ab190cb8dedfffb11c71a Mon Sep 17 00:00:00 2001
From: Piero Toffanin <pt@masseranolabs.com>
Date: Sat, 16 Mar 2024 19:41:31 +0000
Subject: [PATCH 10/19] Build CUDA+CPU working

---
 CMakeLists.txt                   |  26 ++--
 gsplat.hpp                       |   6 +-
 project_gaussians.cpp            | 167 ++++++++++++---------
 project_gaussians.hpp            |  28 +++-
 rasterize_gaussians.cpp          | 244 +++++++++++++++++++++----------
 rasterize_gaussians.hpp          |  21 ++-
 simple_trainer.cpp               |  43 ++++--
 tile_bounds.hpp                  |   5 +
 vendor/gsplat-cpu/bindings.h     |  16 +-
 vendor/gsplat-cpu/gsplat_cpu.cpp |  35 ++---
 10 files changed, 368 insertions(+), 223 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9d1eff4..21687eb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -40,13 +40,11 @@ elseif(GPU_RUNTIME STREQUAL "HIP")
     list(APPEND CMAKE_PREFIX_PATH "${ROCM_ROOT}")
 endif()
 
+set(CMAKE_CXX_STANDARD 17)
 if((GPU_RUNTIME STREQUAL "CUDA") OR (GPU_RUNTIME STREQUAL "HIP"))
     enable_language(${GPU_RUNTIME})
     set(CMAKE_${GPU_RUNTIME}_STANDARD 17)
     set(${GPU_RUNTIME}_STANDARD 17)
-else()
-    # CPU
-    set(CMAKE_CXX_STANDARD 17)
 endif()
 
 if (NOT WIN32 AND NOT APPLE)
@@ -61,8 +59,10 @@ if (NOT WIN32 AND NOT APPLE)
 endif()
 set(OpenCV_LIBS opencv_core opencv_imgproc opencv_highgui opencv_calib3d)
 
+set(GSPLAT_LIBS gsplat_cpu)
 if((GPU_RUNTIME STREQUAL "CUDA") OR (GPU_RUNTIME STREQUAL "HIP"))
     add_library(gsplat vendor/gsplat/forward.cu vendor/gsplat/backward.cu vendor/gsplat/bindings.cu vendor/gsplat/helpers.cuh)
+    list(APPEND GSPLAT_LIBS gsplat)
     if(GPU_RUNTIME STREQUAL "CUDA")
         set(GPU_LIBRARIES "cuda")
         target_link_libraries(gsplat PUBLIC cuda)
@@ -71,21 +71,21 @@ if((GPU_RUNTIME STREQUAL "CUDA") OR (GPU_RUNTIME STREQUAL "HIP"))
         set(GPU_INCLUDE_DIRS "${ROCM_ROOT}/include")
         target_compile_definitions(gsplat PRIVATE USE_HIP __HIP_PLATFORM_AMD__)
     endif()
-else()
-    add_library(gsplat vendor/gsplat-cpu/gsplat_cpu.cpp)
+    target_include_directories(gsplat PRIVATE
+        ${PROJECT_SOURCE_DIR}/vendor/glm        
+        ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
+        ${TORCH_INCLUDE_DIRS}
+    )
+    set_target_properties(gsplat PROPERTIES LINKER_LANGUAGE CXX)
 endif()
 
-target_include_directories(gsplat PRIVATE
-    ${PROJECT_SOURCE_DIR}/vendor/glm        
-    ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
-    ${TORCH_INCLUDE_DIRS}
-)
-set_target_properties(gsplat PROPERTIES LINKER_LANGUAGE CXX)
+add_library(gsplat_cpu vendor/gsplat-cpu/gsplat_cpu.cpp)
+target_include_directories(gsplat_cpu PRIVATE ${TORCH_INCLUDE_DIRS})
 
 add_executable(opensplat opensplat.cpp point_io.cpp nerfstudio.cpp model.cpp kdtree_tensor.cpp spherical_harmonics.cpp cv_utils.cpp utils.cpp project_gaussians.cpp rasterize_gaussians.cpp ssim.cpp optim_scheduler.cpp colmap.cpp input_data.cpp tensor_math.cpp)
 set_property(TARGET opensplat PROPERTY CXX_STANDARD 17)
 target_include_directories(opensplat PRIVATE ${PROJECT_SOURCE_DIR}/vendor/glm ${GPU_INCLUDE_DIRS})
-target_link_libraries(opensplat PUBLIC ${STDPPFS_LIBRARY} ${GPU_LIBRARIES} gsplat ${TORCH_LIBRARIES} ${OpenCV_LIBS})
+target_link_libraries(opensplat PUBLIC ${STDPPFS_LIBRARY} ${GPU_LIBRARIES} ${GSPLAT_LIBS} ${TORCH_LIBRARIES} ${OpenCV_LIBS})
 if(GPU_RUNTIME STREQUAL "HIP")
     target_compile_definitions(opensplat PRIVATE USE_HIP __HIP_PLATFORM_AMD__)
 elseif(GPU_RUNTIME STREQUAL "CUDA")
@@ -95,7 +95,7 @@ endif()
 if(OPENSPLAT_BUILD_SIMPLE_TRAINER)
     add_executable(simple_trainer simple_trainer.cpp project_gaussians.cpp rasterize_gaussians.cpp cv_utils.cpp)
     target_include_directories(simple_trainer PRIVATE ${PROJECT_SOURCE_DIR}/vendor/glm ${GPU_INCLUDE_DIRS})
-    target_link_libraries(simple_trainer PUBLIC ${GPU_LIBRARIES} gsplat ${TORCH_LIBRARIES} ${OpenCV_LIBS})
+    target_link_libraries(simple_trainer PUBLIC ${GPU_LIBRARIES} ${GSPLAT_LIBS} ${TORCH_LIBRARIES} ${OpenCV_LIBS})
     set_property(TARGET simple_trainer PROPERTY CXX_STANDARD 17)
     if(GPU_RUNTIME STREQUAL "HIP")
         target_compile_definitions(simple_trainer PRIVATE USE_HIP __HIP_PLATFORM_AMD__)
diff --git a/gsplat.hpp b/gsplat.hpp
index c0b2408..fab3520 100644
--- a/gsplat.hpp
+++ b/gsplat.hpp
@@ -1,5 +1,5 @@
 #if defined(USE_HIP) || defined(USE_CUDA)
 #include "vendor/gsplat/bindings.h"
-#else
-#include "vendor/gsplat-cpu/bindings.h"
-#endif
\ No newline at end of file
+#endif
+
+#include "vendor/gsplat-cpu/bindings.h"
\ No newline at end of file
diff --git a/project_gaussians.cpp b/project_gaussians.cpp
index 6781078..160ad3f 100644
--- a/project_gaussians.cpp
+++ b/project_gaussians.cpp
@@ -1,7 +1,94 @@
 #include "project_gaussians.hpp"
 #include "gsplat.hpp"
 
-variable_list ProjectGaussians::forward(AutogradContext *ctx, 
+// variable_list ProjectGaussians::forward(AutogradContext *ctx, 
+//                 torch::Tensor means,
+//                 torch::Tensor scales,
+//                 float globScale,
+//                 torch::Tensor quats,
+//                 torch::Tensor viewMat,
+//                 torch::Tensor projMat,
+//                 float fx,
+//                 float fy,
+//                 float cx,
+//                 float cy,
+//                 int imgHeight,
+//                 int imgWidth,
+//                 TileBounds tileBounds,
+//                 float clipThresh
+//             ){
+    
+//     int numPoints = means.size(0);
+
+//     auto t = project_gaussians_forward_tensor(numPoints, means, scales, globScale,
+//                                               quats, viewMat, projMat, fx, fy,
+//                                               cx, cy, imgHeight, imgWidth, tileBounds, clipThresh);
+//     torch::Tensor cov3d = std::get<0>(t);
+//     torch::Tensor xys = std::get<1>(t);
+//     torch::Tensor depths = std::get<2>(t);
+//     torch::Tensor radii = std::get<3>(t);
+//     torch::Tensor conics = std::get<4>(t);
+//     torch::Tensor numTilesHit = std::get<5>(t);
+    
+//     ctx->saved_data["imgHeight"] = imgHeight;
+//     ctx->saved_data["imgWidth"] = imgWidth;
+//     ctx->saved_data["numPoints"] = numPoints;
+//     ctx->saved_data["globScale"] = globScale;
+//     ctx->saved_data["fx"] = fx;
+//     ctx->saved_data["fy"] = fy;
+//     ctx->saved_data["cx"] = cx;
+//     ctx->saved_data["cy"] = cy;
+//     ctx->save_for_backward({ means, scales, quats, viewMat, projMat, cov3d, radii, conics });
+
+//     return { xys, depths, radii, conics, numTilesHit, cov3d };
+// }
+
+// tensor_list ProjectGaussians::backward(AutogradContext *ctx, tensor_list grad_outputs) {
+//     torch::Tensor v_xys = grad_outputs[0];
+//     torch::Tensor v_depths = grad_outputs[1];
+//     torch::Tensor v_radii = grad_outputs[2];
+//     torch::Tensor v_conics = grad_outputs[3];
+//     torch::Tensor v_numTiles = grad_outputs[4];
+//     torch::Tensor v_cov3d = grad_outputs[5];
+
+//     variable_list saved = ctx->get_saved_variables();
+//     torch::Tensor means = saved[0];
+//     torch::Tensor scales = saved[1];
+//     torch::Tensor quats = saved[2];
+//     torch::Tensor viewMat = saved[3];
+//     torch::Tensor projMat = saved[4];
+//     torch::Tensor cov3d = saved[5];
+//     torch::Tensor radii = saved[6];
+//     torch::Tensor conics = saved[7];
+    
+//     auto t = project_gaussians_backward_tensor(ctx->saved_data["numPoints"].toInt(), 
+//                                             means, scales, ctx->saved_data["globScale"].toDouble(),
+//                                             quats, viewMat, projMat, 
+//                                             ctx->saved_data["fx"].toDouble(), ctx->saved_data["fy"].toDouble(),
+//                                             ctx->saved_data["cx"].toDouble(), ctx->saved_data["cy"].toDouble(), 
+//                                             ctx->saved_data["imgHeight"].toInt(), ctx->saved_data["imgWidth"].toInt(), 
+//                                             cov3d, radii,
+//                                             conics, v_xys, v_depths, v_conics);
+//     torch::Tensor none;
+
+//     return {std::get<2>(t), // v_mean
+//             std::get<3>(t), // v_scale
+//             none, // globScale
+//             std::get<4>(t), // v_quat
+//             none, // viewMat
+//             none, // projMat
+//             none, // fx
+//             none, // fy
+//             none, // cx
+//             none, // cy
+//             none, // imgHeight
+//             none, // imgWidth
+//             none, // tileBounds
+//             none // clipThresh
+//         };
+// }
+
+variable_list ProjectGaussiansCPU::Apply(
                 torch::Tensor means,
                 torch::Tensor scales,
                 float globScale,
@@ -14,80 +101,20 @@ variable_list ProjectGaussians::forward(AutogradContext *ctx,
                 float cy,
                 int imgHeight,
                 int imgWidth,
-                TileBounds tileBounds,
                 float clipThresh
             ){
     
     int numPoints = means.size(0);
 
-    auto t = project_gaussians_forward_tensor(numPoints, means, scales, globScale,
+    auto t = project_gaussians_forward_tensor_cpu(numPoints, means, scales, globScale,
                                               quats, viewMat, projMat, fx, fy,
-                                              cx, cy, imgHeight, imgWidth, tileBounds, clipThresh);
-    torch::Tensor cov3d = std::get<0>(t);
-    torch::Tensor xys = std::get<1>(t);
-    torch::Tensor depths = std::get<2>(t);
-    torch::Tensor radii = std::get<3>(t);
-    torch::Tensor conics = std::get<4>(t);
-    torch::Tensor numTilesHit = std::get<5>(t);
-    torch::Tensor cov2d = std::get<6>(t);
+                                              cx, cy, imgHeight, imgWidth, clipThresh);
+                                              
+    torch::Tensor xys = std::get<0>(t);
+    torch::Tensor radii = std::get<1>(t);
+    torch::Tensor conics = std::get<2>(t);
+    torch::Tensor cov2d = std::get<3>(t);
+    torch::Tensor camDepths = std::get<4>(t);
     
-
-    if (ctx != nullptr){
-        ctx->saved_data["imgHeight"] = imgHeight;
-        ctx->saved_data["imgWidth"] = imgWidth;
-        ctx->saved_data["numPoints"] = numPoints;
-        ctx->saved_data["globScale"] = globScale;
-        ctx->saved_data["fx"] = fx;
-        ctx->saved_data["fy"] = fy;
-        ctx->saved_data["cx"] = cx;
-        ctx->saved_data["cy"] = cy;
-        ctx->save_for_backward({ means, scales, quats, viewMat, projMat, cov3d, radii, conics });
-    }
-
-    return { xys, depths, radii, conics, numTilesHit, cov3d, cov2d };
-}
-
-tensor_list ProjectGaussians::backward(AutogradContext *ctx, tensor_list grad_outputs) {
-    torch::Tensor v_xys = grad_outputs[0];
-    torch::Tensor v_depths = grad_outputs[1];
-    torch::Tensor v_radii = grad_outputs[2];
-    torch::Tensor v_conics = grad_outputs[3];
-    torch::Tensor v_numTiles = grad_outputs[4];
-    torch::Tensor v_cov3d = grad_outputs[5];
-
-    variable_list saved = ctx->get_saved_variables();
-    torch::Tensor means = saved[0];
-    torch::Tensor scales = saved[1];
-    torch::Tensor quats = saved[2];
-    torch::Tensor viewMat = saved[3];
-    torch::Tensor projMat = saved[4];
-    torch::Tensor cov3d = saved[5];
-    torch::Tensor radii = saved[6];
-    torch::Tensor conics = saved[7];
-    
-    auto t = project_gaussians_backward_tensor(ctx->saved_data["numPoints"].toInt(), 
-                                            means, scales, ctx->saved_data["globScale"].toDouble(),
-                                            quats, viewMat, projMat, 
-                                            ctx->saved_data["fx"].toDouble(), ctx->saved_data["fy"].toDouble(),
-                                            ctx->saved_data["cx"].toDouble(), ctx->saved_data["cy"].toDouble(), 
-                                            ctx->saved_data["imgHeight"].toInt(), ctx->saved_data["imgWidth"].toInt(), 
-                                            cov3d, radii,
-                                            conics, v_xys, v_depths, v_conics);
-    torch::Tensor none;
-
-    return {std::get<2>(t), // v_mean
-            std::get<3>(t), // v_scale
-            none, // globScale
-            std::get<4>(t), // v_quat
-            none, // viewMat
-            none, // projMat
-            none, // fx
-            none, // fy
-            none, // cx
-            none, // cy
-            none, // imgHeight
-            none, // imgWidth
-            none, // tileBounds
-            none // clipThresh
-        };
+    return { xys, radii, conics, cov2d, camDepths };
 }
\ No newline at end of file
diff --git a/project_gaussians.hpp b/project_gaussians.hpp
index d84f53f..eb41247 100644
--- a/project_gaussians.hpp
+++ b/project_gaussians.hpp
@@ -6,9 +6,29 @@
 
 using namespace torch::autograd;
 
-class ProjectGaussians : public Function<ProjectGaussians>{
+// class ProjectGaussians : public Function<ProjectGaussians>{
+// public:
+//     static variable_list forward(AutogradContext *ctx, 
+//             torch::Tensor means,
+//             torch::Tensor scales,
+//             float globScale,
+//             torch::Tensor quats,
+//             torch::Tensor viewMat,
+//             torch::Tensor projMat,
+//             float fx,
+//             float fy,
+//             float cx,
+//             float cy,
+//             int imgHeight,
+//             int imgWidth,
+//             TileBounds tileBounds,
+//             float clipThresh = 0.01);
+//     static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs);
+// };
+
+class ProjectGaussiansCPU{
 public:
-    static variable_list forward(AutogradContext *ctx, 
+    static variable_list Apply( 
             torch::Tensor means,
             torch::Tensor scales,
             float globScale,
@@ -21,12 +41,8 @@ class ProjectGaussians : public Function<ProjectGaussians>{
             float cy,
             int imgHeight,
             int imgWidth,
-            TileBounds tileBounds,
             float clipThresh = 0.01);
-    static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs);
 };
 
 
-
-
 #endif
\ No newline at end of file
diff --git a/rasterize_gaussians.cpp b/rasterize_gaussians.cpp
index deee76c..f1058f1 100644
--- a/rasterize_gaussians.cpp
+++ b/rasterize_gaussians.cpp
@@ -37,15 +37,120 @@ std::tuple<torch::Tensor,
     return std::make_tuple(isectIds, gaussianIds, isectIdsSorted, gaussianIdsSorted, tileBins);
 }
 
-torch::Tensor RasterizeGaussians::forward(AutogradContext *ctx, 
+// torch::Tensor RasterizeGaussians::forward(AutogradContext *ctx, 
+//             torch::Tensor xys,
+//             torch::Tensor depths,
+//             torch::Tensor radii,
+//             torch::Tensor conics,
+//             torch::Tensor numTilesHit,
+//             torch::Tensor colors,
+//             torch::Tensor opacity,
+//             int imgHeight,
+//             int imgWidth,
+//             torch::Tensor background
+//         ){
+    
+//     int numPoints = xys.size(0);
+
+//     TileBounds tileBounds = std::make_tuple(
+//         (imgWidth + BLOCK_X - 1) / BLOCK_X,
+//         (imgHeight + BLOCK_Y - 1) / BLOCK_Y,
+//         1
+//     );
+//     std::tuple<int, int, int> block = std::make_tuple(BLOCK_X, BLOCK_Y, 1);
+//     std::tuple<int, int, int> imgSize = std::make_tuple(imgWidth, imgHeight, 1);
+    
+//     torch::Tensor cumTilesHit = torch::cumsum(numTilesHit, 0, torch::kInt32);
+//     int numIntersects = cumTilesHit[cumTilesHit.size(0) - 1].item<int>();
+
+//     auto b = binAndSortGaussians(numPoints, numIntersects, xys, depths, radii, cumTilesHit, tileBounds);
+//     torch::Tensor gaussianIdsSorted = std::get<3>(b);
+//     torch::Tensor tileBins = std::get<4>(b);
+
+//     auto t = rasterize_forward_tensor(tileBounds, block, imgSize, 
+//                             gaussianIdsSorted,
+//                             tileBins,
+//                             xys,
+//                             conics,
+//                             colors,
+//                             opacity,
+//                             background);
+//     // Final image
+//     torch::Tensor outImg = std::get<0>(t);
+
+//     // Map of alpha-inverse (1 - finalTs = alpha)
+//     torch::Tensor finalTs = std::get<1>(t);
+
+//     // Map of tile bin IDs
+//     torch::Tensor finalIdx = std::get<2>(t);
+
+//     ctx->saved_data["imgWidth"] = imgWidth;
+//     ctx->saved_data["imgHeight"] = imgHeight;
+//     ctx->save_for_backward({ gaussianIdsSorted, tileBins, xys, conics, colors, opacity, background, finalTs, finalIdx });
+    
+//     return outImg;
+// }
+
+// tensor_list RasterizeGaussians::backward(AutogradContext *ctx, tensor_list grad_outputs) {
+//     torch::Tensor v_outImg = grad_outputs[0];
+//     int imgHeight = ctx->saved_data["imgHeight"].toInt();
+//     int imgWidth = ctx->saved_data["imgWidth"].toInt();
+
+//     variable_list saved = ctx->get_saved_variables();
+//     torch::Tensor gaussianIdsSorted = saved[0];
+//     torch::Tensor tileBins = saved[1];
+//     torch::Tensor xys = saved[2];
+//     torch::Tensor conics = saved[3];
+//     torch::Tensor colors = saved[4];
+//     torch::Tensor opacity = saved[5];
+//     torch::Tensor background = saved[6];
+//     torch::Tensor finalTs = saved[7];
+//     torch::Tensor finalIdx = saved[8];
+
+//     // torch::Tensor v_outAlpha = torch::zeros({imgHeight, imgWidth}, torch::TensorOptions().device(v_outImg.get_device());
+//     torch::Tensor v_outAlpha = torch::zeros_like(v_outImg.index({"...", 0}));
+    
+//     auto t = rasterize_backward_tensor(imgHeight, imgWidth, 
+//                             gaussianIdsSorted,
+//                             tileBins,
+//                             xys,
+//                             conics,
+//                             colors,
+//                             opacity,
+//                             background,
+//                             finalTs,
+//                             finalIdx,
+//                             v_outImg,
+//                             v_outAlpha);
+
+//     torch::Tensor v_xy = std::get<0>(t);
+//     torch::Tensor v_conic = std::get<1>(t);
+//     torch::Tensor v_colors = std::get<2>(t);
+//     torch::Tensor v_opacity = std::get<3>(t);
+//     torch::Tensor none;
+
+//     return { v_xy,
+//             none, // depths
+//             none, // radii
+//             v_conic,
+//             none, // numTilesHit
+//             v_colors,
+//             v_opacity,
+//             none, // imgHeight
+//             none, // imgWidth
+//             none // background
+//     };
+// }
+
+
+torch::Tensor RasterizeGaussiansCPU::forward(AutogradContext *ctx, 
             torch::Tensor xys,
-            torch::Tensor depths,
             torch::Tensor radii,
             torch::Tensor conics,
-            torch::Tensor numTilesHit,
             torch::Tensor colors,
             torch::Tensor opacity,
             torch::Tensor cov2d,
+            torch::Tensor camDepths,
             int imgHeight,
             int imgWidth,
             torch::Tensor background
@@ -53,31 +158,15 @@ torch::Tensor RasterizeGaussians::forward(AutogradContext *ctx,
     
     int numPoints = xys.size(0);
 
-    TileBounds tileBounds = std::make_tuple(
-        (imgWidth + BLOCK_X - 1) / BLOCK_X,
-        (imgHeight + BLOCK_Y - 1) / BLOCK_Y,
-        1
-    );
-    std::tuple<int, int, int> block = std::make_tuple(BLOCK_X, BLOCK_Y, 1);
-    std::tuple<int, int, int> imgSize = std::make_tuple(imgWidth, imgHeight, 1);
-    
-    torch::Tensor cumTilesHit = torch::cumsum(numTilesHit, 0, torch::kInt32);
-    int numIntersects = cumTilesHit[cumTilesHit.size(0) - 1].item<int>();
-
-    auto b = binAndSortGaussians(numPoints, numIntersects, xys, depths, radii, cumTilesHit, tileBounds);
-    torch::Tensor gaussianIdsSorted = std::get<3>(b);
-    torch::Tensor tileBins = std::get<4>(b);
-
-    auto t = rasterize_forward_tensor(tileBounds, block, imgSize, 
-                            gaussianIdsSorted,
-                            tileBins,
+    auto t = rasterize_forward_tensor_cpu(imgWidth, imgHeight, 
                             xys,
                             conics,
                             colors,
                             opacity,
                             background,
                             cov2d,
-                            depths);
+                            camDepths
+                            );
     // Final image
     torch::Tensor outImg = std::get<0>(t);
 
@@ -87,69 +176,68 @@ torch::Tensor RasterizeGaussians::forward(AutogradContext *ctx,
     std::cout << "WROTE";
     exit(1);
 
-
     // Map of alpha-inverse (1 - finalTs = alpha)
     torch::Tensor finalTs = std::get<1>(t);
 
-    // Map of tile bin IDs
+    // Map of gaussian IDs
     torch::Tensor finalIdx = std::get<2>(t);
 
-    if (ctx != nullptr){
-        ctx->saved_data["imgWidth"] = imgWidth;
-        ctx->saved_data["imgHeight"] = imgHeight;
-        ctx->save_for_backward({ gaussianIdsSorted, tileBins, xys, conics, colors, opacity, background, finalTs, finalIdx });
-    }
+    ctx->saved_data["imgWidth"] = imgWidth;
+    ctx->saved_data["imgHeight"] = imgHeight;
+    ctx->save_for_backward({ xys, conics, colors, opacity, background, finalTs, finalIdx });
     
     return outImg;
 }
 
-tensor_list RasterizeGaussians::backward(AutogradContext *ctx, tensor_list grad_outputs) {
-    torch::Tensor v_outImg = grad_outputs[0];
-    int imgHeight = ctx->saved_data["imgHeight"].toInt();
-    int imgWidth = ctx->saved_data["imgWidth"].toInt();
-
-    variable_list saved = ctx->get_saved_variables();
-    torch::Tensor gaussianIdsSorted = saved[0];
-    torch::Tensor tileBins = saved[1];
-    torch::Tensor xys = saved[2];
-    torch::Tensor conics = saved[3];
-    torch::Tensor colors = saved[4];
-    torch::Tensor opacity = saved[5];
-    torch::Tensor background = saved[6];
-    torch::Tensor finalTs = saved[7];
-    torch::Tensor finalIdx = saved[8];
-
-    // torch::Tensor v_outAlpha = torch::zeros({imgHeight, imgWidth}, torch::TensorOptions().device(v_outImg.get_device());
-    torch::Tensor v_outAlpha = torch::zeros_like(v_outImg.index({"...", 0}));
+tensor_list RasterizeGaussiansCPU::backward(AutogradContext *ctx, tensor_list grad_outputs) {
+    // torch::Tensor v_outImg = grad_outputs[0];
+    // int imgHeight = ctx->saved_data["imgHeight"].toInt();
+    // int imgWidth = ctx->saved_data["imgWidth"].toInt();
+
+    // variable_list saved = ctx->get_saved_variables();
+    // torch::Tensor gaussianIdsSorted = saved[0];
+    // torch::Tensor tileBins = saved[1];
+    // torch::Tensor xys = saved[2];
+    // torch::Tensor conics = saved[3];
+    // torch::Tensor colors = saved[4];
+    // torch::Tensor opacity = saved[5];
+    // torch::Tensor background = saved[6];
+    // torch::Tensor finalTs = saved[7];
+    // torch::Tensor finalIdx = saved[8];
+
+    // // torch::Tensor v_outAlpha = torch::zeros({imgHeight, imgWidth}, torch::TensorOptions().device(v_outImg.get_device());
+    // torch::Tensor v_outAlpha = torch::zeros_like(v_outImg.index({"...", 0}));
     
-    auto t = rasterize_backward_tensor(imgHeight, imgWidth, 
-                            gaussianIdsSorted,
-                            tileBins,
-                            xys,
-                            conics,
-                            colors,
-                            opacity,
-                            background,
-                            finalTs,
-                            finalIdx,
-                            v_outImg,
-                            v_outAlpha);
-
-    torch::Tensor v_xy = std::get<0>(t);
-    torch::Tensor v_conic = std::get<1>(t);
-    torch::Tensor v_colors = std::get<2>(t);
-    torch::Tensor v_opacity = std::get<3>(t);
-    torch::Tensor none;
-
-    return { v_xy,
-            none, // depths
-            none, // radii
-            v_conic,
-            none, // numTilesHit
-            v_colors,
-            v_opacity,
-            none, // imgHeight
-            none, // imgWidth
-            none // background
-    };
-}
\ No newline at end of file
+    // auto t = rasterize_backward_tensor(imgHeight, imgWidth, 
+    //                         gaussianIdsSorted,
+    //                         tileBins,
+    //                         xys,
+    //                         conics,
+    //                         colors,
+    //                         opacity,
+    //                         background,
+    //                         finalTs,
+    //                         finalIdx,
+    //                         v_outImg,
+    //                         v_outAlpha);
+
+    // torch::Tensor v_xy = std::get<0>(t);
+    // torch::Tensor v_conic = std::get<1>(t);
+    // torch::Tensor v_colors = std::get<2>(t);
+    // torch::Tensor v_opacity = std::get<3>(t);
+    // torch::Tensor none;
+
+    // return { v_xy,
+    //         none, // depths
+    //         none, // radii
+    //         v_conic,
+    //         none, // numTilesHit
+    //         v_colors,
+    //         v_opacity,
+    //         none, // imgHeight
+    //         none, // imgWidth
+    //         none // background
+    // };
+}
+
+
diff --git a/rasterize_gaussians.hpp b/rasterize_gaussians.hpp
index a97f69d..4d8229a 100644
--- a/rasterize_gaussians.hpp
+++ b/rasterize_gaussians.hpp
@@ -17,17 +17,32 @@ std::tuple<torch::Tensor,
                                             torch::Tensor cumTilesHit,
                                             TileBounds tileBounds);
 
-class RasterizeGaussians : public Function<RasterizeGaussians>{
+// class RasterizeGaussians : public Function<RasterizeGaussians>{
+// public:
+//     static torch::Tensor forward(AutogradContext *ctx, 
+//             torch::Tensor xys,
+//             torch::Tensor depths,
+//             torch::Tensor radii,
+//             torch::Tensor conics,
+//             torch::Tensor numTilesHit,
+//             torch::Tensor colors,
+//             torch::Tensor opacity,
+//             int imgHeight,
+//             int imgWidth,
+//             torch::Tensor background);
+//     static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs);
+// };
+
+class RasterizeGaussiansCPU : public Function<RasterizeGaussiansCPU>{
 public:
     static torch::Tensor forward(AutogradContext *ctx, 
             torch::Tensor xys,
-            torch::Tensor depths,
             torch::Tensor radii,
             torch::Tensor conics,
-            torch::Tensor numTilesHit,
             torch::Tensor colors,
             torch::Tensor opacity,
             torch::Tensor cov2d,
+            torch::Tensor camDepths,
             int imgHeight,
             int imgWidth,
             torch::Tensor background);
diff --git a/simple_trainer.cpp b/simple_trainer.cpp
index 38660b1..495bd3e 100644
--- a/simple_trainer.cpp
+++ b/simple_trainer.cpp
@@ -23,9 +23,6 @@ using namespace torch::indexing;
 
 
 
-
-
-
 int main(int argc, char **argv){
     int width = 128,
         height = 128;
@@ -34,7 +31,7 @@ int main(int argc, char **argv){
     float learningRate = 0.01;
 
     torch::Device device = torch::kCPU;
-    if (torch::cuda::is_available()) {
+    if (torch::cuda::is_available() && !(argc == 2 && std::string(argv[1]) == "--cpu")){
         std::cout << "Using CUDA" << std::endl;
         device = torch::kCUDA;
     }else{
@@ -108,24 +105,44 @@ int main(int argc, char **argv){
     torch::nn::MSELoss mseLoss;
 
     for (size_t i = 0; i < iterations; i++){
-        auto p = ProjectGaussians::forward(nullptr, means, scales, 1, 
+        // auto p = ProjectGaussians::Apply(means, scales, 1, 
+        //                         quats, viewMat, viewMat,
+        //                         focal, focal,
+        //                         width / 2,
+        //                         height / 2,
+        //                         height,
+        //                         width,
+        //                         tileBounds);
+
+        // torch::Tensor outImg = RasterizeGaussians::apply(
+        //     p[0], // xys
+        //     p[1], // depths
+        //     p[2], // radii,
+        //     p[3], // conics
+        //     p[4], // numTilesHit
+        //     torch::sigmoid(rgbs),
+        //     torch::sigmoid(opacities),
+        //     p[6], // cov2d
+        //     height,
+        //     width,
+        //     background);
+
+        auto p = ProjectGaussiansCPU::Apply(means, scales, 1, 
                                 quats, viewMat, viewMat,
                                 focal, focal,
                                 width / 2,
                                 height / 2,
                                 height,
-                                width,
-                                tileBounds);
+                                width);
 
-        torch::Tensor outImg = RasterizeGaussians::forward(nullptr,
+        torch::Tensor outImg = RasterizeGaussiansCPU::apply(
             p[0], // xys
-            p[1], // depths
-            p[2], // radii,
-            p[3], // conics
-            p[4], // numTilesHit
+            p[1], // radii,
+            p[2], // conics
             torch::sigmoid(rgbs),
             torch::sigmoid(opacities),
-            p[6], // cov2d
+            p[3], // cov2d
+            p[4], // camDepths
             height,
             width,
             background);
diff --git a/tile_bounds.hpp b/tile_bounds.hpp
index 7b29dc2..4c1330b 100644
--- a/tile_bounds.hpp
+++ b/tile_bounds.hpp
@@ -1,3 +1,8 @@
+#ifndef TILE_BOUNDS_H
+#define TILE_BOUNDS_H
+
 #include <tuple> 
 
 typedef std::tuple<int, int, int> TileBounds; 
+
+#endif
\ No newline at end of file
diff --git a/vendor/gsplat-cpu/bindings.h b/vendor/gsplat-cpu/bindings.h
index f9ccd8f..5f90386 100644
--- a/vendor/gsplat-cpu/bindings.h
+++ b/vendor/gsplat-cpu/bindings.h
@@ -27,10 +27,8 @@ std::tuple<
     torch::Tensor,
     torch::Tensor,
     torch::Tensor,
-    torch::Tensor,
-    torch::Tensor,
     torch::Tensor>
-project_gaussians_forward_tensor(
+project_gaussians_forward_tensor_cpu(
     const int num_points,
     torch::Tensor &means3d,
     torch::Tensor &scales,
@@ -44,7 +42,6 @@ project_gaussians_forward_tensor(
     const float cy,
     const unsigned img_height,
     const unsigned img_width,
-    const std::tuple<int, int, int> tile_bounds,
     const float clip_thresh
 );
 
@@ -96,19 +93,16 @@ std::tuple<
     torch::Tensor,
     torch::Tensor,
     torch::Tensor
-> rasterize_forward_tensor(
-    const std::tuple<int, int, int> tile_bounds,
-    const std::tuple<int, int, int> block,
-    const std::tuple<int, int, int> img_size,
-    const torch::Tensor &gaussian_ids_sorted,
-    const torch::Tensor &tile_bins,
+> rasterize_forward_tensor_cpu(
+    const int width,
+    const int height,
     const torch::Tensor &xys,
     const torch::Tensor &conics,
     const torch::Tensor &colors,
     const torch::Tensor &opacities,
     const torch::Tensor &background,
     const torch::Tensor &cov2d,
-    const torch::Tensor &depths
+    const torch::Tensor &camDepths
 );
 
 std::
diff --git a/vendor/gsplat-cpu/gsplat_cpu.cpp b/vendor/gsplat-cpu/gsplat_cpu.cpp
index 8e0a4c0..893a1d6 100644
--- a/vendor/gsplat-cpu/gsplat_cpu.cpp
+++ b/vendor/gsplat-cpu/gsplat_cpu.cpp
@@ -80,10 +80,8 @@ std::tuple<
     torch::Tensor,
     torch::Tensor,
     torch::Tensor,
-    torch::Tensor,
-    torch::Tensor,
     torch::Tensor>
-project_gaussians_forward_tensor(
+project_gaussians_forward_tensor_cpu(
     const int num_points,
     torch::Tensor &means3d,
     torch::Tensor &scales,
@@ -97,7 +95,6 @@ project_gaussians_forward_tensor(
     const float cy,
     const unsigned img_height,
     const unsigned img_width,
-    const std::tuple<int, int, int> tile_bounds,
     const float clip_thresh
 ){
     float fovx = 0.5f * static_cast<float>(img_width) / fx;
@@ -169,18 +166,10 @@ project_gaussians_forward_tensor(
     torch::Tensor v = 0.5f * ((pProj.index({"...", 1}) + 1.0f) * static_cast<float>(img_width) - 1.0f);
     torch::Tensor xys = torch::stack({u, v}, -1); // center
 
-    auto bbox = getTileBbox(xys, radius, tile_bounds);
-    torch::Tensor tileMin = std::get<0>(bbox);
-    torch::Tensor tileMax = std::get<1>(bbox);
-    torch::Tensor numTilesHit = (tileMax.index({"...", 0}) - tileMin.index({"...", 0})) * 
-                   (tileMax.index({"...", 1}) - tileMin.index({"...", 1}));
-
-    torch::Tensor depths = pView.index({"...", 2});
     torch::Tensor radii = radius.to(torch::kInt32);
+    torch::Tensor camDepths = pProj.index({"...", 2});
 
-    // TODO: compute camDepths as pProj.index({"...", 2});
-
-    return std::make_tuple(cov3d, xys, depths, radii, conic, numTilesHit, cov2d );
+    return std::make_tuple(xys, radii, conic, cov2d, camDepths);
 }
 
 std::tuple<
@@ -297,27 +286,22 @@ std::tuple<
     torch::Tensor,
     torch::Tensor,
     torch::Tensor
-> rasterize_forward_tensor(
-    const std::tuple<int, int, int> tile_bounds,
-    const std::tuple<int, int, int> block,
-    const std::tuple<int, int, int> img_size,
-    const torch::Tensor &gaussian_ids_sorted,
-    const torch::Tensor &tile_bins,
+> rasterize_forward_tensor_cpu(
+    const int width,
+    const int height,
     const torch::Tensor &xys,
     const torch::Tensor &conics,
     const torch::Tensor &colors,
     const torch::Tensor &opacities,
     const torch::Tensor &background,
     const torch::Tensor &cov2d,
-    const torch::Tensor &depths
+    const torch::Tensor &camDepths
 ){
-    // torch::NoGradGuard noGrad;
+    torch::NoGradGuard noGrad;
 
     int channels = colors.size(1);
-    int width = std::get<1>(img_size);
-    int height = std::get<0>(img_size);
     int numPoints = xys.size(0);
-    float *pDepths = static_cast<float *>(depths.data_ptr());
+    float *pDepths = static_cast<float *>(camDepths.data_ptr());
 
     std::vector< size_t > gIndices( numPoints );
     std::iota( gIndices.begin(), gIndices.end(), 0 );
@@ -338,7 +322,6 @@ std::tuple<
     torch::Tensor sqCov2dX = 3.0f * torch::sqrt(cov2d.index({"...", 0, 0}));
     torch::Tensor sqCov2dY = 3.0f * torch::sqrt(cov2d.index({"...", 1, 1}));
     
-    int32_t *pGaussianIds = static_cast<int32_t*>(gaussian_ids_sorted.data_ptr());
     float *pConics = static_cast<float *>(conics.data_ptr());
     float *pCenters = static_cast<float *>(xys.data_ptr());
     float *pSqCov2dX = static_cast<float *>(sqCov2dX.data_ptr());

From 25e724447a28f8ea11d53cf39f49cf04068cad3c Mon Sep 17 00:00:00 2001
From: Piero Toffanin <pt@masseranolabs.com>
Date: Sun, 17 Mar 2024 17:52:41 +0000
Subject: [PATCH 11/19] Simultaneous GPU+CPU build

---
 project_gaussians.cpp            | 156 +++++++++---------
 project_gaussians.hpp            |  38 ++---
 rasterize_gaussians.cpp          | 208 ++++++++++++------------
 rasterize_gaussians.hpp          |  30 ++--
 simple_trainer.cpp               | 112 +++++++------
 vendor/gsplat-cpu/bindings.h     |  62 +------
 vendor/gsplat-cpu/gsplat_cpu.cpp | 269 +------------------------------
 7 files changed, 287 insertions(+), 588 deletions(-)

diff --git a/project_gaussians.cpp b/project_gaussians.cpp
index 160ad3f..895ee82 100644
--- a/project_gaussians.cpp
+++ b/project_gaussians.cpp
@@ -1,92 +1,92 @@
 #include "project_gaussians.hpp"
 #include "gsplat.hpp"
 
-// variable_list ProjectGaussians::forward(AutogradContext *ctx, 
-//                 torch::Tensor means,
-//                 torch::Tensor scales,
-//                 float globScale,
-//                 torch::Tensor quats,
-//                 torch::Tensor viewMat,
-//                 torch::Tensor projMat,
-//                 float fx,
-//                 float fy,
-//                 float cx,
-//                 float cy,
-//                 int imgHeight,
-//                 int imgWidth,
-//                 TileBounds tileBounds,
-//                 float clipThresh
-//             ){
+variable_list ProjectGaussians::forward(AutogradContext *ctx, 
+                torch::Tensor means,
+                torch::Tensor scales,
+                float globScale,
+                torch::Tensor quats,
+                torch::Tensor viewMat,
+                torch::Tensor projMat,
+                float fx,
+                float fy,
+                float cx,
+                float cy,
+                int imgHeight,
+                int imgWidth,
+                TileBounds tileBounds,
+                float clipThresh
+            ){
     
-//     int numPoints = means.size(0);
+    int numPoints = means.size(0);
 
-//     auto t = project_gaussians_forward_tensor(numPoints, means, scales, globScale,
-//                                               quats, viewMat, projMat, fx, fy,
-//                                               cx, cy, imgHeight, imgWidth, tileBounds, clipThresh);
-//     torch::Tensor cov3d = std::get<0>(t);
-//     torch::Tensor xys = std::get<1>(t);
-//     torch::Tensor depths = std::get<2>(t);
-//     torch::Tensor radii = std::get<3>(t);
-//     torch::Tensor conics = std::get<4>(t);
-//     torch::Tensor numTilesHit = std::get<5>(t);
+    auto t = project_gaussians_forward_tensor(numPoints, means, scales, globScale,
+                                              quats, viewMat, projMat, fx, fy,
+                                              cx, cy, imgHeight, imgWidth, tileBounds, clipThresh);
+    torch::Tensor cov3d = std::get<0>(t);
+    torch::Tensor xys = std::get<1>(t);
+    torch::Tensor depths = std::get<2>(t);
+    torch::Tensor radii = std::get<3>(t);
+    torch::Tensor conics = std::get<4>(t);
+    torch::Tensor numTilesHit = std::get<5>(t);
     
-//     ctx->saved_data["imgHeight"] = imgHeight;
-//     ctx->saved_data["imgWidth"] = imgWidth;
-//     ctx->saved_data["numPoints"] = numPoints;
-//     ctx->saved_data["globScale"] = globScale;
-//     ctx->saved_data["fx"] = fx;
-//     ctx->saved_data["fy"] = fy;
-//     ctx->saved_data["cx"] = cx;
-//     ctx->saved_data["cy"] = cy;
-//     ctx->save_for_backward({ means, scales, quats, viewMat, projMat, cov3d, radii, conics });
+    ctx->saved_data["imgHeight"] = imgHeight;
+    ctx->saved_data["imgWidth"] = imgWidth;
+    ctx->saved_data["numPoints"] = numPoints;
+    ctx->saved_data["globScale"] = globScale;
+    ctx->saved_data["fx"] = fx;
+    ctx->saved_data["fy"] = fy;
+    ctx->saved_data["cx"] = cx;
+    ctx->saved_data["cy"] = cy;
+    ctx->save_for_backward({ means, scales, quats, viewMat, projMat, cov3d, radii, conics });
 
-//     return { xys, depths, radii, conics, numTilesHit, cov3d };
-// }
+    return { xys, depths, radii, conics, numTilesHit, cov3d };
+}
 
-// tensor_list ProjectGaussians::backward(AutogradContext *ctx, tensor_list grad_outputs) {
-//     torch::Tensor v_xys = grad_outputs[0];
-//     torch::Tensor v_depths = grad_outputs[1];
-//     torch::Tensor v_radii = grad_outputs[2];
-//     torch::Tensor v_conics = grad_outputs[3];
-//     torch::Tensor v_numTiles = grad_outputs[4];
-//     torch::Tensor v_cov3d = grad_outputs[5];
+tensor_list ProjectGaussians::backward(AutogradContext *ctx, tensor_list grad_outputs) {
+    torch::Tensor v_xys = grad_outputs[0];
+    torch::Tensor v_depths = grad_outputs[1];
+    torch::Tensor v_radii = grad_outputs[2];
+    torch::Tensor v_conics = grad_outputs[3];
+    torch::Tensor v_numTiles = grad_outputs[4];
+    torch::Tensor v_cov3d = grad_outputs[5];
 
-//     variable_list saved = ctx->get_saved_variables();
-//     torch::Tensor means = saved[0];
-//     torch::Tensor scales = saved[1];
-//     torch::Tensor quats = saved[2];
-//     torch::Tensor viewMat = saved[3];
-//     torch::Tensor projMat = saved[4];
-//     torch::Tensor cov3d = saved[5];
-//     torch::Tensor radii = saved[6];
-//     torch::Tensor conics = saved[7];
+    variable_list saved = ctx->get_saved_variables();
+    torch::Tensor means = saved[0];
+    torch::Tensor scales = saved[1];
+    torch::Tensor quats = saved[2];
+    torch::Tensor viewMat = saved[3];
+    torch::Tensor projMat = saved[4];
+    torch::Tensor cov3d = saved[5];
+    torch::Tensor radii = saved[6];
+    torch::Tensor conics = saved[7];
     
-//     auto t = project_gaussians_backward_tensor(ctx->saved_data["numPoints"].toInt(), 
-//                                             means, scales, ctx->saved_data["globScale"].toDouble(),
-//                                             quats, viewMat, projMat, 
-//                                             ctx->saved_data["fx"].toDouble(), ctx->saved_data["fy"].toDouble(),
-//                                             ctx->saved_data["cx"].toDouble(), ctx->saved_data["cy"].toDouble(), 
-//                                             ctx->saved_data["imgHeight"].toInt(), ctx->saved_data["imgWidth"].toInt(), 
-//                                             cov3d, radii,
-//                                             conics, v_xys, v_depths, v_conics);
-//     torch::Tensor none;
+    auto t = project_gaussians_backward_tensor(ctx->saved_data["numPoints"].toInt(), 
+                                            means, scales, ctx->saved_data["globScale"].toDouble(),
+                                            quats, viewMat, projMat, 
+                                            ctx->saved_data["fx"].toDouble(), ctx->saved_data["fy"].toDouble(),
+                                            ctx->saved_data["cx"].toDouble(), ctx->saved_data["cy"].toDouble(), 
+                                            ctx->saved_data["imgHeight"].toInt(), ctx->saved_data["imgWidth"].toInt(), 
+                                            cov3d, radii,
+                                            conics, v_xys, v_depths, v_conics);
+    torch::Tensor none;
 
-//     return {std::get<2>(t), // v_mean
-//             std::get<3>(t), // v_scale
-//             none, // globScale
-//             std::get<4>(t), // v_quat
-//             none, // viewMat
-//             none, // projMat
-//             none, // fx
-//             none, // fy
-//             none, // cx
-//             none, // cy
-//             none, // imgHeight
-//             none, // imgWidth
-//             none, // tileBounds
-//             none // clipThresh
-//         };
-// }
+    return {std::get<2>(t), // v_mean
+            std::get<3>(t), // v_scale
+            none, // globScale
+            std::get<4>(t), // v_quat
+            none, // viewMat
+            none, // projMat
+            none, // fx
+            none, // fy
+            none, // cx
+            none, // cy
+            none, // imgHeight
+            none, // imgWidth
+            none, // tileBounds
+            none // clipThresh
+        };
+}
 
 variable_list ProjectGaussiansCPU::Apply(
                 torch::Tensor means,
diff --git a/project_gaussians.hpp b/project_gaussians.hpp
index eb41247..b6c7e6c 100644
--- a/project_gaussians.hpp
+++ b/project_gaussians.hpp
@@ -6,25 +6,25 @@
 
 using namespace torch::autograd;
 
-// class ProjectGaussians : public Function<ProjectGaussians>{
-// public:
-//     static variable_list forward(AutogradContext *ctx, 
-//             torch::Tensor means,
-//             torch::Tensor scales,
-//             float globScale,
-//             torch::Tensor quats,
-//             torch::Tensor viewMat,
-//             torch::Tensor projMat,
-//             float fx,
-//             float fy,
-//             float cx,
-//             float cy,
-//             int imgHeight,
-//             int imgWidth,
-//             TileBounds tileBounds,
-//             float clipThresh = 0.01);
-//     static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs);
-// };
+class ProjectGaussians : public Function<ProjectGaussians>{
+public:
+    static variable_list forward(AutogradContext *ctx, 
+            torch::Tensor means,
+            torch::Tensor scales,
+            float globScale,
+            torch::Tensor quats,
+            torch::Tensor viewMat,
+            torch::Tensor projMat,
+            float fx,
+            float fy,
+            float cx,
+            float cy,
+            int imgHeight,
+            int imgWidth,
+            TileBounds tileBounds,
+            float clipThresh = 0.01);
+    static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs);
+};
 
 class ProjectGaussiansCPU{
 public:
diff --git a/rasterize_gaussians.cpp b/rasterize_gaussians.cpp
index f1058f1..976d28a 100644
--- a/rasterize_gaussians.cpp
+++ b/rasterize_gaussians.cpp
@@ -37,110 +37,116 @@ std::tuple<torch::Tensor,
     return std::make_tuple(isectIds, gaussianIds, isectIdsSorted, gaussianIdsSorted, tileBins);
 }
 
-// torch::Tensor RasterizeGaussians::forward(AutogradContext *ctx, 
-//             torch::Tensor xys,
-//             torch::Tensor depths,
-//             torch::Tensor radii,
-//             torch::Tensor conics,
-//             torch::Tensor numTilesHit,
-//             torch::Tensor colors,
-//             torch::Tensor opacity,
-//             int imgHeight,
-//             int imgWidth,
-//             torch::Tensor background
-//         ){
+torch::Tensor RasterizeGaussians::forward(AutogradContext *ctx, 
+            torch::Tensor xys,
+            torch::Tensor depths,
+            torch::Tensor radii,
+            torch::Tensor conics,
+            torch::Tensor numTilesHit,
+            torch::Tensor colors,
+            torch::Tensor opacity,
+            int imgHeight,
+            int imgWidth,
+            torch::Tensor background
+        ){
+    
+    int numPoints = xys.size(0);
+
+    TileBounds tileBounds = std::make_tuple(
+        (imgWidth + BLOCK_X - 1) / BLOCK_X,
+        (imgHeight + BLOCK_Y - 1) / BLOCK_Y,
+        1
+    );
+    std::tuple<int, int, int> block = std::make_tuple(BLOCK_X, BLOCK_Y, 1);
+    std::tuple<int, int, int> imgSize = std::make_tuple(imgWidth, imgHeight, 1);
     
-//     int numPoints = xys.size(0);
-
-//     TileBounds tileBounds = std::make_tuple(
-//         (imgWidth + BLOCK_X - 1) / BLOCK_X,
-//         (imgHeight + BLOCK_Y - 1) / BLOCK_Y,
-//         1
-//     );
-//     std::tuple<int, int, int> block = std::make_tuple(BLOCK_X, BLOCK_Y, 1);
-//     std::tuple<int, int, int> imgSize = std::make_tuple(imgWidth, imgHeight, 1);
+    torch::Tensor cumTilesHit = torch::cumsum(numTilesHit, 0, torch::kInt32);
+    int numIntersects = cumTilesHit[cumTilesHit.size(0) - 1].item<int>();
+
+    auto b = binAndSortGaussians(numPoints, numIntersects, xys, depths, radii, cumTilesHit, tileBounds);
+    torch::Tensor gaussianIdsSorted = std::get<3>(b);
+    torch::Tensor tileBins = std::get<4>(b);
+
+    auto t = rasterize_forward_tensor(tileBounds, block, imgSize, 
+                            gaussianIdsSorted,
+                            tileBins,
+                            xys,
+                            conics,
+                            colors,
+                            opacity,
+                            background);
+    // Final image
+    torch::Tensor outImg = std::get<0>(t);
     
-//     torch::Tensor cumTilesHit = torch::cumsum(numTilesHit, 0, torch::kInt32);
-//     int numIntersects = cumTilesHit[cumTilesHit.size(0) - 1].item<int>();
-
-//     auto b = binAndSortGaussians(numPoints, numIntersects, xys, depths, radii, cumTilesHit, tileBounds);
-//     torch::Tensor gaussianIdsSorted = std::get<3>(b);
-//     torch::Tensor tileBins = std::get<4>(b);
-
-//     auto t = rasterize_forward_tensor(tileBounds, block, imgSize, 
-//                             gaussianIdsSorted,
-//                             tileBins,
-//                             xys,
-//                             conics,
-//                             colors,
-//                             opacity,
-//                             background);
-//     // Final image
-//     torch::Tensor outImg = std::get<0>(t);
-
-//     // Map of alpha-inverse (1 - finalTs = alpha)
-//     torch::Tensor finalTs = std::get<1>(t);
-
-//     // Map of tile bin IDs
-//     torch::Tensor finalIdx = std::get<2>(t);
-
-//     ctx->saved_data["imgWidth"] = imgWidth;
-//     ctx->saved_data["imgHeight"] = imgHeight;
-//     ctx->save_for_backward({ gaussianIdsSorted, tileBins, xys, conics, colors, opacity, background, finalTs, finalIdx });
+    cv::Mat image = tensorToImage(outImg.detach().cpu());
+    cv::cvtColor(image, image, cv::COLOR_RGB2BGR);
+    cv::imwrite("testcuda.png", image);
+    std::cout << "WROTE! " << imgWidth << "x" << imgHeight;
+    exit(1);
+
+    // Map of alpha-inverse (1 - finalTs = alpha)
+    torch::Tensor finalTs = std::get<1>(t);
+
+    // Map of tile bin IDs
+    torch::Tensor finalIdx = std::get<2>(t);
+
+    ctx->saved_data["imgWidth"] = imgWidth;
+    ctx->saved_data["imgHeight"] = imgHeight;
+    ctx->save_for_backward({ gaussianIdsSorted, tileBins, xys, conics, colors, opacity, background, finalTs, finalIdx });
     
-//     return outImg;
-// }
-
-// tensor_list RasterizeGaussians::backward(AutogradContext *ctx, tensor_list grad_outputs) {
-//     torch::Tensor v_outImg = grad_outputs[0];
-//     int imgHeight = ctx->saved_data["imgHeight"].toInt();
-//     int imgWidth = ctx->saved_data["imgWidth"].toInt();
-
-//     variable_list saved = ctx->get_saved_variables();
-//     torch::Tensor gaussianIdsSorted = saved[0];
-//     torch::Tensor tileBins = saved[1];
-//     torch::Tensor xys = saved[2];
-//     torch::Tensor conics = saved[3];
-//     torch::Tensor colors = saved[4];
-//     torch::Tensor opacity = saved[5];
-//     torch::Tensor background = saved[6];
-//     torch::Tensor finalTs = saved[7];
-//     torch::Tensor finalIdx = saved[8];
-
-//     // torch::Tensor v_outAlpha = torch::zeros({imgHeight, imgWidth}, torch::TensorOptions().device(v_outImg.get_device());
-//     torch::Tensor v_outAlpha = torch::zeros_like(v_outImg.index({"...", 0}));
+    return outImg;
+}
+
+tensor_list RasterizeGaussians::backward(AutogradContext *ctx, tensor_list grad_outputs) {
+    torch::Tensor v_outImg = grad_outputs[0];
+    int imgHeight = ctx->saved_data["imgHeight"].toInt();
+    int imgWidth = ctx->saved_data["imgWidth"].toInt();
+
+    variable_list saved = ctx->get_saved_variables();
+    torch::Tensor gaussianIdsSorted = saved[0];
+    torch::Tensor tileBins = saved[1];
+    torch::Tensor xys = saved[2];
+    torch::Tensor conics = saved[3];
+    torch::Tensor colors = saved[4];
+    torch::Tensor opacity = saved[5];
+    torch::Tensor background = saved[6];
+    torch::Tensor finalTs = saved[7];
+    torch::Tensor finalIdx = saved[8];
+
+    // torch::Tensor v_outAlpha = torch::zeros({imgHeight, imgWidth}, torch::TensorOptions().device(v_outImg.get_device());
+    torch::Tensor v_outAlpha = torch::zeros_like(v_outImg.index({"...", 0}));
     
-//     auto t = rasterize_backward_tensor(imgHeight, imgWidth, 
-//                             gaussianIdsSorted,
-//                             tileBins,
-//                             xys,
-//                             conics,
-//                             colors,
-//                             opacity,
-//                             background,
-//                             finalTs,
-//                             finalIdx,
-//                             v_outImg,
-//                             v_outAlpha);
-
-//     torch::Tensor v_xy = std::get<0>(t);
-//     torch::Tensor v_conic = std::get<1>(t);
-//     torch::Tensor v_colors = std::get<2>(t);
-//     torch::Tensor v_opacity = std::get<3>(t);
-//     torch::Tensor none;
-
-//     return { v_xy,
-//             none, // depths
-//             none, // radii
-//             v_conic,
-//             none, // numTilesHit
-//             v_colors,
-//             v_opacity,
-//             none, // imgHeight
-//             none, // imgWidth
-//             none // background
-//     };
-// }
+    auto t = rasterize_backward_tensor(imgHeight, imgWidth, 
+                            gaussianIdsSorted,
+                            tileBins,
+                            xys,
+                            conics,
+                            colors,
+                            opacity,
+                            background,
+                            finalTs,
+                            finalIdx,
+                            v_outImg,
+                            v_outAlpha);
+
+    torch::Tensor v_xy = std::get<0>(t);
+    torch::Tensor v_conic = std::get<1>(t);
+    torch::Tensor v_colors = std::get<2>(t);
+    torch::Tensor v_opacity = std::get<3>(t);
+    torch::Tensor none;
+
+    return { v_xy,
+            none, // depths
+            none, // radii
+            v_conic,
+            none, // numTilesHit
+            v_colors,
+            v_opacity,
+            none, // imgHeight
+            none, // imgWidth
+            none // background
+    };
+}
 
 
 torch::Tensor RasterizeGaussiansCPU::forward(AutogradContext *ctx, 
@@ -173,7 +179,7 @@ torch::Tensor RasterizeGaussiansCPU::forward(AutogradContext *ctx,
     cv::Mat image = tensorToImage(outImg.detach().cpu());
     cv::cvtColor(image, image, cv::COLOR_RGB2BGR);
     cv::imwrite("test.png", image);
-    std::cout << "WROTE";
+    std::cout << "WROTE " << imgWidth << "x" << imgHeight;
     exit(1);
 
     // Map of alpha-inverse (1 - finalTs = alpha)
diff --git a/rasterize_gaussians.hpp b/rasterize_gaussians.hpp
index 4d8229a..aa21a0b 100644
--- a/rasterize_gaussians.hpp
+++ b/rasterize_gaussians.hpp
@@ -17,21 +17,21 @@ std::tuple<torch::Tensor,
                                             torch::Tensor cumTilesHit,
                                             TileBounds tileBounds);
 
-// class RasterizeGaussians : public Function<RasterizeGaussians>{
-// public:
-//     static torch::Tensor forward(AutogradContext *ctx, 
-//             torch::Tensor xys,
-//             torch::Tensor depths,
-//             torch::Tensor radii,
-//             torch::Tensor conics,
-//             torch::Tensor numTilesHit,
-//             torch::Tensor colors,
-//             torch::Tensor opacity,
-//             int imgHeight,
-//             int imgWidth,
-//             torch::Tensor background);
-//     static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs);
-// };
+class RasterizeGaussians : public Function<RasterizeGaussians>{
+public:
+    static torch::Tensor forward(AutogradContext *ctx, 
+            torch::Tensor xys,
+            torch::Tensor depths,
+            torch::Tensor radii,
+            torch::Tensor conics,
+            torch::Tensor numTilesHit,
+            torch::Tensor colors,
+            torch::Tensor opacity,
+            int imgHeight,
+            int imgWidth,
+            torch::Tensor background);
+    static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs);
+};
 
 class RasterizeGaussiansCPU : public Function<RasterizeGaussiansCPU>{
 public:
diff --git a/simple_trainer.cpp b/simple_trainer.cpp
index 495bd3e..70261f6 100644
--- a/simple_trainer.cpp
+++ b/simple_trainer.cpp
@@ -4,12 +4,10 @@
 #include <torch/torch.h>
 #ifdef USE_HIP
 #include <hip/hip_runtime.h>
-#else
+#elif defined(USE_CUDA)
 #include <torch/cuda.h>
 #endif
 
-#include <torch/cuda.h>
-
 #include <opencv2/core/core.hpp>
 #include <opencv2/imgcodecs.hpp>
 #include <opencv2/imgproc.hpp>
@@ -24,8 +22,8 @@ using namespace torch::indexing;
 
 
 int main(int argc, char **argv){
-    int width = 128,
-        height = 128;
+    int width = 256,
+        height = 256;
     int numPoints = 100000;
     int iterations = 1000;
     float learningRate = 0.01;
@@ -66,16 +64,36 @@ int main(int argc, char **argv){
 #endif
     torch::manual_seed(0);
 
+    // TODO: remove
     // Random points, scales and colors
-    torch::Tensor means = 2.0 * (torch::rand({numPoints, 3}, device) - 0.5); // Positions [-1, 1]
-    torch::Tensor scales = torch::rand({numPoints, 3}, device);
-    torch::Tensor rgbs = torch::rand({numPoints, 3}, device);
+    torch::Tensor means = 2.0 * (torch::rand({numPoints, 3}, torch::kCPU) - 0.5); // Positions [-1, 1]
+    torch::Tensor scales = torch::rand({numPoints, 3}, torch::kCPU);
+    torch::Tensor rgbs = torch::rand({numPoints, 3}, torch::kCPU);
     
     // Random rotations (quaternions)
     // quats = ( sqrt(1-u) sin(2πv), sqrt(1-u) cos(2πv), sqrt(u) sin(2πw), sqrt(u) cos(2πw))
-    torch::Tensor u = torch::rand({numPoints, 1}, device);
-    torch::Tensor v = torch::rand({numPoints, 1}, device);
-    torch::Tensor w = torch::rand({numPoints, 1}, device);
+    torch::Tensor u = torch::rand({numPoints, 1}, torch::kCPU);
+    torch::Tensor v = torch::rand({numPoints, 1}, torch::kCPU);
+    torch::Tensor w = torch::rand({numPoints, 1}, torch::kCPU);
+
+    means = means.to(device);
+    scales = scales.to(device);
+    rgbs = rgbs.to(device);
+    u = u.to(device);
+    v = v.to(device);
+    w = w.to(device);    
+
+    // TODO: uncomment
+    // // Random points, scales and colors
+    // torch::Tensor means = 2.0 * (torch::rand({numPoints, 3}, device) - 0.5); // Positions [-1, 1]
+    // torch::Tensor scales = torch::rand({numPoints, 3}, device);
+    // torch::Tensor rgbs = torch::rand({numPoints, 3}, device);
+    
+    // // Random rotations (quaternions)
+    // // quats = ( sqrt(1-u) sin(2πv), sqrt(1-u) cos(2πv), sqrt(u) sin(2πw), sqrt(u) cos(2πw))
+    // torch::Tensor u = torch::rand({numPoints, 1}, device);
+    // torch::Tensor v = torch::rand({numPoints, 1}, device);
+    // torch::Tensor w = torch::rand({numPoints, 1}, device);
     torch::Tensor quats = torch::cat({
                 torch::sqrt(1.0 - u) * torch::sin(2.0 * PI * v),
                 torch::sqrt(1.0 - u) * torch::cos(2.0 * PI * v),
@@ -103,31 +121,11 @@ int main(int argc, char **argv){
 
     torch::optim::Adam optimizer({rgbs, means, scales, opacities, quats}, learningRate);
     torch::nn::MSELoss mseLoss;
+    torch::Tensor outImg;
 
     for (size_t i = 0; i < iterations; i++){
-        // auto p = ProjectGaussians::Apply(means, scales, 1, 
-        //                         quats, viewMat, viewMat,
-        //                         focal, focal,
-        //                         width / 2,
-        //                         height / 2,
-        //                         height,
-        //                         width,
-        //                         tileBounds);
-
-        // torch::Tensor outImg = RasterizeGaussians::apply(
-        //     p[0], // xys
-        //     p[1], // depths
-        //     p[2], // radii,
-        //     p[3], // conics
-        //     p[4], // numTilesHit
-        //     torch::sigmoid(rgbs),
-        //     torch::sigmoid(opacities),
-        //     p[6], // cov2d
-        //     height,
-        //     width,
-        //     background);
-
-        auto p = ProjectGaussiansCPU::Apply(means, scales, 1, 
+        if (device == torch::kCPU){
+            auto p = ProjectGaussiansCPU::Apply(means, scales, 1, 
                                 quats, viewMat, viewMat,
                                 focal, focal,
                                 width / 2,
@@ -135,18 +133,40 @@ int main(int argc, char **argv){
                                 height,
                                 width);
 
-        torch::Tensor outImg = RasterizeGaussiansCPU::apply(
-            p[0], // xys
-            p[1], // radii,
-            p[2], // conics
-            torch::sigmoid(rgbs),
-            torch::sigmoid(opacities),
-            p[3], // cov2d
-            p[4], // camDepths
-            height,
-            width,
-            background);
-        
+            torch::Tensor outImg = RasterizeGaussiansCPU::apply(
+                p[0], // xys
+                p[1], // radii,
+                p[2], // conics
+                torch::sigmoid(rgbs),
+                torch::sigmoid(opacities),
+                p[3], // cov2d
+                p[4], // camDepths
+                height,
+                width,
+                background);
+        }else{
+            auto p = ProjectGaussians::apply(means, scales, 1, 
+                                    quats, viewMat, viewMat,
+                                    focal, focal,
+                                    width / 2,
+                                    height / 2,
+                                    height,
+                                    width,
+                                    tileBounds);
+
+            torch::Tensor outImg = RasterizeGaussians::apply(
+                p[0], // xys
+                p[1], // depths
+                p[2], // radii,
+                p[3], // conics
+                p[4], // numTilesHit
+                torch::sigmoid(rgbs),
+                torch::sigmoid(opacities),
+                height,
+                width,
+                background);
+        }
+
         outImg.requires_grad_();
         torch::Tensor loss = mseLoss(outImg, gtImage);
         optimizer.zero_grad();
diff --git a/vendor/gsplat-cpu/bindings.h b/vendor/gsplat-cpu/bindings.h
index 5f90386..6b16b6e 100644
--- a/vendor/gsplat-cpu/bindings.h
+++ b/vendor/gsplat-cpu/bindings.h
@@ -6,22 +6,6 @@
 #include <tuple>
 #include <torch/all.h>
 
-torch::Tensor compute_sh_forward_tensor(
-    unsigned num_points,
-    unsigned degree,
-    unsigned degrees_to_use,
-    torch::Tensor &viewdirs,
-    torch::Tensor &coeffs
-);
-
-torch::Tensor compute_sh_backward_tensor(
-    unsigned num_points,
-    unsigned degree,
-    unsigned degrees_to_use,
-    torch::Tensor &viewdirs,
-    torch::Tensor &v_colors
-);
-
 std::tuple<
     torch::Tensor,
     torch::Tensor,
@@ -45,50 +29,6 @@ project_gaussians_forward_tensor_cpu(
     const float clip_thresh
 );
 
-std::tuple<
-    torch::Tensor,
-    torch::Tensor,
-    torch::Tensor,
-    torch::Tensor,
-    torch::Tensor>
-project_gaussians_backward_tensor(
-    const int num_points,
-    torch::Tensor &means3d,
-    torch::Tensor &scales,
-    const float glob_scale,
-    torch::Tensor &quats,
-    torch::Tensor &viewmat,
-    torch::Tensor &projmat,
-    const float fx,
-    const float fy,
-    const float cx,
-    const float cy,
-    const unsigned img_height,
-    const unsigned img_width,
-    torch::Tensor &cov3d,
-    torch::Tensor &radii,
-    torch::Tensor &conics,
-    torch::Tensor &v_xy,
-    torch::Tensor &v_depth,
-    torch::Tensor &v_conic
-);
-
-
-std::tuple<torch::Tensor, torch::Tensor> map_gaussian_to_intersects_tensor(
-    const int num_points,
-    const int num_intersects,
-    const torch::Tensor &xys,
-    const torch::Tensor &depths,
-    const torch::Tensor &radii,
-    const torch::Tensor &cum_tiles_hit,
-    const std::tuple<int, int, int> tile_bounds
-);
-
-torch::Tensor get_tile_bin_edges_tensor(
-    int num_intersects,
-    const torch::Tensor &isect_ids_sorted
-);
-
 std::tuple<
     torch::Tensor,
     torch::Tensor,
@@ -112,7 +52,7 @@ std::
         torch::Tensor, // dL_dcolors
         torch::Tensor  // dL_dopacity
         >
-    rasterize_backward_tensor(
+    rasterize_backward_tensor_cpu(
         const unsigned img_height,
         const unsigned img_width,
         const torch::Tensor &gaussians_ids_sorted,
diff --git a/vendor/gsplat-cpu/gsplat_cpu.cpp b/vendor/gsplat-cpu/gsplat_cpu.cpp
index 893a1d6..20273a7 100644
--- a/vendor/gsplat-cpu/gsplat_cpu.cpp
+++ b/vendor/gsplat-cpu/gsplat_cpu.cpp
@@ -37,44 +37,6 @@ torch::Tensor quatToRotMat(const torch::Tensor &quat){
     
 }
 
-std::tuple<torch::Tensor, torch::Tensor> getTileBbox(const torch::Tensor &pixCenter, const torch::Tensor &pixRadius, const std::tuple<int, int, int> &tileBounds){
-    torch::Tensor tileSize = torch::tensor({BLOCK_X, BLOCK_Y}, torch::TensorOptions().dtype(torch::kFloat32).device(pixCenter.device()));
-    torch::Tensor tileCenter = pixCenter / tileSize;
-    torch::Tensor tileRadius = pixRadius.index({"...", None}) / tileSize;
-    torch::Tensor topLeft = (tileCenter - tileRadius).to(torch::kInt32);
-    torch::Tensor bottomRight = (tileCenter + tileRadius).to(torch::kInt32) + 1;
-    torch::Tensor tileMin = torch::stack({
-        torch::clamp(topLeft.index({"...", 0}), 0, std::get<0>(tileBounds)),
-        torch::clamp(topLeft.index({"...", 1}), 0, std::get<1>(tileBounds))
-    }, -1);
-    torch::Tensor tileMax = torch::stack({
-        torch::clamp(bottomRight.index({"...", 0}), 0, std::get<0>(tileBounds)),
-        torch::clamp(bottomRight.index({"...", 1}), 0, std::get<1>(tileBounds))
-    }, -1);
-
-    return std::make_tuple(tileMin, tileMax);    
-}
-
-torch::Tensor compute_sh_forward_tensor(
-    unsigned num_points,
-    unsigned degree,
-    unsigned degrees_to_use,
-    torch::Tensor &viewdirs,
-    torch::Tensor &coeffs
-){
-    return torch::Tensor();
-}
-
-torch::Tensor compute_sh_backward_tensor(
-    unsigned num_points,
-    unsigned degree,
-    unsigned degrees_to_use,
-    torch::Tensor &viewdirs,
-    torch::Tensor &v_colors
-){
-    return torch::Tensor();
-}
-
 std::tuple<
     torch::Tensor,
     torch::Tensor,
@@ -172,116 +134,6 @@ project_gaussians_forward_tensor_cpu(
     return std::make_tuple(xys, radii, conic, cov2d, camDepths);
 }
 
-std::tuple<
-    torch::Tensor,
-    torch::Tensor,
-    torch::Tensor,
-    torch::Tensor,
-    torch::Tensor>
-project_gaussians_backward_tensor(
-    const int num_points,
-    torch::Tensor &means3d,
-    torch::Tensor &scales,
-    const float glob_scale,
-    torch::Tensor &quats,
-    torch::Tensor &viewmat,
-    torch::Tensor &projmat,
-    const float fx,
-    const float fy,
-    const float cx,
-    const float cy,
-    const unsigned img_height,
-    const unsigned img_width,
-    torch::Tensor &cov3d,
-    torch::Tensor &radii,
-    torch::Tensor &conics,
-    torch::Tensor &v_xy,
-    torch::Tensor &v_depth,
-    torch::Tensor &v_conic
-){
-    return std::make_tuple(torch::Tensor(), torch::Tensor(), torch::Tensor(), torch::Tensor(), torch::Tensor());
-}
-
-
-std::tuple<torch::Tensor, torch::Tensor> map_gaussian_to_intersects_tensor(
-    const int num_points,
-    const int num_intersects,
-    const torch::Tensor &xys,
-    const torch::Tensor &depths,
-    const torch::Tensor &radii,
-    const torch::Tensor &cum_tiles_hit,
-    const std::tuple<int, int, int> tile_bounds
-){
-    torch::Device device = xys.device();
-    int numIntersects = cum_tiles_hit[-1].item<int>();
-    torch::Tensor isectIds = torch::zeros(numIntersects, torch::TensorOptions().dtype(torch::kInt64).device(device));
-    torch::Tensor gaussianIds = torch::zeros(numIntersects, torch::TensorOptions().dtype(torch::kInt32).device(device));
-    for (int idx = 0; idx < num_points; idx++){
-        if (radii[idx].item<float>() <= 0.0f) break;
-
-        auto bbox = getTileBbox(xys[idx], radii[idx], tile_bounds);
-        torch::Tensor tileMin = std::get<0>(bbox);
-        torch::Tensor tileMax = std::get<1>(bbox);
-        int curIdx;
-
-        if (idx == 0){
-            curIdx = 0;
-        }else{
-            curIdx = cum_tiles_hit[idx - 1].item<int>();
-        }
-
-        float depth = depths[idx].item<float>();
-        int32_t depthIdN = *(reinterpret_cast<int32_t *>(&depth));
-
-        int iStart = tileMin[1].item<int>();
-        int iEnd = tileMax[1].item<int>();
-        int jStart = tileMin[0].item<int>();
-        int jEnd = tileMax[0].item<int>();
-        int b = std::get<0>(tile_bounds);
-
-        for (int i = iStart; i < iEnd; i++){
-            for (int j = jStart; j < jEnd; j++){
-                int64_t tileId = i * b + j;
-                isectIds[curIdx] = static_cast<int64_t>(tileId << 32) | depthIdN;
-                gaussianIds[curIdx] = idx;
-                curIdx += 1;
-            }
-        }
-    }
-
-    return std::make_tuple(isectIds, gaussianIds); 
-}
-
-torch::Tensor get_tile_bin_edges_tensor(
-    int num_intersects,
-    const torch::Tensor &isect_ids_sorted
-){
-    torch::Tensor tileBins = torch::zeros({num_intersects, 2}, torch::TensorOptions().dtype(torch::kInt32).device(isect_ids_sorted.device()));
-
-    for (int idx = 0; idx < num_intersects; idx++){
-        int32_t curTileIdx = static_cast<int32_t>(isect_ids_sorted[idx].item<int64_t>() >> 32);
-
-        if (idx == 0){
-            tileBins[curTileIdx][0] = 0;
-            continue;
-        }
-
-        if (idx == num_intersects - 1){
-            tileBins[curTileIdx][1] = num_intersects;
-            break;
-        }
-
-        int32_t prevTileIdx = static_cast<int32_t>(isect_ids_sorted[idx - 1].item<int64_t>() >> 32);
-
-        if (curTileIdx != prevTileIdx){
-            tileBins[prevTileIdx][1] = idx;
-            tileBins[curTileIdx][0] = idx;
-        }
-    }
-
-    return tileBins;
-}
-
 std::tuple<
     torch::Tensor,
     torch::Tensor,
@@ -309,10 +161,6 @@ std::tuple<
         return pDepths[a] < pDepths[b];
     });
 
-    std::cout << pDepths[0] << std::endl;
-
-    std::cout << pDepths[100];
-
     torch::Device device = xys.device();
 
     torch::Tensor outImg = torch::zeros({width, height, channels}, torch::TensorOptions().dtype(torch::kFloat32).device(device));
@@ -389,121 +237,6 @@ std::tuple<
     }
 
     return std::make_tuple(outImg, finalTs, finalIdx);
-
-
-/*
-    int minx = 99999;
-    int miny = 99999;
-    int maxx = 0;
-    int maxy = 0;
-    for (int i = 0; i < width; i++){
-        std::cout << i << std::endl;
-        for (int j = 0; j < height; j++){
-            float T = 1.0f;
-            torch::Tensor ji = torch::tensor({j, i}, torch::TensorOptions().dtype(torch::kFloat32).device(device));
-            
-            int idx = 0;
-            for (; idx < 1; idx++){
-                torch::Tensor gaussianId = gaussian_ids_sorted[idx];
-                torch::Tensor conic = conics[gaussianId];
-                torch::Tensor center = xys[gaussianId];
-                torch::Tensor delta = center - ji;
-
-                torch::Tensor sigma = (
-                    0.5f
-                    * (conic[0] * delta[0] * delta[0] + conic[2] * delta[1] * delta[1])
-                    + conic[1] * delta[0] * delta[1]
-                );
-
-                if (sigma.item<float>() < 0.0f) continue;
-
-                float alpha = (std::min)(0.999f, (opacities[gaussianId] * torch::exp(-sigma)).item<float>());
-
-                if (alpha < 1.0f / 255.0f) continue;
-
-                float nextT = T * (1.0f - alpha);
-
-                if (nextT <= 1e-4f){
-                    idx -= 1;
-                    break;
-                }
-
-                float vis = alpha * T;
-                // outImg[i][j] = torch::tensor({1.0f, 1.0f, 1.0f}); 
-                outImg[i][j] += vis * colors[gaussianId];
-
-                maxx = (std::max)(i, maxx);
-                maxy = (std::max)(j, maxy);
-                minx = (std::min)(i, minx);
-                miny = (std::min)(j, miny);               
-                
-
-                T = nextT;
-            }
-
-            finalTs[i][j] = T;
-            finalIdx[i][j] = idx;
-            outImg[i][j] += T * background;
-        }
-    }
-
-    std::cout << "[" << minx << ", " << miny << "], [" << maxx << ", " << maxy << "]" << std::endl;
-
-    return std::make_tuple(outImg, finalTs, finalIdx);
-
-*/
-/*
-    int blockX = std::get<0>(block);
-    int blockY = std::get<1>(block);
-    int tileBoundsX = std::get<0>(tile_bounds);
-    
-    for (int i = 0; i < width; i++){
-        for (int j = 0; j < height; j++){
-            int tileId = (i / blockX) * tileBoundsX + (j / blockY);
-            int tileBinStart = tile_bins[tileId][0].item<int>();
-            int tileBinEnd = tile_bins[tileId][1].item<int>();
-            float T = 1.0f;
-            torch::Tensor ji = torch::tensor({j, i}, torch::TensorOptions().dtype(torch::kFloat32).device(device));
-            
-            int idx = tileBinStart;
-            for (; idx < tileBinEnd; idx++){
-                torch::Tensor gaussianId = gaussian_ids_sorted[idx];
-                torch::Tensor conic = conics[gaussianId];
-                torch::Tensor center = xys[gaussianId];
-                torch::Tensor delta = center - ji;
-pGaussianIds
-                torch::Tensor sigma = (
-                    0.5f
-                    * (conic[0] * delta[0] * delta[0] + conic[2] * delta[1] * delta[1])
-                    + conic[1] * delta[0] * delta[1]
-                );
-
-                if (sigma.item<float>() < 0.0f) continue;
-
-                float alpha = (std::min)(0.999f, (opacities[gaussianId] * torch::exp(-sigma)).item<float>());
-
-                if (alpha < 1.0f / 255.0f) continue;
-
-                float nextT = T * (1.0f - alpha);
-
-                if (nextT <= 1e-4f){
-                    idx -= 1;
-                    break;
-                }
-
-                float vis = alpha * T;
-                outImg[i][j] += vis * colors[gaussianId];
-                T = nextT;
-            }
-
-            finalTs[i][j] = T;
-            finalIdx[i][j] = idx;
-            outImg[i][j] += T * background;
-        }
-    }
-
-    return std::make_tuple(outImg, finalTs, finalIdx);
-*/
 }
 
 
@@ -514,7 +247,7 @@ std::
         torch::Tensor, // dL_dcolors
         torch::Tensor  // dL_dopacity
         >
-    rasterize_backward_tensor(
+    rasterize_backward_tensor_cpu(
         const unsigned img_height,
         const unsigned img_width,
         const torch::Tensor &gaussians_ids_sorted,

From 0d54e998029484819b9128b3b08b46bfa8b2480d Mon Sep 17 00:00:00 2001
From: Piero Toffanin <pt@masseranolabs.com>
Date: Mon, 18 Mar 2024 16:38:19 +0000
Subject: [PATCH 12/19] CPU backward pass rasterization working

---
 rasterize_gaussians.cpp          | 122 +++++++++++++++-------------
 simple_trainer.cpp               |   6 +-
 vendor/gsplat-cpu/bindings.h     |   8 +-
 vendor/gsplat-cpu/gsplat_cpu.cpp | 134 +++++++++++++++++++++++++++++--
 4 files changed, 198 insertions(+), 72 deletions(-)

diff --git a/rasterize_gaussians.cpp b/rasterize_gaussians.cpp
index 976d28a..7c613a7 100644
--- a/rasterize_gaussians.cpp
+++ b/rasterize_gaussians.cpp
@@ -78,11 +78,11 @@ torch::Tensor RasterizeGaussians::forward(AutogradContext *ctx,
     // Final image
     torch::Tensor outImg = std::get<0>(t);
     
-    cv::Mat image = tensorToImage(outImg.detach().cpu());
-    cv::cvtColor(image, image, cv::COLOR_RGB2BGR);
-    cv::imwrite("testcuda.png", image);
-    std::cout << "WROTE! " << imgWidth << "x" << imgHeight;
-    exit(1);
+    // cv::Mat image = tensorToImage(outImg.detach().cpu());
+    // cv::cvtColor(image, image, cv::COLOR_RGB2BGR);
+    // cv::imwrite("testcuda.png", image);
+    // std::cout << "WROTE! " << imgWidth << "x" << imgHeight;
+    // exit(1);
 
     // Map of alpha-inverse (1 - finalTs = alpha)
     torch::Tensor finalTs = std::get<1>(t);
@@ -135,6 +135,9 @@ tensor_list RasterizeGaussians::backward(AutogradContext *ctx, tensor_list grad_
     torch::Tensor v_opacity = std::get<3>(t);
     torch::Tensor none;
 
+    std::cout << "CUDA " << v_xy[0] << v_conic[0] << v_colors[0] << v_opacity[0] << std::endl;
+    exit(1);
+
     return { v_xy,
             none, // depths
             none, // radii
@@ -176,11 +179,11 @@ torch::Tensor RasterizeGaussiansCPU::forward(AutogradContext *ctx,
     // Final image
     torch::Tensor outImg = std::get<0>(t);
 
-    cv::Mat image = tensorToImage(outImg.detach().cpu());
-    cv::cvtColor(image, image, cv::COLOR_RGB2BGR);
-    cv::imwrite("test.png", image);
-    std::cout << "WROTE " << imgWidth << "x" << imgHeight;
-    exit(1);
+    // cv::Mat image = tensorToImage(outImg.detach().cpu());
+    // cv::cvtColor(image, image, cv::COLOR_RGB2BGR);
+    // cv::imwrite("test.png", image);
+    // std::cout << "WROTE " << imgWidth << "x" << imgHeight;
+    // exit(1);
 
     // Map of alpha-inverse (1 - finalTs = alpha)
     torch::Tensor finalTs = std::get<1>(t);
@@ -190,60 +193,63 @@ torch::Tensor RasterizeGaussiansCPU::forward(AutogradContext *ctx,
 
     ctx->saved_data["imgWidth"] = imgWidth;
     ctx->saved_data["imgHeight"] = imgHeight;
-    ctx->save_for_backward({ xys, conics, colors, opacity, background, finalTs, finalIdx });
+    ctx->save_for_backward({ xys, conics, colors, opacity, background, cov2d, camDepths, finalTs, finalIdx });
     
     return outImg;
 }
 
 tensor_list RasterizeGaussiansCPU::backward(AutogradContext *ctx, tensor_list grad_outputs) {
-    // torch::Tensor v_outImg = grad_outputs[0];
-    // int imgHeight = ctx->saved_data["imgHeight"].toInt();
-    // int imgWidth = ctx->saved_data["imgWidth"].toInt();
-
-    // variable_list saved = ctx->get_saved_variables();
-    // torch::Tensor gaussianIdsSorted = saved[0];
-    // torch::Tensor tileBins = saved[1];
-    // torch::Tensor xys = saved[2];
-    // torch::Tensor conics = saved[3];
-    // torch::Tensor colors = saved[4];
-    // torch::Tensor opacity = saved[5];
-    // torch::Tensor background = saved[6];
-    // torch::Tensor finalTs = saved[7];
-    // torch::Tensor finalIdx = saved[8];
-
-    // // torch::Tensor v_outAlpha = torch::zeros({imgHeight, imgWidth}, torch::TensorOptions().device(v_outImg.get_device());
-    // torch::Tensor v_outAlpha = torch::zeros_like(v_outImg.index({"...", 0}));
+    torch::Tensor v_outImg = grad_outputs[0];
+    int imgHeight = ctx->saved_data["imgHeight"].toInt();
+    int imgWidth = ctx->saved_data["imgWidth"].toInt();
+
+    variable_list saved = ctx->get_saved_variables();
+    torch::Tensor xys = saved[0];
+    torch::Tensor conics = saved[1];
+    torch::Tensor colors = saved[2];
+    torch::Tensor opacity = saved[3];
+    torch::Tensor background = saved[4];
+    torch::Tensor cov2d = saved[5];
+    torch::Tensor camDepths = saved[6];
+    torch::Tensor finalTs = saved[7];
+    torch::Tensor finalIdx = saved[8];
+
+    // torch::Tensor v_outAlpha = torch::zeros({imgHeight, imgWidth}, torch::TensorOptions().device(v_outImg.get_device());
+    torch::Tensor v_outAlpha = torch::zeros_like(v_outImg.index({"...", 0}));
     
-    // auto t = rasterize_backward_tensor(imgHeight, imgWidth, 
-    //                         gaussianIdsSorted,
-    //                         tileBins,
-    //                         xys,
-    //                         conics,
-    //                         colors,
-    //                         opacity,
-    //                         background,
-    //                         finalTs,
-    //                         finalIdx,
-    //                         v_outImg,
-    //                         v_outAlpha);
-
-    // torch::Tensor v_xy = std::get<0>(t);
-    // torch::Tensor v_conic = std::get<1>(t);
-    // torch::Tensor v_colors = std::get<2>(t);
-    // torch::Tensor v_opacity = std::get<3>(t);
-    // torch::Tensor none;
-
-    // return { v_xy,
-    //         none, // depths
-    //         none, // radii
-    //         v_conic,
-    //         none, // numTilesHit
-    //         v_colors,
-    //         v_opacity,
-    //         none, // imgHeight
-    //         none, // imgWidth
-    //         none // background
-    // };
+    auto t = rasterize_backward_tensor_cpu(imgHeight, imgWidth, 
+                            xys,
+                            conics,
+                            colors,
+                            opacity,
+                            background,
+                            cov2d,
+                            camDepths,
+                            finalTs,
+                            finalIdx,
+                            v_outImg,
+                            v_outAlpha);
+
+    torch::Tensor v_xy = std::get<0>(t);
+    torch::Tensor v_conic = std::get<1>(t);
+    torch::Tensor v_colors = std::get<2>(t);
+    torch::Tensor v_opacity = std::get<3>(t);
+    torch::Tensor none;
+
+    std::cout << "CPU " << v_xy[0] << v_conic[0] << v_colors[0] << v_opacity[0] << std::endl;
+    exit(1);
+
+    return { v_xy,
+            none, // depths
+            none, // radii
+            v_conic,
+            none, // numTilesHit
+            v_colors,
+            v_opacity,
+            none, // imgHeight
+            none, // imgWidth
+            none // background
+    };
 }
 
 
diff --git a/simple_trainer.cpp b/simple_trainer.cpp
index 70261f6..8494f9f 100644
--- a/simple_trainer.cpp
+++ b/simple_trainer.cpp
@@ -24,7 +24,7 @@ using namespace torch::indexing;
 int main(int argc, char **argv){
     int width = 256,
         height = 256;
-    int numPoints = 100000;
+    int numPoints = 32; //100000;
     int iterations = 1000;
     float learningRate = 0.01;
 
@@ -133,7 +133,7 @@ int main(int argc, char **argv){
                                 height,
                                 width);
 
-            torch::Tensor outImg = RasterizeGaussiansCPU::apply(
+            outImg = RasterizeGaussiansCPU::apply(
                 p[0], // xys
                 p[1], // radii,
                 p[2], // conics
@@ -154,7 +154,7 @@ int main(int argc, char **argv){
                                     width,
                                     tileBounds);
 
-            torch::Tensor outImg = RasterizeGaussians::apply(
+            outImg = RasterizeGaussians::apply(
                 p[0], // xys
                 p[1], // depths
                 p[2], // radii,
diff --git a/vendor/gsplat-cpu/bindings.h b/vendor/gsplat-cpu/bindings.h
index 6b16b6e..c09776e 100644
--- a/vendor/gsplat-cpu/bindings.h
+++ b/vendor/gsplat-cpu/bindings.h
@@ -53,15 +53,15 @@ std::
         torch::Tensor  // dL_dopacity
         >
     rasterize_backward_tensor_cpu(
-        const unsigned img_height,
-        const unsigned img_width,
-        const torch::Tensor &gaussians_ids_sorted,
-        const torch::Tensor &tile_bins,
+        const int height,
+        const int width,
         const torch::Tensor &xys,
         const torch::Tensor &conics,
         const torch::Tensor &colors,
         const torch::Tensor &opacities,
         const torch::Tensor &background,
+        const torch::Tensor &cov2d,
+        const torch::Tensor &camDepths,
         const torch::Tensor &final_Ts,
         const torch::Tensor &final_idx,
         const torch::Tensor &v_output, // dL_dout_color
diff --git a/vendor/gsplat-cpu/gsplat_cpu.cpp b/vendor/gsplat-cpu/gsplat_cpu.cpp
index 20273a7..bacf7aa 100644
--- a/vendor/gsplat-cpu/gsplat_cpu.cpp
+++ b/vendor/gsplat-cpu/gsplat_cpu.cpp
@@ -1,5 +1,5 @@
-// Originally based on https://github.dev/nerfstudio-project/gsplat
-// This implementation is licensed under the AGPLv3
+// Originally started from https://github.dev/nerfstudio-project/gsplat
+// This implementation has been substantially changed and is licensed under the AGPLv3
 
 #include "bindings.h"
 #include "../gsplat/config.h"
@@ -248,19 +248,139 @@ std::
         torch::Tensor  // dL_dopacity
         >
     rasterize_backward_tensor_cpu(
-        const unsigned img_height,
-        const unsigned img_width,
-        const torch::Tensor &gaussians_ids_sorted,
-        const torch::Tensor &tile_bins,
+        const int height,
+        const int width,
         const torch::Tensor &xys,
         const torch::Tensor &conics,
         const torch::Tensor &colors,
         const torch::Tensor &opacities,
         const torch::Tensor &background,
+        const torch::Tensor &cov2d,
+        const torch::Tensor &camDepths,        
         const torch::Tensor &final_Ts,
         const torch::Tensor &final_idx,
         const torch::Tensor &v_output, // dL_dout_color
         const torch::Tensor &v_output_alpha
     ){
-        return std::make_tuple(torch::Tensor(), torch::Tensor(), torch::Tensor(), torch::Tensor());
+    int numPoints = xys.size(0);
+    int channels = colors.size(1);
+    torch::Device device = xys.device();
+
+    torch::Tensor v_xy = torch::zeros({numPoints, 2}, torch::TensorOptions().dtype(torch::kFloat32).device(device));   
+    torch::Tensor v_conic = torch::zeros({numPoints, 3}, torch::TensorOptions().dtype(torch::kFloat32).device(device));
+    torch::Tensor v_colors = torch::zeros({numPoints, channels}, torch::TensorOptions().dtype(torch::kFloat32).device(device));
+    torch::Tensor v_opacity = torch::zeros({numPoints, 1}, torch::TensorOptions().dtype(torch::kFloat32).device(device));
+
+    float *pv_xy = static_cast<float *>(v_xy.data_ptr());
+    float *pv_conic = static_cast<float *>(v_conic.data_ptr());
+    float *pv_colors = static_cast<float *>(v_colors.data_ptr());
+    float *pv_opacity = static_cast<float *>(v_opacity.data_ptr());
+
+    torch::Tensor buffer = torch::zeros({width, height, 3}, torch::TensorOptions().dtype(torch::kFloat32).device(device));   
+
+    std::vector< size_t > gIndices( numPoints );
+
+    float *pDepths = static_cast<float *>(camDepths.data_ptr());
+    std::iota( gIndices.begin(), gIndices.end(), 0 );
+    std::sort(gIndices.begin(), gIndices.end(), [&pDepths](int a, int b){
+        return pDepths[a] < pDepths[b];
+    });
+
+    torch::Tensor sqCov2dX = 3.0f * torch::sqrt(cov2d.index({"...", 0, 0}));
+    torch::Tensor sqCov2dY = 3.0f * torch::sqrt(cov2d.index({"...", 1, 1}));
+    
+    float *pConics = static_cast<float *>(conics.data_ptr());
+    float *pCenters = static_cast<float *>(xys.data_ptr());
+    float *pSqCov2dX = static_cast<float *>(sqCov2dX.data_ptr());
+    float *pSqCov2dY = static_cast<float *>(sqCov2dY.data_ptr());
+    float *pOpacities = static_cast<float *>(opacities.data_ptr());
+
+    float *pColors = static_cast<float *>(colors.data_ptr());
+    float *pBuffer = static_cast<float *>(buffer.data_ptr());
+    float *pv_output = static_cast<float *>(v_output.data_ptr());
+    float *pv_outputAlpha = static_cast<float *>(v_output_alpha.data_ptr());
+    
+    float bgX = background[0].item<float>();
+    float bgY = background[1].item<float>();
+    float bgZ = background[2].item<float>();
+
+    torch::Tensor T = final_Ts.clone();
+    float *pT = static_cast<float *>(T.data_ptr());
+    float *pFinalTs = static_cast<float *>(final_Ts.data_ptr());
+
+    const float alphaThresh = 1.0f / 255.0f;
+    int idx = numPoints - 1;
+    for (; idx >= 0; idx--){
+        int32_t gaussianId = gIndices[idx];
+
+        float A = pConics[gaussianId * 3 + 0];
+        float B = pConics[gaussianId * 3 + 1];
+        float C = pConics[gaussianId * 3 + 2];
+
+        float gX = pCenters[gaussianId * 2 + 0];
+        float gY = pCenters[gaussianId * 2 + 1];
+
+        float sqx = pSqCov2dX[gaussianId];
+        float sqy = pSqCov2dY[gaussianId];
+
+        int minx = (std::max)(0, static_cast<int>(std::floor(gY - sqy)) - 2);
+        int maxx = (std::min)(width, static_cast<int>(std::ceil(gY + sqy)) + 2);
+        int miny = (std::max)(0, static_cast<int>(std::floor(gX - sqx)) - 2);
+        int maxy = (std::min)(height, static_cast<int>(std::ceil(gX + sqx)) + 2);
+
+        for (int i = minx; i < maxx; i++){
+            for (int j = miny; j < maxy; j++){
+                float xCam = gX - j;
+                float yCam = gY - i;
+                float sigma = (
+                    0.5f
+                    * (A * xCam * xCam + C * yCam * yCam)
+                    + B * xCam * yCam
+                );
+
+                if (sigma < 0.0f) continue;
+                float vis = std::exp(-sigma);
+                float alpha = (std::min)(0.999f, pOpacities[gaussianId] * vis);
+                if (alpha < alphaThresh) continue;
+
+                size_t pixIdx = (i * height + j);
+
+                float ra = 1.0f / (1.0f - alpha);
+                pT[pixIdx] *= ra;
+                float T = pT[pixIdx];
+                float TFinal = pFinalTs[pixIdx];
+
+                float fac = alpha * T;
+                float v_alpha = 0.0f;
+                pv_colors[gaussianId * 3 + 0] += fac * pv_output[pixIdx * 3 + 0];
+                pv_colors[gaussianId * 3 + 1] += fac * pv_output[pixIdx * 3 + 1];
+                pv_colors[gaussianId * 3 + 2] += fac * pv_output[pixIdx * 3 + 2];
+
+                v_alpha += (pColors[gaussianId * 3 + 0] * T - pBuffer[pixIdx * 3 + 0] * ra) * pv_output[pixIdx * 3 + 0];
+                v_alpha += (pColors[gaussianId * 3 + 1] * T - pBuffer[pixIdx * 3 + 1] * ra) * pv_output[pixIdx * 3 + 1];
+                v_alpha += (pColors[gaussianId * 3 + 2] * T - pBuffer[pixIdx * 3 + 2] * ra) * pv_output[pixIdx * 3 + 2];
+                v_alpha += (TFinal * ra * pv_outputAlpha[pixIdx]);
+
+                v_alpha += -TFinal * ra * bgX * pv_output[pixIdx * 3 + 0];
+                v_alpha += -TFinal * ra * bgY * pv_output[pixIdx * 3 + 1];
+                v_alpha += -TFinal * ra * bgZ * pv_output[pixIdx * 3 + 2];
+
+                pBuffer[pixIdx * 3 + 0] += pColors[gaussianId * 3 + 0] * fac;
+                pBuffer[pixIdx * 3 + 1] += pColors[gaussianId * 3 + 1] * fac;
+                pBuffer[pixIdx * 3 + 2] += pColors[gaussianId * 3 + 2] * fac;
+                
+                float v_sigma = -pOpacities[gaussianId] * vis * v_alpha;
+                pv_conic[gaussianId * 3 + 0] += 0.5f * v_sigma * xCam * xCam;
+                pv_conic[gaussianId * 3 + 1] += 0.5f * v_sigma * xCam * yCam;
+                pv_conic[gaussianId * 3 + 2] += 0.5f * v_sigma * yCam * yCam;
+
+                pv_xy[gaussianId * 2 + 0] += v_sigma * (A * xCam + B * yCam);
+                pv_xy[gaussianId * 2 + 1] += v_sigma * (B * xCam + C * yCam);
+
+                pv_opacity[gaussianId] += vis * v_alpha;
+            }
+        }
+    }
+
+    return std::make_tuple(v_xy, v_conic, v_colors, v_opacity);
 }
\ No newline at end of file

From 3be2500bbca99d873fd93d420ec8c9a181a3a338 Mon Sep 17 00:00:00 2001
From: Piero Toffanin <pt@masseranolabs.com>
Date: Tue, 19 Mar 2024 05:05:06 +0000
Subject: [PATCH 13/19] Fix width/height dimensions

---
 opensplat.cpp                    |  2 +-
 project_gaussians.cpp            |  4 +-
 rasterize_gaussians.cpp          | 55 ++++++++++++++++++---------
 simple_trainer.cpp               | 65 +++++++++++++++++++-------------
 vendor/gsplat-cpu/gsplat_cpu.cpp | 60 ++++++++++++++++++-----------
 5 files changed, 118 insertions(+), 68 deletions(-)

diff --git a/opensplat.cpp b/opensplat.cpp
index 22eb8a6..e184059 100644
--- a/opensplat.cpp
+++ b/opensplat.cpp
@@ -37,7 +37,7 @@ int main(int argc, char *argv[]){
         ("h,help", "Print usage")
         ;
     options.parse_positional({ "input" });
-    options.positional_help("[nerfstudio project path]");
+    options.positional_help("[colmap or nerfstudio project path]");
     cxxopts::ParseResult result;
     try {
         result = options.parse(argc, argv);
diff --git a/project_gaussians.cpp b/project_gaussians.cpp
index 895ee82..9d41697 100644
--- a/project_gaussians.cpp
+++ b/project_gaussians.cpp
@@ -29,7 +29,7 @@ variable_list ProjectGaussians::forward(AutogradContext *ctx,
     torch::Tensor radii = std::get<3>(t);
     torch::Tensor conics = std::get<4>(t);
     torch::Tensor numTilesHit = std::get<5>(t);
-    
+
     ctx->saved_data["imgHeight"] = imgHeight;
     ctx->saved_data["imgWidth"] = imgWidth;
     ctx->saved_data["numPoints"] = numPoints;
@@ -115,6 +115,6 @@ variable_list ProjectGaussiansCPU::Apply(
     torch::Tensor conics = std::get<2>(t);
     torch::Tensor cov2d = std::get<3>(t);
     torch::Tensor camDepths = std::get<4>(t);
-    
+
     return { xys, radii, conics, cov2d, camDepths };
 }
\ No newline at end of file
diff --git a/rasterize_gaussians.cpp b/rasterize_gaussians.cpp
index 7c613a7..5858f9d 100644
--- a/rasterize_gaussians.cpp
+++ b/rasterize_gaussians.cpp
@@ -77,16 +77,18 @@ torch::Tensor RasterizeGaussians::forward(AutogradContext *ctx,
                             background);
     // Final image
     torch::Tensor outImg = std::get<0>(t);
-    
-    // cv::Mat image = tensorToImage(outImg.detach().cpu());
-    // cv::cvtColor(image, image, cv::COLOR_RGB2BGR);
-    // cv::imwrite("testcuda.png", image);
-    // std::cout << "WROTE! " << imgWidth << "x" << imgHeight;
-    // exit(1);
 
+    cv::Mat image = tensorToImage(outImg.detach().cpu());
+    cv::cvtColor(image, image, cv::COLOR_RGB2BGR);
+    cv::imwrite("cudatest.png", image);
+    std::cout << "WROTE " << imgWidth << "x" << imgHeight;
+    
     // Map of alpha-inverse (1 - finalTs = alpha)
     torch::Tensor finalTs = std::get<1>(t);
 
+    std::cout << finalTs << std::endl;
+    exit(1);
+
     // Map of tile bin IDs
     torch::Tensor finalIdx = std::get<2>(t);
 
@@ -116,6 +118,9 @@ tensor_list RasterizeGaussians::backward(AutogradContext *ctx, tensor_list grad_
     // torch::Tensor v_outAlpha = torch::zeros({imgHeight, imgWidth}, torch::TensorOptions().device(v_outImg.get_device());
     torch::Tensor v_outAlpha = torch::zeros_like(v_outImg.index({"...", 0}));
     
+    // std::cout << xys[0] << finalTs[0] << std::endl;
+    // exit(1);
+
     auto t = rasterize_backward_tensor(imgHeight, imgWidth, 
                             gaussianIdsSorted,
                             tileBins,
@@ -135,8 +140,12 @@ tensor_list RasterizeGaussians::backward(AutogradContext *ctx, tensor_list grad_
     torch::Tensor v_opacity = std::get<3>(t);
     torch::Tensor none;
 
-    std::cout << "CUDA " << v_xy[0] << v_conic[0] << v_colors[0] << v_opacity[0] << std::endl;
-    exit(1);
+    // for (size_t i = 0; i < v_xy.size(0); i++){
+    //     if (v_xy[i][0].item<float>() != 0){
+    //         std::cout << "CUDA " << i << " " << v_xy[i] << v_conic[i] << v_colors[i] << v_opacity[i] << std::endl;
+    //         exit(1);
+    //     }
+    // }
 
     return { v_xy,
             none, // depths
@@ -179,16 +188,19 @@ torch::Tensor RasterizeGaussiansCPU::forward(AutogradContext *ctx,
     // Final image
     torch::Tensor outImg = std::get<0>(t);
 
-    // cv::Mat image = tensorToImage(outImg.detach().cpu());
-    // cv::cvtColor(image, image, cv::COLOR_RGB2BGR);
-    // cv::imwrite("test.png", image);
-    // std::cout << "WROTE " << imgWidth << "x" << imgHeight;
-    // exit(1);
+    cv::Mat image = tensorToImage(outImg.detach().cpu());
+    cv::cvtColor(image, image, cv::COLOR_RGB2BGR);
+    cv::imwrite("test.png", image);
+    std::cout << "WROTE " << imgWidth << "x" << imgHeight;
 
     // Map of alpha-inverse (1 - finalTs = alpha)
     torch::Tensor finalTs = std::get<1>(t);
 
+    std::cout << finalTs << std::endl;
+    exit(1);
+
     // Map of gaussian IDs
+    // TODO: remove finalIdx (not needed)
     torch::Tensor finalIdx = std::get<2>(t);
 
     ctx->saved_data["imgWidth"] = imgWidth;
@@ -217,6 +229,9 @@ tensor_list RasterizeGaussiansCPU::backward(AutogradContext *ctx, tensor_list gr
     // torch::Tensor v_outAlpha = torch::zeros({imgHeight, imgWidth}, torch::TensorOptions().device(v_outImg.get_device());
     torch::Tensor v_outAlpha = torch::zeros_like(v_outImg.index({"...", 0}));
     
+    //   std::cout << xys[0] << finalTs[0] << std::endl;
+    // exit(1);
+
     auto t = rasterize_backward_tensor_cpu(imgHeight, imgWidth, 
                             xys,
                             conics,
@@ -230,22 +245,28 @@ tensor_list RasterizeGaussiansCPU::backward(AutogradContext *ctx, tensor_list gr
                             v_outImg,
                             v_outAlpha);
 
+
     torch::Tensor v_xy = std::get<0>(t);
     torch::Tensor v_conic = std::get<1>(t);
     torch::Tensor v_colors = std::get<2>(t);
     torch::Tensor v_opacity = std::get<3>(t);
     torch::Tensor none;
 
-    std::cout << "CPU " << v_xy[0] << v_conic[0] << v_colors[0] << v_opacity[0] << std::endl;
-    exit(1);
+    // for (size_t i = 0; i < v_xy.size(0); i++){
+    //     if (v_xy[i][0].item<float>() != 0){
+        // size_t i = 11;
+            // std::cout << "CPU " << i << " " << v_xy[i] << v_conic[i] << v_colors[i] << v_opacity[i] << std::endl;
+            // exit(1);
+        // }
+    // }exit(1);
 
     return { v_xy,
-            none, // depths
             none, // radii
             v_conic,
-            none, // numTilesHit
             v_colors,
             v_opacity,
+            none, // cov2d
+            none, // camDepths
             none, // imgHeight
             none, // imgWidth
             none // background
diff --git a/simple_trainer.cpp b/simple_trainer.cpp
index 8494f9f..637e16e 100644
--- a/simple_trainer.cpp
+++ b/simple_trainer.cpp
@@ -16,20 +16,46 @@
 #include "rasterize_gaussians.hpp"
 #include "constants.hpp"
 #include "cv_utils.hpp"
+#include "vendor/cxxopts.hpp"
 
 using namespace torch::indexing;
 
+int main(int argc, char **argv){
+    cxxopts::Options options("simple_trainer", "Test program for gsplat execution");
+    options.add_options()
+        ("cpu", "Force CPU execution")
+        ("width", "Test image width", cxxopts::value<int>()->default_value("256"))
+        ("height", "Test image height", cxxopts::value<int>()->default_value("256"))
+        ("iters", "Number of iterations", cxxopts::value<int>()->default_value("1000"))
+        ("points", "Number of gaussians", cxxopts::value<int>()->default_value("100000"))
+        ("lr", "Learning rate", cxxopts::value<float>()->default_value("0.01"))
+        ("render", "Save rendered images to folder", cxxopts::value<std::string>()->default_value(""))
+        ("h,help", "Print usage")
+        ;
+    cxxopts::ParseResult result;
+    try {
+        result = options.parse(argc, argv);
+    }
+    catch (const std::exception &e) {
+        std::cerr << e.what() << std::endl;
+        std::cerr << options.help() << std::endl;
+        return EXIT_FAILURE;
+    }
 
+    if (result.count("help")) {
+        std::cout << options.help() << std::endl;
+        return EXIT_SUCCESS;
+    }
 
-int main(int argc, char **argv){
-    int width = 256,
-        height = 256;
-    int numPoints = 32; //100000;
-    int iterations = 1000;
-    float learningRate = 0.01;
+    int width = result["width"].as<int>(),
+        height = result["height"].as<int>();
+    int numPoints = result["points"].as<int>();
+    int iterations = result["iters"].as<int>();
+    float learningRate = result["lr"].as<float>();
+    std::string render = result["render"].as<std::string>();
 
     torch::Device device = torch::kCPU;
-    if (torch::cuda::is_available() && !(argc == 2 && std::string(argv[1]) == "--cpu")){
+    if (torch::cuda::is_available() && result.count("cpu") == 0){
         std::cout << "Using CUDA" << std::endl;
         device = torch::kCUDA;
     }else{
@@ -55,16 +81,12 @@ int main(int argc, char **argv){
                       (height + BLOCK_Y - 1) / BLOCK_Y,
                       1);
     
-    // torch::Tensor imgSize = torch::tensor({width, height, 1}, device);
-    // torch::Tensor block = torch::tensor({BLOCK_X, BLOCK_Y, 1}, device);
-    
     // Init gaussians
 #ifdef USE_CUDA
     torch::cuda::manual_seed_all(0);
 #endif
     torch::manual_seed(0);
 
-    // TODO: remove
     // Random points, scales and colors
     torch::Tensor means = 2.0 * (torch::rand({numPoints, 3}, torch::kCPU) - 0.5); // Positions [-1, 1]
     torch::Tensor scales = torch::rand({numPoints, 3}, torch::kCPU);
@@ -83,17 +105,6 @@ int main(int argc, char **argv){
     v = v.to(device);
     w = w.to(device);    
 
-    // TODO: uncomment
-    // // Random points, scales and colors
-    // torch::Tensor means = 2.0 * (torch::rand({numPoints, 3}, device) - 0.5); // Positions [-1, 1]
-    // torch::Tensor scales = torch::rand({numPoints, 3}, device);
-    // torch::Tensor rgbs = torch::rand({numPoints, 3}, device);
-    
-    // // Random rotations (quaternions)
-    // // quats = ( sqrt(1-u) sin(2πv), sqrt(1-u) cos(2πv), sqrt(u) sin(2πw), sqrt(u) cos(2πw))
-    // torch::Tensor u = torch::rand({numPoints, 1}, device);
-    // torch::Tensor v = torch::rand({numPoints, 1}, device);
-    // torch::Tensor w = torch::rand({numPoints, 1}, device);
     torch::Tensor quats = torch::cat({
                 torch::sqrt(1.0 - u) * torch::sin(2.0 * PI * v),
                 torch::sqrt(1.0 - u) * torch::cos(2.0 * PI * v),
@@ -174,9 +185,11 @@ int main(int argc, char **argv){
         optimizer.step();
 
         std::cout << "Iteration " << std::to_string(i + 1) << "/" << std::to_string(iterations) << " Loss: " << loss.item<float>() << std::endl; 
-
-        cv::Mat image = tensorToImage(outImg.detach().cpu());
-        cv::cvtColor(image, image, cv::COLOR_RGB2BGR);
-        cv::imwrite("render/" + std::to_string(i + 1) + ".png", image);
+        
+        if (!render.empty()){
+            cv::Mat image = tensorToImage(outImg.detach().cpu());
+            cv::cvtColor(image, image, cv::COLOR_RGB2BGR);
+            cv::imwrite(render + "/" + std::to_string(i + 1) + ".png", image);
+        }
     }
 }
\ No newline at end of file
diff --git a/vendor/gsplat-cpu/gsplat_cpu.cpp b/vendor/gsplat-cpu/gsplat_cpu.cpp
index bacf7aa..b24c212 100644
--- a/vendor/gsplat-cpu/gsplat_cpu.cpp
+++ b/vendor/gsplat-cpu/gsplat_cpu.cpp
@@ -6,7 +6,7 @@
 
 #include <cstdio>
 #include <iostream>
-#include <math.h>
+#include <cmath>
 #include <tuple>
 
 using namespace torch::indexing;
@@ -59,8 +59,8 @@ project_gaussians_forward_tensor_cpu(
     const unsigned img_width,
     const float clip_thresh
 ){
-    float fovx = 0.5f * static_cast<float>(img_width) / fx;
-    float fovy = 0.5f * static_cast<float>(img_height) / fy;
+    float fovx = 0.5f * static_cast<float>(img_height) / fx;
+    float fovy = 0.5f * static_cast<float>(img_width) / fy;
     
     // TODO: no need to recompute W,p,t below (they are the same)
 
@@ -124,8 +124,8 @@ project_gaussians_forward_tensor_cpu(
     pHom = torch::einsum("...ij,...j->...i", {projmat, pHom});
     torch::Tensor rw = 1.0f / torch::clamp_min(pHom.index({"...", 3}), eps);
     torch::Tensor pProj = pHom.index({"...", Slice(None, 3)}) * rw.index({"...", None});
-    torch::Tensor u = 0.5f * ((pProj.index({"...", 0}) + 1.0f) * static_cast<float>(img_height) - 1.0f);
-    torch::Tensor v = 0.5f * ((pProj.index({"...", 1}) + 1.0f) * static_cast<float>(img_width) - 1.0f);
+    torch::Tensor u = 0.5f * ((pProj.index({"...", 0}) + 1.0f) * static_cast<float>(img_width) - 1.0f);
+    torch::Tensor v = 0.5f * ((pProj.index({"...", 1}) + 1.0f) * static_cast<float>(img_height) - 1.0f);
     torch::Tensor xys = torch::stack({u, v}, -1); // center
 
     torch::Tensor radii = radius.to(torch::kInt32);
@@ -163,9 +163,9 @@ std::tuple<
 
     torch::Device device = xys.device();
 
-    torch::Tensor outImg = torch::zeros({width, height, channels}, torch::TensorOptions().dtype(torch::kFloat32).device(device));
-    torch::Tensor finalTs = torch::ones({width, height, channels}, torch::TensorOptions().dtype(torch::kFloat32).device(device));   
-    torch::Tensor finalIdx = torch::zeros({width, height, channels}, torch::TensorOptions().dtype(torch::kInt32).device(device));   
+    torch::Tensor outImg = torch::zeros({height, width, channels}, torch::TensorOptions().dtype(torch::kFloat32).device(device));
+    torch::Tensor finalTs = torch::ones({height, width}, torch::TensorOptions().dtype(torch::kFloat32).device(device));   
+    torch::Tensor finalIdx = torch::zeros({height, width}, torch::TensorOptions().dtype(torch::kInt32).device(device));   
 
     torch::Tensor sqCov2dX = 3.0f * torch::sqrt(cov2d.index({"...", 0, 0}));
     torch::Tensor sqCov2dY = 3.0f * torch::sqrt(cov2d.index({"...", 1, 1}));
@@ -178,6 +178,7 @@ std::tuple<
 
     float *pOutImg = static_cast<float *>(outImg.data_ptr());
     float *pFinalTs = static_cast<float *>(finalTs.data_ptr());
+    
     int32_t *pFinalIdx = static_cast<int32_t *>(finalIdx.data_ptr());
     float *pColors = static_cast<float *>(colors.data_ptr());
     
@@ -186,9 +187,8 @@ std::tuple<
     float bgZ = background[2].item<float>();
 
     const float alphaThresh = 1.0f / 255.0f;
-    float T = 1.0f;
-    int idx = 0;
-    for (; idx < numPoints; idx++){
+
+    for (int idx = 0; idx < numPoints; idx++){
         int32_t gaussianId = gIndices[idx];
 
         float A = pConics[gaussianId * 3 + 0];
@@ -202,9 +202,9 @@ std::tuple<
         float sqy = pSqCov2dY[gaussianId];
         
         int minx = (std::max)(0, static_cast<int>(std::floor(gY - sqy)) - 2);
-        int maxx = (std::min)(width, static_cast<int>(std::ceil(gY + sqy)) + 2);
+        int maxx = (std::min)(height, static_cast<int>(std::ceil(gY + sqy)) + 2);
         int miny = (std::max)(0, static_cast<int>(std::floor(gX - sqx)) - 2);
-        int maxy = (std::min)(height, static_cast<int>(std::ceil(gX + sqx)) + 2);
+        int maxy = (std::min)(width, static_cast<int>(std::ceil(gX + sqx)) + 2);
 
         for (int i = minx; i < maxx; i++){
             for (int j = miny; j < maxy; j++){
@@ -220,15 +220,18 @@ std::tuple<
                 float alpha = (std::min)(0.999f, (pOpacities[gaussianId] * std::exp(-sigma)));
                 if (alpha < alphaThresh) continue;
 
-                size_t pixIdx = (i * height + j);
+                size_t pixIdx = (i * width + j);
                 float T = pFinalTs[pixIdx];
                 float nextT = T * (1.0f - alpha);
+                if (nextT <= 1e-4f) { // this pixel is done
+                    continue;
+                }
 
-                float alphaT = alpha * T;
+                float vis = alpha * T;
 
-                pOutImg[pixIdx * 3 + 0] += alphaT * (pColors[gaussianId * 3 + 0] + bgX);
-                pOutImg[pixIdx * 3 + 1] += alphaT * (pColors[gaussianId * 3 + 1] + bgY);
-                pOutImg[pixIdx * 3 + 2] += alphaT * (pColors[gaussianId * 3 + 2] + bgZ);
+                pOutImg[pixIdx * 3 + 0] += vis * pColors[gaussianId * 3 + 0];
+                pOutImg[pixIdx * 3 + 1] += vis * pColors[gaussianId * 3 + 1];
+                pOutImg[pixIdx * 3 + 2] += vis * pColors[gaussianId * 3 + 2];
                 
                 pFinalTs[pixIdx] = nextT;
                 pFinalIdx[pixIdx] = idx;
@@ -236,6 +239,18 @@ std::tuple<
         }
     }
 
+    // Background
+    for (int i = 0; i < width; i++){
+        for (int j = 0; j < height; j++){
+            size_t pixIdx = (i * width + j);
+            float T = pFinalTs[pixIdx];
+
+            pOutImg[pixIdx * 3 + 0] += T * bgX;
+            pOutImg[pixIdx * 3 + 1] += T * bgY;
+            pOutImg[pixIdx * 3 + 2] += T * bgZ;
+        }
+    }
+
     return std::make_tuple(outImg, finalTs, finalIdx);
 }
 
@@ -262,6 +277,8 @@ std::
         const torch::Tensor &v_output, // dL_dout_color
         const torch::Tensor &v_output_alpha
     ){
+    torch::NoGradGuard noGrad;
+
     int numPoints = xys.size(0);
     int channels = colors.size(1);
     torch::Device device = xys.device();
@@ -276,7 +293,7 @@ std::
     float *pv_colors = static_cast<float *>(v_colors.data_ptr());
     float *pv_opacity = static_cast<float *>(v_opacity.data_ptr());
 
-    torch::Tensor buffer = torch::zeros({width, height, 3}, torch::TensorOptions().dtype(torch::kFloat32).device(device));   
+    torch::Tensor buffer = torch::zeros({height, width, 3}, torch::TensorOptions().dtype(torch::kFloat32).device(device));   
 
     std::vector< size_t > gIndices( numPoints );
 
@@ -309,8 +326,7 @@ std::
     float *pFinalTs = static_cast<float *>(final_Ts.data_ptr());
 
     const float alphaThresh = 1.0f / 255.0f;
-    int idx = numPoints - 1;
-    for (; idx >= 0; idx--){
+    for (int idx = numPoints - 1; idx >= 0; idx--){
         int32_t gaussianId = gIndices[idx];
 
         float A = pConics[gaussianId * 3 + 0];
@@ -343,7 +359,7 @@ std::
                 float alpha = (std::min)(0.999f, pOpacities[gaussianId] * vis);
                 if (alpha < alphaThresh) continue;
 
-                size_t pixIdx = (i * height + j);
+                size_t pixIdx = (i * width + j); // TODO!!! CHECK!!!
 
                 float ra = 1.0f / (1.0f - alpha);
                 pT[pixIdx] *= ra;

From c840bf2394181e1995dd9008cc96d1a7f1a9a265 Mon Sep 17 00:00:00 2001
From: Piero Toffanin <pt@masseranolabs.com>
Date: Wed, 20 Mar 2024 14:27:52 +0000
Subject: [PATCH 14/19] Backward pass working!

---
 rasterize_gaussians.cpp          |  44 +++------
 simple_trainer.cpp               |   7 +-
 vendor/gsplat-cpu/bindings.h     |   5 +-
 vendor/gsplat-cpu/gsplat_cpu.cpp | 155 +++++++++++++++++++++++--------
 vendor/gsplat/backward.cu        |   6 ++
 vendor/gsplat/bindings.cu        |   1 +
 6 files changed, 148 insertions(+), 70 deletions(-)

diff --git a/rasterize_gaussians.cpp b/rasterize_gaussians.cpp
index 5858f9d..f00d6c6 100644
--- a/rasterize_gaussians.cpp
+++ b/rasterize_gaussians.cpp
@@ -83,12 +83,8 @@ torch::Tensor RasterizeGaussians::forward(AutogradContext *ctx,
     cv::imwrite("cudatest.png", image);
     std::cout << "WROTE " << imgWidth << "x" << imgHeight;
     
-    // Map of alpha-inverse (1 - finalTs = alpha)
     torch::Tensor finalTs = std::get<1>(t);
 
-    std::cout << finalTs << std::endl;
-    exit(1);
-
     // Map of tile bin IDs
     torch::Tensor finalIdx = std::get<2>(t);
 
@@ -118,9 +114,6 @@ tensor_list RasterizeGaussians::backward(AutogradContext *ctx, tensor_list grad_
     // torch::Tensor v_outAlpha = torch::zeros({imgHeight, imgWidth}, torch::TensorOptions().device(v_outImg.get_device());
     torch::Tensor v_outAlpha = torch::zeros_like(v_outImg.index({"...", 0}));
     
-    // std::cout << xys[0] << finalTs[0] << std::endl;
-    // exit(1);
-
     auto t = rasterize_backward_tensor(imgHeight, imgWidth, 
                             gaussianIdsSorted,
                             tileBins,
@@ -140,12 +133,8 @@ tensor_list RasterizeGaussians::backward(AutogradContext *ctx, tensor_list grad_
     torch::Tensor v_opacity = std::get<3>(t);
     torch::Tensor none;
 
-    // for (size_t i = 0; i < v_xy.size(0); i++){
-    //     if (v_xy[i][0].item<float>() != 0){
-    //         std::cout << "CUDA " << i << " " << v_xy[i] << v_conic[i] << v_colors[i] << v_opacity[i] << std::endl;
-    //         exit(1);
-    //     }
-    // }
+    std::cout << v_colors << v_xy;
+    exit(1);
 
     return { v_xy,
             none, // depths
@@ -188,23 +177,22 @@ torch::Tensor RasterizeGaussiansCPU::forward(AutogradContext *ctx,
     // Final image
     torch::Tensor outImg = std::get<0>(t);
 
-    cv::Mat image = tensorToImage(outImg.detach().cpu());
-    cv::cvtColor(image, image, cv::COLOR_RGB2BGR);
-    cv::imwrite("test.png", image);
-    std::cout << "WROTE " << imgWidth << "x" << imgHeight;
+    // cv::Mat image = tensorToImage(outImg.detach().cpu());
+    // cv::cvtColor(image, image, cv::COLOR_RGB2BGR);
+    // cv::imwrite("test.png", image);
+    // std::cout << "WROTE " << imgWidth << "x" << imgHeight;
 
-    // Map of alpha-inverse (1 - finalTs = alpha)
     torch::Tensor finalTs = std::get<1>(t);
 
-    std::cout << finalTs << std::endl;
-    exit(1);
-
     // Map of gaussian IDs
     // TODO: remove finalIdx (not needed)
     torch::Tensor finalIdx = std::get<2>(t);
 
+    std::vector<int32_t> *pxgid = std::get<3>(t);
+
     ctx->saved_data["imgWidth"] = imgWidth;
     ctx->saved_data["imgHeight"] = imgHeight;
+    ctx->saved_data["pxgid"] = reinterpret_cast<int64_t>(pxgid);
     ctx->save_for_backward({ xys, conics, colors, opacity, background, cov2d, camDepths, finalTs, finalIdx });
     
     return outImg;
@@ -214,6 +202,7 @@ tensor_list RasterizeGaussiansCPU::backward(AutogradContext *ctx, tensor_list gr
     torch::Tensor v_outImg = grad_outputs[0];
     int imgHeight = ctx->saved_data["imgHeight"].toInt();
     int imgWidth = ctx->saved_data["imgWidth"].toInt();
+    const std::vector<int32_t> *pxgid = reinterpret_cast<const std::vector<int32_t> *>(ctx->saved_data["pxgid"].toInt());
 
     variable_list saved = ctx->get_saved_variables();
     torch::Tensor xys = saved[0];
@@ -229,9 +218,6 @@ tensor_list RasterizeGaussiansCPU::backward(AutogradContext *ctx, tensor_list gr
     // torch::Tensor v_outAlpha = torch::zeros({imgHeight, imgWidth}, torch::TensorOptions().device(v_outImg.get_device());
     torch::Tensor v_outAlpha = torch::zeros_like(v_outImg.index({"...", 0}));
     
-    //   std::cout << xys[0] << finalTs[0] << std::endl;
-    // exit(1);
-
     auto t = rasterize_backward_tensor_cpu(imgHeight, imgWidth, 
                             xys,
                             conics,
@@ -242,6 +228,7 @@ tensor_list RasterizeGaussiansCPU::backward(AutogradContext *ctx, tensor_list gr
                             camDepths,
                             finalTs,
                             finalIdx,
+                            pxgid,
                             v_outImg,
                             v_outAlpha);
 
@@ -252,13 +239,8 @@ tensor_list RasterizeGaussiansCPU::backward(AutogradContext *ctx, tensor_list gr
     torch::Tensor v_opacity = std::get<3>(t);
     torch::Tensor none;
 
-    // for (size_t i = 0; i < v_xy.size(0); i++){
-    //     if (v_xy[i][0].item<float>() != 0){
-        // size_t i = 11;
-            // std::cout << "CPU " << i << " " << v_xy[i] << v_conic[i] << v_colors[i] << v_opacity[i] << std::endl;
-            // exit(1);
-        // }
-    // }exit(1);
+    // std::cout << v_colors << v_xy;
+    // exit(1);
 
     return { v_xy,
             none, // radii
diff --git a/simple_trainer.cpp b/simple_trainer.cpp
index 637e16e..336f7a9 100644
--- a/simple_trainer.cpp
+++ b/simple_trainer.cpp
@@ -1,5 +1,6 @@
 #include <iostream>
 #include <cmath>
+#include <filesystem>
 
 #include <torch/torch.h>
 #ifdef USE_HIP
@@ -19,6 +20,7 @@
 #include "vendor/cxxopts.hpp"
 
 using namespace torch::indexing;
+namespace fs = std::filesystem;
 
 int main(int argc, char **argv){
     cxxopts::Options options("simple_trainer", "Test program for gsplat execution");
@@ -53,6 +55,7 @@ int main(int argc, char **argv){
     int iterations = result["iters"].as<int>();
     float learningRate = result["lr"].as<float>();
     std::string render = result["render"].as<std::string>();
+    if (!fs::exists(render)) fs::create_directories(render);
 
     torch::Device device = torch::kCPU;
     if (torch::cuda::is_available() && result.count("cpu") == 0){
@@ -90,6 +93,8 @@ int main(int argc, char **argv){
     // Random points, scales and colors
     torch::Tensor means = 2.0 * (torch::rand({numPoints, 3}, torch::kCPU) - 0.5); // Positions [-1, 1]
     torch::Tensor scales = torch::rand({numPoints, 3}, torch::kCPU);
+    // torch::Tensor means = torch::tensor({{0.5f, 0.5f, -5.0f}, {0.5f, 0.5f, -6.0f}, {0.25f, 0.25f, -4.0f}}, torch::kCPU);
+    // torch::Tensor scales = torch::tensor({{0.5f, 0.5f, 0.5f}, {1.0f, 1.0f, 1.0f}, {1.0f, 1.0f, 1.0f}}, torch::kCPU);
     torch::Tensor rgbs = torch::rand({numPoints, 3}, torch::kCPU);
     
     // Random rotations (quaternions)
@@ -189,7 +194,7 @@ int main(int argc, char **argv){
         if (!render.empty()){
             cv::Mat image = tensorToImage(outImg.detach().cpu());
             cv::cvtColor(image, image, cv::COLOR_RGB2BGR);
-            cv::imwrite(render + "/" + std::to_string(i + 1) + ".png", image);
+            cv::imwrite((fs::path(render) / (std::to_string(i + 1) + ".png")).string(), image);
         }
     }
 }
\ No newline at end of file
diff --git a/vendor/gsplat-cpu/bindings.h b/vendor/gsplat-cpu/bindings.h
index c09776e..3f507e1 100644
--- a/vendor/gsplat-cpu/bindings.h
+++ b/vendor/gsplat-cpu/bindings.h
@@ -2,6 +2,7 @@
 
 #include <cstdio>
 #include <iostream>
+#include <vector>
 #include <math.h>
 #include <tuple>
 #include <torch/all.h>
@@ -32,7 +33,8 @@ project_gaussians_forward_tensor_cpu(
 std::tuple<
     torch::Tensor,
     torch::Tensor,
-    torch::Tensor
+    torch::Tensor,
+    std::vector<int32_t> *
 > rasterize_forward_tensor_cpu(
     const int width,
     const int height,
@@ -64,6 +66,7 @@ std::
         const torch::Tensor &camDepths,
         const torch::Tensor &final_Ts,
         const torch::Tensor &final_idx,
+        const std::vector<int32_t> *pxgid,
         const torch::Tensor &v_output, // dL_dout_color
         const torch::Tensor &v_output_alpha
     );
diff --git a/vendor/gsplat-cpu/gsplat_cpu.cpp b/vendor/gsplat-cpu/gsplat_cpu.cpp
index b24c212..23330e8 100644
--- a/vendor/gsplat-cpu/gsplat_cpu.cpp
+++ b/vendor/gsplat-cpu/gsplat_cpu.cpp
@@ -137,7 +137,8 @@ project_gaussians_forward_tensor_cpu(
 std::tuple<
     torch::Tensor,
     torch::Tensor,
-    torch::Tensor
+    torch::Tensor,
+    std::vector<int32_t> *
 > rasterize_forward_tensor_cpu(
     const int width,
     const int height,
@@ -154,6 +155,7 @@ std::tuple<
     int channels = colors.size(1);
     int numPoints = xys.size(0);
     float *pDepths = static_cast<float *>(camDepths.data_ptr());
+    std::vector<int32_t> *pxgid = new std::vector<int32_t>[width * height];
 
     std::vector< size_t > gIndices( numPoints );
     std::iota( gIndices.begin(), gIndices.end(), 0 );
@@ -166,6 +168,7 @@ std::tuple<
     torch::Tensor outImg = torch::zeros({height, width, channels}, torch::TensorOptions().dtype(torch::kFloat32).device(device));
     torch::Tensor finalTs = torch::ones({height, width}, torch::TensorOptions().dtype(torch::kFloat32).device(device));   
     torch::Tensor finalIdx = torch::zeros({height, width}, torch::TensorOptions().dtype(torch::kInt32).device(device));   
+    torch::Tensor done = torch::zeros({height, width}, torch::TensorOptions().dtype(torch::kBool).device(device));   
 
     torch::Tensor sqCov2dX = 3.0f * torch::sqrt(cov2d.index({"...", 0, 0}));
     torch::Tensor sqCov2dY = 3.0f * torch::sqrt(cov2d.index({"...", 1, 1}));
@@ -178,7 +181,8 @@ std::tuple<
 
     float *pOutImg = static_cast<float *>(outImg.data_ptr());
     float *pFinalTs = static_cast<float *>(finalTs.data_ptr());
-    
+    bool *pDone = static_cast<bool *>(done.data_ptr());
+
     int32_t *pFinalIdx = static_cast<int32_t *>(finalIdx.data_ptr());
     float *pColors = static_cast<float *>(colors.data_ptr());
     
@@ -208,6 +212,9 @@ std::tuple<
 
         for (int i = minx; i < maxx; i++){
             for (int j = miny; j < maxy; j++){
+                size_t pixIdx = (i * width + j);
+                if (pDone[pixIdx]) continue;
+
                 float xCam = gX - j;
                 float yCam = gY - i;
                 float sigma = (
@@ -220,10 +227,10 @@ std::tuple<
                 float alpha = (std::min)(0.999f, (pOpacities[gaussianId] * std::exp(-sigma)));
                 if (alpha < alphaThresh) continue;
 
-                size_t pixIdx = (i * width + j);
                 float T = pFinalTs[pixIdx];
                 float nextT = T * (1.0f - alpha);
                 if (nextT <= 1e-4f) { // this pixel is done
+                    pDone[pixIdx] = true;
                     continue;
                 }
 
@@ -234,7 +241,8 @@ std::tuple<
                 pOutImg[pixIdx * 3 + 2] += vis * pColors[gaussianId * 3 + 2];
                 
                 pFinalTs[pixIdx] = nextT;
-                pFinalIdx[pixIdx] = idx;
+                pFinalIdx[pixIdx] = gaussianId;
+                pxgid[pixIdx].push_back(gaussianId);
             }
         }
     }
@@ -248,10 +256,12 @@ std::tuple<
             pOutImg[pixIdx * 3 + 0] += T * bgX;
             pOutImg[pixIdx * 3 + 1] += T * bgY;
             pOutImg[pixIdx * 3 + 2] += T * bgZ;
+
+            std::reverse(pxgid[pixIdx].begin(), pxgid[pixIdx].end());
         }
     }
 
-    return std::make_tuple(outImg, finalTs, finalIdx);
+    return std::make_tuple(outImg, finalTs, finalIdx, pxgid);
 }
 
 
@@ -274,6 +284,7 @@ std::
         const torch::Tensor &camDepths,        
         const torch::Tensor &final_Ts,
         const torch::Tensor &final_idx,
+        const std::vector<int32_t> *pxgid,
         const torch::Tensor &v_output, // dL_dout_color
         const torch::Tensor &v_output_alpha
     ){
@@ -292,9 +303,27 @@ std::
     float *pv_conic = static_cast<float *>(v_conic.data_ptr());
     float *pv_colors = static_cast<float *>(v_colors.data_ptr());
     float *pv_opacity = static_cast<float *>(v_opacity.data_ptr());
+    
+    // torch::Tensor buffer = torch::zeros({height, width, 3}, torch::TensorOptions().dtype(torch::kFloat32).device(device));   
+
+    float *pColors = static_cast<float *>(colors.data_ptr());
+    // float *pBuffer = static_cast<float *>(buffer.data_ptr());
+    float *pv_output = static_cast<float *>(v_output.data_ptr());
+    float *pv_outputAlpha = static_cast<float *>(v_output_alpha.data_ptr());
+    float *pConics = static_cast<float *>(conics.data_ptr());
+    float *pCenters = static_cast<float *>(xys.data_ptr());
+    float *pOpacities = static_cast<float *>(opacities.data_ptr());
 
-    torch::Tensor buffer = torch::zeros({height, width, 3}, torch::TensorOptions().dtype(torch::kFloat32).device(device));   
+    float bgX = background[0].item<float>();
+    float bgY = background[1].item<float>();
+    float bgZ = background[2].item<float>();
+
+    // torch::Tensor Ts = final_Ts.clone();
+    // float *pTs = static_cast<float *>(Ts.data_ptr());
+    float *pFinalTs = static_cast<float *>(final_Ts.data_ptr());
 
+    const float alphaThresh = 1.0f / 255.0f;
+/*
     std::vector< size_t > gIndices( numPoints );
 
     float *pDepths = static_cast<float *>(camDepths.data_ptr());
@@ -306,24 +335,9 @@ std::
     torch::Tensor sqCov2dX = 3.0f * torch::sqrt(cov2d.index({"...", 0, 0}));
     torch::Tensor sqCov2dY = 3.0f * torch::sqrt(cov2d.index({"...", 1, 1}));
     
-    float *pConics = static_cast<float *>(conics.data_ptr());
-    float *pCenters = static_cast<float *>(xys.data_ptr());
+
     float *pSqCov2dX = static_cast<float *>(sqCov2dX.data_ptr());
     float *pSqCov2dY = static_cast<float *>(sqCov2dY.data_ptr());
-    float *pOpacities = static_cast<float *>(opacities.data_ptr());
-
-    float *pColors = static_cast<float *>(colors.data_ptr());
-    float *pBuffer = static_cast<float *>(buffer.data_ptr());
-    float *pv_output = static_cast<float *>(v_output.data_ptr());
-    float *pv_outputAlpha = static_cast<float *>(v_output_alpha.data_ptr());
-    
-    float bgX = background[0].item<float>();
-    float bgY = background[1].item<float>();
-    float bgZ = background[2].item<float>();
-
-    torch::Tensor T = final_Ts.clone();
-    float *pT = static_cast<float *>(T.data_ptr());
-    float *pFinalTs = static_cast<float *>(final_Ts.data_ptr());
 
     const float alphaThresh = 1.0f / 255.0f;
     for (int idx = numPoints - 1; idx >= 0; idx--){
@@ -340,12 +354,14 @@ std::
         float sqy = pSqCov2dY[gaussianId];
 
         int minx = (std::max)(0, static_cast<int>(std::floor(gY - sqy)) - 2);
-        int maxx = (std::min)(width, static_cast<int>(std::ceil(gY + sqy)) + 2);
+        int maxx = (std::min)(height, static_cast<int>(std::ceil(gY + sqy)) + 2);
         int miny = (std::max)(0, static_cast<int>(std::floor(gX - sqx)) - 2);
-        int maxy = (std::min)(height, static_cast<int>(std::ceil(gX + sqx)) + 2);
+        int maxy = (std::min)(width, static_cast<int>(std::ceil(gX + sqx)) + 2);
 
         for (int i = minx; i < maxx; i++){
             for (int j = miny; j < maxy; j++){
+                size_t pixIdx = (i * width + j);
+
                 float xCam = gX - j;
                 float yCam = gY - i;
                 float sigma = (
@@ -356,30 +372,31 @@ std::
 
                 if (sigma < 0.0f) continue;
                 float vis = std::exp(-sigma);
-                float alpha = (std::min)(0.999f, pOpacities[gaussianId] * vis);
+                float alpha = (std::min)(0.99f, pOpacities[gaussianId] * vis);
                 if (alpha < alphaThresh) continue;
 
-                size_t pixIdx = (i * width + j); // TODO!!! CHECK!!!
 
                 float ra = 1.0f / (1.0f - alpha);
-                pT[pixIdx] *= ra;
-                float T = pT[pixIdx];
-                float TFinal = pFinalTs[pixIdx];
+                float T = pTs[pixIdx];
+                T *= ra;
+                pTs[pixIdx] = T;
+
+                float Tfinal = pFinalTs[pixIdx];
 
                 float fac = alpha * T;
-                float v_alpha = 0.0f;
+
                 pv_colors[gaussianId * 3 + 0] += fac * pv_output[pixIdx * 3 + 0];
                 pv_colors[gaussianId * 3 + 1] += fac * pv_output[pixIdx * 3 + 1];
                 pv_colors[gaussianId * 3 + 2] += fac * pv_output[pixIdx * 3 + 2];
 
-                v_alpha += (pColors[gaussianId * 3 + 0] * T - pBuffer[pixIdx * 3 + 0] * ra) * pv_output[pixIdx * 3 + 0];
-                v_alpha += (pColors[gaussianId * 3 + 1] * T - pBuffer[pixIdx * 3 + 1] * ra) * pv_output[pixIdx * 3 + 1];
-                v_alpha += (pColors[gaussianId * 3 + 2] * T - pBuffer[pixIdx * 3 + 2] * ra) * pv_output[pixIdx * 3 + 2];
-                v_alpha += (TFinal * ra * pv_outputAlpha[pixIdx]);
+                float v_alpha = ((pColors[gaussianId * 3 + 0] * T - pBuffer[pixIdx * 3 + 0] * ra) * pv_output[pixIdx * 3 + 0]) +
+                                ((pColors[gaussianId * 3 + 1] * T - pBuffer[pixIdx * 3 + 1] * ra) * pv_output[pixIdx * 3 + 1]) +
+                                ((pColors[gaussianId * 3 + 2] * T - pBuffer[pixIdx * 3 + 2] * ra) * pv_output[pixIdx * 3 + 2]) +
+                                (Tfinal * ra * pv_outputAlpha[pixIdx]) +
 
-                v_alpha += -TFinal * ra * bgX * pv_output[pixIdx * 3 + 0];
-                v_alpha += -TFinal * ra * bgY * pv_output[pixIdx * 3 + 1];
-                v_alpha += -TFinal * ra * bgZ * pv_output[pixIdx * 3 + 2];
+                                (-Tfinal * ra * bgX * pv_output[pixIdx * 3 + 0]) +
+                                (-Tfinal * ra * bgY * pv_output[pixIdx * 3 + 1]) +
+                                (-Tfinal * ra * bgZ * pv_output[pixIdx * 3 + 2]);
 
                 pBuffer[pixIdx * 3 + 0] += pColors[gaussianId * 3 + 0] * fac;
                 pBuffer[pixIdx * 3 + 1] += pColors[gaussianId * 3 + 1] * fac;
@@ -393,6 +410,70 @@ std::
                 pv_xy[gaussianId * 2 + 0] += v_sigma * (A * xCam + B * yCam);
                 pv_xy[gaussianId * 2 + 1] += v_sigma * (B * xCam + C * yCam);
 
+                pv_opacity[gaussianId] += vis * v_alpha;
+            }
+        }
+    }*/
+
+    for (int j = 0; j < width; j++){
+        for (int i = 0; i < height; i++){
+            size_t pixIdx = (i * width + j);
+            float Tfinal = pFinalTs[pixIdx];
+            float T = Tfinal;
+            float buffer[3] = {0.0f, 0.0f, 0.0f};
+
+            for (const int32_t &gaussianId : pxgid[pixIdx]){
+                float A = pConics[gaussianId * 3 + 0];
+                float B = pConics[gaussianId * 3 + 1];
+                float C = pConics[gaussianId * 3 + 2];
+
+                float gX = pCenters[gaussianId * 2 + 0];
+                float gY = pCenters[gaussianId * 2 + 1];
+
+                float xCam = gX - j;
+                float yCam = gY - i;
+                float sigma = (
+                    0.5f
+                    * (A * xCam * xCam + C * yCam * yCam)
+                    + B * xCam * yCam
+                );
+
+                if (sigma < 0.0f) continue;
+                float vis = std::exp(-sigma);
+                float alpha = (std::min)(0.99f, pOpacities[gaussianId] * vis);
+                if (alpha < alphaThresh) continue;
+
+                float ra = 1.0f / (1.0f - alpha);
+                T *= ra;
+                float fac = alpha * T;
+
+                pv_colors[gaussianId * 3 + 0] += fac * pv_output[pixIdx * 3 + 0];
+                pv_colors[gaussianId * 3 + 1] += fac * pv_output[pixIdx * 3 + 1];
+                pv_colors[gaussianId * 3 + 2] += fac * pv_output[pixIdx * 3 + 2];
+
+                float v_alpha = ((pColors[gaussianId * 3 + 0] * T - buffer[0] * ra) * pv_output[pixIdx * 3 + 0]) +
+                                ((pColors[gaussianId * 3 + 1] * T - buffer[1] * ra) * pv_output[pixIdx * 3 + 1]) +
+                                ((pColors[gaussianId * 3 + 2] * T - buffer[2] * ra) * pv_output[pixIdx * 3 + 2]) +
+                                (Tfinal * ra * pv_outputAlpha[pixIdx]) +
+
+                                (-Tfinal * ra * bgX * pv_output[pixIdx * 3 + 0]) +
+                                (-Tfinal * ra * bgY * pv_output[pixIdx * 3 + 1]) +
+                                (-Tfinal * ra * bgZ * pv_output[pixIdx * 3 + 2]);
+
+                buffer[0] += pColors[gaussianId * 3 + 0] * fac;
+                buffer[1] += pColors[gaussianId * 3 + 1] * fac;
+                buffer[2] += pColors[gaussianId * 3 + 2] * fac;
+                
+                float v_sigma = -pOpacities[gaussianId] * vis * v_alpha;
+                pv_conic[gaussianId * 3 + 0] += 0.5f * v_sigma * xCam * xCam;
+                pv_conic[gaussianId * 3 + 1] += 0.5f * v_sigma * xCam * yCam;
+                pv_conic[gaussianId * 3 + 2] += 0.5f * v_sigma * yCam * yCam;
+
+                // std::cout << v_sigma << std::endl;
+
+                pv_xy[gaussianId * 2 + 0] += v_sigma * (A * xCam + B * yCam);
+                pv_xy[gaussianId * 2 + 1] += v_sigma * (B * xCam + C * yCam);
+
                 pv_opacity[gaussianId] += vis * v_alpha;
             }
         }
diff --git a/vendor/gsplat/backward.cu b/vendor/gsplat/backward.cu
index c338dba..c0e8f1a 100644
--- a/vendor/gsplat/backward.cu
+++ b/vendor/gsplat/backward.cu
@@ -10,6 +10,8 @@
 #include <cooperative_groups/reduce.h>
 #endif
 
+#include <stdio.h> // TODO REMOVE
+
 namespace cg = cooperative_groups;
 
 __global__ void nd_rasterize_backward_kernel(
@@ -299,6 +301,10 @@ __global__ void rasterize_backward_kernel(
                 // compute the current T for this gaussian
                 float ra = 1.f / (1.f - alpha);
                 T *= ra;
+        // if (pix_id == 4999){
+        //     printf("%f \n", T);
+        // }
+
                 // update v_rgb for this gaussian
                 const float fac = alpha * T;
                 float v_alpha = 0.f;
diff --git a/vendor/gsplat/bindings.cu b/vendor/gsplat/bindings.cu
index 80d581c..8bdbaca 100644
--- a/vendor/gsplat/bindings.cu
+++ b/vendor/gsplat/bindings.cu
@@ -628,6 +628,7 @@ std::
         (float3 *)v_colors.contiguous().data_ptr<float>(),
         v_opacity.contiguous().data_ptr<float>()
     );
+    cudaDeviceSynchronize(); // TODO REMOVE
 
     return std::make_tuple(v_xy, v_conic, v_colors, v_opacity);
 }
\ No newline at end of file

From c0ba9dac8b1c89d0cc0cab4bd2a822e5c636d57f Mon Sep 17 00:00:00 2001
From: Piero Toffanin <pt@masseranolabs.com>
Date: Wed, 20 Mar 2024 16:03:40 +0000
Subject: [PATCH 15/19] Started opensplat CPU integration

---
 gsplat.hpp                       |   2 +
 model.cpp                        | 122 ++++++++++++++++++--------
 opensplat.cpp                    |   2 -
 rasterize_gaussians.cpp          |  40 ++-------
 simple_trainer.cpp               |  45 +++++-----
 vendor/gsplat-cpu/bindings.h     |   9 +-
 vendor/gsplat-cpu/gsplat_cpu.cpp | 145 ++++---------------------------
 vendor/gsplat/backward.cu        |   5 --
 8 files changed, 142 insertions(+), 228 deletions(-)

diff --git a/gsplat.hpp b/gsplat.hpp
index fab3520..db0368d 100644
--- a/gsplat.hpp
+++ b/gsplat.hpp
@@ -1,3 +1,5 @@
+#include "vendor/gsplat/config.h"
+
 #if defined(USE_HIP) || defined(USE_CUDA)
 #include "vendor/gsplat/bindings.h"
 #endif
diff --git a/model.cpp b/model.cpp
index ca615c8..3f36ff7 100644
--- a/model.cpp
+++ b/model.cpp
@@ -74,31 +74,62 @@ torch::Tensor Model::forward(Camera& cam, int step){
     float fovY = 2.0f * std::atan(height / (2.0f * fy));
 
     torch::Tensor projMat = projectionMatrix(0.001f, 1000.0f, fovX, fovY, device);
-
-    TileBounds tileBounds = std::make_tuple((width + BLOCK_X - 1) / BLOCK_X,
-                      (height + BLOCK_Y - 1) / BLOCK_Y,
-                      1);
-
     torch::Tensor colors =  torch::cat({featuresDc.index({Slice(), None, Slice()}), featuresRest}, 1);
 
-    auto p = ProjectGaussians::apply(means, 
-                    torch::exp(scales), 
-                    1, 
-                    quats / quats.norm(2, {-1}, true), 
-                    viewMat, 
-                    torch::matmul(projMat, viewMat),
-                    fx, 
-                    fy,
-                    cx,
-                    cy,
-                    height,
-                    width,
-                    tileBounds);
-    xys = p[0];
-    torch::Tensor depths = p[1];
-    radii = p[2];
-    torch::Tensor conics = p[3];
-    torch::Tensor numTilesHit = p[4];
+    torch::Tensor conics;
+    torch::Tensor depths; // GPU-only
+    torch::Tensor numTilesHit; // GPU-only
+    torch::Tensor cov2d; // CPU-only
+    torch::Tensor camDepths; // CPU-only
+    torch::Tensor rgb;
+
+    if (device == torch::kCPU){
+        auto p = ProjectGaussiansCPU::Apply(means, 
+                                torch::exp(scales), 
+                                1, 
+                                quats / quats.norm(2, {-1}, true), 
+                                viewMat, 
+                                torch::matmul(projMat, viewMat),
+                                fx, 
+                                fy,
+                                cx,
+                                cy,
+                                height,
+                                width);
+        xys = p[0];
+        radii = p[1];
+        conics = p[2];
+        cov2d = p[3];
+        camDepths = p[4];
+    }else{
+        #if defined(USE_HIP) || defined(USE_CUDA)
+
+        TileBounds tileBounds = std::make_tuple((width + BLOCK_X - 1) / BLOCK_X,
+                        (height + BLOCK_Y - 1) / BLOCK_Y,
+                        1);
+        auto p = ProjectGaussians::apply(means, 
+                        torch::exp(scales), 
+                        1, 
+                        quats / quats.norm(2, {-1}, true), 
+                        viewMat, 
+                        torch::matmul(projMat, viewMat),
+                        fx, 
+                        fy,
+                        cx,
+                        cy,
+                        height,
+                        width,
+                        tileBounds);
+
+        xys = p[0];
+        depths = p[1];
+        radii = p[2];
+        conics = p[3];
+        numTilesHit = p[4];
+        #else
+            throw std::runtime_error("GPU support not built");
+        #endif
+    }
     
 
     if (radii.sum().item<float>() == 0.0f)
@@ -110,22 +141,39 @@ torch::Tensor Model::forward(Camera& cam, int step){
     torch::Tensor viewDirs = means.detach() - T.transpose(0, 1).to(device);
     viewDirs = viewDirs / viewDirs.norm(2, {-1}, true);
     int degreesToUse = (std::min<int>)(step / shDegreeInterval, shDegree);
+    std::cout << degreesToUse;
+    exit(1);
     torch::Tensor rgbs = SphericalHarmonics::apply(degreesToUse, viewDirs, colors);
-    rgbs = torch::clamp_min(rgbs + 0.5f, 0.0f); 
+    rgbs = torch::clamp_min(rgbs + 0.5f, 0.0f);
+
+    if (device == torch::kCPU){
+        rgb = RasterizeGaussiansCPU::apply(
+                xys,
+                radii,
+                conics,
+                rgbs,
+                torch::sigmoid(opacities),
+                cov2d,
+                camDepths,
+                height,
+                width,
+                backgroundColor);
+    }else{  
+        #if defined(USE_HIP) || defined(USE_CUDA)
+        rgb = RasterizeGaussians::apply(
+                xys,
+                depths,
+                radii,
+                conics,
+                numTilesHit,
+                rgbs,
+                torch::sigmoid(opacities),
+                height,
+                width,
+                backgroundColor);
+        #endif
+    }
 
-    
-    torch::Tensor rgb = RasterizeGaussians::apply(
-            xys,
-            depths,
-            radii,
-            conics,
-            numTilesHit,
-            rgbs, // TODO: why not sigmod?
-            torch::sigmoid(opacities),
-            height,
-            width,
-            backgroundColor);
-    
     rgb = torch::clamp_max(rgb, 1.0f);
 
     return rgb;
diff --git a/opensplat.cpp b/opensplat.cpp
index e184059..4273a3d 100644
--- a/opensplat.cpp
+++ b/opensplat.cpp
@@ -90,8 +90,6 @@ int main(int argc, char *argv[]){
             cam.loadImage(downScaleFactor);
         }
 
-        
-
         // Withhold a validation camera if necessary
         auto t = inputData.getCameras(validate, valImage);
         std::vector<Camera> cams = std::get<0>(t);
diff --git a/rasterize_gaussians.cpp b/rasterize_gaussians.cpp
index f00d6c6..40d0b12 100644
--- a/rasterize_gaussians.cpp
+++ b/rasterize_gaussians.cpp
@@ -1,8 +1,5 @@
 #include "rasterize_gaussians.hpp"
 #include "gsplat.hpp"
-#include "vendor/gsplat/config.h"
-
-#include "cv_utils.hpp" // TODO REMOVE
 
 std::tuple<torch::Tensor,
         torch::Tensor,
@@ -78,11 +75,6 @@ torch::Tensor RasterizeGaussians::forward(AutogradContext *ctx,
     // Final image
     torch::Tensor outImg = std::get<0>(t);
 
-    cv::Mat image = tensorToImage(outImg.detach().cpu());
-    cv::cvtColor(image, image, cv::COLOR_RGB2BGR);
-    cv::imwrite("cudatest.png", image);
-    std::cout << "WROTE " << imgWidth << "x" << imgHeight;
-    
     torch::Tensor finalTs = std::get<1>(t);
 
     // Map of tile bin IDs
@@ -111,7 +103,6 @@ tensor_list RasterizeGaussians::backward(AutogradContext *ctx, tensor_list grad_
     torch::Tensor finalTs = saved[7];
     torch::Tensor finalIdx = saved[8];
 
-    // torch::Tensor v_outAlpha = torch::zeros({imgHeight, imgWidth}, torch::TensorOptions().device(v_outImg.get_device());
     torch::Tensor v_outAlpha = torch::zeros_like(v_outImg.index({"...", 0}));
     
     auto t = rasterize_backward_tensor(imgHeight, imgWidth, 
@@ -133,9 +124,6 @@ tensor_list RasterizeGaussians::backward(AutogradContext *ctx, tensor_list grad_
     torch::Tensor v_opacity = std::get<3>(t);
     torch::Tensor none;
 
-    std::cout << v_colors << v_xy;
-    exit(1);
-
     return { v_xy,
             none, // depths
             none, // radii
@@ -177,23 +165,13 @@ torch::Tensor RasterizeGaussiansCPU::forward(AutogradContext *ctx,
     // Final image
     torch::Tensor outImg = std::get<0>(t);
 
-    // cv::Mat image = tensorToImage(outImg.detach().cpu());
-    // cv::cvtColor(image, image, cv::COLOR_RGB2BGR);
-    // cv::imwrite("test.png", image);
-    // std::cout << "WROTE " << imgWidth << "x" << imgHeight;
-
     torch::Tensor finalTs = std::get<1>(t);
-
-    // Map of gaussian IDs
-    // TODO: remove finalIdx (not needed)
-    torch::Tensor finalIdx = std::get<2>(t);
-
-    std::vector<int32_t> *pxgid = std::get<3>(t);
+    std::vector<int32_t> *px2gid = std::get<2>(t);
 
     ctx->saved_data["imgWidth"] = imgWidth;
     ctx->saved_data["imgHeight"] = imgHeight;
-    ctx->saved_data["pxgid"] = reinterpret_cast<int64_t>(pxgid);
-    ctx->save_for_backward({ xys, conics, colors, opacity, background, cov2d, camDepths, finalTs, finalIdx });
+    ctx->saved_data["px2gid"] = reinterpret_cast<int64_t>(px2gid);
+    ctx->save_for_backward({ xys, conics, colors, opacity, background, cov2d, camDepths, finalTs });
     
     return outImg;
 }
@@ -202,7 +180,7 @@ tensor_list RasterizeGaussiansCPU::backward(AutogradContext *ctx, tensor_list gr
     torch::Tensor v_outImg = grad_outputs[0];
     int imgHeight = ctx->saved_data["imgHeight"].toInt();
     int imgWidth = ctx->saved_data["imgWidth"].toInt();
-    const std::vector<int32_t> *pxgid = reinterpret_cast<const std::vector<int32_t> *>(ctx->saved_data["pxgid"].toInt());
+    const std::vector<int32_t> *px2gid = reinterpret_cast<const std::vector<int32_t> *>(ctx->saved_data["px2gid"].toInt());
 
     variable_list saved = ctx->get_saved_variables();
     torch::Tensor xys = saved[0];
@@ -213,9 +191,7 @@ tensor_list RasterizeGaussiansCPU::backward(AutogradContext *ctx, tensor_list gr
     torch::Tensor cov2d = saved[5];
     torch::Tensor camDepths = saved[6];
     torch::Tensor finalTs = saved[7];
-    torch::Tensor finalIdx = saved[8];
 
-    // torch::Tensor v_outAlpha = torch::zeros({imgHeight, imgWidth}, torch::TensorOptions().device(v_outImg.get_device());
     torch::Tensor v_outAlpha = torch::zeros_like(v_outImg.index({"...", 0}));
     
     auto t = rasterize_backward_tensor_cpu(imgHeight, imgWidth, 
@@ -227,11 +203,12 @@ tensor_list RasterizeGaussiansCPU::backward(AutogradContext *ctx, tensor_list gr
                             cov2d,
                             camDepths,
                             finalTs,
-                            finalIdx,
-                            pxgid,
+                            px2gid,
                             v_outImg,
                             v_outAlpha);
 
+    delete[] px2gid;
+
 
     torch::Tensor v_xy = std::get<0>(t);
     torch::Tensor v_conic = std::get<1>(t);
@@ -239,9 +216,6 @@ tensor_list RasterizeGaussiansCPU::backward(AutogradContext *ctx, tensor_list gr
     torch::Tensor v_opacity = std::get<3>(t);
     torch::Tensor none;
 
-    // std::cout << v_colors << v_xy;
-    // exit(1);
-
     return { v_xy,
             none, // radii
             v_conic,
diff --git a/simple_trainer.cpp b/simple_trainer.cpp
index 336f7a9..c11ed5b 100644
--- a/simple_trainer.cpp
+++ b/simple_trainer.cpp
@@ -12,7 +12,6 @@
 #include <opencv2/core/core.hpp>
 #include <opencv2/imgcodecs.hpp>
 #include <opencv2/imgproc.hpp>
-#include "vendor/gsplat/config.h"
 #include "project_gaussians.hpp"
 #include "rasterize_gaussians.hpp"
 #include "constants.hpp"
@@ -161,26 +160,30 @@ int main(int argc, char **argv){
                 width,
                 background);
         }else{
-            auto p = ProjectGaussians::apply(means, scales, 1, 
-                                    quats, viewMat, viewMat,
-                                    focal, focal,
-                                    width / 2,
-                                    height / 2,
-                                    height,
-                                    width,
-                                    tileBounds);
-
-            outImg = RasterizeGaussians::apply(
-                p[0], // xys
-                p[1], // depths
-                p[2], // radii,
-                p[3], // conics
-                p[4], // numTilesHit
-                torch::sigmoid(rgbs),
-                torch::sigmoid(opacities),
-                height,
-                width,
-                background);
+            #if defined(USE_HIP) || defined(USE_CUDA)
+                auto p = ProjectGaussians::apply(means, scales, 1, 
+                                        quats, viewMat, viewMat,
+                                        focal, focal,
+                                        width / 2,
+                                        height / 2,
+                                        height,
+                                        width,
+                                        tileBounds);
+
+                outImg = RasterizeGaussians::apply(
+                    p[0], // xys
+                    p[1], // depths
+                    p[2], // radii,
+                    p[3], // conics
+                    p[4], // numTilesHit
+                    torch::sigmoid(rgbs),
+                    torch::sigmoid(opacities),
+                    height,
+                    width,
+                    background);
+            #else
+                throw std::runtime_error("GPU support not built");
+            #endif
         }
 
         outImg.requires_grad_();
diff --git a/vendor/gsplat-cpu/bindings.h b/vendor/gsplat-cpu/bindings.h
index 3f507e1..af0c593 100644
--- a/vendor/gsplat-cpu/bindings.h
+++ b/vendor/gsplat-cpu/bindings.h
@@ -1,4 +1,7 @@
-// Originally based on https://github.dev/nerfstudio-project/gsplat
+// Originally based on https://github.com/nerfstudio-project/gsplat
+// This implementation has been substantially changed and optimized 
+// Licensed under the AGPLv3
+// Piero Toffanin - 2024
 
 #include <cstdio>
 #include <iostream>
@@ -31,7 +34,6 @@ project_gaussians_forward_tensor_cpu(
 );
 
 std::tuple<
-    torch::Tensor,
     torch::Tensor,
     torch::Tensor,
     std::vector<int32_t> *
@@ -65,8 +67,7 @@ std::
         const torch::Tensor &cov2d,
         const torch::Tensor &camDepths,
         const torch::Tensor &final_Ts,
-        const torch::Tensor &final_idx,
-        const std::vector<int32_t> *pxgid,
+        const std::vector<int32_t> *px2gid,
         const torch::Tensor &v_output, // dL_dout_color
         const torch::Tensor &v_output_alpha
     );
diff --git a/vendor/gsplat-cpu/gsplat_cpu.cpp b/vendor/gsplat-cpu/gsplat_cpu.cpp
index 23330e8..8b39fec 100644
--- a/vendor/gsplat-cpu/gsplat_cpu.cpp
+++ b/vendor/gsplat-cpu/gsplat_cpu.cpp
@@ -1,5 +1,7 @@
-// Originally started from https://github.dev/nerfstudio-project/gsplat
-// This implementation has been substantially changed and is licensed under the AGPLv3
+// Originally started from https://github.com/nerfstudio-project/gsplat
+// This implementation has been substantially changed and optimized 
+// Licensed under the AGPLv3
+// Piero Toffanin - 2024
 
 #include "bindings.h"
 #include "../gsplat/config.h"
@@ -11,7 +13,7 @@
 
 using namespace torch::indexing;
 
-torch::Tensor quatToRotMat(const torch::Tensor &quat){
+torch::Tensor quatToRot(const torch::Tensor &quat){
     auto u = torch::unbind(torch::nn::functional::normalize(quat, torch::nn::functional::NormalizeFuncOptions().dim(-1)), -1);
     torch::Tensor w = u[0];
     torch::Tensor x = u[1];
@@ -62,8 +64,6 @@ project_gaussians_forward_tensor_cpu(
     float fovx = 0.5f * static_cast<float>(img_height) / fx;
     float fovy = 0.5f * static_cast<float>(img_width) / fy;
     
-    // TODO: no need to recompute W,p,t below (they are the same)
-
     // clip_near_plane
     torch::Tensor Rclip = viewmat.index({"...", Slice(None, 3), Slice(None, 3)}); 
     torch::Tensor Tclip = viewmat.index({"...", Slice(None, 3), 3});
@@ -71,22 +71,18 @@ project_gaussians_forward_tensor_cpu(
     // torch::Tensor isClose = pView.index({"...", 2}) < clip_thresh;
 
     // scale_rot_to_cov3d
-    torch::Tensor R = quatToRotMat(quats);
+    torch::Tensor R = quatToRot(quats);
     torch::Tensor M = R * glob_scale * scales.index({"...", None, Slice()});
     torch::Tensor cov3d = torch::matmul(M, M.transpose(-1, -2));
 
     // project_cov3d_ewa
-    torch::Tensor W = viewmat.index({"...", Slice(None, 3), Slice(None, 3)});
-    torch::Tensor p = viewmat.index({"...", Slice(None, 3), 3});
-    torch::Tensor t = torch::matmul(W, means3d.index({"...", None})).index({"...", 0}) + p;
-
     torch::Tensor limX = 1.3f * torch::tensor({fovx}, means3d.device());
     torch::Tensor limY = 1.3f * torch::tensor({fovy}, means3d.device());
     
-    torch::Tensor minLimX = t.index({"...", 2}) * torch::min(limX, torch::max(-limX, t.index({"...", 0}) / t.index({"...", 2})));
-    torch::Tensor minLimY = t.index({"...", 2}) * torch::min(limY, torch::max(-limY, t.index({"...", 1}) / t.index({"...", 2})));
+    torch::Tensor minLimX = pView.index({"...", 2}) * torch::min(limX, torch::max(-limX, pView.index({"...", 0}) / pView.index({"...", 2})));
+    torch::Tensor minLimY = pView.index({"...", 2}) * torch::min(limY, torch::max(-limY, pView.index({"...", 1}) / pView.index({"...", 2})));
     
-    t = torch::cat({minLimX.index({"...", None}), minLimY.index({"...", None}), t.index({"...", 2, None})}, -1);
+    torch::Tensor t = torch::cat({minLimX.index({"...", None}), minLimY.index({"...", None}), pView.index({"...", 2, None})}, -1);
     torch::Tensor rz = 1.0f / t.index({"...", 2});
     torch::Tensor rz2 = rz.pow(2);
 
@@ -95,7 +91,7 @@ project_gaussians_forward_tensor_cpu(
         torch::stack({torch::zeros_like(rz), fy * rz, -fy * t.index({"...", 1}) * rz2}, -1)
     }, -2);
 
-    torch::Tensor T = torch::matmul(J, W);
+    torch::Tensor T = torch::matmul(J, Rclip);
     torch::Tensor cov2d = torch::matmul(T, torch::matmul(cov3d, T.transpose(-1, -2)));
 
     // Add blur along axes
@@ -135,7 +131,6 @@ project_gaussians_forward_tensor_cpu(
 }
 
 std::tuple<
-    torch::Tensor,
     torch::Tensor,
     torch::Tensor,
     std::vector<int32_t> *
@@ -155,7 +150,7 @@ std::tuple<
     int channels = colors.size(1);
     int numPoints = xys.size(0);
     float *pDepths = static_cast<float *>(camDepths.data_ptr());
-    std::vector<int32_t> *pxgid = new std::vector<int32_t>[width * height];
+    std::vector<int32_t> *px2gid = new std::vector<int32_t>[width * height];
 
     std::vector< size_t > gIndices( numPoints );
     std::iota( gIndices.begin(), gIndices.end(), 0 );
@@ -167,7 +162,6 @@ std::tuple<
 
     torch::Tensor outImg = torch::zeros({height, width, channels}, torch::TensorOptions().dtype(torch::kFloat32).device(device));
     torch::Tensor finalTs = torch::ones({height, width}, torch::TensorOptions().dtype(torch::kFloat32).device(device));   
-    torch::Tensor finalIdx = torch::zeros({height, width}, torch::TensorOptions().dtype(torch::kInt32).device(device));   
     torch::Tensor done = torch::zeros({height, width}, torch::TensorOptions().dtype(torch::kBool).device(device));   
 
     torch::Tensor sqCov2dX = 3.0f * torch::sqrt(cov2d.index({"...", 0, 0}));
@@ -183,7 +177,6 @@ std::tuple<
     float *pFinalTs = static_cast<float *>(finalTs.data_ptr());
     bool *pDone = static_cast<bool *>(done.data_ptr());
 
-    int32_t *pFinalIdx = static_cast<int32_t *>(finalIdx.data_ptr());
     float *pColors = static_cast<float *>(colors.data_ptr());
     
     float bgX = background[0].item<float>();
@@ -209,7 +202,7 @@ std::tuple<
         int maxx = (std::min)(height, static_cast<int>(std::ceil(gY + sqy)) + 2);
         int miny = (std::max)(0, static_cast<int>(std::floor(gX - sqx)) - 2);
         int maxy = (std::min)(width, static_cast<int>(std::ceil(gX + sqx)) + 2);
-
+        
         for (int i = minx; i < maxx; i++){
             for (int j = miny; j < maxy; j++){
                 size_t pixIdx = (i * width + j);
@@ -241,8 +234,7 @@ std::tuple<
                 pOutImg[pixIdx * 3 + 2] += vis * pColors[gaussianId * 3 + 2];
                 
                 pFinalTs[pixIdx] = nextT;
-                pFinalIdx[pixIdx] = gaussianId;
-                pxgid[pixIdx].push_back(gaussianId);
+                px2gid[pixIdx].push_back(gaussianId);
             }
         }
     }
@@ -257,11 +249,11 @@ std::tuple<
             pOutImg[pixIdx * 3 + 1] += T * bgY;
             pOutImg[pixIdx * 3 + 2] += T * bgZ;
 
-            std::reverse(pxgid[pixIdx].begin(), pxgid[pixIdx].end());
+            std::reverse(px2gid[pixIdx].begin(), px2gid[pixIdx].end());
         }
     }
 
-    return std::make_tuple(outImg, finalTs, finalIdx, pxgid);
+    return std::make_tuple(outImg, finalTs, px2gid);
 }
 
 
@@ -283,8 +275,7 @@ std::
         const torch::Tensor &cov2d,
         const torch::Tensor &camDepths,        
         const torch::Tensor &final_Ts,
-        const torch::Tensor &final_idx,
-        const std::vector<int32_t> *pxgid,
+        const std::vector<int32_t> *px2gid,
         const torch::Tensor &v_output, // dL_dout_color
         const torch::Tensor &v_output_alpha
     ){
@@ -304,10 +295,7 @@ std::
     float *pv_colors = static_cast<float *>(v_colors.data_ptr());
     float *pv_opacity = static_cast<float *>(v_opacity.data_ptr());
     
-    // torch::Tensor buffer = torch::zeros({height, width, 3}, torch::TensorOptions().dtype(torch::kFloat32).device(device));   
-
     float *pColors = static_cast<float *>(colors.data_ptr());
-    // float *pBuffer = static_cast<float *>(buffer.data_ptr());
     float *pv_output = static_cast<float *>(v_output.data_ptr());
     float *pv_outputAlpha = static_cast<float *>(v_output_alpha.data_ptr());
     float *pConics = static_cast<float *>(conics.data_ptr());
@@ -318,111 +306,18 @@ std::
     float bgY = background[1].item<float>();
     float bgZ = background[2].item<float>();
 
-    // torch::Tensor Ts = final_Ts.clone();
-    // float *pTs = static_cast<float *>(Ts.data_ptr());
     float *pFinalTs = static_cast<float *>(final_Ts.data_ptr());
 
     const float alphaThresh = 1.0f / 255.0f;
-/*
-    std::vector< size_t > gIndices( numPoints );
-
-    float *pDepths = static_cast<float *>(camDepths.data_ptr());
-    std::iota( gIndices.begin(), gIndices.end(), 0 );
-    std::sort(gIndices.begin(), gIndices.end(), [&pDepths](int a, int b){
-        return pDepths[a] < pDepths[b];
-    });
-
-    torch::Tensor sqCov2dX = 3.0f * torch::sqrt(cov2d.index({"...", 0, 0}));
-    torch::Tensor sqCov2dY = 3.0f * torch::sqrt(cov2d.index({"...", 1, 1}));
-    
-
-    float *pSqCov2dX = static_cast<float *>(sqCov2dX.data_ptr());
-    float *pSqCov2dY = static_cast<float *>(sqCov2dY.data_ptr());
-
-    const float alphaThresh = 1.0f / 255.0f;
-    for (int idx = numPoints - 1; idx >= 0; idx--){
-        int32_t gaussianId = gIndices[idx];
-
-        float A = pConics[gaussianId * 3 + 0];
-        float B = pConics[gaussianId * 3 + 1];
-        float C = pConics[gaussianId * 3 + 2];
-
-        float gX = pCenters[gaussianId * 2 + 0];
-        float gY = pCenters[gaussianId * 2 + 1];
-
-        float sqx = pSqCov2dX[gaussianId];
-        float sqy = pSqCov2dY[gaussianId];
-
-        int minx = (std::max)(0, static_cast<int>(std::floor(gY - sqy)) - 2);
-        int maxx = (std::min)(height, static_cast<int>(std::ceil(gY + sqy)) + 2);
-        int miny = (std::max)(0, static_cast<int>(std::floor(gX - sqx)) - 2);
-        int maxy = (std::min)(width, static_cast<int>(std::ceil(gX + sqx)) + 2);
-
-        for (int i = minx; i < maxx; i++){
-            for (int j = miny; j < maxy; j++){
-                size_t pixIdx = (i * width + j);
-
-                float xCam = gX - j;
-                float yCam = gY - i;
-                float sigma = (
-                    0.5f
-                    * (A * xCam * xCam + C * yCam * yCam)
-                    + B * xCam * yCam
-                );
-
-                if (sigma < 0.0f) continue;
-                float vis = std::exp(-sigma);
-                float alpha = (std::min)(0.99f, pOpacities[gaussianId] * vis);
-                if (alpha < alphaThresh) continue;
-
-
-                float ra = 1.0f / (1.0f - alpha);
-                float T = pTs[pixIdx];
-                T *= ra;
-                pTs[pixIdx] = T;
-
-                float Tfinal = pFinalTs[pixIdx];
 
-                float fac = alpha * T;
-
-                pv_colors[gaussianId * 3 + 0] += fac * pv_output[pixIdx * 3 + 0];
-                pv_colors[gaussianId * 3 + 1] += fac * pv_output[pixIdx * 3 + 1];
-                pv_colors[gaussianId * 3 + 2] += fac * pv_output[pixIdx * 3 + 2];
-
-                float v_alpha = ((pColors[gaussianId * 3 + 0] * T - pBuffer[pixIdx * 3 + 0] * ra) * pv_output[pixIdx * 3 + 0]) +
-                                ((pColors[gaussianId * 3 + 1] * T - pBuffer[pixIdx * 3 + 1] * ra) * pv_output[pixIdx * 3 + 1]) +
-                                ((pColors[gaussianId * 3 + 2] * T - pBuffer[pixIdx * 3 + 2] * ra) * pv_output[pixIdx * 3 + 2]) +
-                                (Tfinal * ra * pv_outputAlpha[pixIdx]) +
-
-                                (-Tfinal * ra * bgX * pv_output[pixIdx * 3 + 0]) +
-                                (-Tfinal * ra * bgY * pv_output[pixIdx * 3 + 1]) +
-                                (-Tfinal * ra * bgZ * pv_output[pixIdx * 3 + 2]);
-
-                pBuffer[pixIdx * 3 + 0] += pColors[gaussianId * 3 + 0] * fac;
-                pBuffer[pixIdx * 3 + 1] += pColors[gaussianId * 3 + 1] * fac;
-                pBuffer[pixIdx * 3 + 2] += pColors[gaussianId * 3 + 2] * fac;
-                
-                float v_sigma = -pOpacities[gaussianId] * vis * v_alpha;
-                pv_conic[gaussianId * 3 + 0] += 0.5f * v_sigma * xCam * xCam;
-                pv_conic[gaussianId * 3 + 1] += 0.5f * v_sigma * xCam * yCam;
-                pv_conic[gaussianId * 3 + 2] += 0.5f * v_sigma * yCam * yCam;
-
-                pv_xy[gaussianId * 2 + 0] += v_sigma * (A * xCam + B * yCam);
-                pv_xy[gaussianId * 2 + 1] += v_sigma * (B * xCam + C * yCam);
-
-                pv_opacity[gaussianId] += vis * v_alpha;
-            }
-        }
-    }*/
-
-    for (int j = 0; j < width; j++){
-        for (int i = 0; i < height; i++){
+    for (int i = 0; i < height; i++){
+        for (int j = 0; j < width; j++){
             size_t pixIdx = (i * width + j);
             float Tfinal = pFinalTs[pixIdx];
             float T = Tfinal;
             float buffer[3] = {0.0f, 0.0f, 0.0f};
 
-            for (const int32_t &gaussianId : pxgid[pixIdx]){
+            for (const int32_t &gaussianId : px2gid[pixIdx]){
                 float A = pConics[gaussianId * 3 + 0];
                 float B = pConics[gaussianId * 3 + 1];
                 float C = pConics[gaussianId * 3 + 2];
@@ -469,8 +364,6 @@ std::
                 pv_conic[gaussianId * 3 + 1] += 0.5f * v_sigma * xCam * yCam;
                 pv_conic[gaussianId * 3 + 2] += 0.5f * v_sigma * yCam * yCam;
 
-                // std::cout << v_sigma << std::endl;
-
                 pv_xy[gaussianId * 2 + 0] += v_sigma * (A * xCam + B * yCam);
                 pv_xy[gaussianId * 2 + 1] += v_sigma * (B * xCam + C * yCam);
 
diff --git a/vendor/gsplat/backward.cu b/vendor/gsplat/backward.cu
index c0e8f1a..5f85cd1 100644
--- a/vendor/gsplat/backward.cu
+++ b/vendor/gsplat/backward.cu
@@ -10,8 +10,6 @@
 #include <cooperative_groups/reduce.h>
 #endif
 
-#include <stdio.h> // TODO REMOVE
-
 namespace cg = cooperative_groups;
 
 __global__ void nd_rasterize_backward_kernel(
@@ -301,9 +299,6 @@ __global__ void rasterize_backward_kernel(
                 // compute the current T for this gaussian
                 float ra = 1.f / (1.f - alpha);
                 T *= ra;
-        // if (pix_id == 4999){
-        //     printf("%f \n", T);
-        // }
 
                 // update v_rgb for this gaussian
                 const float fac = alpha * T;

From b46fd3aab06672096b350b3e21cf26ef7fa9e151 Mon Sep 17 00:00:00 2001
From: Piero Toffanin <pt@masseranolabs.com>
Date: Wed, 20 Mar 2024 18:35:06 +0000
Subject: [PATCH 16/19] opensplat CPU integration completed

---
 gsplat.hpp                       |   7 +-
 model.cpp                        |  18 ++++--
 opensplat.cpp                    |   3 +-
 project_gaussians.cpp            |   7 +-
 project_gaussians.hpp            |   7 +-
 rasterize_gaussians.cpp          |   3 +
 rasterize_gaussians.hpp          |   4 ++
 simple_trainer.cpp               |   4 +-
 spherical_harmonics.cpp          |  30 ++++-----
 spherical_harmonics.hpp          |  13 +++-
 vendor/gsplat-cpu/bindings.h     |  10 +++
 vendor/gsplat-cpu/gsplat_cpu.cpp | 107 +++++++++++++++++++++++++++++++
 12 files changed, 184 insertions(+), 29 deletions(-)

diff --git a/gsplat.hpp b/gsplat.hpp
index db0368d..d427655 100644
--- a/gsplat.hpp
+++ b/gsplat.hpp
@@ -1,7 +1,12 @@
+#ifndef GSPLAT_H
+#define GSPLAT_H
+
 #include "vendor/gsplat/config.h"
 
 #if defined(USE_HIP) || defined(USE_CUDA)
 #include "vendor/gsplat/bindings.h"
 #endif
 
-#include "vendor/gsplat-cpu/bindings.h"
\ No newline at end of file
+#include "vendor/gsplat-cpu/bindings.h"
+
+#endif
\ No newline at end of file
diff --git a/model.cpp b/model.cpp
index 3f36ff7..ebf56f0 100644
--- a/model.cpp
+++ b/model.cpp
@@ -84,7 +84,7 @@ torch::Tensor Model::forward(Camera& cam, int step){
     torch::Tensor rgb;
 
     if (device == torch::kCPU){
-        auto p = ProjectGaussiansCPU::Apply(means, 
+        auto p = ProjectGaussiansCPU::apply(means, 
                                 torch::exp(scales), 
                                 1, 
                                 quats / quats.norm(2, {-1}, true), 
@@ -141,11 +141,21 @@ torch::Tensor Model::forward(Camera& cam, int step){
     torch::Tensor viewDirs = means.detach() - T.transpose(0, 1).to(device);
     viewDirs = viewDirs / viewDirs.norm(2, {-1}, true);
     int degreesToUse = (std::min<int>)(step / shDegreeInterval, shDegree);
-    std::cout << degreesToUse;
-    exit(1);
-    torch::Tensor rgbs = SphericalHarmonics::apply(degreesToUse, viewDirs, colors);
+    torch::Tensor rgbs;
+    
+    std::cerr << "HERE";
+    if (device == torch::kCPU){
+        rgbs = SphericalHarmonicsCPU::apply(degreesToUse, viewDirs, colors);
+    }else{
+        #if defined(USE_HIP) || defined(USE_CUDA)
+        rgbs = SphericalHarmonics::apply(degreesToUse, viewDirs, colors);
+        #endif
+    }
+    
+    std::cerr << "THERE" << camDepths;
     rgbs = torch::clamp_min(rgbs + 0.5f, 0.0f);
 
+
     if (device == torch::kCPU){
         rgb = RasterizeGaussiansCPU::apply(
                 xys,
diff --git a/opensplat.cpp b/opensplat.cpp
index 4273a3d..f21c03d 100644
--- a/opensplat.cpp
+++ b/opensplat.cpp
@@ -17,6 +17,7 @@ int main(int argc, char *argv[]){
         ("s,save-every", "Save output scene every these many steps (set to -1 to disable)", cxxopts::value<int>()->default_value("-1"))
         ("val", "Withhold a camera shot for validating the scene loss")
         ("val-image", "Filename of the image to withhold for validating scene loss", cxxopts::value<std::string>()->default_value("random"))
+        ("cpu", "Force CPU execution")
         
         ("n,num-iters", "Number of iterations to run", cxxopts::value<int>()->default_value("30000"))
         ("d,downscale-factor", "Scale input images by this factor.", cxxopts::value<float>()->default_value("1"))
@@ -77,7 +78,7 @@ int main(int argc, char *argv[]){
 
     torch::Device device = torch::kCPU;
 
-    if (torch::cuda::is_available()) {
+    if (torch::cuda::is_available() && result.count("cpu") == 0) {
         std::cout << "Using CUDA" << std::endl;
         device = torch::kCUDA;
     }else{
diff --git a/project_gaussians.cpp b/project_gaussians.cpp
index 9d41697..d57e1d6 100644
--- a/project_gaussians.cpp
+++ b/project_gaussians.cpp
@@ -1,5 +1,6 @@
 #include "project_gaussians.hpp"
-#include "gsplat.hpp"
+
+#if defined(USE_HIP) || defined(USE_CUDA)
 
 variable_list ProjectGaussians::forward(AutogradContext *ctx, 
                 torch::Tensor means,
@@ -88,7 +89,9 @@ tensor_list ProjectGaussians::backward(AutogradContext *ctx, tensor_list grad_ou
         };
 }
 
-variable_list ProjectGaussiansCPU::Apply(
+#endif
+
+variable_list ProjectGaussiansCPU::apply(
                 torch::Tensor means,
                 torch::Tensor scales,
                 float globScale,
diff --git a/project_gaussians.hpp b/project_gaussians.hpp
index b6c7e6c..8891d22 100644
--- a/project_gaussians.hpp
+++ b/project_gaussians.hpp
@@ -3,9 +3,12 @@
 
 #include <torch/torch.h>
 #include "tile_bounds.hpp"
+#include "gsplat.hpp"
 
 using namespace torch::autograd;
 
+#if defined(USE_HIP) || defined(USE_CUDA)
+
 class ProjectGaussians : public Function<ProjectGaussians>{
 public:
     static variable_list forward(AutogradContext *ctx, 
@@ -26,9 +29,11 @@ class ProjectGaussians : public Function<ProjectGaussians>{
     static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs);
 };
 
+#endif
+
 class ProjectGaussiansCPU{
 public:
-    static variable_list Apply( 
+    static variable_list apply( 
             torch::Tensor means,
             torch::Tensor scales,
             float globScale,
diff --git a/rasterize_gaussians.cpp b/rasterize_gaussians.cpp
index 40d0b12..9023b9b 100644
--- a/rasterize_gaussians.cpp
+++ b/rasterize_gaussians.cpp
@@ -34,6 +34,8 @@ std::tuple<torch::Tensor,
     return std::make_tuple(isectIds, gaussianIds, isectIdsSorted, gaussianIdsSorted, tileBins);
 }
 
+#if defined(USE_HIP) || defined(USE_CUDA)
+
 torch::Tensor RasterizeGaussians::forward(AutogradContext *ctx, 
             torch::Tensor xys,
             torch::Tensor depths,
@@ -137,6 +139,7 @@ tensor_list RasterizeGaussians::backward(AutogradContext *ctx, tensor_list grad_
     };
 }
 
+#endif
 
 torch::Tensor RasterizeGaussiansCPU::forward(AutogradContext *ctx, 
             torch::Tensor xys,
diff --git a/rasterize_gaussians.hpp b/rasterize_gaussians.hpp
index aa21a0b..adb7692 100644
--- a/rasterize_gaussians.hpp
+++ b/rasterize_gaussians.hpp
@@ -17,6 +17,8 @@ std::tuple<torch::Tensor,
                                             torch::Tensor cumTilesHit,
                                             TileBounds tileBounds);
 
+#if defined(USE_HIP) || defined(USE_CUDA)
+
 class RasterizeGaussians : public Function<RasterizeGaussians>{
 public:
     static torch::Tensor forward(AutogradContext *ctx, 
@@ -33,6 +35,8 @@ class RasterizeGaussians : public Function<RasterizeGaussians>{
     static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs);
 };
 
+#endif
+
 class RasterizeGaussiansCPU : public Function<RasterizeGaussiansCPU>{
 public:
     static torch::Tensor forward(AutogradContext *ctx, 
diff --git a/simple_trainer.cpp b/simple_trainer.cpp
index c11ed5b..c15a0f6 100644
--- a/simple_trainer.cpp
+++ b/simple_trainer.cpp
@@ -54,7 +54,7 @@ int main(int argc, char **argv){
     int iterations = result["iters"].as<int>();
     float learningRate = result["lr"].as<float>();
     std::string render = result["render"].as<std::string>();
-    if (!fs::exists(render)) fs::create_directories(render);
+    if (!render.empty() && !fs::exists(render)) fs::create_directories(render);
 
     torch::Device device = torch::kCPU;
     if (torch::cuda::is_available() && result.count("cpu") == 0){
@@ -140,7 +140,7 @@ int main(int argc, char **argv){
 
     for (size_t i = 0; i < iterations; i++){
         if (device == torch::kCPU){
-            auto p = ProjectGaussiansCPU::Apply(means, scales, 1, 
+            auto p = ProjectGaussiansCPU::apply(means, scales, 1, 
                                 quats, viewMat, viewMat,
                                 focal, focal,
                                 width / 2,
diff --git a/spherical_harmonics.cpp b/spherical_harmonics.cpp
index c6d4036..6581188 100644
--- a/spherical_harmonics.cpp
+++ b/spherical_harmonics.cpp
@@ -1,20 +1,4 @@
 #include "spherical_harmonics.hpp"
-#include "gsplat.hpp"
-
-int numShBases(int degree){
-    switch(degree){
-        case 0:
-            return 1;
-        case 1:
-            return 4;
-        case 2:
-            return 9;
-        case 3:
-            return 16;
-        default:
-            return 25;
-    }
-}
 
 int degFromSh(int numBases){
     switch(numBases){
@@ -37,6 +21,8 @@ torch::Tensor rgb2sh(const torch::Tensor &rgb){
     return (rgb - 0.5) / C0;
 }
 
+#if defined(USE_HIP) || defined(USE_CUDA)
+
 torch::Tensor SphericalHarmonics::forward(AutogradContext *ctx, 
             int degreesToUse, 
             torch::Tensor viewDirs, 
@@ -67,5 +53,15 @@ tensor_list SphericalHarmonics::backward(AutogradContext *ctx, tensor_list grad_
         none,
         compute_sh_backward_tensor(numPoints, degree, degreesToUse, viewDirs, v_colors)
     };
-    
+}
+
+#endif
+
+torch::Tensor SphericalHarmonicsCPU::apply(int degreesToUse, 
+            torch::Tensor viewDirs, 
+            torch::Tensor coeffs){
+    long long numPoints = coeffs.size(0);
+    int degree = degFromSh(coeffs.size(-2)); 
+
+    return compute_sh_forward_tensor_cpu(numPoints, degree, degreesToUse, viewDirs, coeffs);
 }
\ No newline at end of file
diff --git a/spherical_harmonics.hpp b/spherical_harmonics.hpp
index a84762d..2ebe1d1 100644
--- a/spherical_harmonics.hpp
+++ b/spherical_harmonics.hpp
@@ -2,13 +2,15 @@
 #define SPHERICAL_HARMONICS_H
 
 #include <torch/torch.h>
+#include "gsplat.hpp"
 
 using namespace torch::autograd;
 
-int numShBases(int degree);
 int degFromSh(int numBases);
 torch::Tensor rgb2sh(const torch::Tensor &rgb);
 
+#if defined(USE_HIP) || defined(USE_CUDA)
+
 class SphericalHarmonics : public Function<SphericalHarmonics>{
 public:
     static torch::Tensor forward(AutogradContext *ctx, 
@@ -18,4 +20,13 @@ class SphericalHarmonics : public Function<SphericalHarmonics>{
     static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs);
 };
 
+#endif
+
+class SphericalHarmonicsCPU{
+public:
+    static torch::Tensor apply(int degreesToUse, 
+            torch::Tensor viewDirs, 
+            torch::Tensor coeffs);
+};
+
 #endif
\ No newline at end of file
diff --git a/vendor/gsplat-cpu/bindings.h b/vendor/gsplat-cpu/bindings.h
index af0c593..bfcce48 100644
--- a/vendor/gsplat-cpu/bindings.h
+++ b/vendor/gsplat-cpu/bindings.h
@@ -71,3 +71,13 @@ std::
         const torch::Tensor &v_output, // dL_dout_color
         const torch::Tensor &v_output_alpha
     );
+
+int numShBases(int degree);
+
+torch::Tensor compute_sh_forward_tensor_cpu(
+    const int num_points,
+    const int degree,
+    const int degrees_to_use,
+    const torch::Tensor &viewdirs,
+    const torch::Tensor &coeffs
+);
\ No newline at end of file
diff --git a/vendor/gsplat-cpu/gsplat_cpu.cpp b/vendor/gsplat-cpu/gsplat_cpu.cpp
index 8b39fec..0fff7d3 100644
--- a/vendor/gsplat-cpu/gsplat_cpu.cpp
+++ b/vendor/gsplat-cpu/gsplat_cpu.cpp
@@ -373,4 +373,111 @@ std::
     }
 
     return std::make_tuple(v_xy, v_conic, v_colors, v_opacity);
+}
+
+
+const float SH_C0 = 0.28209479177387814f;
+const float SH_C1 = 0.4886025119029199f;
+const float SH_C2[] = {
+    1.0925484305920792f,
+    -1.0925484305920792f,
+    0.31539156525252005f,
+    -1.0925484305920792f,
+    0.5462742152960396f
+};
+const float SH_C3[] = {
+    -0.5900435899266435f,
+    2.890611442640554f,
+    -0.4570457994644658f,
+    0.3731763325901154f,
+    -0.4570457994644658f,
+    1.445305721320277f,
+    -0.5900435899266435f
+};
+const float SH_C4[] = {
+    2.5033429417967046f,
+    -1.7701307697799304f,
+    0.9461746957575601f,
+    -0.6690465435572892f,
+    0.10578554691520431f,
+    -0.6690465435572892f,
+    0.47308734787878004f,
+    -1.7701307697799304f,
+    0.6258357354491761f
+};
+
+int numShBases(int degree){
+    switch(degree){
+        case 0:
+            return 1;
+        case 1:
+            return 4;
+        case 2:
+            return 9;
+        case 3:
+            return 16;
+        default:
+            return 25;
+    }
+}
+
+torch::Tensor compute_sh_forward_tensor_cpu(
+    const int num_points,
+    const int degree,
+    const int degrees_to_use,
+    const torch::Tensor &viewdirs,
+    const torch::Tensor &coeffs
+) {
+    const int numChannels = 3;
+    unsigned numBases = numShBases(degrees_to_use);
+
+    torch::Tensor result = torch::zeros({viewdirs.size(0), numBases}, torch::TensorOptions().dtype(torch::kFloat32).device(viewdirs.device()));   
+    
+    result.index_put_({"...", 0}, SH_C0);
+    if (numBases > 1){
+        std::vector<torch::Tensor> xyz = viewdirs.unbind(-1); 
+        torch::Tensor x = xyz[0];
+        torch::Tensor y = xyz[1];
+        torch::Tensor z = xyz[2];
+
+        if (numBases > 4){
+            torch::Tensor xx = x * x;
+            torch::Tensor yy = y * y;
+            torch::Tensor zz = z * z;
+            torch::Tensor xy = x * y;
+            torch::Tensor yz = y * z;
+            torch::Tensor xz = x * z;
+
+            result.index_put_({"...", 4}, SH_C2[0] * xy);
+            result.index_put_({"...", 5}, SH_C2[1] * yz);
+            result.index_put_({"...", 6}, SH_C2[2] * (2.0f * zz - xx - yy));
+            result.index_put_({"...", 7}, SH_C2[3] * xz);
+            result.index_put_({"...", 8}, SH_C2[4] * (xx - yy));
+
+            if (numBases > 9){
+                result.index_put_({"...", 9},  SH_C3[0] * y * (3 * xx - yy));
+                result.index_put_({"...", 10}, SH_C3[1] * xy * z);
+                result.index_put_({"...", 11}, SH_C3[2] * y * (4 * zz - xx - yy));
+                result.index_put_({"...", 12}, SH_C3[3] * z * (2 * zz - 3 * xx - 3 * yy));
+                result.index_put_({"...", 13}, SH_C3[4] * x * (4 * zz - xx - yy) );
+                result.index_put_({"...", 14}, SH_C3[5] * z * (xx - yy));
+                result.index_put_({"...", 15}, SH_C3[6] * x * (xx - 3 * yy));
+                
+                if (numBases > 16){
+                    result.index_put_({"...", 16}, SH_C4[0] * xy * (xx - yy));
+                    result.index_put_({"...", 17}, SH_C4[1] * yz * (3 * xx - yy));
+                    result.index_put_({"...", 18}, SH_C4[2] * xy * (7 * zz - 1));
+                    result.index_put_({"...", 19}, SH_C4[3] * yz * (7 * zz - 3));
+                    result.index_put_({"...", 20}, SH_C4[4] * (zz * (35 * zz - 30) + 3));
+                    result.index_put_({"...", 21}, SH_C4[5] * xz * (7 * zz - 3));
+                    result.index_put_({"...", 22}, SH_C4[6] * (xx - yy) * (7 * zz - 1));
+                    result.index_put_({"...", 23}, SH_C4[7] * xz * (xx - 3 * yy));
+                    result.index_put_({"...", 24}, SH_C4[8] * (xx * (xx - 3 * yy) - yy * (3 * xx - yy)));
+                        
+                }
+            }
+        }             
+    }
+
+    return (result.index({"...", None}) * coeffs).sum(-2);
 }
\ No newline at end of file

From 17211cc044e470bfcebfcd6b30f7c08d4bdf21e0 Mon Sep 17 00:00:00 2001
From: Piero Toffanin <pt@masseranolabs.com>
Date: Wed, 20 Mar 2024 19:06:37 +0000
Subject: [PATCH 17/19] width/height mismatch fix, cleanup

---
 model.cpp                        | 4 +---
 opensplat.cpp                    | 4 +++-
 simple_trainer.cpp               | 4 ++--
 vendor/gsplat-cpu/gsplat_cpu.cpp | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/model.cpp b/model.cpp
index ebf56f0..ae02d2b 100644
--- a/model.cpp
+++ b/model.cpp
@@ -127,7 +127,7 @@ torch::Tensor Model::forward(Camera& cam, int step){
         conics = p[3];
         numTilesHit = p[4];
         #else
-            throw std::runtime_error("GPU support not built");
+            throw std::runtime_error("GPU support not built, use --cpu");
         #endif
     }
     
@@ -143,7 +143,6 @@ torch::Tensor Model::forward(Camera& cam, int step){
     int degreesToUse = (std::min<int>)(step / shDegreeInterval, shDegree);
     torch::Tensor rgbs;
     
-    std::cerr << "HERE";
     if (device == torch::kCPU){
         rgbs = SphericalHarmonicsCPU::apply(degreesToUse, viewDirs, colors);
     }else{
@@ -152,7 +151,6 @@ torch::Tensor Model::forward(Camera& cam, int step){
         #endif
     }
     
-    std::cerr << "THERE" << camDepths;
     rgbs = torch::clamp_min(rgbs + 0.5f, 0.0f);
 
 
diff --git a/opensplat.cpp b/opensplat.cpp
index f21c03d..e0c410d 100644
--- a/opensplat.cpp
+++ b/opensplat.cpp
@@ -77,10 +77,12 @@ int main(int argc, char *argv[]){
     const float splitScreenSize = result["split-screen-size"].as<float>();
 
     torch::Device device = torch::kCPU;
+    int displayStep = 1;
 
     if (torch::cuda::is_available() && result.count("cpu") == 0) {
         std::cout << "Using CUDA" << std::endl;
         device = torch::kCUDA;
+        displayStep = 10;
     }else{
         std::cout << "Using CPU" << std::endl;
     }
@@ -120,7 +122,7 @@ int main(int argc, char *argv[]){
             torch::Tensor mainLoss = model.mainLoss(rgb, gt, ssimWeight);
             mainLoss.backward();
             
-            if (step % 10 == 0) std::cout << "Step " << step << ": " << mainLoss.item<float>() << std::endl;
+            if (step % displayStep == 0) std::cout << "Step " << step << ": " << mainLoss.item<float>() << std::endl;
 
             model.optimizersStep();
             model.schedulersStep(step);
diff --git a/simple_trainer.cpp b/simple_trainer.cpp
index c15a0f6..2da1704 100644
--- a/simple_trainer.cpp
+++ b/simple_trainer.cpp
@@ -147,7 +147,7 @@ int main(int argc, char **argv){
                                 height / 2,
                                 height,
                                 width);
-
+            
             outImg = RasterizeGaussiansCPU::apply(
                 p[0], // xys
                 p[1], // radii,
@@ -182,7 +182,7 @@ int main(int argc, char **argv){
                     width,
                     background);
             #else
-                throw std::runtime_error("GPU support not built");
+                throw std::runtime_error("GPU support not built, use --cpu");
             #endif
         }
 
diff --git a/vendor/gsplat-cpu/gsplat_cpu.cpp b/vendor/gsplat-cpu/gsplat_cpu.cpp
index 0fff7d3..250b394 100644
--- a/vendor/gsplat-cpu/gsplat_cpu.cpp
+++ b/vendor/gsplat-cpu/gsplat_cpu.cpp
@@ -240,8 +240,8 @@ std::tuple<
     }
 
     // Background
-    for (int i = 0; i < width; i++){
-        for (int j = 0; j < height; j++){
+    for (int i = 0; i < height; i++){
+        for (int j = 0; j < width; j++){
             size_t pixIdx = (i * width + j);
             float T = pFinalTs[pixIdx];
 

From 19343685083128c0147c302784faf80df5c2112d Mon Sep 17 00:00:00 2001
From: Piero Toffanin <pt@masseranolabs.com>
Date: Wed, 20 Mar 2024 15:20:44 -0400
Subject: [PATCH 18/19] Cleanup

---
 model.cpp                 | 13 ++++++++-----
 vendor/gsplat/backward.cu |  1 -
 vendor/gsplat/bindings.cu |  1 -
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/model.cpp b/model.cpp
index bc391ae..86a6d2f 100644
--- a/model.cpp
+++ b/model.cpp
@@ -448,11 +448,14 @@ void Model::afterTrain(int step){
         xysGradNorm = torch::Tensor();
         visCounts = torch::Tensor();
         max2DSize = torch::Tensor();
-#ifdef USE_HIP
-        c10::hip::HIPCachingAllocator::emptyCache();
-#elif defined(USE_CUDA)
-        c10::cuda::CUDACachingAllocator::emptyCache();
-#endif
+
+        if (device != torch::kCPU){
+            #ifdef USE_HIP
+                    c10::hip::HIPCachingAllocator::emptyCache();
+            #elif defined(USE_CUDA)
+                    c10::cuda::CUDACachingAllocator::emptyCache();
+            #endif
+        }
     }
 }
 
diff --git a/vendor/gsplat/backward.cu b/vendor/gsplat/backward.cu
index 5f85cd1..c338dba 100644
--- a/vendor/gsplat/backward.cu
+++ b/vendor/gsplat/backward.cu
@@ -299,7 +299,6 @@ __global__ void rasterize_backward_kernel(
                 // compute the current T for this gaussian
                 float ra = 1.f / (1.f - alpha);
                 T *= ra;
-
                 // update v_rgb for this gaussian
                 const float fac = alpha * T;
                 float v_alpha = 0.f;
diff --git a/vendor/gsplat/bindings.cu b/vendor/gsplat/bindings.cu
index 8bdbaca..80d581c 100644
--- a/vendor/gsplat/bindings.cu
+++ b/vendor/gsplat/bindings.cu
@@ -628,7 +628,6 @@ std::
         (float3 *)v_colors.contiguous().data_ptr<float>(),
         v_opacity.contiguous().data_ptr<float>()
     );
-    cudaDeviceSynchronize(); // TODO REMOVE
 
     return std::make_tuple(v_xy, v_conic, v_colors, v_opacity);
 }
\ No newline at end of file

From 4457ff7686a27dc448e1dbdb8d52461f88cbbb45 Mon Sep 17 00:00:00 2001
From: Piero Toffanin <pt@masseranolabs.com>
Date: Wed, 20 Mar 2024 15:34:06 -0400
Subject: [PATCH 19/19] Update README

---
 README.md | 57 +++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 43 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index 3d8d43c..a882108 100644
--- a/README.md
+++ b/README.md
@@ -6,15 +6,37 @@ A free and open source implementation of 3D gaussian splatting written in C++, f
 
 OpenSplat takes camera poses + sparse points in [COLMAP](https://colmap.github.io/) or [nerfstudio](https://docs.nerf.studio/quickstart/custom_dataset.html) project format and computes a [scene file](https://drive.google.com/file/d/1w-CBxyWNXF3omA8B_IeOsRmSJel3iwyr/view?usp=sharing) (.ply) that can be later imported for viewing, editing and rendering in other [software](https://github.com/MrNeRF/awesome-3D-gaussian-splatting?tab=readme-ov-file#open-source-implementations).
 
+Graphics card recommended, but not required! OpenSplat runs the fastest on NVIDIA and AMD GPUs, but can also run entirely on CPU power (~100x slower).
+
 Commercial use allowed and encouraged under the terms of the [AGPLv3](https://www.tldrlegal.com/license/gnu-affero-general-public-license-v3-agpl-3-0). ✅
 
-## Build (CUDA)
+## Build
 
 Requirements:
 
- * **CUDA**: Make sure you have the CUDA compiler (`nvcc`) in your PATH and that `nvidia-smi` is working. https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html 
- * **libtorch**: Visit https://pytorch.org/get-started/locally/ and select your OS, for package select "LibTorch". Make sure to match your version of CUDA if you want to leverage GPU support in libtorch.
  * **OpenCV**: `sudo apt install libopencv-dev` should do it.
+ * **libtorch**: See instructions below
+
+### CPU
+
+ **libtorch**: Visit https://pytorch.org/get-started/locally/ and select your OS, for package select "LibTorch". For compute platform you can select "CPU".
+
+ Then:
+
+ ```bash
+ git clone https://github.com/pierotofy/OpenSplat OpenSplat
+ cd OpenSplat
+ mkdir build && cd build
+ cmake -DCMAKE_PREFIX_PATH=/path/to/libtorch/ .. && make -j$(nproc)
+ ```
+
+### CUDA
+
+Additional requirement:
+
+ * **CUDA**: Make sure you have the CUDA compiler (`nvcc`) in your PATH and that `nvidia-smi` is working. https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html 
+ 
+ **libtorch**: Visit https://pytorch.org/get-started/locally/ and select your OS, for package select "LibTorch". Make sure to match your version of CUDA if you want to leverage GPU support in libtorch.
  
  Then:
 
@@ -27,12 +49,13 @@ Requirements:
 
  The software has been tested on Ubuntu 20.04 and Windows. With some changes it could run on macOS (help us by opening a PR?).
 
-## Build (ROCm via HIP)
-Requirements:
+### ROCm via HIP
+
+Additional requirement:
 
 * **ROCm**: Make sure you have the ROCm installed at /opt/rocm. https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html
-* **libtorch**: Visit https://pytorch.org/get-started/locally/ and select your OS, for package select "LibTorch". Make sure to match your version of ROCm (5.7) if you want to leverage AMD GPU support in libtorch.
-* **OpenCV**: `sudo apt install libopencv-dev` should do it.
+
+**libtorch**: Visit https://pytorch.org/get-started/locally/ and select your OS, for package select "LibTorch". Make sure to match your version of ROCm (5.7) if you want to leverage AMD GPU support in libtorch.
 
 Then:
 
@@ -44,13 +67,18 @@ Then:
  cmake -DCMAKE_PREFIX_PATH=/path/to/libtorch/ -DGPU_RUNTIME="HIP" -DHIP_ROOT_DIR=/opt/rocm -DOPENSPLAT_BUILD_SIMPLE_TRAINER=ON ..
  make
  ```
+
 In addition, you can leverage Jinja to build the project
- ```
- cmake -GNinja -DCMAKE_PREFIX_PATH=/path/to/libtorch/ -DGPU_RUNTIME="HIP" -DHIP_ROOT_DIR=/opt/rocm -DOPENSPLAT_BUILD_SIMPLE_TRAINER=ON ..
- jinja
- ```
 
-## Docker Build (CUDA)
+```bash
+cmake -GNinja -DCMAKE_PREFIX_PATH=/path/to/libtorch/ -DGPU_RUNTIME="HIP" -DHIP_ROOT_DIR=/opt/rocm -DOPENSPLAT_BUILD_SIMPLE_TRAINER=ON ..
+jinja
+```
+
+## Docker Build
+
+### CUDA
+
 Navigate to the root directory of OpenSplat repo that has Dockerfile and run the following command to build the Docker image:
 
 ```bash
@@ -70,7 +98,8 @@ docker build \
   --build-arg CMAKE_BUILD_TYPE=Release .
 ```
 
-## Docker Build (ROCm via HIP)
+### ROCm via HIP
+
 Navigate to the root directory of OpenSplat repo that has Dockerfile and run the following command to build the Docker image:
 ```bash
 docker build \
@@ -138,11 +167,11 @@ cd /code/build
 We recently released OpenSplat, so there's lots of work to do.
 
  * Support for running on AMD cards (more testing needed)
- * Support for running on CPU-only
  * Improve speed / reduce memory usage
  * Distributed computation using multiple machines
  * Real-time training viewer output
  * Compressed scene outputs
+ * Automatic filtering
  * Your ideas?
 
  https://github.com/pierotofy/OpenSplat/issues?q=is%3Aopen+is%3Aissue+label%3Aenhancement