From 345ac7becf33e52a0ee3037c799ee1722764b9ce Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Fri, 19 Jul 2024 22:07:49 +0530
Subject: [PATCH 01/50] Add 26_Autoexposure

---
 23_Autoexposure/CMakeLists.txt                | 12 ---------
 26_Autoexposure/CMakeLists.txt                | 25 +++++++++++++++++++
 .../config.json.template                      |  0
 {23_Autoexposure => 26_Autoexposure}/main.cpp |  0
 .../pipeline.groovy                           |  0
 CMakeLists.txt                                |  1 +
 6 files changed, 26 insertions(+), 12 deletions(-)
 delete mode 100644 23_Autoexposure/CMakeLists.txt
 create mode 100644 26_Autoexposure/CMakeLists.txt
 rename {23_Autoexposure => 26_Autoexposure}/config.json.template (100%)
 rename {23_Autoexposure => 26_Autoexposure}/main.cpp (100%)
 rename {23_Autoexposure => 26_Autoexposure}/pipeline.groovy (100%)

diff --git a/23_Autoexposure/CMakeLists.txt b/23_Autoexposure/CMakeLists.txt
deleted file mode 100644
index 8604e54c4..000000000
--- a/23_Autoexposure/CMakeLists.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-
-include(common RESULT_VARIABLE RES)
-if(NOT RES)
-	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
-endif()
-
-set(EXAMPLE_SOURCES
-	../../src/nbl/ext/LumaMeter/CLumaMeter.cpp
-	../../src/nbl/ext/ToneMapper/CToneMapper.cpp
-)
-
-nbl_create_executable_project("${EXAMPLE_SOURCES}" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
\ No newline at end of file
diff --git a/26_Autoexposure/CMakeLists.txt b/26_Autoexposure/CMakeLists.txt
new file mode 100644
index 000000000..0724366c9
--- /dev/null
+++ b/26_Autoexposure/CMakeLists.txt
@@ -0,0 +1,25 @@
+
+include(common RESULT_VARIABLE RES)
+if(NOT RES)
+	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
+endif()
+
+nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
+
+if(NBL_EMBED_BUILTIN_RESOURCES)
+	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
+	set(RESOURCE_DIR "app_resources")
+
+	get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+
+    file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
+    foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
+      LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
+    endforeach()
+
+	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
+
+	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
+endif()
\ No newline at end of file
diff --git a/23_Autoexposure/config.json.template b/26_Autoexposure/config.json.template
similarity index 100%
rename from 23_Autoexposure/config.json.template
rename to 26_Autoexposure/config.json.template
diff --git a/23_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
similarity index 100%
rename from 23_Autoexposure/main.cpp
rename to 26_Autoexposure/main.cpp
diff --git a/23_Autoexposure/pipeline.groovy b/26_Autoexposure/pipeline.groovy
similarity index 100%
rename from 23_Autoexposure/pipeline.groovy
rename to 26_Autoexposure/pipeline.groovy
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9bc4ffc23..1c173f573 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,6 +44,7 @@ if(NBL_BUILD_EXAMPLES)
 	add_subdirectory(23_ArithmeticUnitTest EXCLUDE_FROM_ALL)
 	add_subdirectory(24_ColorSpaceTest EXCLUDE_FROM_ALL)
 	add_subdirectory(25_FilterTest EXCLUDE_FROM_ALL)
+	add_subdirectory(26_Autoexposure EXCLUDE_FROM_ALL)
 	# add_subdirectory(36_CUDAInterop EXCLUDE_FROM_ALL)
 
 	add_subdirectory(38_EXRSplit EXCLUDE_FROM_ALL)

From 87d4794dcc5de8264528292c4a30b5284979754a Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Sat, 20 Jul 2024 00:33:17 +0530
Subject: [PATCH 02/50] Change 26_Autoexposure to SimpleWindowedApplication

---
 26_Autoexposure/CMakeLists.txt |  2 +-
 26_Autoexposure/main.cpp       | 95 +++++++++++++++++++++++++++++++++-
 2 files changed, 95 insertions(+), 2 deletions(-)

diff --git a/26_Autoexposure/CMakeLists.txt b/26_Autoexposure/CMakeLists.txt
index 0724366c9..34040e8c1 100644
--- a/26_Autoexposure/CMakeLists.txt
+++ b/26_Autoexposure/CMakeLists.txt
@@ -22,4 +22,4 @@ if(NBL_EMBED_BUILTIN_RESOURCES)
 	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
 
 	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
-endif()
\ No newline at end of file
+endif()
diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index 83b62c88d..7b89917b5 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -1,3 +1,94 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+#include "../common/SimpleWindowedApplication.hpp"
+
+#include "nbl/video/surface/CSurfaceVulkan.h"
+
+using namespace nbl;
+using namespace core;
+using namespace hlsl;
+using namespace system;
+using namespace asset;
+using namespace ui;
+using namespace video;
+
+//#include "app_resources/push_constants.hlsl"
+
+class AutoexposureApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+{
+	using device_base_t = examples::SimpleWindowedApplication;
+	using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+	using clock_t = std::chrono::steady_clock;
+
+public:
+	// Yay thanks to multiple inheritance we cannot forward ctors anymore
+	inline AutoexposureApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
+		IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+
+	// Will get called mid-initialization, via `filterDevices` between when the API Connection is created and Physical Device is chosen
+	inline core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const override
+	{
+		// So let's create our Window and Surface then!
+		if (!m_surface)
+		{
+			{
+				IWindow::SCreationParams params = {};
+				params.callback = core::make_smart_refctd_ptr<nbl::video::ISimpleManagedSurface::ICallback>();
+				params.width = 256;
+				params.height = 256;
+				params.x = 32;
+				params.y = 32;
+				// Don't want to have a window lingering about before we're ready so create it hidden.
+				// Only programmatic resize, not regular.
+				params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE;
+				params.windowCaption = "AutoexposureApp";
+				const_cast<std::remove_const_t<decltype(m_window)>&>(m_window) = m_winMgr->createWindow(std::move(params));
+			}
+			auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast<IWindowWin32>(m_window));
+			const_cast<std::remove_const_t<decltype(m_surface)>&>(m_surface) = nbl::video::CSimpleResizeSurface<nbl::video::CDefaultSwapchainFramebuffers>::create(std::move(surface));
+		}
+		if (m_surface)
+			return { {m_surface->getSurface()/*,EQF_NONE*/} };
+		return {};
+	}
+
+	inline bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+	{
+		// Remember to call the base class initialization!
+		if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
+			return false;
+		if (!asset_base_t::onAppInitialized(std::move(system)))
+			return false;
+
+		return true;
+	}
+
+	// We do a very simple thing, display an image and wait `DisplayImageMs` to show it
+	inline void workLoopBody() override
+	{
+	}
+
+	inline bool keepRunning() override
+	{
+		return false;
+	}
+
+	inline bool onAppTerminated() override
+	{
+		return device_base_t::onAppTerminated();
+	}
+
+protected:
+	smart_refctd_ptr<IWindow> m_window;
+	smart_refctd_ptr<CSimpleResizeSurface<CDefaultSwapchainFramebuffers>> m_surface;
+};
+
+NBL_MAIN_FUNC(AutoexposureApp)
+
+#if 0
+
 // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
@@ -174,4 +265,6 @@ int main()
 	}
 
 	return 0;
-}
\ No newline at end of file
+}
+
+#endif
\ No newline at end of file

From 7a5ea7c0c217d515550f67b7a36ebdd1462870f4 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Wed, 24 Jul 2024 18:12:34 +0530
Subject: [PATCH 03/50] Build a staging buffer and upload exr image

---
 26_Autoexposure/main.cpp | 219 +++++++++++++++++++++++++++++++++------
 1 file changed, 187 insertions(+), 32 deletions(-)

diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index 7b89917b5..fdabdd7f9 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -5,6 +5,7 @@
 #include "../common/SimpleWindowedApplication.hpp"
 
 #include "nbl/video/surface/CSurfaceVulkan.h"
+#include "nbl/asset/interchange/IAssetLoader.h"
 
 using namespace nbl;
 using namespace core;
@@ -22,6 +23,9 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 	using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
 	using clock_t = std::chrono::steady_clock;
 
+	constexpr static inline std::string_view DefaultImagePathsFile = "../../media/noises/spp_benchmark_4k_512.exr";
+
+
 public:
 	// Yay thanks to multiple inheritance we cannot forward ctors anymore
 	inline AutoexposureApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
@@ -62,6 +66,178 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 		if (!asset_base_t::onAppInitialized(std::move(system)))
 			return false;
 
+		/*
+			* We'll be using a combined image sampler for this example, which lets us assign both a sampled image and a sampler to the same binding.
+			* In this example we provide a sampler at descriptor set creation time, via the SBinding struct below. This specifies that the sampler for this binding is immutable,
+			* as evidenced by the name of the field in the SBinding.
+			* Samplers for combined image samplers can also be mutable, which for a binding of a descriptor set is specified also at creation time by leaving the immutableSamplers
+			* field set to its default (nullptr).
+			*/
+		smart_refctd_ptr<IGPUDescriptorSetLayout> dsLayout;
+		{
+			auto defaultSampler = m_device->createSampler({
+				.AnisotropicFilter = 0
+				});
+
+			const IGPUDescriptorSetLayout::SBinding bindings[1] = { {
+				.binding = 0,
+				.type = IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,
+				.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+				.stageFlags = IShader::E_SHADER_STAGE::ESS_FRAGMENT,
+				.count = 1,
+				.immutableSamplers = &defaultSampler
+			}
+			};
+			dsLayout = m_device->createDescriptorSetLayout(bindings);
+			if (!dsLayout)
+				return logFail("Failed to Create Descriptor Layout");
+
+		}
+
+		// create the descriptor set and with enough room for one image sampler
+		{
+			const uint32_t setCount = 1;
+			auto pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, { &dsLayout.get(),1 }, &setCount);
+			if (!pool)
+				return logFail("Failed to Create Descriptor Pool");
+
+			m_descriptorSets[0] = pool->createDescriptorSet(core::smart_refctd_ptr(dsLayout));
+			if (!m_descriptorSets[0])
+				return logFail("Could not create Descriptor Set!");
+		}
+
+		auto queue = getGraphicsQueue();
+
+		// need resetttable commandbuffers for the upload utility
+		{
+			m_cmdPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+			// create the commandbuffers
+			if (!m_cmdPool)
+				return logFail("Couldn't create Command Pool!");
+			if (!m_cmdPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data(), 1 }))
+				return logFail("Couldn't create Command Buffer!");
+		}
+
+		// things for IUtilities
+		{
+			m_scratchSemaphore = m_device->createSemaphore(0);
+			if (!m_scratchSemaphore)
+				return logFail("Could not create Scratch Semaphore");
+			m_scratchSemaphore->setObjectDebugName("Scratch Semaphore");
+			// we don't want to overcomplicate the example with multi-queue
+			m_intendedSubmit.queue = queue;
+			// wait for nothing before upload
+			m_intendedSubmit.waitSemaphores = {};
+			// fill later
+			m_intendedSubmit.commandBuffers = {};
+			m_intendedSubmit.scratchSemaphore = {
+				.semaphore = m_scratchSemaphore.get(),
+				.value = 0,
+				.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
+			};
+		}
+
+		// Allocate and Leave 1/4 for image uploads, to test image copy with small memory remaining
+		{
+			uint32_t localOffset = video::StreamingTransientDataBufferMT<>::invalid_value;
+			uint32_t maxFreeBlock = m_utils->getDefaultUpStreamingBuffer()->max_size();
+			const uint32_t allocationAlignment = 64u;
+			const uint32_t allocationSize = (maxFreeBlock / 4) * 3;
+			m_utils->getDefaultUpStreamingBuffer()->multi_allocate(std::chrono::steady_clock::now() + std::chrono::microseconds(500u), 1u, &localOffset, &allocationSize, &allocationAlignment);
+		}
+
+		// Load exr file into gpu
+		{
+			IAssetLoader::SAssetLoadParams params;
+			auto imageBundle = m_assetMgr->getAsset(DefaultImagePathsFile.data(), params);
+			auto cpuImg = IAsset::castDown<ICPUImage>(imageBundle.getContents().begin()[0]);
+			auto format = cpuImg->getCreationParameters().format;
+
+			ICPUImageView::SCreationParams viewParams = {
+				.flags = ICPUImageView::E_CREATE_FLAGS::ECF_NONE,
+				.image = std::move(cpuImg),
+				.viewType = IImageView<ICPUImage>::E_TYPE::ET_2D,
+				.format = format,
+				.subresourceRange = {
+					.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT,
+					.baseMipLevel = 0u,
+					.levelCount = ICPUImageView::remaining_mip_levels,
+					.baseArrayLayer = 0u,
+					.layerCount = ICPUImageView::remaining_array_layers
+				}
+			};
+
+			const auto cpuImgView = ICPUImageView::create(std::move(viewParams));
+			const auto& cpuImgParams = cpuImgView->getCreationParameters();
+
+			// create matching size image
+			IGPUImage::SCreationParams imageParams = {};
+			imageParams = cpuImgParams.image->getCreationParameters();
+			imageParams.usage |= IGPUImage::EUF_TRANSFER_DST_BIT | IGPUImage::EUF_SAMPLED_BIT | IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_SRC_BIT;
+			// promote format because RGB8 and friends don't actually exist in HW
+			{
+				const IPhysicalDevice::SImageFormatPromotionRequest request = {
+					.originalFormat = imageParams.format,
+					.usages = IPhysicalDevice::SFormatImageUsages::SUsage(imageParams.usage)
+				};
+				imageParams.format = m_physicalDevice->promoteImageFormat(request, imageParams.tiling);
+			}
+			if (imageParams.type == IGPUImage::ET_3D)
+				imageParams.flags |= IGPUImage::ECF_2D_ARRAY_COMPATIBLE_BIT;
+			m_gpuImg = m_device->createImage(std::move(imageParams));
+			if (!m_gpuImg || !m_device->allocate(m_gpuImg->getMemoryReqs(), m_gpuImg.get()).isValid())
+				return false;
+			m_gpuImg->setObjectDebugName("Autoexposure Image");
+
+			// we don't want to overcomplicate the example with multi-queue
+			auto queue = getGraphicsQueue();
+			auto cmdbuf = m_cmdBufs[0].get();
+			IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo = { cmdbuf };
+			m_intendedSubmit.commandBuffers = { &cmdbufInfo, 1 };
+
+			// there's no previous operation to wait for
+			const SMemoryBarrier toTransferBarrier = {
+				.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
+				.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT
+			};
+
+			// upload image and write to descriptor set
+			queue->startCapture();
+			auto ds = m_descriptorSets[0].get();
+
+			cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+			// change the layout of the image
+			const IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarriers[] = { {
+				.barrier = {
+					.dep = toTransferBarrier
+					// no ownership transfers
+				},
+				.image = m_gpuImg.get(),
+				// transition the whole view
+				.subresourceRange = cpuImgParams.subresourceRange,
+				// a wiping transition
+				.newLayout = IGPUImage::LAYOUT::TRANSFER_DST_OPTIMAL
+			} };
+			cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imgBarriers });
+			// upload contents and submit right away
+			m_utils->updateImageViaStagingBufferAutoSubmit(
+				m_intendedSubmit,
+				cpuImgParams.image->getBuffer(),
+				cpuImgParams.image->getCreationParameters().format,
+				m_gpuImg.get(),
+				IGPUImage::LAYOUT::TRANSFER_DST_OPTIMAL,
+				cpuImgParams.image->getRegions()
+			);
+
+			IGPUImageView::SCreationParams gpuImgViewParams = {
+				.image = m_gpuImg,
+				.viewType = IGPUImageView::ET_2D_ARRAY,
+				.format = m_gpuImg->getCreationParameters().format
+			};
+
+			m_gpuImgView = m_device->createImageView(std::move(gpuImgViewParams));
+		}
+
 		return true;
 	}
 
@@ -83,6 +259,17 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 protected:
 	smart_refctd_ptr<IWindow> m_window;
 	smart_refctd_ptr<CSimpleResizeSurface<CDefaultSwapchainFramebuffers>> m_surface;
+	smart_refctd_ptr<IGPUImage> m_gpuImg;
+	smart_refctd_ptr<IGPUImageView> m_gpuImgView;
+
+	// for image uploads
+	smart_refctd_ptr<ISemaphore> m_scratchSemaphore;
+	SIntendedSubmitInfo m_intendedSubmit;
+
+	// Command Buffers and other resources
+	std::array<smart_refctd_ptr<IGPUDescriptorSet>, ISwapchain::MaxImages> m_descriptorSets;
+	smart_refctd_ptr<IGPUCommandPool> m_cmdPool;
+	std::array<smart_refctd_ptr<IGPUCommandBuffer>, ISwapchain::MaxImages> m_cmdBufs;
 };
 
 NBL_MAIN_FUNC(AutoexposureApp)
@@ -136,38 +323,6 @@ int main()
 	IAssetLoader::SAssetLoadParams lp;
 	auto imageBundle = am->getAsset("../../media/noises/spp_benchmark_4k_512.exr", lp);
 
-	E_FORMAT inFormat;
-	constexpr auto outFormat = EF_R8G8B8A8_SRGB;
-	smart_refctd_ptr<IGPUImage> outImg;
-	smart_refctd_ptr<IGPUImageView> imgToTonemapView,outImgView;
-	{
-		auto cpuImg = IAsset::castDown<ICPUImage>(imageBundle.getContents().begin()[0]);
-		IGPUImage::SCreationParams imgInfo = cpuImg->getCreationParameters();
-		inFormat = imgInfo.format;
-
-		auto gpuImages = driver->getGPUObjectsFromAssets(&cpuImg.get(),&cpuImg.get()+1);
-		auto gpuImage = gpuImages->operator[](0u);
-
-		IGPUImageView::SCreationParams imgViewInfo;
-		imgViewInfo.flags = static_cast<IGPUImageView::E_CREATE_FLAGS>(0u);
-		imgViewInfo.image = std::move(gpuImage);
-		imgViewInfo.viewType = IGPUImageView::ET_2D_ARRAY;
-		imgViewInfo.format = inFormat;
-		imgViewInfo.subresourceRange.aspectMask = static_cast<IImage::E_ASPECT_FLAGS>(0u);
-		imgViewInfo.subresourceRange.baseMipLevel = 0;
-		imgViewInfo.subresourceRange.levelCount = 1;
-		imgViewInfo.subresourceRange.baseArrayLayer = 0;
-		imgViewInfo.subresourceRange.layerCount = 1;
-		imgToTonemapView = driver->createImageView(IGPUImageView::SCreationParams(imgViewInfo));
-
-		imgInfo.format = outFormat;
-		outImg = driver->createDeviceLocalGPUImageOnDedMem(std::move(imgInfo));
-
-		imgViewInfo.image = outImg;
-		imgViewInfo.format = outFormat;
-		outImgView = driver->createImageView(IGPUImageView::SCreationParams(imgViewInfo));
-	}
-
 	auto glslCompiler = am->getCompilerSet();
 	const auto inputColorSpace = std::make_tuple(inFormat,ECP_SRGB,EOTF_IDENTITY);
 

From 5d63d041d08fcf08dd0fc061732f10fa1274e6f3 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Thu, 25 Jul 2024 22:17:09 +0530
Subject: [PATCH 04/50] Init surface and create the swapchain

---
 26_Autoexposure/main.cpp | 66 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 63 insertions(+), 3 deletions(-)

diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index fdabdd7f9..43f39c917 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -25,7 +25,6 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 	constexpr static inline std::string_view DefaultImagePathsFile = "../../media/noises/spp_benchmark_4k_512.exr";
 
-
 public:
 	// Yay thanks to multiple inheritance we cannot forward ctors anymore
 	inline AutoexposureApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
@@ -108,6 +107,65 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 		auto queue = getGraphicsQueue();
 
+		// init the surface and create the swapchain
+		{
+			ISwapchain::SCreationParams swapchainParams = { .surface = smart_refctd_ptr<ISurface>(m_surface->getSurface()) };
+			// Need to choose a surface format
+			if (!swapchainParams.deduceFormat(m_physicalDevice))
+				return logFail("Could not choose a Surface Format for the Swapchain!");
+			// We actually need external dependencies to ensure ordering of the Implicit Layout Transitions relative to the semaphore signals
+			constexpr IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = {
+				// wipe-transition to ATTACHMENT_OPTIMAL
+				{
+					.srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
+					.dstSubpass = 0,
+					.memoryBarrier = {
+					// since we're uploading the image data we're about to draw
+					.srcStageMask = asset::PIPELINE_STAGE_FLAGS::COPY_BIT,
+					.srcAccessMask = asset::ACCESS_FLAGS::TRANSFER_WRITE_BIT,
+					.dstStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+					// because we clear and don't blend
+					.dstAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
+					}
+					// leave view offsets and flags default
+				},
+				// ATTACHMENT_OPTIMAL to PRESENT_SRC
+				{
+					.srcSubpass = 0,
+					.dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
+					.memoryBarrier = {
+						.srcStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+						.srcAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
+						// we can have NONE as the Destinations because the spec says so about presents
+						}
+					// leave view offsets and flags default
+				},
+				IGPURenderpass::SCreationParams::DependenciesEnd
+			};
+			auto scResources = std::make_unique<CDefaultSwapchainFramebuffers>(m_device.get(), swapchainParams.surfaceFormat.format, dependencies);
+			if (!scResources->getRenderpass())
+				return logFail("Failed to create Renderpass!");
+			if (!m_surface || !m_surface->init(queue, std::move(scResources), swapchainParams.sharedParams))
+				return logFail("Could not create Window & Surface or initialize the Surface!");
+		}
+
+		// Now create the pipeline
+		/* {
+			const asset::SPushConstantRange range = {
+				.stageFlags = IShader::E_SHADER_STAGE::ESS_FRAGMENT,
+				.offset = 0,
+				.size = sizeof(push_constants_t)
+			};
+			auto layout = m_device->createPipelineLayout({ &range,1 }, nullptr, nullptr, nullptr, core::smart_refctd_ptr(dsLayout));
+			const IGPUShader::SSpecInfo fragSpec = {
+				.entryPoint = "main",
+				.shader = fragmentShader.get()
+			};
+			m_pipeline = fsTriProtoPPln.createPipeline(fragSpec, layout.get(), scResources->getRenderpass());
+			if (!m_pipeline)
+				return logFail("Could not create Graphics Pipeline!");
+		}*/
+
 		// need resetttable commandbuffers for the upload utility
 		{
 			m_cmdPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
@@ -257,8 +315,6 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 	}
 
 protected:
-	smart_refctd_ptr<IWindow> m_window;
-	smart_refctd_ptr<CSimpleResizeSurface<CDefaultSwapchainFramebuffers>> m_surface;
 	smart_refctd_ptr<IGPUImage> m_gpuImg;
 	smart_refctd_ptr<IGPUImageView> m_gpuImgView;
 
@@ -270,6 +326,10 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 	std::array<smart_refctd_ptr<IGPUDescriptorSet>, ISwapchain::MaxImages> m_descriptorSets;
 	smart_refctd_ptr<IGPUCommandPool> m_cmdPool;
 	std::array<smart_refctd_ptr<IGPUCommandBuffer>, ISwapchain::MaxImages> m_cmdBufs;
+
+	// window
+	smart_refctd_ptr<IWindow> m_window;
+	smart_refctd_ptr<CSimpleResizeSurface<CDefaultSwapchainFramebuffers>> m_surface;
 };
 
 NBL_MAIN_FUNC(AutoexposureApp)

From 640e6a38223306851d68b44b36b64fc4a863333e Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Thu, 25 Jul 2024 22:51:28 +0530
Subject: [PATCH 05/50] Load shaders and create the pipeline for full screen
 triagnle

---
 .../app_resources/present.frag.hlsl           | 17 ++++++
 26_Autoexposure/main.cpp                      | 57 ++++++++++++++-----
 2 files changed, 60 insertions(+), 14 deletions(-)
 create mode 100644 26_Autoexposure/app_resources/present.frag.hlsl

diff --git a/26_Autoexposure/app_resources/present.frag.hlsl b/26_Autoexposure/app_resources/present.frag.hlsl
new file mode 100644
index 000000000..fcddeb743
--- /dev/null
+++ b/26_Autoexposure/app_resources/present.frag.hlsl
@@ -0,0 +1,17 @@
+// Copyright (C) 2024-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#pragma wave shader_stage(fragment)
+
+// vertex shader is provided by the fullScreenTriangle extension
+#include <nbl/builtin/hlsl/ext/FullScreenTriangle/SVertexAttributes.hlsl>
+using namespace nbl::hlsl::ext::FullScreenTriangle;
+
+[[vk::combinedImageSampler]] [[vk::binding(0, 3)]] Texture2D texture;
+[[vk::combinedImageSampler]] [[vk::binding(0, 3)]] SamplerState samplerState;
+
+[[vk::location(0)]] float32_t4 main(SVertexAttributes vxAttr) : SV_Target0
+{
+    return texture.Sample(samplerState, vxAttr.uv);
+}
\ No newline at end of file
diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index 43f39c917..cc048a3f5 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -6,6 +6,8 @@
 
 #include "nbl/video/surface/CSurfaceVulkan.h"
 #include "nbl/asset/interchange/IAssetLoader.h"
+#include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h"
+
 
 using namespace nbl;
 using namespace core;
@@ -107,9 +109,11 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 		auto queue = getGraphicsQueue();
 
-		// init the surface and create the swapchain
+		// Gather swapchain resources
+		std::unique_ptr<CDefaultSwapchainFramebuffers> scResources;
+		ISwapchain::SCreationParams swapchainParams;
 		{
-			ISwapchain::SCreationParams swapchainParams = { .surface = smart_refctd_ptr<ISurface>(m_surface->getSurface()) };
+			swapchainParams = { .surface = smart_refctd_ptr<ISurface>(m_surface->getSurface()) };
 			// Need to choose a surface format
 			if (!swapchainParams.deduceFormat(m_physicalDevice))
 				return logFail("Could not choose a Surface Format for the Swapchain!");
@@ -142,21 +146,41 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				},
 				IGPURenderpass::SCreationParams::DependenciesEnd
 			};
-			auto scResources = std::make_unique<CDefaultSwapchainFramebuffers>(m_device.get(), swapchainParams.surfaceFormat.format, dependencies);
+			scResources = std::make_unique<CDefaultSwapchainFramebuffers>(m_device.get(), swapchainParams.surfaceFormat.format, dependencies);
 			if (!scResources->getRenderpass())
 				return logFail("Failed to create Renderpass!");
-			if (!m_surface || !m_surface->init(queue, std::move(scResources), swapchainParams.sharedParams))
-				return logFail("Could not create Window & Surface or initialize the Surface!");
 		}
 
-		// Now create the pipeline
-		/* {
-			const asset::SPushConstantRange range = {
-				.stageFlags = IShader::E_SHADER_STAGE::ESS_FRAGMENT,
-				.offset = 0,
-				.size = sizeof(push_constants_t)
-			};
-			auto layout = m_device->createPipelineLayout({ &range,1 }, nullptr, nullptr, nullptr, core::smart_refctd_ptr(dsLayout));
+		// Load the shaders and create the pipeline
+		{
+			// Load FSTri Shader
+			ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get());
+			if (!fsTriProtoPPln)
+				return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!");
+
+			// Load Custom Shader
+			auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr<IGPUShader>
+				{
+					IAssetLoader::SAssetLoadParams lp = {};
+					lp.logger = m_logger.get();
+					lp.workingDirectory = ""; // virtual root
+					auto assetBundle = m_assetMgr->getAsset(relPath, lp);
+					const auto assets = assetBundle.getContents();
+					if (assets.empty())
+						return nullptr;
+
+					// lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
+					auto source = IAsset::castDown<ICPUShader>(assets[0]);
+					if (!source)
+						return nullptr;
+
+					return m_device->createShader(source.get());
+				};
+			auto fragmentShader = loadCompileAndCreateShader("app_resources/present.frag.hlsl");
+			if (!fragmentShader)
+				return logFail("Failed to Load and Compile Fragment Shader!");
+
+			auto layout = m_device->createPipelineLayout({}, nullptr, nullptr, nullptr, core::smart_refctd_ptr(dsLayout));
 			const IGPUShader::SSpecInfo fragSpec = {
 				.entryPoint = "main",
 				.shader = fragmentShader.get()
@@ -164,7 +188,11 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			m_pipeline = fsTriProtoPPln.createPipeline(fragSpec, layout.get(), scResources->getRenderpass());
 			if (!m_pipeline)
 				return logFail("Could not create Graphics Pipeline!");
-		}*/
+		}
+
+		// Init the surface and create the swapchain
+		if (!m_surface || !m_surface->init(queue, std::move(scResources), swapchainParams.sharedParams))
+			return logFail("Could not create Window & Surface or initialize the Surface!");
 
 		// need resetttable commandbuffers for the upload utility
 		{
@@ -326,6 +354,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 	std::array<smart_refctd_ptr<IGPUDescriptorSet>, ISwapchain::MaxImages> m_descriptorSets;
 	smart_refctd_ptr<IGPUCommandPool> m_cmdPool;
 	std::array<smart_refctd_ptr<IGPUCommandBuffer>, ISwapchain::MaxImages> m_cmdBufs;
+	smart_refctd_ptr<IGPUGraphicsPipeline> m_pipeline;
 
 	// window
 	smart_refctd_ptr<IWindow> m_window;

From d69a11179989b116f936a722300691273dc15e0d Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Fri, 26 Jul 2024 02:36:09 +0530
Subject: [PATCH 06/50] Set window size according to loaded image

---
 26_Autoexposure/main.cpp | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index cc048a3f5..2f49bda6d 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -275,6 +275,21 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				return false;
 			m_gpuImg->setObjectDebugName("Autoexposure Image");
 
+			// set window size
+			const auto imageExtent = m_gpuImg->getCreationParameters().extent;
+			const VkExtent2D newWindowResolution = { imageExtent.width, imageExtent.height };
+
+			if (newWindowResolution.width != m_window->getWidth() || newWindowResolution.height != m_window->getHeight())
+			{
+				// Resize the window
+				m_winMgr->setWindowSize(m_window.get(), newWindowResolution.width, newWindowResolution.height);
+				// Don't want to rely on the Swapchain OUT_OF_DATE causing an implicit re-create in the `acquireNextImage` because the
+				// swapchain may report OUT_OF_DATE after the next VBlank after the resize, not getting the message right away.
+				m_surface->recreateSwapchain();
+			}
+			// Now show the window (ideally should happen just after present, but don't want to mess with acquire/recreation)
+			m_winMgr->show(m_window.get());
+
 			// we don't want to overcomplicate the example with multi-queue
 			auto queue = getGraphicsQueue();
 			auto cmdbuf = m_cmdBufs[0].get();
@@ -314,7 +329,6 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				IGPUImage::LAYOUT::TRANSFER_DST_OPTIMAL,
 				cpuImgParams.image->getRegions()
 			);
-
 			IGPUImageView::SCreationParams gpuImgViewParams = {
 				.image = m_gpuImg,
 				.viewType = IGPUImageView::ET_2D_ARRAY,
@@ -322,6 +336,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			};
 
 			m_gpuImgView = m_device->createImageView(std::move(gpuImgViewParams));
+			queue->endCapture();
 		}
 
 		return true;
@@ -334,7 +349,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 	inline bool keepRunning() override
 	{
-		return false;
+		return true;
 	}
 
 	inline bool onAppTerminated() override

From 54bf38f6661b17a1fbfe6b4c30bc4b287bf45467 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Fri, 26 Jul 2024 02:39:26 +0530
Subject: [PATCH 07/50] Stop running if window is closed

---
 26_Autoexposure/main.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index 2f49bda6d..95fcab225 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -349,6 +349,10 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 	inline bool keepRunning() override
 	{
+		// Keep arunning as long as we have a surface to present to (usually this means, as long as the window is open)
+		if (m_surface->irrecoverable())
+			return false;
+
 		return true;
 	}
 

From 461efd36efd02b48b07e14b027f6e51d431d296c Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Fri, 26 Jul 2024 16:24:19 +0530
Subject: [PATCH 08/50] Acquire swapchain image and present uploaded image to
 it

---
 26_Autoexposure/main.cpp | 142 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 138 insertions(+), 4 deletions(-)

diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index 95fcab225..fbbc31524 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -8,7 +8,6 @@
 #include "nbl/asset/interchange/IAssetLoader.h"
 #include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h"
 
-
 using namespace nbl;
 using namespace core;
 using namespace hlsl;
@@ -95,6 +94,9 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 		}
 
+		// Create semaphore
+		m_semaphore = m_device->createSemaphore(m_submitIx);
+
 		// create the descriptor set and with enough room for one image sampler
 		{
 			const uint32_t setCount = 1;
@@ -107,6 +109,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				return logFail("Could not create Descriptor Set!");
 		}
 
+		auto ds = m_descriptorSets[0].get();
 		auto queue = getGraphicsQueue();
 
 		// Gather swapchain resources
@@ -256,7 +259,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			const auto cpuImgView = ICPUImageView::create(std::move(viewParams));
 			const auto& cpuImgParams = cpuImgView->getCreationParameters();
 
-			// create matching size image
+			// create matching size image upto dimensions
 			IGPUImage::SCreationParams imageParams = {};
 			imageParams = cpuImgParams.image->getCreationParameters();
 			imageParams.usage |= IGPUImage::EUF_TRANSFER_DST_BIT | IGPUImage::EUF_SAMPLED_BIT | IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_SRC_BIT;
@@ -304,7 +307,6 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 			// upload image and write to descriptor set
 			queue->startCapture();
-			auto ds = m_descriptorSets[0].get();
 
 			cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
 			// change the layout of the image
@@ -331,11 +333,28 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			);
 			IGPUImageView::SCreationParams gpuImgViewParams = {
 				.image = m_gpuImg,
-				.viewType = IGPUImageView::ET_2D_ARRAY,
+				.viewType = IGPUImageView::ET_2D,
 				.format = m_gpuImg->getCreationParameters().format
 			};
 
 			m_gpuImgView = m_device->createImageView(std::move(gpuImgViewParams));
+
+			IGPUDescriptorSet::SDescriptorInfo info = {};
+			info.info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+			info.desc = m_gpuImgView;
+
+			IGPUDescriptorSet::SWriteDescriptorSet writeDescriptors[] = {
+				{
+					.dstSet = ds,
+					.binding = 0,
+					.arrayElement = 0,
+					.count = 1,
+					.info = &info
+				}
+			};
+
+			m_device->updateDescriptorSets(1, writeDescriptors, 0, nullptr);
+
 			queue->endCapture();
 		}
 
@@ -345,6 +364,119 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 	// We do a very simple thing, display an image and wait `DisplayImageMs` to show it
 	inline void workLoopBody() override
 	{
+		// Acquire
+		auto acquire = m_surface->acquireNextImage();
+		if (!acquire)
+			return;
+
+		auto queue = getGraphicsQueue();
+		auto cmdbuf = m_cmdBufs[0].get();
+		auto ds = m_descriptorSets[0].get();
+
+		// there's no previous operation to wait for
+		const SMemoryBarrier toTransferBarrier = {
+			.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
+			.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT
+		};
+		const auto gpuImgCreationParams = m_gpuImg->getCreationParameters();
+		const auto gpuImgViewCreationParams = m_gpuImgView->getCreationParameters();
+
+		queue->startCapture();
+		// Render to the Image
+		{
+			cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+
+			// need a pipeline barrier to transition layout
+			const IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarriers[] = { {
+				.barrier = {
+					.dep = toTransferBarrier.nextBarrier(PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT,ACCESS_FLAGS::SAMPLED_READ_BIT)
+				},
+				.image = m_gpuImg.get(),
+				.subresourceRange = gpuImgViewCreationParams.subresourceRange,
+				.oldLayout = IGPUImage::LAYOUT::TRANSFER_DST_OPTIMAL,
+				.newLayout = IGPUImage::LAYOUT::READ_ONLY_OPTIMAL
+			} };
+			cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imgBarriers });
+
+			const VkRect2D currentRenderArea =
+			{
+				.offset = {0,0},
+				.extent = {gpuImgCreationParams.extent.width, gpuImgCreationParams.extent.height}
+			};
+			// set viewport
+			{
+				const asset::SViewport viewport =
+				{
+					.width = float(gpuImgCreationParams.extent.width),
+					.height = float(gpuImgCreationParams.extent.height)
+				};
+				cmdbuf->setViewport({ &viewport,1 });
+			}
+			cmdbuf->setScissor({ &currentRenderArea,1 });
+
+			// begin the renderpass
+			{
+				const IGPUCommandBuffer::SClearColorValue clearValue = { .float32 = {1.f,0.f,1.f,1.f} };
+				auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
+				const IGPUCommandBuffer::SRenderpassBeginInfo info = {
+					.framebuffer = scRes->getFramebuffer(acquire.imageIndex),
+					.colorClearValues = &clearValue,
+					.depthStencilClearValues = nullptr,
+					.renderArea = currentRenderArea
+				};
+				cmdbuf->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
+			}
+			cmdbuf->bindGraphicsPipeline(m_pipeline.get());
+			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, m_pipeline->getLayout(), 3, 1, &ds);
+			ext::FullScreenTriangle::recordDrawCall(cmdbuf);
+			cmdbuf->endRenderPass();
+
+			cmdbuf->end();
+		}
+
+		// submit
+		const IQueue::SSubmitInfo::SSemaphoreInfo rendered[1] = { {
+			.semaphore = m_semaphore.get(),
+			.value = ++m_submitIx,
+			// just as we've outputted all pixels, signal
+			.stageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT
+		} };
+		{
+			{
+				const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { {
+					.cmdbuf = cmdbuf
+				} };
+				// we don't need to wait for the transfer semaphore, because we submit everything to the same queue
+				const IQueue::SSubmitInfo::SSemaphoreInfo acquired[1] = { {
+					.semaphore = acquire.semaphore,
+					.value = acquire.acquireCount,
+					.stageMask = PIPELINE_STAGE_FLAGS::NONE
+				} };
+				const IQueue::SSubmitInfo infos[1] = { {
+					.waitSemaphores = acquired,
+					.commandBuffers = commandBuffers,
+					.signalSemaphores = rendered
+				} };
+				// we won't signal the sema if no success
+				if (queue->submit(infos) != IQueue::RESULT::SUCCESS)
+					m_submitIx--;
+			}
+		}
+
+		// Present
+		m_surface->present(acquire.imageIndex, rendered);
+		getGraphicsQueue()->endCapture();
+
+		{
+			const ISemaphore::SWaitInfo cmdbufDonePending[] = {
+				{
+					.semaphore = m_semaphore.get(),
+					.value = m_submitIx
+				}
+			};
+			if (m_device->blockForSemaphores(cmdbufDonePending) != ISemaphore::WAIT_RESULT::SUCCESS)
+				return;
+		}
 	}
 
 	inline bool keepRunning() override
@@ -374,6 +506,8 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 	smart_refctd_ptr<IGPUCommandPool> m_cmdPool;
 	std::array<smart_refctd_ptr<IGPUCommandBuffer>, ISwapchain::MaxImages> m_cmdBufs;
 	smart_refctd_ptr<IGPUGraphicsPipeline> m_pipeline;
+	smart_refctd_ptr<ISemaphore> m_semaphore;
+	uint64_t m_submitIx = 0;
 
 	// window
 	smart_refctd_ptr<IWindow> m_window;

From 734fea90599aefa5b431057eb7c8796854cac6ca Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Fri, 26 Jul 2024 22:03:54 +0530
Subject: [PATCH 09/50] Set window size directly and use that for swapchain
 rendering

---
 26_Autoexposure/main.cpp | 53 +++++++++-------------------------------
 1 file changed, 12 insertions(+), 41 deletions(-)

diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index fbbc31524..d34be555c 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -25,6 +25,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 	using clock_t = std::chrono::steady_clock;
 
 	constexpr static inline std::string_view DefaultImagePathsFile = "../../media/noises/spp_benchmark_4k_512.exr";
+	constexpr static inline std::array<int, 2> Dimensions = { 1280, 720 };
 
 public:
 	// Yay thanks to multiple inheritance we cannot forward ctors anymore
@@ -40,8 +41,8 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			{
 				IWindow::SCreationParams params = {};
 				params.callback = core::make_smart_refctd_ptr<nbl::video::ISimpleManagedSurface::ICallback>();
-				params.width = 256;
-				params.height = 256;
+				params.width = Dimensions[0];
+				params.height = Dimensions[1];
 				params.x = 32;
 				params.y = 32;
 				// Don't want to have a window lingering about before we're ready so create it hidden.
@@ -278,19 +279,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				return false;
 			m_gpuImg->setObjectDebugName("Autoexposure Image");
 
-			// set window size
-			const auto imageExtent = m_gpuImg->getCreationParameters().extent;
-			const VkExtent2D newWindowResolution = { imageExtent.width, imageExtent.height };
-
-			if (newWindowResolution.width != m_window->getWidth() || newWindowResolution.height != m_window->getHeight())
-			{
-				// Resize the window
-				m_winMgr->setWindowSize(m_window.get(), newWindowResolution.width, newWindowResolution.height);
-				// Don't want to rely on the Swapchain OUT_OF_DATE causing an implicit re-create in the `acquireNextImage` because the
-				// swapchain may report OUT_OF_DATE after the next VBlank after the resize, not getting the message right away.
-				m_surface->recreateSwapchain();
-			}
-			// Now show the window (ideally should happen just after present, but don't want to mess with acquire/recreation)
+			// Now show the window
 			m_winMgr->show(m_window.get());
 
 			// we don't want to overcomplicate the example with multi-queue
@@ -373,46 +362,26 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 		auto cmdbuf = m_cmdBufs[0].get();
 		auto ds = m_descriptorSets[0].get();
 
-		// there's no previous operation to wait for
-		const SMemoryBarrier toTransferBarrier = {
-			.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
-			.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT
-		};
-		const auto gpuImgCreationParams = m_gpuImg->getCreationParameters();
-		const auto gpuImgViewCreationParams = m_gpuImgView->getCreationParameters();
-
 		queue->startCapture();
-		// Render to the Image
+		// Render to the swapchain
 		{
 			cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
 
-			// need a pipeline barrier to transition layout
-			const IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarriers[] = { {
-				.barrier = {
-					.dep = toTransferBarrier.nextBarrier(PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT,ACCESS_FLAGS::SAMPLED_READ_BIT)
-				},
-				.image = m_gpuImg.get(),
-				.subresourceRange = gpuImgViewCreationParams.subresourceRange,
-				.oldLayout = IGPUImage::LAYOUT::TRANSFER_DST_OPTIMAL,
-				.newLayout = IGPUImage::LAYOUT::READ_ONLY_OPTIMAL
-			} };
-			cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imgBarriers });
-
 			const VkRect2D currentRenderArea =
 			{
 				.offset = {0,0},
-				.extent = {gpuImgCreationParams.extent.width, gpuImgCreationParams.extent.height}
+				.extent = { m_window->getWidth(), m_window->getHeight() }
 			};
 			// set viewport
 			{
 				const asset::SViewport viewport =
 				{
-					.width = float(gpuImgCreationParams.extent.width),
-					.height = float(gpuImgCreationParams.extent.height)
+					.width = float(m_window->getWidth()),
+					.height = float(m_window->getHeight())
 				};
-				cmdbuf->setViewport({ &viewport,1 });
+				cmdbuf->setViewport({ &viewport, 1 });
 			}
-			cmdbuf->setScissor({ &currentRenderArea,1 });
+			cmdbuf->setScissor({ &currentRenderArea, 1 });
 
 			// begin the renderpass
 			{
@@ -426,6 +395,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				};
 				cmdbuf->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
 			}
+
 			cmdbuf->bindGraphicsPipeline(m_pipeline.get());
 			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, m_pipeline->getLayout(), 3, 1, &ds);
 			ext::FullScreenTriangle::recordDrawCall(cmdbuf);
@@ -467,6 +437,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 		m_surface->present(acquire.imageIndex, rendered);
 		getGraphicsQueue()->endCapture();
 
+		// Wait for completion
 		{
 			const ISemaphore::SWaitInfo cmdbufDonePending[] = {
 				{

From 4a117243aa4cb096ee59a140617e88e7e6532f31 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Mon, 5 Aug 2024 19:02:28 +0530
Subject: [PATCH 10/50] m_computeSubgroupSize

---
 26_Autoexposure/app_resources/present.frag.hlsl | 2 ++
 26_Autoexposure/main.cpp                        | 6 ++++++
 2 files changed, 8 insertions(+)

diff --git a/26_Autoexposure/app_resources/present.frag.hlsl b/26_Autoexposure/app_resources/present.frag.hlsl
index fcddeb743..4f76de4cd 100644
--- a/26_Autoexposure/app_resources/present.frag.hlsl
+++ b/26_Autoexposure/app_resources/present.frag.hlsl
@@ -8,6 +8,8 @@
 #include <nbl/builtin/hlsl/ext/FullScreenTriangle/SVertexAttributes.hlsl>
 using namespace nbl::hlsl::ext::FullScreenTriangle;
 
+#include <nbl/builtin/hlsl/luma_meter/luma_meter.hlsl>
+
 [[vk::combinedImageSampler]] [[vk::binding(0, 3)]] Texture2D texture;
 [[vk::combinedImageSampler]] [[vk::binding(0, 3)]] SamplerState samplerState;
 
diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index d34be555c..b285c930f 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -347,12 +347,15 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			queue->endCapture();
 		}
 
+		m_computeSubgroupSize = m_physicalDevice->getLimits().maxComputeWorkgroupSubgroups;
+
 		return true;
 	}
 
 	// We do a very simple thing, display an image and wait `DisplayImageMs` to show it
 	inline void workLoopBody() override
 	{
+
 		// Acquire
 		auto acquire = m_surface->acquireNextImage();
 		if (!acquire)
@@ -483,6 +486,9 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 	// window
 	smart_refctd_ptr<IWindow> m_window;
 	smart_refctd_ptr<CSimpleResizeSurface<CDefaultSwapchainFramebuffers>> m_surface;
+
+	// constants
+	uint32_t m_computeSubgroupSize = 0;
 };
 
 NBL_MAIN_FUNC(AutoexposureApp)

From 7d4895a7a5287d6e3912657d168fdf385c39ec38 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Fri, 9 Aug 2024 03:10:32 +0530
Subject: [PATCH 11/50] Allocate buffer for gathered luma values

---
 26_Autoexposure/app_resources/common.hlsl     | 13 ++++
 .../app_resources/luma_meter.comp.hlsl        | 17 ++++++
 .../app_resources/present.frag.hlsl           |  2 -
 .../app_resources/tonemap.comp.hlsl           | 17 ++++++
 26_Autoexposure/main.cpp                      | 60 +++++++++++++------
 5 files changed, 89 insertions(+), 20 deletions(-)
 create mode 100644 26_Autoexposure/app_resources/common.hlsl
 create mode 100644 26_Autoexposure/app_resources/luma_meter.comp.hlsl
 create mode 100644 26_Autoexposure/app_resources/tonemap.comp.hlsl

diff --git a/26_Autoexposure/app_resources/common.hlsl b/26_Autoexposure/app_resources/common.hlsl
new file mode 100644
index 000000000..f2b21b7e4
--- /dev/null
+++ b/26_Autoexposure/app_resources/common.hlsl
@@ -0,0 +1,13 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _AUTOEXPOSURE_COMMON_INCLUDED_
+#define _AUTOEXPOSURE_COMMON_INCLUDED_
+
+struct AutoexposurePushData
+{
+    uint32_t viewportSizeX, viewportSizeY;
+};
+
+#endif
\ No newline at end of file
diff --git a/26_Autoexposure/app_resources/luma_meter.comp.hlsl b/26_Autoexposure/app_resources/luma_meter.comp.hlsl
new file mode 100644
index 000000000..4a0797d6d
--- /dev/null
+++ b/26_Autoexposure/app_resources/luma_meter.comp.hlsl
@@ -0,0 +1,17 @@
+// Copyright (C) 2024-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#include "app_resources/common.hlsl"
+
+[[vk::push_constant]] AutoexposurePushData pushData;
+
+uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize()
+{
+    return uint32_t3(WorkgroupSize, 1, 1);
+}
+
+[numthreads(SubgroupSize, SubgroupSize, 1)]
+void main(uint32_t3 ID : SV_GroupThreadID, uint32_t3 GroupID : SV_GroupID)
+{
+}
\ No newline at end of file
diff --git a/26_Autoexposure/app_resources/present.frag.hlsl b/26_Autoexposure/app_resources/present.frag.hlsl
index 4f76de4cd..fcddeb743 100644
--- a/26_Autoexposure/app_resources/present.frag.hlsl
+++ b/26_Autoexposure/app_resources/present.frag.hlsl
@@ -8,8 +8,6 @@
 #include <nbl/builtin/hlsl/ext/FullScreenTriangle/SVertexAttributes.hlsl>
 using namespace nbl::hlsl::ext::FullScreenTriangle;
 
-#include <nbl/builtin/hlsl/luma_meter/luma_meter.hlsl>
-
 [[vk::combinedImageSampler]] [[vk::binding(0, 3)]] Texture2D texture;
 [[vk::combinedImageSampler]] [[vk::binding(0, 3)]] SamplerState samplerState;
 
diff --git a/26_Autoexposure/app_resources/tonemap.comp.hlsl b/26_Autoexposure/app_resources/tonemap.comp.hlsl
new file mode 100644
index 000000000..4a0797d6d
--- /dev/null
+++ b/26_Autoexposure/app_resources/tonemap.comp.hlsl
@@ -0,0 +1,17 @@
+// Copyright (C) 2024-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#include "app_resources/common.hlsl"
+
+[[vk::push_constant]] AutoexposurePushData pushData;
+
+uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize()
+{
+    return uint32_t3(WorkgroupSize, 1, 1);
+}
+
+[numthreads(SubgroupSize, SubgroupSize, 1)]
+void main(uint32_t3 ID : SV_GroupThreadID, uint32_t3 GroupID : SV_GroupID)
+{
+}
\ No newline at end of file
diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index b285c930f..b31984844 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -227,6 +227,46 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			};
 		}
 
+		// Allocate and create buffer for Luma Gather
+		{
+			// Allocate memory
+			nbl::video::IDeviceMemoryAllocator::SAllocation allocation = {};
+			smart_refctd_ptr<IGPUBuffer> buffer;
+			//smart_refctd_ptr<nbl::video::IGPUDescriptorSet> ds;
+			{
+				auto build_buffer = [this](
+					smart_refctd_ptr<ILogicalDevice> m_device,
+					nbl::video::IDeviceMemoryAllocator::SAllocation* allocation,
+					smart_refctd_ptr<IGPUBuffer>& buffer,
+					size_t buffer_size,
+					const char* label)
+				{
+					IGPUBuffer::SCreationParams params;
+					params.size = buffer_size;
+					params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+					buffer = m_device->createBuffer(std::move(params));
+					if (!buffer)
+						return logFail("Failed to create GPU buffer of size %d!\n", buffer_size);
+
+					buffer->setObjectDebugName(label);
+
+					auto reqs = buffer->getMemoryReqs();
+					reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits();
+
+					*allocation = m_device->allocate(reqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
+					if (!allocation->isValid())
+						return logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n");
+
+					assert(allocation->memory.get() == buffer->getBoundMemory().memory);
+				};
+
+				auto x = m_physicalDevice->getLimits();
+
+				build_buffer(m_device, &allocation, buffer, m_physicalDevice->getLimits().maxSubgroupSize, "Luma Gather Buffer");
+			}
+			m_lumaGatherBDA = buffer->getDeviceAddress();
+		}
+
 		// Allocate and Leave 1/4 for image uploads, to test image copy with small memory remaining
 		{
 			uint32_t localOffset = video::StreamingTransientDataBufferMT<>::invalid_value;
@@ -347,8 +387,6 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			queue->endCapture();
 		}
 
-		m_computeSubgroupSize = m_physicalDevice->getLimits().maxComputeWorkgroupSubgroups;
-
 		return true;
 	}
 
@@ -487,8 +525,8 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 	smart_refctd_ptr<IWindow> m_window;
 	smart_refctd_ptr<CSimpleResizeSurface<CDefaultSwapchainFramebuffers>> m_surface;
 
-	// constants
-	uint32_t m_computeSubgroupSize = 0;
+	// luma gather
+	uint64_t m_lumaGatherBDA;
 };
 
 NBL_MAIN_FUNC(AutoexposureApp)
@@ -531,20 +569,6 @@ int main()
 	if (!device)
 		return 1; // could not create selected driver.
 
-	QToQuitEventReceiver receiver;
-	device->setEventReceiver(&receiver);
-
-	IVideoDriver* driver = device->getVideoDriver();
-	
-	nbl::io::IFileSystem* filesystem = device->getFileSystem();
-	IAssetManager* am = device->getAssetManager();
-
-	IAssetLoader::SAssetLoadParams lp;
-	auto imageBundle = am->getAsset("../../media/noises/spp_benchmark_4k_512.exr", lp);
-
-	auto glslCompiler = am->getCompilerSet();
-	const auto inputColorSpace = std::make_tuple(inFormat,ECP_SRGB,EOTF_IDENTITY);
-
 	using LumaMeterClass = ext::LumaMeter::CLumaMeter;
 	constexpr auto MeterMode = LumaMeterClass::EMM_MEDIAN;
 	const float minLuma = 1.f/2048.f;

From 0e3e125bac2a53ab1692257db53f184b209417c4 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Fri, 9 Aug 2024 17:59:14 +0530
Subject: [PATCH 12/50] Create gpu resources for all passes

---
 26_Autoexposure/main.cpp | 132 +++++++++++++++++++++++++--------------
 1 file changed, 86 insertions(+), 46 deletions(-)

diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index b31984844..23717516f 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -74,43 +74,74 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			* Samplers for combined image samplers can also be mutable, which for a binding of a descriptor set is specified also at creation time by leaving the immutableSamplers
 			* field set to its default (nullptr).
 			*/
-		smart_refctd_ptr<IGPUDescriptorSetLayout> dsLayout;
+		smart_refctd_ptr<IGPUDescriptorSetLayout> lumaPresentDSLayout, tonemapperDSLayout;
 		{
-			auto defaultSampler = m_device->createSampler({
-				.AnisotropicFilter = 0
-				});
-
-			const IGPUDescriptorSetLayout::SBinding bindings[1] = { {
-				.binding = 0,
-				.type = IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,
-				.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-				.stageFlags = IShader::E_SHADER_STAGE::ESS_FRAGMENT,
-				.count = 1,
-				.immutableSamplers = &defaultSampler
-			}
+			auto defaultSampler = m_device->createSampler(
+				{
+					.AnisotropicFilter = 0
+				}
+			);
+
+			const IGPUDescriptorSetLayout::SBinding lumaPresentBindings[1] = {
+				{
+					.binding = 0,
+					.type = IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,
+					.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+					.stageFlags = IShader::E_SHADER_STAGE::ESS_FRAGMENT | IShader::E_SHADER_STAGE::ESS_COMPUTE,
+					.count = 1,
+					.immutableSamplers = &defaultSampler
+				}
 			};
-			dsLayout = m_device->createDescriptorSetLayout(bindings);
-			if (!dsLayout)
-				return logFail("Failed to Create Descriptor Layout");
+			lumaPresentDSLayout = m_device->createDescriptorSetLayout(lumaPresentBindings);
+			if (!lumaPresentDSLayout)
+				return logFail("Failed to Create Descriptor Layout: lumaPresentDSLayout");
 
+			const IGPUDescriptorSetLayout::SBinding tonemapperBindings[1] = {
+				{
+					.binding = 1,
+					.type = IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,
+					.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+					.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
+					.count = 1,
+					.immutableSamplers = &defaultSampler
+				}
+			};
+			tonemapperDSLayout = m_device->createDescriptorSetLayout(tonemapperBindings);
+			if (!tonemapperDSLayout)
+				return logFail("Failed to Create Descriptor Layout: tonemapperDSLayout");
 		}
 
-		// Create semaphore
-		m_semaphore = m_device->createSemaphore(m_submitIx);
+		// Create semaphores
+		m_lumaMeterSemaphore = m_device->createSemaphore(m_submitIx);
+		m_tonemapperSemaphore = m_device->createSemaphore(m_submitIx);
+		m_presentSemaphore = m_device->createSemaphore(m_submitIx);
 
-		// create the descriptor set and with enough room for one image sampler
+		// create the descriptor sets and with enough room
 		{
-			const uint32_t setCount = 1;
-			auto pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, { &dsLayout.get(),1 }, &setCount);
-			if (!pool)
-				return logFail("Failed to Create Descriptor Pool");
-
-			m_descriptorSets[0] = pool->createDescriptorSet(core::smart_refctd_ptr(dsLayout));
-			if (!m_descriptorSets[0])
-				return logFail("Could not create Descriptor Set!");
+			constexpr uint32_t lumaPresentSetCount = 2, tonemapperSetCount = 1;
+			auto lumaPresentPool = m_device->createDescriptorPoolForDSLayouts(
+				IDescriptorPool::E_CREATE_FLAGS::ECF_NONE,
+				{ &lumaPresentDSLayout.get(), 1 },
+				&lumaPresentSetCount
+			);
+			auto tonemapperPool = m_device->createDescriptorPoolForDSLayouts(
+				IDescriptorPool::E_CREATE_FLAGS::ECF_NONE,
+				{ &tonemapperDSLayout.get(), 1 },
+				&tonemapperSetCount
+			);
+
+			if (!lumaPresentPool || !tonemapperPool)
+				return logFail("Failed to Create Descriptor Pools");
+
+			m_lumaPresentDS[0] = lumaPresentPool->createDescriptorSet(core::smart_refctd_ptr(lumaPresentDSLayout));
+			if (!m_lumaPresentDS[0])
+				return logFail("Could not create Descriptor Set: lumaPresentDS!");
+			m_tonemapperDS[0] = tonemapperPool->createDescriptorSet(core::smart_refctd_ptr(tonemapperDSLayout));
+			if (!m_tonemapperDS[0])
+				return logFail("Could not create Descriptor Set: tonemapperDS!");
+
 		}
 
-		auto ds = m_descriptorSets[0].get();
 		auto queue = getGraphicsQueue();
 
 		// Gather swapchain resources
@@ -184,13 +215,13 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			if (!fragmentShader)
 				return logFail("Failed to Load and Compile Fragment Shader!");
 
-			auto layout = m_device->createPipelineLayout({}, nullptr, nullptr, nullptr, core::smart_refctd_ptr(dsLayout));
+			auto layout = m_device->createPipelineLayout({}, nullptr, nullptr, nullptr, core::smart_refctd_ptr(lumaPresentDSLayout));
 			const IGPUShader::SSpecInfo fragSpec = {
 				.entryPoint = "main",
 				.shader = fragmentShader.get()
 			};
-			m_pipeline = fsTriProtoPPln.createPipeline(fragSpec, layout.get(), scResources->getRenderpass());
-			if (!m_pipeline)
+			m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, layout.get(), scResources->getRenderpass());
+			if (!m_presentPipeline)
 				return logFail("Could not create Graphics Pipeline!");
 		}
 
@@ -374,7 +405,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 			IGPUDescriptorSet::SWriteDescriptorSet writeDescriptors[] = {
 				{
-					.dstSet = ds,
+					.dstSet = m_lumaPresentDS[0].get(),
 					.binding = 0,
 					.arrayElement = 0,
 					.count = 1,
@@ -387,6 +418,10 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			queue->endCapture();
 		}
 
+		// Allocate and create texture for tonemapping
+		{
+		}
+
 		return true;
 	}
 
@@ -401,7 +436,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 		auto queue = getGraphicsQueue();
 		auto cmdbuf = m_cmdBufs[0].get();
-		auto ds = m_descriptorSets[0].get();
+		auto ds = m_lumaPresentDS[0].get();
 
 		queue->startCapture();
 		// Render to the swapchain
@@ -437,8 +472,8 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				cmdbuf->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
 			}
 
-			cmdbuf->bindGraphicsPipeline(m_pipeline.get());
-			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, m_pipeline->getLayout(), 3, 1, &ds);
+			cmdbuf->bindGraphicsPipeline(m_presentPipeline.get());
+			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, m_presentPipeline->getLayout(), 3, 1, &ds);
 			ext::FullScreenTriangle::recordDrawCall(cmdbuf);
 			cmdbuf->endRenderPass();
 
@@ -447,7 +482,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 		// submit
 		const IQueue::SSubmitInfo::SSemaphoreInfo rendered[1] = { {
-			.semaphore = m_semaphore.get(),
+			.semaphore = m_presentSemaphore.get(),
 			.value = ++m_submitIx,
 			// just as we've outputted all pixels, signal
 			.stageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT
@@ -482,7 +517,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 		{
 			const ISemaphore::SWaitInfo cmdbufDonePending[] = {
 				{
-					.semaphore = m_semaphore.get(),
+					.semaphore = m_presentSemaphore.get(),
 					.value = m_submitIx
 				}
 			};
@@ -506,27 +541,32 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 	}
 
 protected:
-	smart_refctd_ptr<IGPUImage> m_gpuImg;
-	smart_refctd_ptr<IGPUImageView> m_gpuImgView;
+	uint64_t m_lumaGatherBDA;
+	smart_refctd_ptr<IGPUImage> m_gpuImg, m_gpuTonemapImg;
+	smart_refctd_ptr<IGPUImageView> m_gpuImgView, m_gpuTonemapImgView;
 
 	// for image uploads
 	smart_refctd_ptr<ISemaphore> m_scratchSemaphore;
 	SIntendedSubmitInfo m_intendedSubmit;
 
-	// Command Buffers and other resources
-	std::array<smart_refctd_ptr<IGPUDescriptorSet>, ISwapchain::MaxImages> m_descriptorSets;
+	// Pipelines
+	smart_refctd_ptr<IGPUGraphicsPipeline> m_presentPipeline;
+	smart_refctd_ptr<IGPUComputePipeline> m_lumaMeterPipeline, m_tonemapperPipeline;
+
+	// Descriptor Sets
+	std::array<smart_refctd_ptr<IGPUDescriptorSet>, ISwapchain::MaxImages> m_lumaPresentDS, m_tonemapperDS;
+
+	// Command Buffers
 	smart_refctd_ptr<IGPUCommandPool> m_cmdPool;
 	std::array<smart_refctd_ptr<IGPUCommandBuffer>, ISwapchain::MaxImages> m_cmdBufs;
-	smart_refctd_ptr<IGPUGraphicsPipeline> m_pipeline;
-	smart_refctd_ptr<ISemaphore> m_semaphore;
+
+	// Semaphores
+	smart_refctd_ptr<ISemaphore> m_lumaMeterSemaphore, m_tonemapperSemaphore, m_presentSemaphore;
 	uint64_t m_submitIx = 0;
 
 	// window
 	smart_refctd_ptr<IWindow> m_window;
 	smart_refctd_ptr<CSimpleResizeSurface<CDefaultSwapchainFramebuffers>> m_surface;
-
-	// luma gather
-	uint64_t m_lumaGatherBDA;
 };
 
 NBL_MAIN_FUNC(AutoexposureApp)

From cef80b3e5b961029344c4516cd5682ad2254cab7 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Fri, 9 Aug 2024 20:06:57 +0530
Subject: [PATCH 13/50] Create shaders and pipelines

---
 .../app_resources/luma_meter.comp.hlsl        |  3 +-
 ...tonemap.comp.hlsl => tonemapper.comp.hlsl} |  8 +-
 26_Autoexposure/main.cpp                      | 97 ++++++++++++++-----
 3 files changed, 78 insertions(+), 30 deletions(-)
 rename 26_Autoexposure/app_resources/{tonemap.comp.hlsl => tonemapper.comp.hlsl} (72%)

diff --git a/26_Autoexposure/app_resources/luma_meter.comp.hlsl b/26_Autoexposure/app_resources/luma_meter.comp.hlsl
index 4a0797d6d..0902baa59 100644
--- a/26_Autoexposure/app_resources/luma_meter.comp.hlsl
+++ b/26_Autoexposure/app_resources/luma_meter.comp.hlsl
@@ -2,6 +2,7 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
+#include "nbl/builtin/hlsl/luma_meter/luma_meter.hlsl"
 #include "app_resources/common.hlsl"
 
 [[vk::push_constant]] AutoexposurePushData pushData;
@@ -11,7 +12,7 @@ uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize()
     return uint32_t3(WorkgroupSize, 1, 1);
 }
 
-[numthreads(SubgroupSize, SubgroupSize, 1)]
+[numthreads(DeviceSubgroupSize, DeviceSubgroupSize, 1)]
 void main(uint32_t3 ID : SV_GroupThreadID, uint32_t3 GroupID : SV_GroupID)
 {
 }
\ No newline at end of file
diff --git a/26_Autoexposure/app_resources/tonemap.comp.hlsl b/26_Autoexposure/app_resources/tonemapper.comp.hlsl
similarity index 72%
rename from 26_Autoexposure/app_resources/tonemap.comp.hlsl
rename to 26_Autoexposure/app_resources/tonemapper.comp.hlsl
index 4a0797d6d..15b543469 100644
--- a/26_Autoexposure/app_resources/tonemap.comp.hlsl
+++ b/26_Autoexposure/app_resources/tonemapper.comp.hlsl
@@ -2,16 +2,12 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
 #include "app_resources/common.hlsl"
 
 [[vk::push_constant]] AutoexposurePushData pushData;
 
-uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize()
-{
-    return uint32_t3(WorkgroupSize, 1, 1);
-}
-
-[numthreads(SubgroupSize, SubgroupSize, 1)]
+[numthreads(DeviceSubgroupSize, DeviceSubgroupSize, 1)]
 void main(uint32_t3 ID : SV_GroupThreadID, uint32_t3 GroupID : SV_GroupID)
 {
 }
\ No newline at end of file
diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index 23717516f..aaf2ecf80 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -8,6 +8,8 @@
 #include "nbl/asset/interchange/IAssetLoader.h"
 #include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h"
 
+#include "app_resources/common.hlsl"
+
 using namespace nbl;
 using namespace core;
 using namespace hlsl;
@@ -186,41 +188,90 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				return logFail("Failed to create Renderpass!");
 		}
 
-		// Load the shaders and create the pipeline
+		// Load the shaders and create the pipelines
 		{
+			auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr<IGPUShader>
+			{
+				IAssetLoader::SAssetLoadParams lp = {};
+				lp.logger = m_logger.get();
+				lp.workingDirectory = ""; // virtual root
+				auto assetBundle = m_assetMgr->getAsset(relPath, lp);
+				const auto assets = assetBundle.getContents();
+				if (assets.empty())
+					return nullptr;
+
+				// lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
+				auto source = IAsset::castDown<ICPUShader>(assets[0]);
+				if (!source)
+					return nullptr;
+				const uint32_t workgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations;
+				const uint32_t subgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
+				auto overriddenSource = CHLSLCompiler::createOverridenCopy(
+					source.get(),
+					"#define WorkgroupSize %d\n#define DeviceSubgroupSize %d\n",
+					workgroupSize,
+					subgroupSize
+				);
+
+				return m_device->createShader(overriddenSource.get());
+			};
+
+			auto createComputePipeline = [&](smart_refctd_ptr<IGPUShader> shader, smart_refctd_ptr<IGPUComputePipeline> pipeline) -> bool
+			{
+				const nbl::asset::SPushConstantRange pcRange = {
+					.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
+					.offset = 0,
+					.size = sizeof(AutoexposurePushData)
+				};
+
+				smart_refctd_ptr<IGPUPipelineLayout> layout;
+				{
+					layout = m_device->createPipelineLayout({ &pcRange,1 });
+					IGPUComputePipeline::SCreationParams params = {};
+					params.layout = layout.get();
+					params.shader.shader = shader.get();
+					params.shader.entryPoint = "main";
+					params.shader.entries = nullptr;
+					params.shader.requireFullSubgroups = true;
+					params.shader.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(5);
+					if (!m_device->createComputePipelines(nullptr, { &params,1 }, &pipeline))
+						return logFail("Failed to create compute pipeline!\n");
+				}
+
+				return true;
+			};
+
+			// Luma Meter
+			auto lumaMeterShader = loadCompileAndCreateShader("app_resources/luma_meter.comp.hlsl");
+			if (!lumaMeterShader)
+				return logFail("Failed to Load and Compile Compute Shader: lumaMeterShader!");
+			auto lumaPresentLayout = m_device->createPipelineLayout({}, nullptr, nullptr, nullptr, core::smart_refctd_ptr(lumaPresentDSLayout));
+			if (!createComputePipeline(lumaMeterShader, m_lumaMeterPipeline))
+				return logFail("Could not create Luma Meter Pipeline!");
+
+			// Tonemapper
+			auto tonemapperShader = loadCompileAndCreateShader("app_resources/tonemapper.comp.hlsl");
+			if (!tonemapperShader)
+				return logFail("Failed to Load and Compile Compute Shader: tonemapperShader!");
+			auto tonemapperLayout = m_device->createPipelineLayout({}, nullptr, nullptr, nullptr, core::smart_refctd_ptr(tonemapperDSLayout));
+			if (!createComputePipeline(tonemapperShader, m_tonemapperPipeline))
+				return logFail("Could not create Luma Meter Pipeline!");
+
 			// Load FSTri Shader
 			ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get());
 			if (!fsTriProtoPPln)
 				return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!");
 
-			// Load Custom Shader
-			auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr<IGPUShader>
-				{
-					IAssetLoader::SAssetLoadParams lp = {};
-					lp.logger = m_logger.get();
-					lp.workingDirectory = ""; // virtual root
-					auto assetBundle = m_assetMgr->getAsset(relPath, lp);
-					const auto assets = assetBundle.getContents();
-					if (assets.empty())
-						return nullptr;
-
-					// lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
-					auto source = IAsset::castDown<ICPUShader>(assets[0]);
-					if (!source)
-						return nullptr;
-
-					return m_device->createShader(source.get());
-				};
-			auto fragmentShader = loadCompileAndCreateShader("app_resources/present.frag.hlsl");
+			// Load Fragment Shader
+			auto fragmentShader = loadCompileAndCreateShader("app_resources/present.frag.hlsl");;
 			if (!fragmentShader)
-				return logFail("Failed to Load and Compile Fragment Shader!");
+				return logFail("Failed to Load and Compile Fragment Shader: lumaMeterShader!");
 
-			auto layout = m_device->createPipelineLayout({}, nullptr, nullptr, nullptr, core::smart_refctd_ptr(lumaPresentDSLayout));
 			const IGPUShader::SSpecInfo fragSpec = {
 				.entryPoint = "main",
 				.shader = fragmentShader.get()
 			};
-			m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, layout.get(), scResources->getRenderpass());
+			m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, lumaPresentLayout.get(), scResources->getRenderpass());
 			if (!m_presentPipeline)
 				return logFail("Could not create Graphics Pipeline!");
 		}

From 15e489f2bcfa1407b9e831452178abb6c97d22b8 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Mon, 12 Aug 2024 17:02:52 +0530
Subject: [PATCH 14/50] Allocate and create texture for tonemapping

---
 26_Autoexposure/main.cpp | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index aaf2ecf80..77636304b 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -471,6 +471,30 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 		// Allocate and create texture for tonemapping
 		{
+			IGPUImage::SCreationParams imageParams = {};
+			imageParams = m_gpuImg->getCreationParameters();
+			// promote format because RGB8 and friends don't actually exist in HW
+			{
+				const IPhysicalDevice::SImageFormatPromotionRequest request = {
+					.originalFormat = imageParams.format,
+					.usages = IPhysicalDevice::SFormatImageUsages::SUsage(imageParams.usage)
+				};
+				imageParams.format = m_physicalDevice->promoteImageFormat(request, imageParams.tiling);
+			}
+			if (imageParams.type == IGPUImage::ET_3D)
+				imageParams.flags |= IGPUImage::ECF_2D_ARRAY_COMPATIBLE_BIT;
+			m_gpuTonemapImg = m_device->createImage(std::move(imageParams));
+			if (!m_gpuTonemapImg || !m_device->allocate(m_gpuTonemapImg->getMemoryReqs(), m_gpuTonemapImg.get()).isValid())
+				return false;
+			m_gpuTonemapImg->setObjectDebugName("Autoexposure Tonemapper Image");
+
+			IGPUImageView::SCreationParams gpuTonemapImgViewParams = {
+				.image = m_gpuTonemapImg,
+				.viewType = IGPUImageView::ET_2D,
+				.format = m_gpuTonemapImg->getCreationParameters().format
+			};
+
+			m_gpuTonemapImgView = m_device->createImageView(std::move(gpuTonemapImgViewParams));
 		}
 
 		return true;
@@ -479,7 +503,6 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 	// We do a very simple thing, display an image and wait `DisplayImageMs` to show it
 	inline void workLoopBody() override
 	{
-
 		// Acquire
 		auto acquire = m_surface->acquireNextImage();
 		if (!acquire)

From c646c7d6f22247f74ab82877e3302250a20966b0 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Mon, 12 Aug 2024 21:09:04 +0530
Subject: [PATCH 15/50] Create separate ds for luma and present

---
 26_Autoexposure/main.cpp | 58 +++++++++++++++++++++++-----------------
 1 file changed, 34 insertions(+), 24 deletions(-)

diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index 77636304b..baa031dd4 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -136,7 +136,8 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				return logFail("Failed to Create Descriptor Pools");
 
 			m_lumaPresentDS[0] = lumaPresentPool->createDescriptorSet(core::smart_refctd_ptr(lumaPresentDSLayout));
-			if (!m_lumaPresentDS[0])
+			m_lumaPresentDS[1] = lumaPresentPool->createDescriptorSet(core::smart_refctd_ptr(lumaPresentDSLayout));
+			if (!m_lumaPresentDS[0] || !m_lumaPresentDS[1])
 				return logFail("Could not create Descriptor Set: lumaPresentDS!");
 			m_tonemapperDS[0] = tonemapperPool->createDescriptorSet(core::smart_refctd_ptr(tonemapperDSLayout));
 			if (!m_tonemapperDS[0])
@@ -450,28 +451,8 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 			m_gpuImgView = m_device->createImageView(std::move(gpuImgViewParams));
 
-			IGPUDescriptorSet::SDescriptorInfo info = {};
-			info.info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
-			info.desc = m_gpuImgView;
-
-			IGPUDescriptorSet::SWriteDescriptorSet writeDescriptors[] = {
-				{
-					.dstSet = m_lumaPresentDS[0].get(),
-					.binding = 0,
-					.arrayElement = 0,
-					.count = 1,
-					.info = &info
-				}
-			};
-
-			m_device->updateDescriptorSets(1, writeDescriptors, 0, nullptr);
-
-			queue->endCapture();
-		}
-
-		// Allocate and create texture for tonemapping
-		{
-			IGPUImage::SCreationParams imageParams = {};
+			// Allocate and create texture for tonemapping
+			imageParams = {};
 			imageParams = m_gpuImg->getCreationParameters();
 			// promote format because RGB8 and friends don't actually exist in HW
 			{
@@ -495,6 +476,35 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			};
 
 			m_gpuTonemapImgView = m_device->createImageView(std::move(gpuTonemapImgViewParams));
+
+			IGPUDescriptorSet::SDescriptorInfo info1 = {};
+			info1.info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+			info1.desc = m_gpuImgView;
+
+			IGPUDescriptorSet::SDescriptorInfo info2 = {};
+			info2.info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+			info2.desc = m_gpuImgView;
+
+			IGPUDescriptorSet::SWriteDescriptorSet writeDescriptors[] = {
+				{
+					.dstSet = m_lumaPresentDS[0].get(),
+					.binding = 0,
+					.arrayElement = 0,
+					.count = 1,
+					.info = &info1
+				},
+				{
+					.dstSet = m_lumaPresentDS[1].get(),
+					.binding = 0,
+					.arrayElement = 0,
+					.count = 1,
+					.info = &info2
+				}
+			};
+
+			m_device->updateDescriptorSets(2, writeDescriptors, 0, nullptr);
+
+			queue->endCapture();
 		}
 
 		return true;
@@ -510,7 +520,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 		auto queue = getGraphicsQueue();
 		auto cmdbuf = m_cmdBufs[0].get();
-		auto ds = m_lumaPresentDS[0].get();
+		auto ds = m_lumaPresentDS[1].get();
 
 		queue->startCapture();
 		// Render to the swapchain

From 36d70978bd1a180245b26d8d62871c335f23ba93 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 13 Aug 2024 14:19:11 +0530
Subject: [PATCH 16/50] Record luma meter commands

---
 .../app_resources/luma_meter.comp.hlsl        |  3 +
 26_Autoexposure/main.cpp                      | 59 ++++++++++++-------
 2 files changed, 40 insertions(+), 22 deletions(-)

diff --git a/26_Autoexposure/app_resources/luma_meter.comp.hlsl b/26_Autoexposure/app_resources/luma_meter.comp.hlsl
index 0902baa59..e7d080da2 100644
--- a/26_Autoexposure/app_resources/luma_meter.comp.hlsl
+++ b/26_Autoexposure/app_resources/luma_meter.comp.hlsl
@@ -5,6 +5,9 @@
 #include "nbl/builtin/hlsl/luma_meter/luma_meter.hlsl"
 #include "app_resources/common.hlsl"
 
+[[vk::combinedImageSampler]] [[vk::binding(0)]] Texture2D texture;
+[[vk::combinedImageSampler]] [[vk::binding(0)]] SamplerState samplerState;
+
 [[vk::push_constant]] AutoexposurePushData pushData;
 
 uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize()
diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index baa031dd4..6f214c7fe 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -28,6 +28,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 	constexpr static inline std::string_view DefaultImagePathsFile = "../../media/noises/spp_benchmark_4k_512.exr";
 	constexpr static inline std::array<int, 2> Dimensions = { 1280, 720 };
+	constexpr static inline std::array<int, 2> SampleCount = { 10000, 10000 };
 
 public:
 	// Yay thanks to multiple inheritance we cannot forward ctors anymore
@@ -100,7 +101,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 			const IGPUDescriptorSetLayout::SBinding tonemapperBindings[1] = {
 				{
-					.binding = 1,
+					.binding = 0,
 					.type = IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,
 					.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
 					.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
@@ -217,7 +218,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				return m_device->createShader(overriddenSource.get());
 			};
 
-			auto createComputePipeline = [&](smart_refctd_ptr<IGPUShader> shader, smart_refctd_ptr<IGPUComputePipeline> pipeline) -> bool
+			auto createComputePipeline = [&](smart_refctd_ptr<IGPUShader>& shader, smart_refctd_ptr<IGPUComputePipeline>& pipeline) -> bool
 			{
 				const nbl::asset::SPushConstantRange pcRange = {
 					.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
@@ -287,7 +288,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			// create the commandbuffers
 			if (!m_cmdPool)
 				return logFail("Couldn't create Command Pool!");
-			if (!m_cmdPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data(), 1 }))
+			if (!m_cmdPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data(), 3 }))
 				return logFail("Couldn't create Command Buffer!");
 		}
 
@@ -301,6 +302,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			m_intendedSubmit.queue = queue;
 			// wait for nothing before upload
 			m_intendedSubmit.waitSemaphores = {};
+			m_intendedSubmit.waitSemaphores = {};
 			// fill later
 			m_intendedSubmit.commandBuffers = {};
 			m_intendedSubmit.scratchSemaphore = {
@@ -514,19 +516,32 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 	inline void workLoopBody() override
 	{
 		// Acquire
-		auto acquire = m_surface->acquireNextImage();
-		if (!acquire)
-			return;
+		//auto acquire = m_surface->acquireNextImage();
+		//if (!acquire)
+		//	return;
 
-		auto queue = getGraphicsQueue();
-		auto cmdbuf = m_cmdBufs[0].get();
-		auto ds = m_lumaPresentDS[1].get();
-
-		queue->startCapture();
-		// Render to the swapchain
+		// Luma Meter
 		{
+			auto queue = getComputeQueue();
+			auto cmdbuf = m_cmdBufs[0].get();
+			auto ds = m_lumaPresentDS[0].get();
+
+			const uint32_t SubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
+
+			queue->startCapture();
+
 			cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
 
+			cmdbuf->bindComputePipeline(m_lumaMeterPipeline.get());
+			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, m_lumaMeterPipeline->getLayout(), 0, 1, &ds);
+			cmdbuf->dispatch(1 + (SampleCount[0] - 1) / SubgroupSize, 1 + (SampleCount[1] - 1) / SubgroupSize);
+			cmdbuf->end();
+		}
+
+		// Render to the swapchain
+		/*{
+			cmdbuf3->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+
 			const VkRect2D currentRenderArea =
 			{
 				.offset = {0,0},
@@ -539,9 +554,9 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 					.width = float(m_window->getWidth()),
 					.height = float(m_window->getHeight())
 				};
-				cmdbuf->setViewport({ &viewport, 1 });
+				cmdbuf3->setViewport({ &viewport, 1 });
 			}
-			cmdbuf->setScissor({ &currentRenderArea, 1 });
+			cmdbuf3->setScissor({ &currentRenderArea, 1 });
 
 			// begin the renderpass
 			{
@@ -553,15 +568,15 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 					.depthStencilClearValues = nullptr,
 					.renderArea = currentRenderArea
 				};
-				cmdbuf->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
+				cmdbuf3->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
 			}
 
-			cmdbuf->bindGraphicsPipeline(m_presentPipeline.get());
-			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, m_presentPipeline->getLayout(), 3, 1, &ds);
-			ext::FullScreenTriangle::recordDrawCall(cmdbuf);
-			cmdbuf->endRenderPass();
+			cmdbuf3->bindGraphicsPipeline(m_presentPipeline.get());
+			cmdbuf3->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, m_presentPipeline->getLayout(), 3, 1, &ds);
+			ext::FullScreenTriangle::recordDrawCall(cmdbuf3);
+			cmdbuf3->endRenderPass();
 
-			cmdbuf->end();
+			cmdbuf3->end();
 		}
 
 		// submit
@@ -574,7 +589,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 		{
 			{
 				const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { {
-					.cmdbuf = cmdbuf
+					.cmdbuf = cmdbuf3
 				} };
 				// we don't need to wait for the transfer semaphore, because we submit everything to the same queue
 				const IQueue::SSubmitInfo::SSemaphoreInfo acquired[1] = { {
@@ -607,7 +622,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			};
 			if (m_device->blockForSemaphores(cmdbufDonePending) != ISemaphore::WAIT_RESULT::SUCCESS)
 				return;
-		}
+		}*/
 	}
 
 	inline bool keepRunning() override

From 6addbf18516bd3f7cd06d446f285b08deb04c4fc Mon Sep 17 00:00:00 2001
From: AnastaZIuk <areklachowicz@gmail.com>
Date: Tue, 13 Aug 2024 12:04:40 +0200
Subject: [PATCH 17/50] fix layout issues with compute pipeline in
 26_Autoexposure example + update luma DSes

---
 .../app_resources/luma_meter.comp.hlsl        |  5 ++-
 .../app_resources/present.frag.hlsl           |  1 +
 26_Autoexposure/main.cpp                      | 37 ++++++++-----------
 3 files changed, 19 insertions(+), 24 deletions(-)

diff --git a/26_Autoexposure/app_resources/luma_meter.comp.hlsl b/26_Autoexposure/app_resources/luma_meter.comp.hlsl
index e7d080da2..ccdf42256 100644
--- a/26_Autoexposure/app_resources/luma_meter.comp.hlsl
+++ b/26_Autoexposure/app_resources/luma_meter.comp.hlsl
@@ -5,8 +5,9 @@
 #include "nbl/builtin/hlsl/luma_meter/luma_meter.hlsl"
 #include "app_resources/common.hlsl"
 
-[[vk::combinedImageSampler]] [[vk::binding(0)]] Texture2D texture;
-[[vk::combinedImageSampler]] [[vk::binding(0)]] SamplerState samplerState;
+// shared accross frag & compute - binding 0 set 3
+[[vk::combinedImageSampler]] [[vk::binding(0, 3)]] Texture2D texture;
+[[vk::combinedImageSampler]] [[vk::binding(0, 3)]] SamplerState samplerState;
 
 [[vk::push_constant]] AutoexposurePushData pushData;
 
diff --git a/26_Autoexposure/app_resources/present.frag.hlsl b/26_Autoexposure/app_resources/present.frag.hlsl
index fcddeb743..8c3be5573 100644
--- a/26_Autoexposure/app_resources/present.frag.hlsl
+++ b/26_Autoexposure/app_resources/present.frag.hlsl
@@ -8,6 +8,7 @@
 #include <nbl/builtin/hlsl/ext/FullScreenTriangle/SVertexAttributes.hlsl>
 using namespace nbl::hlsl::ext::FullScreenTriangle;
 
+// shared accross frag & compute - binding 0 set 3
 [[vk::combinedImageSampler]] [[vk::binding(0, 3)]] Texture2D texture;
 [[vk::combinedImageSampler]] [[vk::binding(0, 3)]] SamplerState samplerState;
 
diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index 6f214c7fe..db3816703 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -121,12 +121,15 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 		// create the descriptor sets and with enough room
 		{
-			constexpr uint32_t lumaPresentSetCount = 2, tonemapperSetCount = 1;
-			auto lumaPresentPool = m_device->createDescriptorPoolForDSLayouts(
-				IDescriptorPool::E_CREATE_FLAGS::ECF_NONE,
-				{ &lumaPresentDSLayout.get(), 1 },
-				&lumaPresentSetCount
-			);
+			constexpr uint32_t tonemapperSetCount = 1;
+
+			core::smart_refctd_ptr<IDescriptorPool> lumaPresentPool;
+			{
+				const video::IGPUDescriptorSetLayout* const layouts[] = { nullptr, nullptr, nullptr, lumaPresentDSLayout.get() };
+				const uint32_t setCounts[] = { 0u, 0u, 0u, 1u }; // leaving you one for 3th set, but you can increase if you really want 2 separate DSs but I think you want single to be shared (then you also need to create 2 DSes as you did)
+				lumaPresentPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, layouts, setCounts);
+			}
+
 			auto tonemapperPool = m_device->createDescriptorPoolForDSLayouts(
 				IDescriptorPool::E_CREATE_FLAGS::ECF_NONE,
 				{ &tonemapperDSLayout.get(), 1 },
@@ -136,9 +139,9 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			if (!lumaPresentPool || !tonemapperPool)
 				return logFail("Failed to Create Descriptor Pools");
 
+			// why do you need 2 separate DSs for combined sampler? from stage flags it looks like you want them shared between compute & fragment
 			m_lumaPresentDS[0] = lumaPresentPool->createDescriptorSet(core::smart_refctd_ptr(lumaPresentDSLayout));
-			m_lumaPresentDS[1] = lumaPresentPool->createDescriptorSet(core::smart_refctd_ptr(lumaPresentDSLayout));
-			if (!m_lumaPresentDS[0] || !m_lumaPresentDS[1])
+			if (!m_lumaPresentDS[0])
 				return logFail("Could not create Descriptor Set: lumaPresentDS!");
 			m_tonemapperDS[0] = tonemapperPool->createDescriptorSet(core::smart_refctd_ptr(tonemapperDSLayout));
 			if (!m_tonemapperDS[0])
@@ -228,7 +231,8 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 				smart_refctd_ptr<IGPUPipelineLayout> layout;
 				{
-					layout = m_device->createPipelineLayout({ &pcRange,1 });
+					layout = m_device->createPipelineLayout({ &pcRange,1 }, nullptr, nullptr, nullptr, core::smart_refctd_ptr(lumaPresentDSLayout)); // dont forget your compute uses combinedImageSampler, cause of your cmd buffer errors is here
+
 					IGPUComputePipeline::SCreationParams params = {};
 					params.layout = layout.get();
 					params.shader.shader = shader.get();
@@ -483,10 +487,6 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			info1.info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
 			info1.desc = m_gpuImgView;
 
-			IGPUDescriptorSet::SDescriptorInfo info2 = {};
-			info2.info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
-			info2.desc = m_gpuImgView;
-
 			IGPUDescriptorSet::SWriteDescriptorSet writeDescriptors[] = {
 				{
 					.dstSet = m_lumaPresentDS[0].get(),
@@ -494,17 +494,10 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 					.arrayElement = 0,
 					.count = 1,
 					.info = &info1
-				},
-				{
-					.dstSet = m_lumaPresentDS[1].get(),
-					.binding = 0,
-					.arrayElement = 0,
-					.count = 1,
-					.info = &info2
 				}
 			};
 
-			m_device->updateDescriptorSets(2, writeDescriptors, 0, nullptr);
+			m_device->updateDescriptorSets(1, writeDescriptors, 0, nullptr);
 
 			queue->endCapture();
 		}
@@ -533,7 +526,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
 
 			cmdbuf->bindComputePipeline(m_lumaMeterPipeline.get());
-			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, m_lumaMeterPipeline->getLayout(), 0, 1, &ds);
+			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, m_lumaMeterPipeline->getLayout(), 3, 1, &ds); // also if you created DS Set with 3th index you need to respect it here - firstSet tells you the index of set and count tells you what range from this index it should update, useful if you had 2 DS with lets say set index 2,3, then you can bind both with single call setting firstSet to 2, count to 2 and last argument would be pointet to your DS pointers
 			cmdbuf->dispatch(1 + (SampleCount[0] - 1) / SubgroupSize, 1 + (SampleCount[1] - 1) / SubgroupSize);
 			cmdbuf->end();
 		}

From 8434f20746f399d04bff8490946b9d342b39fa55 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 13 Aug 2024 18:14:17 +0530
Subject: [PATCH 18/50] Create two sets from common lumaPresentLayout correctly

---
 .../app_resources/luma_meter.comp.hlsl        |  7 ++-
 .../app_resources/present.frag.hlsl           |  8 +--
 26_Autoexposure/main.cpp                      | 51 +++++++++++--------
 3 files changed, 36 insertions(+), 30 deletions(-)

diff --git a/26_Autoexposure/app_resources/luma_meter.comp.hlsl b/26_Autoexposure/app_resources/luma_meter.comp.hlsl
index ccdf42256..9a3b5c98a 100644
--- a/26_Autoexposure/app_resources/luma_meter.comp.hlsl
+++ b/26_Autoexposure/app_resources/luma_meter.comp.hlsl
@@ -5,9 +5,8 @@
 #include "nbl/builtin/hlsl/luma_meter/luma_meter.hlsl"
 #include "app_resources/common.hlsl"
 
-// shared accross frag & compute - binding 0 set 3
-[[vk::combinedImageSampler]] [[vk::binding(0, 3)]] Texture2D texture;
-[[vk::combinedImageSampler]] [[vk::binding(0, 3)]] SamplerState samplerState;
+[[vk::combinedImageSampler]] [[vk::binding(0, 1)]] Texture2D texture;
+[[vk::combinedImageSampler]] [[vk::binding(0, 1)]] SamplerState samplerState;
 
 [[vk::push_constant]] AutoexposurePushData pushData;
 
@@ -19,4 +18,4 @@ uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize()
 [numthreads(DeviceSubgroupSize, DeviceSubgroupSize, 1)]
 void main(uint32_t3 ID : SV_GroupThreadID, uint32_t3 GroupID : SV_GroupID)
 {
-}
\ No newline at end of file
+}
diff --git a/26_Autoexposure/app_resources/present.frag.hlsl b/26_Autoexposure/app_resources/present.frag.hlsl
index 8c3be5573..9a53c19eb 100644
--- a/26_Autoexposure/app_resources/present.frag.hlsl
+++ b/26_Autoexposure/app_resources/present.frag.hlsl
@@ -8,11 +8,11 @@
 #include <nbl/builtin/hlsl/ext/FullScreenTriangle/SVertexAttributes.hlsl>
 using namespace nbl::hlsl::ext::FullScreenTriangle;
 
-// shared accross frag & compute - binding 0 set 3
-[[vk::combinedImageSampler]] [[vk::binding(0, 3)]] Texture2D texture;
-[[vk::combinedImageSampler]] [[vk::binding(0, 3)]] SamplerState samplerState;
+// binding 0 set 1
+[[vk::combinedImageSampler]] [[vk::binding(0, 1)]] Texture2D texture;
+[[vk::combinedImageSampler]] [[vk::binding(0, 1)]] SamplerState samplerState;
 
 [[vk::location(0)]] float32_t4 main(SVertexAttributes vxAttr) : SV_Target0
 {
     return texture.Sample(samplerState, vxAttr.uv);
-}
\ No newline at end of file
+}
diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index db3816703..23689f6fe 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -125,8 +125,8 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 			core::smart_refctd_ptr<IDescriptorPool> lumaPresentPool;
 			{
-				const video::IGPUDescriptorSetLayout* const layouts[] = { nullptr, nullptr, nullptr, lumaPresentDSLayout.get() };
-				const uint32_t setCounts[] = { 0u, 0u, 0u, 1u }; // leaving you one for 3th set, but you can increase if you really want 2 separate DSs but I think you want single to be shared (then you also need to create 2 DSes as you did)
+				const video::IGPUDescriptorSetLayout* const layouts[] = { lumaPresentDSLayout.get(), lumaPresentDSLayout.get() };
+				const uint32_t setCounts[] = { 1u, 1u };
 				lumaPresentPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, layouts, setCounts);
 			}
 
@@ -139,9 +139,9 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			if (!lumaPresentPool || !tonemapperPool)
 				return logFail("Failed to Create Descriptor Pools");
 
-			// why do you need 2 separate DSs for combined sampler? from stage flags it looks like you want them shared between compute & fragment
 			m_lumaPresentDS[0] = lumaPresentPool->createDescriptorSet(core::smart_refctd_ptr(lumaPresentDSLayout));
-			if (!m_lumaPresentDS[0])
+			m_lumaPresentDS[1] = lumaPresentPool->createDescriptorSet(core::smart_refctd_ptr(lumaPresentDSLayout));
+			if (!m_lumaPresentDS[0] || !m_lumaPresentDS[1])
 				return logFail("Could not create Descriptor Set: lumaPresentDS!");
 			m_tonemapperDS[0] = tonemapperPool->createDescriptorSet(core::smart_refctd_ptr(tonemapperDSLayout));
 			if (!m_tonemapperDS[0])
@@ -221,20 +221,11 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				return m_device->createShader(overriddenSource.get());
 			};
 
-			auto createComputePipeline = [&](smart_refctd_ptr<IGPUShader>& shader, smart_refctd_ptr<IGPUComputePipeline>& pipeline) -> bool
+			auto createComputePipeline = [&](smart_refctd_ptr<IGPUShader>& shader, smart_refctd_ptr<IGPUComputePipeline>& pipeline, smart_refctd_ptr<IGPUPipelineLayout> pipelineLayout) -> bool
 			{
-				const nbl::asset::SPushConstantRange pcRange = {
-					.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
-					.offset = 0,
-					.size = sizeof(AutoexposurePushData)
-				};
-
-				smart_refctd_ptr<IGPUPipelineLayout> layout;
 				{
-					layout = m_device->createPipelineLayout({ &pcRange,1 }, nullptr, nullptr, nullptr, core::smart_refctd_ptr(lumaPresentDSLayout)); // dont forget your compute uses combinedImageSampler, cause of your cmd buffer errors is here
-
 					IGPUComputePipeline::SCreationParams params = {};
-					params.layout = layout.get();
+					params.layout = pipelineLayout.get();
 					params.shader.shader = shader.get();
 					params.shader.entryPoint = "main";
 					params.shader.entries = nullptr;
@@ -247,20 +238,26 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				return true;
 			};
 
+			const nbl::asset::SPushConstantRange pcRange = {
+					.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
+					.offset = 0,
+					.size = sizeof(AutoexposurePushData)
+			};
+
 			// Luma Meter
 			auto lumaMeterShader = loadCompileAndCreateShader("app_resources/luma_meter.comp.hlsl");
 			if (!lumaMeterShader)
 				return logFail("Failed to Load and Compile Compute Shader: lumaMeterShader!");
-			auto lumaPresentLayout = m_device->createPipelineLayout({}, nullptr, nullptr, nullptr, core::smart_refctd_ptr(lumaPresentDSLayout));
-			if (!createComputePipeline(lumaMeterShader, m_lumaMeterPipeline))
+			auto lumaPresentLayout = m_device->createPipelineLayout({ &pcRange, 1 }, core::smart_refctd_ptr(lumaPresentDSLayout), core::smart_refctd_ptr(lumaPresentDSLayout), nullptr, nullptr);
+			if (!createComputePipeline(lumaMeterShader, m_lumaMeterPipeline, lumaPresentLayout))
 				return logFail("Could not create Luma Meter Pipeline!");
 
 			// Tonemapper
 			auto tonemapperShader = loadCompileAndCreateShader("app_resources/tonemapper.comp.hlsl");
 			if (!tonemapperShader)
 				return logFail("Failed to Load and Compile Compute Shader: tonemapperShader!");
-			auto tonemapperLayout = m_device->createPipelineLayout({}, nullptr, nullptr, nullptr, core::smart_refctd_ptr(tonemapperDSLayout));
-			if (!createComputePipeline(tonemapperShader, m_tonemapperPipeline))
+			auto tonemapperLayout = m_device->createPipelineLayout({ &pcRange, 1 }, core::smart_refctd_ptr(tonemapperDSLayout), nullptr, nullptr, nullptr);
+			if (!createComputePipeline(tonemapperShader, m_tonemapperPipeline, tonemapperLayout))
 				return logFail("Could not create Luma Meter Pipeline!");
 
 			// Load FSTri Shader
@@ -321,7 +318,6 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			// Allocate memory
 			nbl::video::IDeviceMemoryAllocator::SAllocation allocation = {};
 			smart_refctd_ptr<IGPUBuffer> buffer;
-			//smart_refctd_ptr<nbl::video::IGPUDescriptorSet> ds;
 			{
 				auto build_buffer = [this](
 					smart_refctd_ptr<ILogicalDevice> m_device,
@@ -487,6 +483,10 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			info1.info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
 			info1.desc = m_gpuImgView;
 
+			IGPUDescriptorSet::SDescriptorInfo info2 = {};
+			info2.info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+			info2.desc = m_gpuTonemapImgView;
+
 			IGPUDescriptorSet::SWriteDescriptorSet writeDescriptors[] = {
 				{
 					.dstSet = m_lumaPresentDS[0].get(),
@@ -494,10 +494,17 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 					.arrayElement = 0,
 					.count = 1,
 					.info = &info1
+				},
+				{
+					.dstSet = m_lumaPresentDS[1].get(),
+					.binding = 0,
+					.arrayElement = 0,
+					.count = 1,
+					.info = &info2
 				}
 			};
 
-			m_device->updateDescriptorSets(1, writeDescriptors, 0, nullptr);
+			m_device->updateDescriptorSets(2, writeDescriptors, 0, nullptr);
 
 			queue->endCapture();
 		}
@@ -526,7 +533,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
 
 			cmdbuf->bindComputePipeline(m_lumaMeterPipeline.get());
-			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, m_lumaMeterPipeline->getLayout(), 3, 1, &ds); // also if you created DS Set with 3th index you need to respect it here - firstSet tells you the index of set and count tells you what range from this index it should update, useful if you had 2 DS with lets say set index 2,3, then you can bind both with single call setting firstSet to 2, count to 2 and last argument would be pointet to your DS pointers
+			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, m_lumaMeterPipeline->getLayout(), 0, 1, &ds); // also if you created DS Set with 3th index you need to respect it here - firstSet tells you the index of set and count tells you what range from this index it should update, useful if you had 2 DS with lets say set index 2,3, then you can bind both with single call setting firstSet to 2, count to 2 and last argument would be pointet to your DS pointers
 			cmdbuf->dispatch(1 + (SampleCount[0] - 1) / SubgroupSize, 1 + (SampleCount[1] - 1) / SubgroupSize);
 			cmdbuf->end();
 		}

From bf08caa961286ca414602873717d123d26941b41 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 13 Aug 2024 18:56:57 +0530
Subject: [PATCH 19/50] Create compute and graphics resources separately and
 finish luma meter

---
 26_Autoexposure/main.cpp | 75 ++++++++++++++++++++++++++++++----------
 1 file changed, 57 insertions(+), 18 deletions(-)

diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index 23689f6fe..e6c814bc0 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -149,7 +149,8 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 		}
 
-		auto queue = getGraphicsQueue();
+		auto graphicsQueue = getGraphicsQueue();
+		auto computeQueue = getComputeQueue();
 
 		// Gather swapchain resources
 		std::unique_ptr<CDefaultSwapchainFramebuffers> scResources;
@@ -280,17 +281,23 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 		}
 
 		// Init the surface and create the swapchain
-		if (!m_surface || !m_surface->init(queue, std::move(scResources), swapchainParams.sharedParams))
+		if (!m_surface || !m_surface->init(graphicsQueue, std::move(scResources), swapchainParams.sharedParams))
 			return logFail("Could not create Window & Surface or initialize the Surface!");
 
 		// need resetttable commandbuffers for the upload utility
 		{
-			m_cmdPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+			m_graphicsCmdPool = m_device->createCommandPool(graphicsQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+			m_computeCmdPool = m_device->createCommandPool(computeQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+
 			// create the commandbuffers
-			if (!m_cmdPool)
-				return logFail("Couldn't create Command Pool!");
-			if (!m_cmdPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data(), 3 }))
-				return logFail("Couldn't create Command Buffer!");
+			if (!m_graphicsCmdPool || !m_computeCmdPool)
+				return logFail("Couldn't create Command Pools!");
+
+			if (
+				!m_graphicsCmdPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_graphicsCmdBufs.data(), 1 }) ||
+				!m_computeCmdPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_computeCmdBufs.data(), 2 })
+			)
+				return logFail("Couldn't create Command Buffers!");
 		}
 
 		// things for IUtilities
@@ -300,7 +307,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				return logFail("Could not create Scratch Semaphore");
 			m_scratchSemaphore->setObjectDebugName("Scratch Semaphore");
 			// we don't want to overcomplicate the example with multi-queue
-			m_intendedSubmit.queue = queue;
+			m_intendedSubmit.queue = graphicsQueue;
 			// wait for nothing before upload
 			m_intendedSubmit.waitSemaphores = {};
 			m_intendedSubmit.waitSemaphores = {};
@@ -409,7 +416,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 			// we don't want to overcomplicate the example with multi-queue
 			auto queue = getGraphicsQueue();
-			auto cmdbuf = m_cmdBufs[0].get();
+			auto cmdbuf = m_graphicsCmdBufs[0].get();
 			IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo = { cmdbuf };
 			m_intendedSubmit.commandBuffers = { &cmdbufInfo, 1 };
 
@@ -515,15 +522,11 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 	// We do a very simple thing, display an image and wait `DisplayImageMs` to show it
 	inline void workLoopBody() override
 	{
-		// Acquire
-		//auto acquire = m_surface->acquireNextImage();
-		//if (!acquire)
-		//	return;
-
 		// Luma Meter
 		{
 			auto queue = getComputeQueue();
-			auto cmdbuf = m_cmdBufs[0].get();
+			auto cmdbuf = m_computeCmdBufs[0].get();
+			cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE);
 			auto ds = m_lumaPresentDS[0].get();
 
 			const uint32_t SubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
@@ -533,11 +536,47 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
 
 			cmdbuf->bindComputePipeline(m_lumaMeterPipeline.get());
-			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, m_lumaMeterPipeline->getLayout(), 0, 1, &ds); // also if you created DS Set with 3th index you need to respect it here - firstSet tells you the index of set and count tells you what range from this index it should update, useful if you had 2 DS with lets say set index 2,3, then you can bind both with single call setting firstSet to 2, count to 2 and last argument would be pointet to your DS pointers
+			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_lumaMeterPipeline->getLayout(), 0, 1, &ds); // also if you created DS Set with 3th index you need to respect it here - firstSet tells you the index of set and count tells you what range from this index it should update, useful if you had 2 DS with lets say set index 2,3, then you can bind both with single call setting firstSet to 2, count to 2 and last argument would be pointet to your DS pointers
 			cmdbuf->dispatch(1 + (SampleCount[0] - 1) / SubgroupSize, 1 + (SampleCount[1] - 1) / SubgroupSize);
 			cmdbuf->end();
+
+			{
+				IQueue::SSubmitInfo submit_infos[1];
+				IQueue::SSubmitInfo::SCommandBufferInfo cmdBufs[] = {
+					{
+						.cmdbuf = cmdbuf
+					}
+				};
+				submit_infos[0].commandBuffers = cmdBufs;
+				IQueue::SSubmitInfo::SSemaphoreInfo signals[] = {
+					{
+						.semaphore = m_lumaMeterSemaphore.get(),
+						.value = m_submitIx + 1,
+						.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
+					}
+				};
+				submit_infos[0].signalSemaphores = signals;
+
+				queue->submit(submit_infos);
+				queue->endCapture();
+			}
+
+			const ISemaphore::SWaitInfo wait_infos[] = {
+				{
+					.semaphore = m_lumaMeterSemaphore.get(),
+					.value = m_submitIx + 1
+				}
+			};
+			m_device->blockForSemaphores(wait_infos);
 		}
 
+		m_submitIx++;
+
+		// Acquire
+		//auto acquire = m_surface->acquireNextImage();
+		//if (!acquire)
+		//	return;
+
 		// Render to the swapchain
 		/*{
 			cmdbuf3->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
@@ -656,8 +695,8 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 	std::array<smart_refctd_ptr<IGPUDescriptorSet>, ISwapchain::MaxImages> m_lumaPresentDS, m_tonemapperDS;
 
 	// Command Buffers
-	smart_refctd_ptr<IGPUCommandPool> m_cmdPool;
-	std::array<smart_refctd_ptr<IGPUCommandBuffer>, ISwapchain::MaxImages> m_cmdBufs;
+	smart_refctd_ptr<IGPUCommandPool> m_graphicsCmdPool, m_computeCmdPool;
+	std::array<smart_refctd_ptr<IGPUCommandBuffer>, ISwapchain::MaxImages> m_graphicsCmdBufs, m_computeCmdBufs;
 
 	// Semaphores
 	smart_refctd_ptr<ISemaphore> m_lumaMeterSemaphore, m_tonemapperSemaphore, m_presentSemaphore;

From b342c6c05e06dd348ce4cdd66fd65e12b89503bf Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 13 Aug 2024 19:28:42 +0530
Subject: [PATCH 20/50] Fix descriptor binding for luma_meter

---
 26_Autoexposure/app_resources/luma_meter.comp.hlsl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/26_Autoexposure/app_resources/luma_meter.comp.hlsl b/26_Autoexposure/app_resources/luma_meter.comp.hlsl
index 9a3b5c98a..bd05198b8 100644
--- a/26_Autoexposure/app_resources/luma_meter.comp.hlsl
+++ b/26_Autoexposure/app_resources/luma_meter.comp.hlsl
@@ -5,8 +5,8 @@
 #include "nbl/builtin/hlsl/luma_meter/luma_meter.hlsl"
 #include "app_resources/common.hlsl"
 
-[[vk::combinedImageSampler]] [[vk::binding(0, 1)]] Texture2D texture;
-[[vk::combinedImageSampler]] [[vk::binding(0, 1)]] SamplerState samplerState;
+[[vk::combinedImageSampler]] [[vk::binding(0, 0)]] Texture2D texture;
+[[vk::combinedImageSampler]] [[vk::binding(0, 0)]] SamplerState samplerState;
 
 [[vk::push_constant]] AutoexposurePushData pushData;
 

From 817c4a7bb2dfc99251299ed908b7126690986441 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 13 Aug 2024 21:18:19 +0530
Subject: [PATCH 21/50] Create separate pipeline layouts for luma and present

---
 26_Autoexposure/main.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index e6c814bc0..99af2093a 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -249,8 +249,8 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			auto lumaMeterShader = loadCompileAndCreateShader("app_resources/luma_meter.comp.hlsl");
 			if (!lumaMeterShader)
 				return logFail("Failed to Load and Compile Compute Shader: lumaMeterShader!");
-			auto lumaPresentLayout = m_device->createPipelineLayout({ &pcRange, 1 }, core::smart_refctd_ptr(lumaPresentDSLayout), core::smart_refctd_ptr(lumaPresentDSLayout), nullptr, nullptr);
-			if (!createComputePipeline(lumaMeterShader, m_lumaMeterPipeline, lumaPresentLayout))
+			auto lumaLayout = m_device->createPipelineLayout({ &pcRange, 1 }, core::smart_refctd_ptr(lumaPresentDSLayout), nullptr, nullptr, nullptr);
+			if (!createComputePipeline(lumaMeterShader, m_lumaMeterPipeline, lumaLayout))
 				return logFail("Could not create Luma Meter Pipeline!");
 
 			// Tonemapper
@@ -275,7 +275,8 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				.entryPoint = "main",
 				.shader = fragmentShader.get()
 			};
-			m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, lumaPresentLayout.get(), scResources->getRenderpass());
+			auto presentLayout = m_device->createPipelineLayout({ &pcRange, 1 }, nullptr, core::smart_refctd_ptr(lumaPresentDSLayout), nullptr, nullptr);
+			m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, presentLayout.get(), scResources->getRenderpass());
 			if (!m_presentPipeline)
 				return logFail("Could not create Graphics Pipeline!");
 		}

From 7f89542ebd21ddc9b23f1acffb95111f49b1ae52 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 13 Aug 2024 21:18:43 +0530
Subject: [PATCH 22/50] Setup luma_meter.comp.hlsl

---
 26_Autoexposure/app_resources/common.hlsl     |  7 +++
 .../app_resources/luma_meter.comp.hlsl        | 48 +++++++++++++++++++
 2 files changed, 55 insertions(+)

diff --git a/26_Autoexposure/app_resources/common.hlsl b/26_Autoexposure/app_resources/common.hlsl
index f2b21b7e4..3735da6a9 100644
--- a/26_Autoexposure/app_resources/common.hlsl
+++ b/26_Autoexposure/app_resources/common.hlsl
@@ -5,9 +5,16 @@
 #ifndef _AUTOEXPOSURE_COMMON_INCLUDED_
 #define _AUTOEXPOSURE_COMMON_INCLUDED_
 
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+
 struct AutoexposurePushData
 {
+    float meteringWindowScaleX, meteringWindowScaleY;
+    float meteringWindowOffsetX, meteringWindowOffsetY;
+    float lumaMin, lumaMax;
+    uint32_t sampleCountX, sampleCountY;
     uint32_t viewportSizeX, viewportSizeY;
+    uint64_t lumaMeterBDA;
 };
 
 #endif
\ No newline at end of file
diff --git a/26_Autoexposure/app_resources/luma_meter.comp.hlsl b/26_Autoexposure/app_resources/luma_meter.comp.hlsl
index bd05198b8..fffd80988 100644
--- a/26_Autoexposure/app_resources/luma_meter.comp.hlsl
+++ b/26_Autoexposure/app_resources/luma_meter.comp.hlsl
@@ -3,6 +3,7 @@
 // For conditions of distribution and use, see copyright notice in nabla.h
 
 #include "nbl/builtin/hlsl/luma_meter/luma_meter.hlsl"
+#include "nbl/builtin/hlsl/bda/bda_accessor.hlsl"
 #include "app_resources/common.hlsl"
 
 [[vk::combinedImageSampler]] [[vk::binding(0, 0)]] Texture2D texture;
@@ -10,6 +11,36 @@
 
 [[vk::push_constant]] AutoexposurePushData pushData;
 
+using Ptr = nbl::hlsl::bda::__ptr < uint32_t >;
+using PtrAccessor = nbl::hlsl::BdaAccessor < uint32_t >;
+
+groupshared float32_t sdata[WorkgroupSize];
+
+struct SharedAccessor
+{
+    uint32_t get(const uint32_t index)
+    {
+        return sdata[index];
+    }
+
+    void set(const uint32_t index, const uint32_t value)
+    {
+        sdata[index] = value;
+    }
+
+    void workgroupExecutionAndMemoryBarrier()
+    {
+        nbl::hlsl::glsl::barrier();
+    }
+};
+
+struct TexAccessor
+{
+    float32_t3 get(float32_t2 uv) {
+        return texture.Sample(samplerState, uv).rgb;
+    }
+};
+
 uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize()
 {
     return uint32_t3(WorkgroupSize, 1, 1);
@@ -18,4 +49,21 @@ uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize()
 [numthreads(DeviceSubgroupSize, DeviceSubgroupSize, 1)]
 void main(uint32_t3 ID : SV_GroupThreadID, uint32_t3 GroupID : SV_GroupID)
 {
+    nbl::hlsl::luma_meter::LumaMeteringWindow luma_meter_window;
+    luma_meter_window.meteringWindowScale = float32_t2(pushData.meteringWindowScaleX, pushData.meteringWindowScaleY);
+    luma_meter_window.meteringWindowOffset = float32_t2(pushData.meteringWindowOffsetX, pushData.meteringWindowOffsetY);
+
+    const Ptr val_ptr = Ptr::create(pushData.lumaMeterBDA);
+    PtrAccessor val_accessor = PtrAccessor::create(val_ptr);
+
+    SharedAccessor sdata;
+    TexAccessor tex;
+
+    using LumaMeter = nbl::hlsl::luma_meter::geom_luma_meter< WorkgroupSize, PtrAccessor, SharedAccessor, TexAccessor>;
+    LumaMeter meter = LumaMeter::create(luma_meter_window, pushData.lumaMin, pushData.lumaMax);
+
+    uint32_t2 sampleCount = uint32_t2(pushData.sampleCountX, pushData.sampleCountY);
+    uint32_t2 viewportSize = uint32_t2(pushData.viewportSizeX, pushData.viewportSizeY);
+
+    meter.gatherLuma(val_accessor, tex, sdata, sampleCount, viewportSize);
 }

From defd45eaec2130b6c2b5aefbae524638755afa1a Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 13 Aug 2024 21:40:04 +0530
Subject: [PATCH 23/50] Pass push constants

---
 26_Autoexposure/main.cpp | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index 99af2093a..471d7f169 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -27,8 +27,11 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 	using clock_t = std::chrono::steady_clock;
 
 	constexpr static inline std::string_view DefaultImagePathsFile = "../../media/noises/spp_benchmark_4k_512.exr";
-	constexpr static inline std::array<int, 2> Dimensions = { 1280, 720 };
-	constexpr static inline std::array<int, 2> SampleCount = { 10000, 10000 };
+	constexpr static inline std::array<uint32_t, 2> Dimensions = { 1280, 720 };
+	constexpr static inline std::array<uint32_t, 2> SampleCount = { 10000, 10000 };
+	constexpr static inline std::array<float, 2> MeteringWindowScale = { 0.5f, 0.5f };
+	constexpr static inline std::array<float, 2> MeteringWindowOffset = { 0.25f, 0.25f };
+	constexpr static inline std::array<float, 2> LumaMinMax = { 1.0f / 4096.0f, 32768.0f };
 
 public:
 	// Yay thanks to multiple inheritance we cannot forward ctors anymore
@@ -353,8 +356,6 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 					assert(allocation->memory.get() == buffer->getBoundMemory().memory);
 				};
 
-				auto x = m_physicalDevice->getLimits();
-
 				build_buffer(m_device, &allocation, buffer, m_physicalDevice->getLimits().maxSubgroupSize, "Luma Gather Buffer");
 			}
 			m_lumaGatherBDA = buffer->getDeviceAddress();
@@ -531,13 +532,27 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			auto ds = m_lumaPresentDS[0].get();
 
 			const uint32_t SubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
+			auto pc = AutoexposurePushData
+			{
+				.meteringWindowScaleX = MeteringWindowScale[0] * m_gpuImg->getCreationParameters().extent.width,
+				.meteringWindowScaleY = MeteringWindowScale[1] * m_gpuImg->getCreationParameters().extent.height,
+				.meteringWindowOffsetX = MeteringWindowOffset[0] * m_gpuImg->getCreationParameters().extent.width,
+				.meteringWindowOffsetY = MeteringWindowOffset[1] * m_gpuImg->getCreationParameters().extent.height,
+				.lumaMin = LumaMinMax[0],
+				.lumaMax = LumaMinMax[1],
+				.sampleCountX = SampleCount[0],
+				.sampleCountY = SampleCount[1],
+				.viewportSizeX = m_gpuImg->getCreationParameters().extent.width,
+				.viewportSizeY = m_gpuImg->getCreationParameters().extent.height,
+				.lumaMeterBDA = m_lumaGatherBDA
+			};
 
 			queue->startCapture();
 
 			cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-
 			cmdbuf->bindComputePipeline(m_lumaMeterPipeline.get());
 			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_lumaMeterPipeline->getLayout(), 0, 1, &ds); // also if you created DS Set with 3th index you need to respect it here - firstSet tells you the index of set and count tells you what range from this index it should update, useful if you had 2 DS with lets say set index 2,3, then you can bind both with single call setting firstSet to 2, count to 2 and last argument would be pointet to your DS pointers
+			cmdbuf->pushConstants(m_lumaMeterPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(pc), &pc);
 			cmdbuf->dispatch(1 + (SampleCount[0] - 1) / SubgroupSize, 1 + (SampleCount[1] - 1) / SubgroupSize);
 			cmdbuf->end();
 

From f6f8154146010a09c240aa3576a99955b5779429 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Wed, 14 Aug 2024 19:01:24 +0530
Subject: [PATCH 24/50] Record draw pass correctly

---
 26_Autoexposure/main.cpp | 114 +++++++++++++++++++++++----------------
 1 file changed, 69 insertions(+), 45 deletions(-)

diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index 471d7f169..4a193e9dc 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -494,7 +494,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 			IGPUDescriptorSet::SDescriptorInfo info2 = {};
 			info2.info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
-			info2.desc = m_gpuTonemapImgView;
+			info2.desc = m_gpuImgView; // FIXME: temporarily pass in input image
 
 			IGPUDescriptorSet::SWriteDescriptorSet writeDescriptors[] = {
 				{
@@ -586,16 +586,41 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			m_device->blockForSemaphores(wait_infos);
 		}
 
-		m_submitIx++;
+		// Tonemapper
+		{
+		}
+
+		// Render to swapchain
+		{
+			// Acquire
+			auto acquire = m_surface->acquireNextImage();
+			if (!acquire)
+				return;
+
+			auto queue = getGraphicsQueue();
+			auto cmdbuf = m_graphicsCmdBufs[0].get();
+			cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE);
+			auto ds = m_lumaPresentDS[1].get();
+
+			const uint32_t SubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
+			auto pc = AutoexposurePushData
+			{
+				.meteringWindowScaleX = MeteringWindowScale[0] * m_gpuImg->getCreationParameters().extent.width,
+				.meteringWindowScaleY = MeteringWindowScale[1] * m_gpuImg->getCreationParameters().extent.height,
+				.meteringWindowOffsetX = MeteringWindowOffset[0] * m_gpuImg->getCreationParameters().extent.width,
+				.meteringWindowOffsetY = MeteringWindowOffset[1] * m_gpuImg->getCreationParameters().extent.height,
+				.lumaMin = LumaMinMax[0],
+				.lumaMax = LumaMinMax[1],
+				.sampleCountX = SampleCount[0],
+				.sampleCountY = SampleCount[1],
+				.viewportSizeX = m_gpuImg->getCreationParameters().extent.width,
+				.viewportSizeY = m_gpuImg->getCreationParameters().extent.height,
+				.lumaMeterBDA = m_lumaGatherBDA
+			};
 
-		// Acquire
-		//auto acquire = m_surface->acquireNextImage();
-		//if (!acquire)
-		//	return;
+			queue->startCapture();
 
-		// Render to the swapchain
-		/*{
-			cmdbuf3->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+			cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
 
 			const VkRect2D currentRenderArea =
 			{
@@ -609,9 +634,9 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 					.width = float(m_window->getWidth()),
 					.height = float(m_window->getHeight())
 				};
-				cmdbuf3->setViewport({ &viewport, 1 });
+				cmdbuf->setViewport({ &viewport, 1 });
 			}
-			cmdbuf3->setScissor({ &currentRenderArea, 1 });
+			cmdbuf->setScissor({ &currentRenderArea, 1 });
 
 			// begin the renderpass
 			{
@@ -623,28 +648,26 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 					.depthStencilClearValues = nullptr,
 					.renderArea = currentRenderArea
 				};
-				cmdbuf3->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
+				cmdbuf->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
 			}
 
-			cmdbuf3->bindGraphicsPipeline(m_presentPipeline.get());
-			cmdbuf3->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, m_presentPipeline->getLayout(), 3, 1, &ds);
-			ext::FullScreenTriangle::recordDrawCall(cmdbuf3);
-			cmdbuf3->endRenderPass();
+			cmdbuf->bindGraphicsPipeline(m_presentPipeline.get());
+			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, m_presentPipeline->getLayout(), 1, 1, &ds);
+			ext::FullScreenTriangle::recordDrawCall(cmdbuf);
+			cmdbuf->endRenderPass();
 
-			cmdbuf3->end();
-		}
+			cmdbuf->end();
 
-		// submit
-		const IQueue::SSubmitInfo::SSemaphoreInfo rendered[1] = { {
-			.semaphore = m_presentSemaphore.get(),
-			.value = ++m_submitIx,
-			// just as we've outputted all pixels, signal
-			.stageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT
-		} };
-		{
+			// submit
+			const IQueue::SSubmitInfo::SSemaphoreInfo rendered[1] = { {
+				.semaphore = m_presentSemaphore.get(),
+				.value = m_submitIx + 1,
+				// just as we've outputted all pixels, signal
+				.stageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT
+			} };
 			{
 				const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { {
-					.cmdbuf = cmdbuf3
+					.cmdbuf = cmdbuf
 				} };
 				// we don't need to wait for the transfer semaphore, because we submit everything to the same queue
 				const IQueue::SSubmitInfo::SSemaphoreInfo acquired[1] = { {
@@ -657,27 +680,28 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 					.commandBuffers = commandBuffers,
 					.signalSemaphores = rendered
 				} };
-				// we won't signal the sema if no success
-				if (queue->submit(infos) != IQueue::RESULT::SUCCESS)
-					m_submitIx--;
+
+				queue->submit(infos);
 			}
-		}
 
-		// Present
-		m_surface->present(acquire.imageIndex, rendered);
-		getGraphicsQueue()->endCapture();
+			// Present
+			m_surface->present(acquire.imageIndex, rendered);
+			queue->endCapture();
 
-		// Wait for completion
-		{
-			const ISemaphore::SWaitInfo cmdbufDonePending[] = {
-				{
-					.semaphore = m_presentSemaphore.get(),
-					.value = m_submitIx
-				}
-			};
-			if (m_device->blockForSemaphores(cmdbufDonePending) != ISemaphore::WAIT_RESULT::SUCCESS)
-				return;
-		}*/
+			// Wait for completion
+			{
+				const ISemaphore::SWaitInfo cmdbufDonePending[] = {
+					{
+						.semaphore = m_presentSemaphore.get(),
+						.value = m_submitIx
+					}
+				};
+				if (m_device->blockForSemaphores(cmdbufDonePending) != ISemaphore::WAIT_RESULT::SUCCESS)
+					return;
+			}
+		}
+
+		m_submitIx++;
 	}
 
 	inline bool keepRunning() override

From 64eb610d6ecaac71cb3cfa8e051c7623d754793d Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Wed, 14 Aug 2024 19:31:35 +0530
Subject: [PATCH 25/50] Add a pipeline barrier to transition image layout

---
 26_Autoexposure/main.cpp | 59 ++++++++++++++++++++++++++++------------
 1 file changed, 42 insertions(+), 17 deletions(-)

diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index 4a193e9dc..8f0895686 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -423,9 +423,15 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			m_intendedSubmit.commandBuffers = { &cmdbufInfo, 1 };
 
 			// there's no previous operation to wait for
-			const SMemoryBarrier toTransferBarrier = {
-				.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
-				.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT
+			const SMemoryBarrier transferBarriers[] = {
+				{
+					.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
+					.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT
+				},
+				{
+					.srcStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
+					.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT,
+				}
 			};
 
 			// upload image and write to descriptor set
@@ -433,20 +439,36 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 			cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
 			// change the layout of the image
-			const IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarriers[] = { {
-				.barrier = {
-					.dep = toTransferBarrier
-					// no ownership transfers
-				},
-				.image = m_gpuImg.get(),
-				// transition the whole view
-				.subresourceRange = cpuImgParams.subresourceRange,
-				// a wiping transition
-				.newLayout = IGPUImage::LAYOUT::TRANSFER_DST_OPTIMAL
-			} };
-			cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imgBarriers });
-			// upload contents and submit right away
-			m_utils->updateImageViaStagingBufferAutoSubmit(
+			const IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarriers1[] = {
+				{
+					.barrier = {
+						.dep = transferBarriers[0]
+						// no ownership transfers
+					},
+					.image = m_gpuImg.get(),
+					// transition the whole view
+					.subresourceRange = cpuImgParams.subresourceRange,
+					// a wiping transition
+					.newLayout = IGPUImage::LAYOUT::TRANSFER_DST_OPTIMAL
+				}
+			};
+			const IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarriers2[] = {
+				{
+					.barrier = {
+						.dep = transferBarriers[1]
+						// no ownership transfers
+					},
+					.image = m_gpuImg.get(),
+					// transition the whole view
+					.subresourceRange = cpuImgParams.subresourceRange,
+					// a wiping transition
+					.oldLayout = IGPUImage::LAYOUT::TRANSFER_DST_OPTIMAL,
+					.newLayout = IGPUImage::LAYOUT::READ_ONLY_OPTIMAL
+				}
+			};
+			cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imgBarriers1 });
+			// upload contents
+			m_utils->updateImageViaStagingBuffer(
 				m_intendedSubmit,
 				cpuImgParams.image->getBuffer(),
 				cpuImgParams.image->getCreationParameters().format,
@@ -454,6 +476,9 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				IGPUImage::LAYOUT::TRANSFER_DST_OPTIMAL,
 				cpuImgParams.image->getRegions()
 			);
+			cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imgBarriers2 });
+			m_utils->autoSubmit(m_intendedSubmit, [&](SIntendedSubmitInfo& nextSubmit) -> bool { return true; });
+
 			IGPUImageView::SCreationParams gpuImgViewParams = {
 				.image = m_gpuImg,
 				.viewType = IGPUImageView::ET_2D,

From 3d3d64693993846a008988757a8ed4effbccceab Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Wed, 14 Aug 2024 20:18:06 +0530
Subject: [PATCH 26/50] Record tonemapping pass

---
 26_Autoexposure/main.cpp | 60 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 59 insertions(+), 1 deletion(-)

diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index 8f0895686..8686ed77d 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -79,7 +79,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			* as evidenced by the name of the field in the SBinding.
 			* Samplers for combined image samplers can also be mutable, which for a binding of a descriptor set is specified also at creation time by leaving the immutableSamplers
 			* field set to its default (nullptr).
-			*/
+		*/
 		smart_refctd_ptr<IGPUDescriptorSetLayout> lumaPresentDSLayout, tonemapperDSLayout;
 		{
 			auto defaultSampler = m_device->createSampler(
@@ -613,6 +613,64 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 		// Tonemapper
 		{
+			auto queue = getComputeQueue();
+			auto cmdbuf = m_computeCmdBufs[1].get();
+			cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE);
+			auto ds = m_tonemapperDS[0].get();
+
+			const uint32_t SubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
+			auto pc = AutoexposurePushData
+			{
+				.meteringWindowScaleX = MeteringWindowScale[0] * m_gpuImg->getCreationParameters().extent.width,
+				.meteringWindowScaleY = MeteringWindowScale[1] * m_gpuImg->getCreationParameters().extent.height,
+				.meteringWindowOffsetX = MeteringWindowOffset[0] * m_gpuImg->getCreationParameters().extent.width,
+				.meteringWindowOffsetY = MeteringWindowOffset[1] * m_gpuImg->getCreationParameters().extent.height,
+				.lumaMin = LumaMinMax[0],
+				.lumaMax = LumaMinMax[1],
+				.sampleCountX = SampleCount[0],
+				.sampleCountY = SampleCount[1],
+				.viewportSizeX = m_gpuImg->getCreationParameters().extent.width,
+				.viewportSizeY = m_gpuImg->getCreationParameters().extent.height,
+				.lumaMeterBDA = m_lumaGatherBDA
+			};
+
+			queue->startCapture();
+
+			cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+			cmdbuf->bindComputePipeline(m_tonemapperPipeline.get());
+			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_tonemapperPipeline->getLayout(), 0, 1, &ds); // also if you created DS Set with 3th index you need to respect it here - firstSet tells you the index of set and count tells you what range from this index it should update, useful if you had 2 DS with lets say set index 2,3, then you can bind both with single call setting firstSet to 2, count to 2 and last argument would be pointet to your DS pointers
+			cmdbuf->pushConstants(m_tonemapperPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(pc), &pc);
+			cmdbuf->dispatch(1 + (SampleCount[0] - 1) / SubgroupSize, 1 + (SampleCount[1] - 1) / SubgroupSize);
+			cmdbuf->end();
+
+			{
+				IQueue::SSubmitInfo submit_infos[1];
+				IQueue::SSubmitInfo::SCommandBufferInfo cmdBufs[] = {
+					{
+						.cmdbuf = cmdbuf
+					}
+				};
+				submit_infos[0].commandBuffers = cmdBufs;
+				IQueue::SSubmitInfo::SSemaphoreInfo signals[] = {
+					{
+						.semaphore = m_tonemapperSemaphore.get(),
+						.value = m_submitIx + 1,
+						.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
+					}
+				};
+				submit_infos[0].signalSemaphores = signals;
+
+				queue->submit(submit_infos);
+				queue->endCapture();
+			}
+
+			const ISemaphore::SWaitInfo wait_infos[] = {
+				{
+					.semaphore = m_tonemapperSemaphore.get(),
+					.value = m_submitIx + 1
+				}
+			};
+			m_device->blockForSemaphores(wait_infos);
 		}
 
 		// Render to swapchain

From b4102dc43c84a06dda323e316c5d815f2899497e Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Fri, 16 Aug 2024 03:19:22 +0530
Subject: [PATCH 27/50] Revert "Record tonemapping pass"

This reverts commit 3d3d64693993846a008988757a8ed4effbccceab.
---
 26_Autoexposure/main.cpp | 60 +---------------------------------------
 1 file changed, 1 insertion(+), 59 deletions(-)

diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index 8686ed77d..8f0895686 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -79,7 +79,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			* as evidenced by the name of the field in the SBinding.
 			* Samplers for combined image samplers can also be mutable, which for a binding of a descriptor set is specified also at creation time by leaving the immutableSamplers
 			* field set to its default (nullptr).
-		*/
+			*/
 		smart_refctd_ptr<IGPUDescriptorSetLayout> lumaPresentDSLayout, tonemapperDSLayout;
 		{
 			auto defaultSampler = m_device->createSampler(
@@ -613,64 +613,6 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 		// Tonemapper
 		{
-			auto queue = getComputeQueue();
-			auto cmdbuf = m_computeCmdBufs[1].get();
-			cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE);
-			auto ds = m_tonemapperDS[0].get();
-
-			const uint32_t SubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
-			auto pc = AutoexposurePushData
-			{
-				.meteringWindowScaleX = MeteringWindowScale[0] * m_gpuImg->getCreationParameters().extent.width,
-				.meteringWindowScaleY = MeteringWindowScale[1] * m_gpuImg->getCreationParameters().extent.height,
-				.meteringWindowOffsetX = MeteringWindowOffset[0] * m_gpuImg->getCreationParameters().extent.width,
-				.meteringWindowOffsetY = MeteringWindowOffset[1] * m_gpuImg->getCreationParameters().extent.height,
-				.lumaMin = LumaMinMax[0],
-				.lumaMax = LumaMinMax[1],
-				.sampleCountX = SampleCount[0],
-				.sampleCountY = SampleCount[1],
-				.viewportSizeX = m_gpuImg->getCreationParameters().extent.width,
-				.viewportSizeY = m_gpuImg->getCreationParameters().extent.height,
-				.lumaMeterBDA = m_lumaGatherBDA
-			};
-
-			queue->startCapture();
-
-			cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-			cmdbuf->bindComputePipeline(m_tonemapperPipeline.get());
-			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_tonemapperPipeline->getLayout(), 0, 1, &ds); // also if you created DS Set with 3th index you need to respect it here - firstSet tells you the index of set and count tells you what range from this index it should update, useful if you had 2 DS with lets say set index 2,3, then you can bind both with single call setting firstSet to 2, count to 2 and last argument would be pointet to your DS pointers
-			cmdbuf->pushConstants(m_tonemapperPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(pc), &pc);
-			cmdbuf->dispatch(1 + (SampleCount[0] - 1) / SubgroupSize, 1 + (SampleCount[1] - 1) / SubgroupSize);
-			cmdbuf->end();
-
-			{
-				IQueue::SSubmitInfo submit_infos[1];
-				IQueue::SSubmitInfo::SCommandBufferInfo cmdBufs[] = {
-					{
-						.cmdbuf = cmdbuf
-					}
-				};
-				submit_infos[0].commandBuffers = cmdBufs;
-				IQueue::SSubmitInfo::SSemaphoreInfo signals[] = {
-					{
-						.semaphore = m_tonemapperSemaphore.get(),
-						.value = m_submitIx + 1,
-						.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
-					}
-				};
-				submit_infos[0].signalSemaphores = signals;
-
-				queue->submit(submit_infos);
-				queue->endCapture();
-			}
-
-			const ISemaphore::SWaitInfo wait_infos[] = {
-				{
-					.semaphore = m_tonemapperSemaphore.get(),
-					.value = m_submitIx + 1
-				}
-			};
-			m_device->blockForSemaphores(wait_infos);
 		}
 
 		// Render to swapchain

From 8307e926f95b568d319ff83c109abe3313fc7fed Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Fri, 16 Aug 2024 03:32:16 +0530
Subject: [PATCH 28/50] Remove separate tonemapping pass

---
 26_Autoexposure/main.cpp | 81 ++++------------------------------------
 1 file changed, 7 insertions(+), 74 deletions(-)

diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index 8f0895686..0be4c9c3a 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -18,8 +18,6 @@ using namespace asset;
 using namespace ui;
 using namespace video;
 
-//#include "app_resources/push_constants.hlsl"
-
 class AutoexposureApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
 	using device_base_t = examples::SimpleWindowedApplication;
@@ -80,7 +78,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			* Samplers for combined image samplers can also be mutable, which for a binding of a descriptor set is specified also at creation time by leaving the immutableSamplers
 			* field set to its default (nullptr).
 			*/
-		smart_refctd_ptr<IGPUDescriptorSetLayout> lumaPresentDSLayout, tonemapperDSLayout;
+		smart_refctd_ptr<IGPUDescriptorSetLayout> lumaPresentDSLayout;
 		{
 			auto defaultSampler = m_device->createSampler(
 				{
@@ -101,31 +99,14 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			lumaPresentDSLayout = m_device->createDescriptorSetLayout(lumaPresentBindings);
 			if (!lumaPresentDSLayout)
 				return logFail("Failed to Create Descriptor Layout: lumaPresentDSLayout");
-
-			const IGPUDescriptorSetLayout::SBinding tonemapperBindings[1] = {
-				{
-					.binding = 0,
-					.type = IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,
-					.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-					.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
-					.count = 1,
-					.immutableSamplers = &defaultSampler
-				}
-			};
-			tonemapperDSLayout = m_device->createDescriptorSetLayout(tonemapperBindings);
-			if (!tonemapperDSLayout)
-				return logFail("Failed to Create Descriptor Layout: tonemapperDSLayout");
 		}
 
 		// Create semaphores
 		m_lumaMeterSemaphore = m_device->createSemaphore(m_submitIx);
-		m_tonemapperSemaphore = m_device->createSemaphore(m_submitIx);
 		m_presentSemaphore = m_device->createSemaphore(m_submitIx);
 
 		// create the descriptor sets and with enough room
 		{
-			constexpr uint32_t tonemapperSetCount = 1;
-
 			core::smart_refctd_ptr<IDescriptorPool> lumaPresentPool;
 			{
 				const video::IGPUDescriptorSetLayout* const layouts[] = { lumaPresentDSLayout.get(), lumaPresentDSLayout.get() };
@@ -133,23 +114,13 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				lumaPresentPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, layouts, setCounts);
 			}
 
-			auto tonemapperPool = m_device->createDescriptorPoolForDSLayouts(
-				IDescriptorPool::E_CREATE_FLAGS::ECF_NONE,
-				{ &tonemapperDSLayout.get(), 1 },
-				&tonemapperSetCount
-			);
-
-			if (!lumaPresentPool || !tonemapperPool)
+			if (!lumaPresentPool)
 				return logFail("Failed to Create Descriptor Pools");
 
 			m_lumaPresentDS[0] = lumaPresentPool->createDescriptorSet(core::smart_refctd_ptr(lumaPresentDSLayout));
 			m_lumaPresentDS[1] = lumaPresentPool->createDescriptorSet(core::smart_refctd_ptr(lumaPresentDSLayout));
 			if (!m_lumaPresentDS[0] || !m_lumaPresentDS[1])
 				return logFail("Could not create Descriptor Set: lumaPresentDS!");
-			m_tonemapperDS[0] = tonemapperPool->createDescriptorSet(core::smart_refctd_ptr(tonemapperDSLayout));
-			if (!m_tonemapperDS[0])
-				return logFail("Could not create Descriptor Set: tonemapperDS!");
-
 		}
 
 		auto graphicsQueue = getGraphicsQueue();
@@ -256,14 +227,6 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			if (!createComputePipeline(lumaMeterShader, m_lumaMeterPipeline, lumaLayout))
 				return logFail("Could not create Luma Meter Pipeline!");
 
-			// Tonemapper
-			auto tonemapperShader = loadCompileAndCreateShader("app_resources/tonemapper.comp.hlsl");
-			if (!tonemapperShader)
-				return logFail("Failed to Load and Compile Compute Shader: tonemapperShader!");
-			auto tonemapperLayout = m_device->createPipelineLayout({ &pcRange, 1 }, core::smart_refctd_ptr(tonemapperDSLayout), nullptr, nullptr, nullptr);
-			if (!createComputePipeline(tonemapperShader, m_tonemapperPipeline, tonemapperLayout))
-				return logFail("Could not create Luma Meter Pipeline!");
-
 			// Load FSTri Shader
 			ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get());
 			if (!fsTriProtoPPln)
@@ -487,32 +450,6 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 			m_gpuImgView = m_device->createImageView(std::move(gpuImgViewParams));
 
-			// Allocate and create texture for tonemapping
-			imageParams = {};
-			imageParams = m_gpuImg->getCreationParameters();
-			// promote format because RGB8 and friends don't actually exist in HW
-			{
-				const IPhysicalDevice::SImageFormatPromotionRequest request = {
-					.originalFormat = imageParams.format,
-					.usages = IPhysicalDevice::SFormatImageUsages::SUsage(imageParams.usage)
-				};
-				imageParams.format = m_physicalDevice->promoteImageFormat(request, imageParams.tiling);
-			}
-			if (imageParams.type == IGPUImage::ET_3D)
-				imageParams.flags |= IGPUImage::ECF_2D_ARRAY_COMPATIBLE_BIT;
-			m_gpuTonemapImg = m_device->createImage(std::move(imageParams));
-			if (!m_gpuTonemapImg || !m_device->allocate(m_gpuTonemapImg->getMemoryReqs(), m_gpuTonemapImg.get()).isValid())
-				return false;
-			m_gpuTonemapImg->setObjectDebugName("Autoexposure Tonemapper Image");
-
-			IGPUImageView::SCreationParams gpuTonemapImgViewParams = {
-				.image = m_gpuTonemapImg,
-				.viewType = IGPUImageView::ET_2D,
-				.format = m_gpuTonemapImg->getCreationParameters().format
-			};
-
-			m_gpuTonemapImgView = m_device->createImageView(std::move(gpuTonemapImgViewParams));
-
 			IGPUDescriptorSet::SDescriptorInfo info1 = {};
 			info1.info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
 			info1.desc = m_gpuImgView;
@@ -611,10 +548,6 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			m_device->blockForSemaphores(wait_infos);
 		}
 
-		// Tonemapper
-		{
-		}
-
 		// Render to swapchain
 		{
 			// Acquire
@@ -745,8 +678,8 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 protected:
 	uint64_t m_lumaGatherBDA;
-	smart_refctd_ptr<IGPUImage> m_gpuImg, m_gpuTonemapImg;
-	smart_refctd_ptr<IGPUImageView> m_gpuImgView, m_gpuTonemapImgView;
+	smart_refctd_ptr<IGPUImage> m_gpuImg;
+	smart_refctd_ptr<IGPUImageView> m_gpuImgView;
 
 	// for image uploads
 	smart_refctd_ptr<ISemaphore> m_scratchSemaphore;
@@ -754,17 +687,17 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 	// Pipelines
 	smart_refctd_ptr<IGPUGraphicsPipeline> m_presentPipeline;
-	smart_refctd_ptr<IGPUComputePipeline> m_lumaMeterPipeline, m_tonemapperPipeline;
+	smart_refctd_ptr<IGPUComputePipeline> m_lumaMeterPipeline;
 
 	// Descriptor Sets
-	std::array<smart_refctd_ptr<IGPUDescriptorSet>, ISwapchain::MaxImages> m_lumaPresentDS, m_tonemapperDS;
+	std::array<smart_refctd_ptr<IGPUDescriptorSet>, ISwapchain::MaxImages> m_lumaPresentDS;
 
 	// Command Buffers
 	smart_refctd_ptr<IGPUCommandPool> m_graphicsCmdPool, m_computeCmdPool;
 	std::array<smart_refctd_ptr<IGPUCommandBuffer>, ISwapchain::MaxImages> m_graphicsCmdBufs, m_computeCmdBufs;
 
 	// Semaphores
-	smart_refctd_ptr<ISemaphore> m_lumaMeterSemaphore, m_tonemapperSemaphore, m_presentSemaphore;
+	smart_refctd_ptr<ISemaphore> m_lumaMeterSemaphore, m_presentSemaphore;
 	uint64_t m_submitIx = 0;
 
 	// window

From 7b5ca0522d21fcd670ecea6d4193b159dd7fb2de Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Fri, 16 Aug 2024 15:32:40 +0530
Subject: [PATCH 29/50] Compute final EV value on CPU

---
 26_Autoexposure/app_resources/common.hlsl |  1 +
 26_Autoexposure/main.cpp                  | 37 ++++++++++++++++++++---
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/26_Autoexposure/app_resources/common.hlsl b/26_Autoexposure/app_resources/common.hlsl
index 3735da6a9..07993d58d 100644
--- a/26_Autoexposure/app_resources/common.hlsl
+++ b/26_Autoexposure/app_resources/common.hlsl
@@ -12,6 +12,7 @@ struct AutoexposurePushData
     float meteringWindowScaleX, meteringWindowScaleY;
     float meteringWindowOffsetX, meteringWindowOffsetY;
     float lumaMin, lumaMax;
+    float EV;
     uint32_t sampleCountX, sampleCountY;
     uint32_t viewportSizeX, viewportSizeY;
     uint64_t lumaMeterBDA;
diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index 0be4c9c3a..faba912f4 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -290,7 +290,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 		// Allocate and create buffer for Luma Gather
 		{
 			// Allocate memory
-			nbl::video::IDeviceMemoryAllocator::SAllocation allocation = {};
+			m_lumaGatherAllocation = {};
 			smart_refctd_ptr<IGPUBuffer> buffer;
 			{
 				auto build_buffer = [this](
@@ -319,9 +319,13 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 					assert(allocation->memory.get() == buffer->getBoundMemory().memory);
 				};
 
-				build_buffer(m_device, &allocation, buffer, m_physicalDevice->getLimits().maxSubgroupSize, "Luma Gather Buffer");
+				build_buffer(m_device, &m_lumaGatherAllocation, buffer, m_physicalDevice->getLimits().maxSubgroupSize, "Luma Gather Buffer");
 			}
 			m_lumaGatherBDA = buffer->getDeviceAddress();
+
+			auto mapped_memory = m_lumaGatherAllocation.memory->map({ 0ull, m_lumaGatherAllocation.memory->getAllocationSize() }, IDeviceMemoryAllocation::EMCAF_READ);
+			if (!mapped_memory)
+				return logFail("Failed to map the Device Memory!\n");
 		}
 
 		// Allocate and Leave 1/4 for image uploads, to test image copy with small memory remaining
@@ -486,6 +490,8 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 	// We do a very simple thing, display an image and wait `DisplayImageMs` to show it
 	inline void workLoopBody() override
 	{
+		const uint32_t SubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
+
 		// Luma Meter
 		{
 			auto queue = getComputeQueue();
@@ -493,7 +499,6 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE);
 			auto ds = m_lumaPresentDS[0].get();
 
-			const uint32_t SubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
 			auto pc = AutoexposurePushData
 			{
 				.meteringWindowScaleX = MeteringWindowScale[0] * m_gpuImg->getCreationParameters().extent.width,
@@ -502,6 +507,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				.meteringWindowOffsetY = MeteringWindowOffset[1] * m_gpuImg->getCreationParameters().extent.height,
 				.lumaMin = LumaMinMax[0],
 				.lumaMax = LumaMinMax[1],
+				.EV = 0.0f,
 				.sampleCountX = SampleCount[0],
 				.sampleCountY = SampleCount[1],
 				.viewportSizeX = m_gpuImg->getCreationParameters().extent.width,
@@ -548,6 +554,27 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			m_device->blockForSemaphores(wait_infos);
 		}
 
+		// Get EV
+		{
+			const auto memory_range = ILogicalDevice::MappedMemoryRange(
+				m_lumaGatherAllocation.memory.get(),
+				0ull,
+				m_lumaGatherAllocation.memory->getAllocationSize()
+			);
+
+			if (!m_lumaGatherAllocation.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+				m_device->invalidateMappedMemoryRanges(1, &memory_range);
+
+			const uint32_t* buffData = reinterpret_cast<const uint32_t*>(m_lumaGatherAllocation.memory->getMappedPointer());
+
+			assert(m_lumaGatherAllocation.offset == 0); // simpler than writing out all the pointer arithmetic
+
+			m_EV = 0.0f;
+			for (int index = 0; index < SubgroupSize; index++) {
+				m_EV += buffData[index];
+			}
+		}
+
 		// Render to swapchain
 		{
 			// Acquire
@@ -560,7 +587,6 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE);
 			auto ds = m_lumaPresentDS[1].get();
 
-			const uint32_t SubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
 			auto pc = AutoexposurePushData
 			{
 				.meteringWindowScaleX = MeteringWindowScale[0] * m_gpuImg->getCreationParameters().extent.width,
@@ -569,6 +595,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				.meteringWindowOffsetY = MeteringWindowOffset[1] * m_gpuImg->getCreationParameters().extent.height,
 				.lumaMin = LumaMinMax[0],
 				.lumaMax = LumaMinMax[1],
+				.EV = m_EV,
 				.sampleCountX = SampleCount[0],
 				.sampleCountY = SampleCount[1],
 				.viewportSizeX = m_gpuImg->getCreationParameters().extent.width,
@@ -677,7 +704,9 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 	}
 
 protected:
+	nbl::video::IDeviceMemoryAllocator::SAllocation m_lumaGatherAllocation;
 	uint64_t m_lumaGatherBDA;
+	float m_EV = 0;
 	smart_refctd_ptr<IGPUImage> m_gpuImg;
 	smart_refctd_ptr<IGPUImageView> m_gpuImgView;
 

From edbf8d11854f3f4508316937d196e9c0e17c8b55 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Fri, 16 Aug 2024 16:36:29 +0530
Subject: [PATCH 30/50] Compute EV correctly and tonemap in fragment shader

---
 .../app_resources/present.frag.hlsl            | 18 +++++++++++++++++-
 .../app_resources/tonemapper.comp.hlsl         | 13 -------------
 26_Autoexposure/main.cpp                       |  3 ++-
 3 files changed, 19 insertions(+), 15 deletions(-)
 delete mode 100644 26_Autoexposure/app_resources/tonemapper.comp.hlsl

diff --git a/26_Autoexposure/app_resources/present.frag.hlsl b/26_Autoexposure/app_resources/present.frag.hlsl
index 9a53c19eb..5f0259fe5 100644
--- a/26_Autoexposure/app_resources/present.frag.hlsl
+++ b/26_Autoexposure/app_resources/present.frag.hlsl
@@ -4,6 +4,13 @@
 
 #pragma wave shader_stage(fragment)
 
+#include "nbl/builtin/hlsl/colorspace/EOTF.hlsl"
+#include "nbl/builtin/hlsl/colorspace/encodeCIEXYZ.hlsl"
+#include "nbl/builtin/hlsl/colorspace/decodeCIEXYZ.hlsl"
+#include "nbl/builtin/hlsl/colorspace/OETF.hlsl"
+#include "nbl/builtin/hlsl/tonemapper/operators.hlsl"
+#include "app_resources/common.hlsl"
+
 // vertex shader is provided by the fullScreenTriangle extension
 #include <nbl/builtin/hlsl/ext/FullScreenTriangle/SVertexAttributes.hlsl>
 using namespace nbl::hlsl::ext::FullScreenTriangle;
@@ -12,7 +19,16 @@ using namespace nbl::hlsl::ext::FullScreenTriangle;
 [[vk::combinedImageSampler]] [[vk::binding(0, 1)]] Texture2D texture;
 [[vk::combinedImageSampler]] [[vk::binding(0, 1)]] SamplerState samplerState;
 
+[[vk::push_constant]] AutoexposurePushData pushData;
+
 [[vk::location(0)]] float32_t4 main(SVertexAttributes vxAttr) : SV_Target0
 {
-    return texture.Sample(samplerState, vxAttr.uv);
+    float32_t3 color = nbl::hlsl::colorspace::oetf::sRGB(texture.Sample(samplerState, vxAttr.uv).rgb);
+    float32_t3 CIEColor = mul(nbl::hlsl::colorspace::sRGBtoXYZ, color);
+
+    nbl::hlsl::tonemapper::ReinhardParams params = nbl::hlsl::tonemapper::ReinhardParams::create(pushData.EV);
+
+    float32_t3 tonemappedColor = mul(nbl::hlsl::colorspace::decode::XYZtoscRGB, nbl::hlsl::tonemapper::reinhard(params, CIEColor));
+
+    return float32_t4(nbl::hlsl::colorspace::eotf::sRGB(tonemappedColor), 1.0);
 }
diff --git a/26_Autoexposure/app_resources/tonemapper.comp.hlsl b/26_Autoexposure/app_resources/tonemapper.comp.hlsl
deleted file mode 100644
index 15b543469..000000000
--- a/26_Autoexposure/app_resources/tonemapper.comp.hlsl
+++ /dev/null
@@ -1,13 +0,0 @@
-// Copyright (C) 2024-2024 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
-#include "app_resources/common.hlsl"
-
-[[vk::push_constant]] AutoexposurePushData pushData;
-
-[numthreads(DeviceSubgroupSize, DeviceSubgroupSize, 1)]
-void main(uint32_t3 ID : SV_GroupThreadID, uint32_t3 GroupID : SV_GroupID)
-{
-}
\ No newline at end of file
diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index faba912f4..fe770e395 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -571,8 +571,9 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 			m_EV = 0.0f;
 			for (int index = 0; index < SubgroupSize; index++) {
-				m_EV += buffData[index];
+				m_EV += static_cast<float>(buffData[index]) / (log2(LumaMinMax[1]) - log2(LumaMinMax[0])) + log2(LumaMinMax[0]);
 			}
+			m_EV /= (SampleCount[0] * SampleCount[1]);
 		}
 
 		// Render to swapchain

From dca49d2048ceb42ef9eba4b24fb527101847fcbe Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 20 Aug 2024 18:27:38 +0530
Subject: [PATCH 31/50] Separate LumaMeteringWindow into a common header

---
 26_Autoexposure/app_resources/luma_meter.comp.hlsl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/26_Autoexposure/app_resources/luma_meter.comp.hlsl b/26_Autoexposure/app_resources/luma_meter.comp.hlsl
index fffd80988..dbb214a8c 100644
--- a/26_Autoexposure/app_resources/luma_meter.comp.hlsl
+++ b/26_Autoexposure/app_resources/luma_meter.comp.hlsl
@@ -49,9 +49,9 @@ uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize()
 [numthreads(DeviceSubgroupSize, DeviceSubgroupSize, 1)]
 void main(uint32_t3 ID : SV_GroupThreadID, uint32_t3 GroupID : SV_GroupID)
 {
-    nbl::hlsl::luma_meter::LumaMeteringWindow luma_meter_window;
-    luma_meter_window.meteringWindowScale = float32_t2(pushData.meteringWindowScaleX, pushData.meteringWindowScaleY);
-    luma_meter_window.meteringWindowOffset = float32_t2(pushData.meteringWindowOffsetX, pushData.meteringWindowOffsetY);
+    nbl::hlsl::luma_meter::MeteringWindow meter_window;
+    meter_window.meteringWindowScale = float32_t2(pushData.meteringWindowScaleX, pushData.meteringWindowScaleY);
+    meter_window.meteringWindowOffset = float32_t2(pushData.meteringWindowOffsetX, pushData.meteringWindowOffsetY);
 
     const Ptr val_ptr = Ptr::create(pushData.lumaMeterBDA);
     PtrAccessor val_accessor = PtrAccessor::create(val_ptr);
@@ -60,7 +60,7 @@ void main(uint32_t3 ID : SV_GroupThreadID, uint32_t3 GroupID : SV_GroupID)
     TexAccessor tex;
 
     using LumaMeter = nbl::hlsl::luma_meter::geom_luma_meter< WorkgroupSize, PtrAccessor, SharedAccessor, TexAccessor>;
-    LumaMeter meter = LumaMeter::create(luma_meter_window, pushData.lumaMin, pushData.lumaMax);
+    LumaMeter meter = LumaMeter::create(meter_window, pushData.lumaMin, pushData.lumaMax);
 
     uint32_t2 sampleCount = uint32_t2(pushData.sampleCountX, pushData.sampleCountY);
     uint32_t2 viewportSize = uint32_t2(pushData.viewportSizeX, pushData.viewportSizeY);

From 9e283950262bae829ed1330d83cd424a14eb39e7 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 20 Aug 2024 18:32:55 +0530
Subject: [PATCH 32/50] Simplify luma_meter naming

---
 26_Autoexposure/app_resources/luma_meter.comp.hlsl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/26_Autoexposure/app_resources/luma_meter.comp.hlsl b/26_Autoexposure/app_resources/luma_meter.comp.hlsl
index dbb214a8c..241a499b7 100644
--- a/26_Autoexposure/app_resources/luma_meter.comp.hlsl
+++ b/26_Autoexposure/app_resources/luma_meter.comp.hlsl
@@ -59,7 +59,7 @@ void main(uint32_t3 ID : SV_GroupThreadID, uint32_t3 GroupID : SV_GroupID)
     SharedAccessor sdata;
     TexAccessor tex;
 
-    using LumaMeter = nbl::hlsl::luma_meter::geom_luma_meter< WorkgroupSize, PtrAccessor, SharedAccessor, TexAccessor>;
+    using LumaMeter = nbl::hlsl::luma_meter::geom_meter< WorkgroupSize, PtrAccessor, SharedAccessor, TexAccessor>;
     LumaMeter meter = LumaMeter::create(meter_window, pushData.lumaMin, pushData.lumaMax);
 
     uint32_t2 sampleCount = uint32_t2(pushData.sampleCountX, pushData.sampleCountY);

From 18fae9f1f93f07b91a642363689435d3f1092606 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 20 Aug 2024 19:05:22 +0530
Subject: [PATCH 33/50] Update luma examples to shared accessor api

---
 .../app_resources/luma_meter.comp.hlsl         | 18 +++++++++---------
 .../app_resources/present.frag.hlsl            | 13 +++++++------
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/26_Autoexposure/app_resources/luma_meter.comp.hlsl b/26_Autoexposure/app_resources/luma_meter.comp.hlsl
index 241a499b7..1cd451286 100644
--- a/26_Autoexposure/app_resources/luma_meter.comp.hlsl
+++ b/26_Autoexposure/app_resources/luma_meter.comp.hlsl
@@ -11,16 +11,16 @@
 
 [[vk::push_constant]] AutoexposurePushData pushData;
 
-using Ptr = nbl::hlsl::bda::__ptr < uint32_t >;
-using PtrAccessor = nbl::hlsl::BdaAccessor < uint32_t >;
+using namespace nbl::hlsl;
+using Ptr = bda::__ptr < uint32_t >;
+using PtrAccessor = BdaAccessor < uint32_t >;
 
 groupshared float32_t sdata[WorkgroupSize];
-
 struct SharedAccessor
 {
-    uint32_t get(const uint32_t index)
+    void get(const uint32_t index, NBL_REF_ARG(uint32_t) value)
     {
-        return sdata[index];
+        value = sdata[index];
     }
 
     void set(const uint32_t index, const uint32_t value)
@@ -30,7 +30,7 @@ struct SharedAccessor
 
     void workgroupExecutionAndMemoryBarrier()
     {
-        nbl::hlsl::glsl::barrier();
+        glsl::barrier();
     }
 };
 
@@ -41,7 +41,7 @@ struct TexAccessor
     }
 };
 
-uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize()
+uint32_t3 glsl::gl_WorkGroupSize()
 {
     return uint32_t3(WorkgroupSize, 1, 1);
 }
@@ -49,7 +49,7 @@ uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize()
 [numthreads(DeviceSubgroupSize, DeviceSubgroupSize, 1)]
 void main(uint32_t3 ID : SV_GroupThreadID, uint32_t3 GroupID : SV_GroupID)
 {
-    nbl::hlsl::luma_meter::MeteringWindow meter_window;
+    luma_meter::MeteringWindow meter_window;
     meter_window.meteringWindowScale = float32_t2(pushData.meteringWindowScaleX, pushData.meteringWindowScaleY);
     meter_window.meteringWindowOffset = float32_t2(pushData.meteringWindowOffsetX, pushData.meteringWindowOffsetY);
 
@@ -59,7 +59,7 @@ void main(uint32_t3 ID : SV_GroupThreadID, uint32_t3 GroupID : SV_GroupID)
     SharedAccessor sdata;
     TexAccessor tex;
 
-    using LumaMeter = nbl::hlsl::luma_meter::geom_meter< WorkgroupSize, PtrAccessor, SharedAccessor, TexAccessor>;
+    using LumaMeter = luma_meter::geom_meter< WorkgroupSize, PtrAccessor, SharedAccessor, TexAccessor>;
     LumaMeter meter = LumaMeter::create(meter_window, pushData.lumaMin, pushData.lumaMax);
 
     uint32_t2 sampleCount = uint32_t2(pushData.sampleCountX, pushData.sampleCountY);
diff --git a/26_Autoexposure/app_resources/present.frag.hlsl b/26_Autoexposure/app_resources/present.frag.hlsl
index 5f0259fe5..2e8142823 100644
--- a/26_Autoexposure/app_resources/present.frag.hlsl
+++ b/26_Autoexposure/app_resources/present.frag.hlsl
@@ -13,7 +13,8 @@
 
 // vertex shader is provided by the fullScreenTriangle extension
 #include <nbl/builtin/hlsl/ext/FullScreenTriangle/SVertexAttributes.hlsl>
-using namespace nbl::hlsl::ext::FullScreenTriangle;
+using namespace nbl::hlsl;
+using namespace ext::FullScreenTriangle;
 
 // binding 0 set 1
 [[vk::combinedImageSampler]] [[vk::binding(0, 1)]] Texture2D texture;
@@ -23,12 +24,12 @@ using namespace nbl::hlsl::ext::FullScreenTriangle;
 
 [[vk::location(0)]] float32_t4 main(SVertexAttributes vxAttr) : SV_Target0
 {
-    float32_t3 color = nbl::hlsl::colorspace::oetf::sRGB(texture.Sample(samplerState, vxAttr.uv).rgb);
-    float32_t3 CIEColor = mul(nbl::hlsl::colorspace::sRGBtoXYZ, color);
+    float32_t3 color = colorspace::oetf::sRGB(texture.Sample(samplerState, vxAttr.uv).rgb);
+    float32_t3 CIEColor = mul(colorspace::sRGBtoXYZ, color);
 
-    nbl::hlsl::tonemapper::ReinhardParams params = nbl::hlsl::tonemapper::ReinhardParams::create(pushData.EV);
+    tonemapper::ReinhardParams params = tonemapper::ReinhardParams::create(pushData.EV);
 
-    float32_t3 tonemappedColor = mul(nbl::hlsl::colorspace::decode::XYZtoscRGB, nbl::hlsl::tonemapper::reinhard(params, CIEColor));
+    float32_t3 tonemappedColor = mul(colorspace::decode::XYZtoscRGB, tonemapper::reinhard(params, CIEColor));
 
-    return float32_t4(nbl::hlsl::colorspace::eotf::sRGB(tonemappedColor), 1.0);
+    return float32_t4(colorspace::eotf::sRGB(tonemappedColor), 1.0);
 }

From 9b31c2c70eca0bcfb12d7a6a1327435954979707 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 20 Aug 2024 20:08:55 +0530
Subject: [PATCH 34/50] Refactor tonemapping operators

---
 26_Autoexposure/app_resources/present.frag.hlsl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/26_Autoexposure/app_resources/present.frag.hlsl b/26_Autoexposure/app_resources/present.frag.hlsl
index 2e8142823..b8ad803ff 100644
--- a/26_Autoexposure/app_resources/present.frag.hlsl
+++ b/26_Autoexposure/app_resources/present.frag.hlsl
@@ -27,9 +27,9 @@ using namespace ext::FullScreenTriangle;
     float32_t3 color = colorspace::oetf::sRGB(texture.Sample(samplerState, vxAttr.uv).rgb);
     float32_t3 CIEColor = mul(colorspace::sRGBtoXYZ, color);
 
-    tonemapper::ReinhardParams params = tonemapper::ReinhardParams::create(pushData.EV);
+    tonemapper::Reinhard<float32_t> reinhard = tonemapper::Reinhard<float32_t>::create(pushData.EV);
 
-    float32_t3 tonemappedColor = mul(colorspace::decode::XYZtoscRGB, tonemapper::reinhard(params, CIEColor));
+    float32_t3 tonemappedColor = mul(colorspace::decode::XYZtoscRGB, reinhard(CIEColor));
 
     return float32_t4(colorspace::eotf::sRGB(tonemappedColor), 1.0);
 }

From e987452090e0b3a321b1c92a61542f659353d4a6 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Wed, 21 Aug 2024 17:08:09 +0530
Subject: [PATCH 35/50] Simplify push constants and remove explicit sample
 counts

---
 26_Autoexposure/app_resources/common.hlsl | 19 ++++++---
 26_Autoexposure/main.cpp                  | 48 ++++++++---------------
 2 files changed, 30 insertions(+), 37 deletions(-)

diff --git a/26_Autoexposure/app_resources/common.hlsl b/26_Autoexposure/app_resources/common.hlsl
index 07993d58d..887607fb1 100644
--- a/26_Autoexposure/app_resources/common.hlsl
+++ b/26_Autoexposure/app_resources/common.hlsl
@@ -6,16 +6,23 @@
 #define _AUTOEXPOSURE_COMMON_INCLUDED_
 
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#include "nbl/builtin/hlsl/luma_meter/common.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
 
 struct AutoexposurePushData
 {
-    float meteringWindowScaleX, meteringWindowScaleY;
-    float meteringWindowOffsetX, meteringWindowOffsetY;
-    float lumaMin, lumaMax;
-    float EV;
-    uint32_t sampleCountX, sampleCountY;
-    uint32_t viewportSizeX, viewportSizeY;
+    nbl::hlsl::luma_meter::MeteringWindow window;
+    float32_t2 lumaMinMax;
+    float32_t EV;
+    uint32_t2 viewportSize;
     uint64_t lumaMeterBDA;
 };
 
+}
+}
+
 #endif
\ No newline at end of file
diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index fe770e395..913a68d0f 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -25,11 +25,10 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 	using clock_t = std::chrono::steady_clock;
 
 	constexpr static inline std::string_view DefaultImagePathsFile = "../../media/noises/spp_benchmark_4k_512.exr";
-	constexpr static inline std::array<uint32_t, 2> Dimensions = { 1280, 720 };
-	constexpr static inline std::array<uint32_t, 2> SampleCount = { 10000, 10000 };
-	constexpr static inline std::array<float, 2> MeteringWindowScale = { 0.5f, 0.5f };
-	constexpr static inline std::array<float, 2> MeteringWindowOffset = { 0.25f, 0.25f };
-	constexpr static inline std::array<float, 2> LumaMinMax = { 1.0f / 4096.0f, 32768.0f };
+	constexpr static inline uint32_t2 Dimensions = { 1280, 720 };
+	constexpr static inline float32_t2 MeteringWindowScale = { 0.5f, 0.5f };
+	constexpr static inline float32_t2 MeteringWindowOffset = { 0.25f, 0.25f };
+	constexpr static inline float32_t2 LumaMinMax = { 1.0f / 4096.0f, 32768.0f };
 
 public:
 	// Yay thanks to multiple inheritance we cannot forward ctors anymore
@@ -491,6 +490,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 	inline void workLoopBody() override
 	{
 		const uint32_t SubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
+		uint32_t2 viewportSize = { m_gpuImg->getCreationParameters().extent.width, m_gpuImg->getCreationParameters().extent.height };
 
 		// Luma Meter
 		{
@@ -501,17 +501,10 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 			auto pc = AutoexposurePushData
 			{
-				.meteringWindowScaleX = MeteringWindowScale[0] * m_gpuImg->getCreationParameters().extent.width,
-				.meteringWindowScaleY = MeteringWindowScale[1] * m_gpuImg->getCreationParameters().extent.height,
-				.meteringWindowOffsetX = MeteringWindowOffset[0] * m_gpuImg->getCreationParameters().extent.width,
-				.meteringWindowOffsetY = MeteringWindowOffset[1] * m_gpuImg->getCreationParameters().extent.height,
-				.lumaMin = LumaMinMax[0],
-				.lumaMax = LumaMinMax[1],
+				.window = nbl::hlsl::luma_meter::MeteringWindow::create(MeteringWindowScale, MeteringWindowOffset),
+				.lumaMinMax = LumaMinMax,
 				.EV = 0.0f,
-				.sampleCountX = SampleCount[0],
-				.sampleCountY = SampleCount[1],
-				.viewportSizeX = m_gpuImg->getCreationParameters().extent.width,
-				.viewportSizeY = m_gpuImg->getCreationParameters().extent.height,
+				.viewportSize = viewportSize,
 				.lumaMeterBDA = m_lumaGatherBDA
 			};
 
@@ -521,7 +514,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			cmdbuf->bindComputePipeline(m_lumaMeterPipeline.get());
 			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_lumaMeterPipeline->getLayout(), 0, 1, &ds); // also if you created DS Set with 3th index you need to respect it here - firstSet tells you the index of set and count tells you what range from this index it should update, useful if you had 2 DS with lets say set index 2,3, then you can bind both with single call setting firstSet to 2, count to 2 and last argument would be pointet to your DS pointers
 			cmdbuf->pushConstants(m_lumaMeterPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(pc), &pc);
-			cmdbuf->dispatch(1 + (SampleCount[0] - 1) / SubgroupSize, 1 + (SampleCount[1] - 1) / SubgroupSize);
+			cmdbuf->dispatch(viewportSize.x / SubgroupSize, viewportSize.y / SubgroupSize);
 			cmdbuf->end();
 
 			{
@@ -571,9 +564,9 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 			m_EV = 0.0f;
 			for (int index = 0; index < SubgroupSize; index++) {
-				m_EV += static_cast<float>(buffData[index]) / (log2(LumaMinMax[1]) - log2(LumaMinMax[0])) + log2(LumaMinMax[0]);
+				m_EV += static_cast<float32_t>(buffData[index]) / (log2(LumaMinMax[1]) - log2(LumaMinMax[0])) + log2(LumaMinMax[0]);
 			}
-			m_EV /= (SampleCount[0] * SampleCount[1]);
+			m_EV /= (viewportSize.x * viewportSize.y) / 4;
 		}
 
 		// Render to swapchain
@@ -590,17 +583,10 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 			auto pc = AutoexposurePushData
 			{
-				.meteringWindowScaleX = MeteringWindowScale[0] * m_gpuImg->getCreationParameters().extent.width,
-				.meteringWindowScaleY = MeteringWindowScale[1] * m_gpuImg->getCreationParameters().extent.height,
-				.meteringWindowOffsetX = MeteringWindowOffset[0] * m_gpuImg->getCreationParameters().extent.width,
-				.meteringWindowOffsetY = MeteringWindowOffset[1] * m_gpuImg->getCreationParameters().extent.height,
-				.lumaMin = LumaMinMax[0],
-				.lumaMax = LumaMinMax[1],
+				.window = nbl::hlsl::luma_meter::MeteringWindow::create(MeteringWindowScale, MeteringWindowOffset),
+				.lumaMinMax = LumaMinMax,
 				.EV = m_EV,
-				.sampleCountX = SampleCount[0],
-				.sampleCountY = SampleCount[1],
-				.viewportSizeX = m_gpuImg->getCreationParameters().extent.width,
-				.viewportSizeY = m_gpuImg->getCreationParameters().extent.height,
+				.viewportSize = viewportSize,
 				.lumaMeterBDA = m_lumaGatherBDA
 			};
 
@@ -617,8 +603,8 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			{
 				const asset::SViewport viewport =
 				{
-					.width = float(m_window->getWidth()),
-					.height = float(m_window->getHeight())
+					.width = float32_t(m_window->getWidth()),
+					.height = float32_t(m_window->getHeight())
 				};
 				cmdbuf->setViewport({ &viewport, 1 });
 			}
@@ -707,7 +693,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 protected:
 	nbl::video::IDeviceMemoryAllocator::SAllocation m_lumaGatherAllocation;
 	uint64_t m_lumaGatherBDA;
-	float m_EV = 0;
+	float32_t m_EV = 0;
 	smart_refctd_ptr<IGPUImage> m_gpuImg;
 	smart_refctd_ptr<IGPUImageView> m_gpuImgView;
 

From e135e434d13df7932f87a67b30a4b731aa58b5d4 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Wed, 21 Aug 2024 18:32:13 +0530
Subject: [PATCH 36/50] Infer sample count from viewportSize and simplify
 userspace HLSL

---
 .../app_resources/luma_meter.comp.hlsl            | 15 ++++-----------
 26_Autoexposure/main.cpp                          |  8 ++++++--
 2 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/26_Autoexposure/app_resources/luma_meter.comp.hlsl b/26_Autoexposure/app_resources/luma_meter.comp.hlsl
index 1cd451286..0cd9d78c7 100644
--- a/26_Autoexposure/app_resources/luma_meter.comp.hlsl
+++ b/26_Autoexposure/app_resources/luma_meter.comp.hlsl
@@ -9,12 +9,12 @@
 [[vk::combinedImageSampler]] [[vk::binding(0, 0)]] Texture2D texture;
 [[vk::combinedImageSampler]] [[vk::binding(0, 0)]] SamplerState samplerState;
 
-[[vk::push_constant]] AutoexposurePushData pushData;
-
 using namespace nbl::hlsl;
 using Ptr = bda::__ptr < uint32_t >;
 using PtrAccessor = BdaAccessor < uint32_t >;
 
+[[vk::push_constant]] AutoexposurePushData pushData;
+
 groupshared float32_t sdata[WorkgroupSize];
 struct SharedAccessor
 {
@@ -49,10 +49,6 @@ uint32_t3 glsl::gl_WorkGroupSize()
 [numthreads(DeviceSubgroupSize, DeviceSubgroupSize, 1)]
 void main(uint32_t3 ID : SV_GroupThreadID, uint32_t3 GroupID : SV_GroupID)
 {
-    luma_meter::MeteringWindow meter_window;
-    meter_window.meteringWindowScale = float32_t2(pushData.meteringWindowScaleX, pushData.meteringWindowScaleY);
-    meter_window.meteringWindowOffset = float32_t2(pushData.meteringWindowOffsetX, pushData.meteringWindowOffsetY);
-
     const Ptr val_ptr = Ptr::create(pushData.lumaMeterBDA);
     PtrAccessor val_accessor = PtrAccessor::create(val_ptr);
 
@@ -60,10 +56,7 @@ void main(uint32_t3 ID : SV_GroupThreadID, uint32_t3 GroupID : SV_GroupID)
     TexAccessor tex;
 
     using LumaMeter = luma_meter::geom_meter< WorkgroupSize, PtrAccessor, SharedAccessor, TexAccessor>;
-    LumaMeter meter = LumaMeter::create(meter_window, pushData.lumaMin, pushData.lumaMax);
-
-    uint32_t2 sampleCount = uint32_t2(pushData.sampleCountX, pushData.sampleCountY);
-    uint32_t2 viewportSize = uint32_t2(pushData.viewportSizeX, pushData.viewportSizeY);
+    LumaMeter meter = LumaMeter::create(pushData.lumaMinMax);
 
-    meter.gatherLuma(val_accessor, tex, sdata, sampleCount, viewportSize);
+    meter.gatherLuma(pushData.window, val_accessor, tex, sdata, (float32_t2)(glsl::gl_WorkGroupID() * glsl::gl_WorkGroupSize()));
 }
diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index 913a68d0f..8b8ca771b 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -490,6 +490,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 	inline void workLoopBody() override
 	{
 		const uint32_t SubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
+
 		uint32_t2 viewportSize = { m_gpuImg->getCreationParameters().extent.width, m_gpuImg->getCreationParameters().extent.height };
 
 		// Luma Meter
@@ -514,7 +515,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			cmdbuf->bindComputePipeline(m_lumaMeterPipeline.get());
 			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_lumaMeterPipeline->getLayout(), 0, 1, &ds); // also if you created DS Set with 3th index you need to respect it here - firstSet tells you the index of set and count tells you what range from this index it should update, useful if you had 2 DS with lets say set index 2,3, then you can bind both with single call setting firstSet to 2, count to 2 and last argument would be pointet to your DS pointers
 			cmdbuf->pushConstants(m_lumaMeterPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(pc), &pc);
-			cmdbuf->dispatch(viewportSize.x / SubgroupSize, viewportSize.y / SubgroupSize);
+			cmdbuf->dispatch(1 + (viewportSize.x - 1) / SubgroupSize, 1 + (viewportSize.y - 1) / SubgroupSize);
 			cmdbuf->end();
 
 			{
@@ -566,7 +567,10 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			for (int index = 0; index < SubgroupSize; index++) {
 				m_EV += static_cast<float32_t>(buffData[index]) / (log2(LumaMinMax[1]) - log2(LumaMinMax[0])) + log2(LumaMinMax[0]);
 			}
-			m_EV /= (viewportSize.x * viewportSize.y) / 4;
+			uint64_t sampleCount = (viewportSize.x * viewportSize.y) / 4;
+			uint64_t workgroupSize = SubgroupSize * SubgroupSize;
+			sampleCount = workgroupSize * (1 + (sampleCount - 1) / workgroupSize);
+			m_EV /= sampleCount;
 		}
 
 		// Render to swapchain

From 57e49ae17b66dd74a4ad5b945d127f7059be2452 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Thu, 22 Aug 2024 23:03:31 +0530
Subject: [PATCH 37/50] Templatize float type and add toXYZ method to
 TexAccessor

---
 26_Autoexposure/app_resources/luma_meter.comp.hlsl | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/26_Autoexposure/app_resources/luma_meter.comp.hlsl b/26_Autoexposure/app_resources/luma_meter.comp.hlsl
index 0cd9d78c7..1bcec5918 100644
--- a/26_Autoexposure/app_resources/luma_meter.comp.hlsl
+++ b/26_Autoexposure/app_resources/luma_meter.comp.hlsl
@@ -4,6 +4,7 @@
 
 #include "nbl/builtin/hlsl/luma_meter/luma_meter.hlsl"
 #include "nbl/builtin/hlsl/bda/bda_accessor.hlsl"
+#include "nbl/builtin/hlsl/colorspace/encodeCIEXYZ.hlsl"
 #include "app_resources/common.hlsl"
 
 [[vk::combinedImageSampler]] [[vk::binding(0, 0)]] Texture2D texture;
@@ -18,6 +19,7 @@ using PtrAccessor = BdaAccessor < uint32_t >;
 groupshared float32_t sdata[WorkgroupSize];
 struct SharedAccessor
 {
+    using type = float32_t;
     void get(const uint32_t index, NBL_REF_ARG(uint32_t) value)
     {
         value = sdata[index];
@@ -36,6 +38,10 @@ struct SharedAccessor
 
 struct TexAccessor
 {
+    static float32_t3 toXYZ(float32_t3 srgbColor) {
+        return dot(colorspace::sRGBtoXYZ[1], srgbColor);
+    }
+
     float32_t3 get(float32_t2 uv) {
         return texture.Sample(samplerState, uv).rgb;
     }
@@ -58,5 +64,5 @@ void main(uint32_t3 ID : SV_GroupThreadID, uint32_t3 GroupID : SV_GroupID)
     using LumaMeter = luma_meter::geom_meter< WorkgroupSize, PtrAccessor, SharedAccessor, TexAccessor>;
     LumaMeter meter = LumaMeter::create(pushData.lumaMinMax);
 
-    meter.gatherLuma(pushData.window, val_accessor, tex, sdata, (float32_t2)(glsl::gl_WorkGroupID() * glsl::gl_WorkGroupSize()));
+    meter.sampleLuma(pushData.window, val_accessor, tex, sdata, (float32_t2)(glsl::gl_WorkGroupID() * glsl::gl_WorkGroupSize()));
 }

From f8d50e804424eecd2f6e8a0b02285c623ec66376 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 27 Aug 2024 22:32:53 +0530
Subject: [PATCH 38/50] Refactor the example into using a 2-compute, 1-fragment
 architecture

---
 26_Autoexposure/app_resources/common.hlsl     |   2 +-
 .../app_resources/luma_gather.comp.hlsl       |  89 +++++
 .../app_resources/luma_meter.comp.hlsl        |   4 +-
 .../app_resources/present.frag.hlsl           |  22 +-
 26_Autoexposure/main.cpp                      | 351 ++++++++++++++----
 5 files changed, 365 insertions(+), 103 deletions(-)
 create mode 100644 26_Autoexposure/app_resources/luma_gather.comp.hlsl

diff --git a/26_Autoexposure/app_resources/common.hlsl b/26_Autoexposure/app_resources/common.hlsl
index 887607fb1..b270c38ce 100644
--- a/26_Autoexposure/app_resources/common.hlsl
+++ b/26_Autoexposure/app_resources/common.hlsl
@@ -17,7 +17,7 @@ struct AutoexposurePushData
 {
     nbl::hlsl::luma_meter::MeteringWindow window;
     float32_t2 lumaMinMax;
-    float32_t EV;
+    float32_t sampleCount;
     uint32_t2 viewportSize;
     uint64_t lumaMeterBDA;
 };
diff --git a/26_Autoexposure/app_resources/luma_gather.comp.hlsl b/26_Autoexposure/app_resources/luma_gather.comp.hlsl
new file mode 100644
index 000000000..7b14ee5be
--- /dev/null
+++ b/26_Autoexposure/app_resources/luma_gather.comp.hlsl
@@ -0,0 +1,89 @@
+// Copyright (C) 2024-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#include "nbl/builtin/hlsl/luma_meter/luma_meter.hlsl"
+#include "nbl/builtin/hlsl/bda/bda_accessor.hlsl"
+#include "nbl/builtin/hlsl/colorspace/EOTF.hlsl"
+#include "nbl/builtin/hlsl/colorspace/encodeCIEXYZ.hlsl"
+#include "nbl/builtin/hlsl/colorspace/decodeCIEXYZ.hlsl"
+#include "nbl/builtin/hlsl/colorspace/OETF.hlsl"
+#include "nbl/builtin/hlsl/tonemapper/operators.hlsl"
+#include "app_resources/common.hlsl"
+
+[[vk::combinedImageSampler]] [[vk::binding(0, 0)]] Texture2D textureIn;
+[[vk::combinedImageSampler]] [[vk::binding(0, 0)]] SamplerState samplerStateIn;
+[[vk::binding(0, 3)]] RWTexture2D<float32_t4> textureOut;
+
+using namespace nbl::hlsl;
+using Ptr = bda::__ptr < uint32_t >;
+using PtrAccessor = BdaAccessor < uint32_t >;
+
+[[vk::push_constant]] AutoexposurePushData pushData;
+
+groupshared float32_t sdata[WorkgroupSize];
+struct SharedAccessor
+{
+    using type = float32_t;
+    void get(const uint32_t index, NBL_REF_ARG(uint32_t) value)
+    {
+        value = sdata[index];
+    }
+
+    void set(const uint32_t index, const uint32_t value)
+    {
+        sdata[index] = value;
+    }
+
+    void workgroupExecutionAndMemoryBarrier()
+    {
+        glsl::barrier();
+    }
+};
+
+struct TexAccessor
+{
+    static float32_t3 toXYZ(float32_t3 srgbColor) {
+        return dot(colorspace::sRGBtoXYZ[1], srgbColor);
+    }
+
+    float32_t3 get(float32_t2 uv) {
+        return textureIn.Sample(samplerStateIn, uv).rgb;
+    }
+};
+
+uint32_t3 glsl::gl_WorkGroupSize()
+{
+    return uint32_t3(WorkgroupSize, 1, 1);
+}
+
+[numthreads(DeviceSubgroupSize, DeviceSubgroupSize, 1)]
+void main(uint32_t3 ID : SV_GroupThreadID, uint32_t3 GroupID : SV_GroupID)
+{
+    const Ptr val_ptr = Ptr::create(pushData.lumaMeterBDA);
+    PtrAccessor val_accessor = PtrAccessor::create(val_ptr);
+
+    SharedAccessor sdata;
+    TexAccessor tex;
+
+    using LumaMeter = luma_meter::geom_meter< WorkgroupSize, PtrAccessor, SharedAccessor, TexAccessor>;
+    LumaMeter meter = LumaMeter::create(pushData.lumaMinMax, pushData.sampleCount);
+
+    float32_t EV = meter.gatherLuma(val_accessor);
+
+    uint32_t tid = workgroup::SubgroupContiguousIndex();
+    uint32_t2 coord = {
+        morton2d_decode_x(tid),
+        morton2d_decode_y(tid)
+    };
+
+    uint32_t2 pos = glsl::gl_WorkGroupID() * glsl::gl_WorkGroupSize() + coord;
+
+    float32_t2 uv = (float32_t2)(pos) / pushData.viewportSize;
+    float32_t3 color = colorspace::oetf::sRGB(tex.get(uv).rgb);
+    float32_t3 CIEColor = mul(colorspace::sRGBtoXYZ, color);
+    tonemapper::Reinhard<float32_t> reinhard = tonemapper::Reinhard<float32_t>::create(EV);
+    float32_t3 tonemappedColor = mul(colorspace::decode::XYZtoscRGB, reinhard(CIEColor));
+
+    textureOut[pos] = float32_t4(colorspace::eotf::sRGB(tonemappedColor), 1.0f);
+}
diff --git a/26_Autoexposure/app_resources/luma_meter.comp.hlsl b/26_Autoexposure/app_resources/luma_meter.comp.hlsl
index 1bcec5918..f936d8d37 100644
--- a/26_Autoexposure/app_resources/luma_meter.comp.hlsl
+++ b/26_Autoexposure/app_resources/luma_meter.comp.hlsl
@@ -62,7 +62,7 @@ void main(uint32_t3 ID : SV_GroupThreadID, uint32_t3 GroupID : SV_GroupID)
     TexAccessor tex;
 
     using LumaMeter = luma_meter::geom_meter< WorkgroupSize, PtrAccessor, SharedAccessor, TexAccessor>;
-    LumaMeter meter = LumaMeter::create(pushData.lumaMinMax);
+    LumaMeter meter = LumaMeter::create(pushData.lumaMinMax, pushData.sampleCount);
 
-    meter.sampleLuma(pushData.window, val_accessor, tex, sdata, (float32_t2)(glsl::gl_WorkGroupID() * glsl::gl_WorkGroupSize()));
+    meter.sampleLuma(pushData.window, val_accessor, tex, sdata, (float32_t2)(glsl::gl_WorkGroupID() * glsl::gl_WorkGroupSize()), pushData.viewportSize);
 }
diff --git a/26_Autoexposure/app_resources/present.frag.hlsl b/26_Autoexposure/app_resources/present.frag.hlsl
index b8ad803ff..b436e248f 100644
--- a/26_Autoexposure/app_resources/present.frag.hlsl
+++ b/26_Autoexposure/app_resources/present.frag.hlsl
@@ -4,11 +4,6 @@
 
 #pragma wave shader_stage(fragment)
 
-#include "nbl/builtin/hlsl/colorspace/EOTF.hlsl"
-#include "nbl/builtin/hlsl/colorspace/encodeCIEXYZ.hlsl"
-#include "nbl/builtin/hlsl/colorspace/decodeCIEXYZ.hlsl"
-#include "nbl/builtin/hlsl/colorspace/OETF.hlsl"
-#include "nbl/builtin/hlsl/tonemapper/operators.hlsl"
 #include "app_resources/common.hlsl"
 
 // vertex shader is provided by the fullScreenTriangle extension
@@ -17,19 +12,10 @@ using namespace nbl::hlsl;
 using namespace ext::FullScreenTriangle;
 
 // binding 0 set 1
-[[vk::combinedImageSampler]] [[vk::binding(0, 1)]] Texture2D texture;
-[[vk::combinedImageSampler]] [[vk::binding(0, 1)]] SamplerState samplerState;
-
-[[vk::push_constant]] AutoexposurePushData pushData;
+[[vk::combinedImageSampler]] [[vk::binding(0, 3)]] Texture2D texture;
+[[vk::combinedImageSampler]] [[vk::binding(0, 3)]] SamplerState samplerState;
 
 [[vk::location(0)]] float32_t4 main(SVertexAttributes vxAttr) : SV_Target0
 {
-    float32_t3 color = colorspace::oetf::sRGB(texture.Sample(samplerState, vxAttr.uv).rgb);
-    float32_t3 CIEColor = mul(colorspace::sRGBtoXYZ, color);
-
-    tonemapper::Reinhard<float32_t> reinhard = tonemapper::Reinhard<float32_t>::create(pushData.EV);
-
-    float32_t3 tonemappedColor = mul(colorspace::decode::XYZtoscRGB, reinhard(CIEColor));
-
-    return float32_t4(colorspace::eotf::sRGB(tonemappedColor), 1.0);
-}
+    return float32_t4(texture.Sample(samplerState, vxAttr.uv).rgb, 1.0f);
+}
\ No newline at end of file
diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index 8b8ca771b..570e96807 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -77,7 +77,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			* Samplers for combined image samplers can also be mutable, which for a binding of a descriptor set is specified also at creation time by leaving the immutableSamplers
 			* field set to its default (nullptr).
 			*/
-		smart_refctd_ptr<IGPUDescriptorSetLayout> lumaPresentDSLayout;
+		std::array<smart_refctd_ptr<IGPUDescriptorSetLayout>, 3> dsLayouts;
 		{
 			auto defaultSampler = m_device->createSampler(
 				{
@@ -85,41 +85,96 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				}
 			);
 
-			const IGPUDescriptorSetLayout::SBinding lumaPresentBindings[1] = {
+			const IGPUDescriptorSetLayout::SBinding imgBindings[3][1] = {
 				{
-					.binding = 0,
-					.type = IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,
-					.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-					.stageFlags = IShader::E_SHADER_STAGE::ESS_FRAGMENT | IShader::E_SHADER_STAGE::ESS_COMPUTE,
-					.count = 1,
-					.immutableSamplers = &defaultSampler
+					{
+						.binding = 0,
+						.type = IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,
+						.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+						.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
+						.count = 1,
+						.immutableSamplers = &defaultSampler
+					}
+				},
+				{
+					{
+						.binding = 0,
+						.type = IDescriptor::E_TYPE::ET_STORAGE_IMAGE,
+						.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+						.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
+						.count = 1,
+						.immutableSamplers = nullptr
+					}
+				},
+				{
+					{
+						.binding = 0,
+						.type = IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,
+						.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+						.stageFlags = IShader::E_SHADER_STAGE::ESS_FRAGMENT,
+						.count = 1,
+						.immutableSamplers = &defaultSampler
+					}
 				}
 			};
-			lumaPresentDSLayout = m_device->createDescriptorSetLayout(lumaPresentBindings);
-			if (!lumaPresentDSLayout)
-				return logFail("Failed to Create Descriptor Layout: lumaPresentDSLayout");
+
+			bool dsLayoutCreation = true;
+			for (uint32_t index = 0; index < dsLayouts.size(); index++) {
+				dsLayouts[index] = m_device->createDescriptorSetLayout(imgBindings[index]);
+				dsLayoutCreation = dsLayoutCreation && dsLayouts[index];
+			}
+
+			if (!dsLayoutCreation)
+				return logFail("Failed to Create Descriptor Layouts");
 		}
 
 		// Create semaphores
-		m_lumaMeterSemaphore = m_device->createSemaphore(m_submitIx);
+		m_meterSemaphore = m_device->createSemaphore(m_submitIx);
+		m_gatherSemaphore = m_device->createSemaphore(m_submitIx);
 		m_presentSemaphore = m_device->createSemaphore(m_submitIx);
 
 		// create the descriptor sets and with enough room
 		{
-			core::smart_refctd_ptr<IDescriptorPool> lumaPresentPool;
+			std::array<core::smart_refctd_ptr<IDescriptorPool>, 3> dsPools;
+			bool dsPoolCreation = true;
+			{
+				const video::IGPUDescriptorSetLayout* const layouts[] = { dsLayouts[0].get() };
+				const uint32_t setCounts[] = { 1u };
+				dsPools[0] = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, layouts, setCounts);
+				dsPoolCreation = dsPoolCreation && dsPools[0];
+			}
 			{
-				const video::IGPUDescriptorSetLayout* const layouts[] = { lumaPresentDSLayout.get(), lumaPresentDSLayout.get() };
-				const uint32_t setCounts[] = { 1u, 1u };
-				lumaPresentPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, layouts, setCounts);
+				const video::IGPUDescriptorSetLayout* const layouts[] = { dsLayouts[1].get() };
+				const uint32_t setCounts[] = { 1u };
+				dsPools[1] = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, layouts, setCounts);
+				dsPoolCreation = dsPoolCreation && dsPools[1];
+			}
+			{
+				const video::IGPUDescriptorSetLayout* const layouts[] = { dsLayouts[2].get() };
+				const uint32_t setCounts[] = { 1u };
+				dsPools[2] = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, layouts, setCounts);
+				dsPoolCreation = dsPoolCreation && dsPools[2];
 			}
 
-			if (!lumaPresentPool)
+			if (!dsPoolCreation)
 				return logFail("Failed to Create Descriptor Pools");
 
-			m_lumaPresentDS[0] = lumaPresentPool->createDescriptorSet(core::smart_refctd_ptr(lumaPresentDSLayout));
-			m_lumaPresentDS[1] = lumaPresentPool->createDescriptorSet(core::smart_refctd_ptr(lumaPresentDSLayout));
-			if (!m_lumaPresentDS[0] || !m_lumaPresentDS[1])
-				return logFail("Could not create Descriptor Set: lumaPresentDS!");
+			bool dsCreation = true;
+			{
+				m_ds[0] = dsPools[0]->createDescriptorSet(dsLayouts[0]);
+				dsCreation = dsCreation && m_ds[0];
+			}
+			{
+				m_ds[1] = dsPools[1]->createDescriptorSet(dsLayouts[1]);
+				dsCreation = dsCreation && m_ds[1];
+			}
+			{
+				m_ds[2] = dsPools[2]->createDescriptorSet(dsLayouts[2]);
+				dsCreation = dsCreation && m_ds[2];
+			}
+
+			if (!dsCreation)
+				return logFail("Could not create Descriptor Sets!");
 		}
 
 		auto graphicsQueue = getGraphicsQueue();
@@ -219,13 +274,33 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			};
 
 			// Luma Meter
-			auto lumaMeterShader = loadCompileAndCreateShader("app_resources/luma_meter.comp.hlsl");
-			if (!lumaMeterShader)
-				return logFail("Failed to Load and Compile Compute Shader: lumaMeterShader!");
-			auto lumaLayout = m_device->createPipelineLayout({ &pcRange, 1 }, core::smart_refctd_ptr(lumaPresentDSLayout), nullptr, nullptr, nullptr);
-			if (!createComputePipeline(lumaMeterShader, m_lumaMeterPipeline, lumaLayout))
+			auto meterShader = loadCompileAndCreateShader("app_resources/luma_meter.comp.hlsl");
+			if (!meterShader)
+				return logFail("Failed to Load and Compile Compute Shader: meterShader!");
+			auto meterLayout = m_device->createPipelineLayout(
+				{ &pcRange, 1 },
+				core::smart_refctd_ptr(dsLayouts[0]),
+				nullptr,
+				nullptr,
+				nullptr
+			);
+			if (!createComputePipeline(meterShader, m_meterPipeline, meterLayout))
 				return logFail("Could not create Luma Meter Pipeline!");
 
+			// Luma Gather
+			auto gatherShader = loadCompileAndCreateShader("app_resources/luma_gather.comp.hlsl");
+			if (!gatherShader)
+				return logFail("Failed to Load and Compile Compute Shader: gatherShader!");
+			auto gatherLayout = m_device->createPipelineLayout(
+				{ &pcRange, 1 },
+				core::smart_refctd_ptr(dsLayouts[0]),
+				nullptr,
+				nullptr,
+				core::smart_refctd_ptr(dsLayouts[1])
+			);
+			if (!createComputePipeline(gatherShader, m_gatherPipeline, gatherLayout))
+				return logFail("Could not create Luma Gather Pipeline!");
+
 			// Load FSTri Shader
 			ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get());
 			if (!fsTriProtoPPln)
@@ -240,7 +315,13 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				.entryPoint = "main",
 				.shader = fragmentShader.get()
 			};
-			auto presentLayout = m_device->createPipelineLayout({ &pcRange, 1 }, nullptr, core::smart_refctd_ptr(lumaPresentDSLayout), nullptr, nullptr);
+			auto presentLayout = m_device->createPipelineLayout(
+				{ &pcRange, 1 },
+				nullptr,
+				nullptr,
+				nullptr,
+				core::smart_refctd_ptr(dsLayouts[2])
+			);
 			m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, presentLayout.get(), scResources->getRenderpass());
 			if (!m_presentPipeline)
 				return logFail("Could not create Graphics Pipeline!");
@@ -289,7 +370,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 		// Allocate and create buffer for Luma Gather
 		{
 			// Allocate memory
-			m_lumaGatherAllocation = {};
+			m_gatherAllocation = {};
 			smart_refctd_ptr<IGPUBuffer> buffer;
 			{
 				auto build_buffer = [this](
@@ -318,11 +399,11 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 					assert(allocation->memory.get() == buffer->getBoundMemory().memory);
 				};
 
-				build_buffer(m_device, &m_lumaGatherAllocation, buffer, m_physicalDevice->getLimits().maxSubgroupSize, "Luma Gather Buffer");
+				build_buffer(m_device, &m_gatherAllocation, buffer, m_physicalDevice->getLimits().maxSubgroupSize, "Luma Gather Buffer");
 			}
-			m_lumaGatherBDA = buffer->getDeviceAddress();
+			m_gatherBDA = buffer->getDeviceAddress();
 
-			auto mapped_memory = m_lumaGatherAllocation.memory->map({ 0ull, m_lumaGatherAllocation.memory->getAllocationSize() }, IDeviceMemoryAllocation::EMCAF_READ);
+			auto mapped_memory = m_gatherAllocation.memory->map({ 0ull, m_gatherAllocation.memory->getAllocationSize() }, IDeviceMemoryAllocation::EMCAF_READ);
 			if (!mapped_memory)
 				return logFail("Failed to map the Device Memory!\n");
 		}
@@ -379,6 +460,13 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				return false;
 			m_gpuImg->setObjectDebugName("Autoexposure Image");
 
+			imageParams = m_gpuImg->getCreationParameters();
+			imageParams.usage = IGPUImage::EUF_SAMPLED_BIT | IGPUImage::EUF_STORAGE_BIT;
+			m_tonemappedImg = m_device->createImage(std::move(imageParams));
+			if (!m_tonemappedImg || !m_device->allocate(m_tonemappedImg->getMemoryReqs(), m_tonemappedImg.get()).isValid())
+				return false;
+			m_tonemappedImg->setObjectDebugName("Tonemapped Image");
+
 			// Now show the window
 			m_winMgr->show(m_window.get());
 
@@ -448,37 +536,51 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			IGPUImageView::SCreationParams gpuImgViewParams = {
 				.image = m_gpuImg,
 				.viewType = IGPUImageView::ET_2D,
-				.format = m_gpuImg->getCreationParameters().format
+				.format = m_gpuImg->getCreationParameters().format,
+			};
+			IGPUImageView::SCreationParams tonemappedImgViewParams = {
+				.image = m_tonemappedImg,
+				.viewType = IGPUImageView::ET_2D,
+				.format = m_tonemappedImg->getCreationParameters().format
 			};
 
 			m_gpuImgView = m_device->createImageView(std::move(gpuImgViewParams));
+			m_tonemappedImgView = m_device->createImageView(std::move(tonemappedImgViewParams));
 
-			IGPUDescriptorSet::SDescriptorInfo info1 = {};
-			info1.info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
-			info1.desc = m_gpuImgView;
+			IGPUDescriptorSet::SDescriptorInfo infos[3];
+			infos[0].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+			infos[0].desc = m_gpuImgView;
+			infos[1].info.image.imageLayout = IImage::LAYOUT::GENERAL;
+			infos[1].desc = m_tonemappedImgView;
+			infos[2].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+			infos[2].desc = m_tonemappedImgView;
 
-			IGPUDescriptorSet::SDescriptorInfo info2 = {};
-			info2.info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
-			info2.desc = m_gpuImgView; // FIXME: temporarily pass in input image
 
 			IGPUDescriptorSet::SWriteDescriptorSet writeDescriptors[] = {
 				{
-					.dstSet = m_lumaPresentDS[0].get(),
+					.dstSet = m_ds[0].get(),
 					.binding = 0,
 					.arrayElement = 0,
 					.count = 1,
-					.info = &info1
+					.info = infos
 				},
 				{
-					.dstSet = m_lumaPresentDS[1].get(),
+					.dstSet = m_ds[1].get(),
 					.binding = 0,
 					.arrayElement = 0,
 					.count = 1,
-					.info = &info2
+					.info = infos
+				},
+				{
+					.dstSet = m_ds[2].get(),
+					.binding = 0,
+					.arrayElement = 0,
+					.count = 1,
+					.info = infos
 				}
 			};
 
-			m_device->updateDescriptorSets(2, writeDescriptors, 0, nullptr);
+			m_device->updateDescriptorSets(3, writeDescriptors, 0, nullptr);
 
 			queue->endCapture();
 		}
@@ -492,30 +594,38 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 		const uint32_t SubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
 
 		uint32_t2 viewportSize = { m_gpuImg->getCreationParameters().extent.width, m_gpuImg->getCreationParameters().extent.height };
+		float32_t sampleCount = (viewportSize.x * viewportSize.y) / 4;
+		uint32_t workgroupSize = SubgroupSize * SubgroupSize;
+		sampleCount = workgroupSize * (1 + (sampleCount - 1) / workgroupSize);
 
 		// Luma Meter
 		{
 			auto queue = getComputeQueue();
 			auto cmdbuf = m_computeCmdBufs[0].get();
 			cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE);
-			auto ds = m_lumaPresentDS[0].get();
+			auto ds = m_ds[0].get();
 
 			auto pc = AutoexposurePushData
 			{
 				.window = nbl::hlsl::luma_meter::MeteringWindow::create(MeteringWindowScale, MeteringWindowOffset),
 				.lumaMinMax = LumaMinMax,
-				.EV = 0.0f,
+				.sampleCount = sampleCount,
 				.viewportSize = viewportSize,
-				.lumaMeterBDA = m_lumaGatherBDA
+				.lumaMeterBDA = m_gatherBDA
+			};
+
+			const uint32_t2 dispatchSize = {
+				1 + ((viewportSize.x / 2) - 1) / SubgroupSize,
+				1 + ((viewportSize.y / 2) - 1) / SubgroupSize
 			};
 
 			queue->startCapture();
 
 			cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-			cmdbuf->bindComputePipeline(m_lumaMeterPipeline.get());
-			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_lumaMeterPipeline->getLayout(), 0, 1, &ds); // also if you created DS Set with 3th index you need to respect it here - firstSet tells you the index of set and count tells you what range from this index it should update, useful if you had 2 DS with lets say set index 2,3, then you can bind both with single call setting firstSet to 2, count to 2 and last argument would be pointet to your DS pointers
-			cmdbuf->pushConstants(m_lumaMeterPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(pc), &pc);
-			cmdbuf->dispatch(1 + (viewportSize.x - 1) / SubgroupSize, 1 + (viewportSize.y - 1) / SubgroupSize);
+			cmdbuf->bindComputePipeline(m_meterPipeline.get());
+			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_meterPipeline->getLayout(), 0, 1, &ds); // also if you created DS Set with 3th index you need to respect it here - firstSet tells you the index of set and count tells you what range from this index it should update, useful if you had 2 DS with lets say set index 2,3, then you can bind both with single call setting firstSet to 2, count to 2 and last argument would be pointet to your DS pointers
+			cmdbuf->pushConstants(m_meterPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(pc), &pc);
+			cmdbuf->dispatch(dispatchSize.x, dispatchSize.y);
 			cmdbuf->end();
 
 			{
@@ -528,7 +638,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				submit_infos[0].commandBuffers = cmdBufs;
 				IQueue::SSubmitInfo::SSemaphoreInfo signals[] = {
 					{
-						.semaphore = m_lumaMeterSemaphore.get(),
+						.semaphore = m_meterSemaphore.get(),
 						.value = m_submitIx + 1,
 						.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
 					}
@@ -541,36 +651,114 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 			const ISemaphore::SWaitInfo wait_infos[] = {
 				{
-					.semaphore = m_lumaMeterSemaphore.get(),
+					.semaphore = m_meterSemaphore.get(),
 					.value = m_submitIx + 1
 				}
 			};
 			m_device->blockForSemaphores(wait_infos);
 		}
 
-		// Get EV
+		// Luma Gather and Tonemapping
 		{
-			const auto memory_range = ILogicalDevice::MappedMemoryRange(
-				m_lumaGatherAllocation.memory.get(),
-				0ull,
-				m_lumaGatherAllocation.memory->getAllocationSize()
-			);
+			auto queue = getComputeQueue();
+			auto cmdbuf = m_computeCmdBufs[1].get();
+			cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE);
+			auto ds1 = m_ds[0].get();
+			auto ds2 = m_ds[1].get();
+
+			auto pc = AutoexposurePushData
+			{
+				.window = nbl::hlsl::luma_meter::MeteringWindow::create(MeteringWindowScale, MeteringWindowOffset),
+				.lumaMinMax = LumaMinMax,
+				.sampleCount = sampleCount,
+				.viewportSize = viewportSize,
+				.lumaMeterBDA = m_gatherBDA
+			};
+
+			const uint32_t2 dispatchSize = {
+				1 + ((viewportSize.x) - 1) / SubgroupSize,
+				1 + ((viewportSize.y) - 1) / SubgroupSize
+			};
 
-			if (!m_lumaGatherAllocation.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
-				m_device->invalidateMappedMemoryRanges(1, &memory_range);
+			const SMemoryBarrier computeBarriers[] = {
+				{
+					.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
+				},
+				{
+					.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
+					.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
+				}
+			};
 
-			const uint32_t* buffData = reinterpret_cast<const uint32_t*>(m_lumaGatherAllocation.memory->getMappedPointer());
+			// change the layout of the image
+			const IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarriers1[] = {
+				{
+					.barrier = {
+						.dep = computeBarriers[0]
+						// no ownership transfers
+					},
+					.image = m_gpuImg.get(),
+				// transition the whole view
+				.subresourceRange = m_tonemappedImgView->getCreationParameters().subresourceRange,
+				// a wiping transition
+				.newLayout = IGPUImage::LAYOUT::GENERAL
+				}
+			};
+			const IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarriers2[] = {
+				{
+					.barrier = {
+						.dep = computeBarriers[1]
+						// no ownership transfers
+					},
+					.image = m_gpuImg.get(),
+				// transition the whole view
+				.subresourceRange = m_tonemappedImgView->getCreationParameters().subresourceRange,
+				// a wiping transition
+				.oldLayout = IGPUImage::LAYOUT::GENERAL,
+				.newLayout = IGPUImage::LAYOUT::READ_ONLY_OPTIMAL
+				}
+			};
 
-			assert(m_lumaGatherAllocation.offset == 0); // simpler than writing out all the pointer arithmetic
+			queue->startCapture();
 
-			m_EV = 0.0f;
-			for (int index = 0; index < SubgroupSize; index++) {
-				m_EV += static_cast<float32_t>(buffData[index]) / (log2(LumaMinMax[1]) - log2(LumaMinMax[0])) + log2(LumaMinMax[0]);
+			cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+			cmdbuf->bindComputePipeline(m_gatherPipeline.get());
+			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_gatherPipeline->getLayout(), 0, 1, &ds1); // also if you created DS Set with 3th index you need to respect it here - firstSet tells you the index of set and count tells you what range from this index it should update, useful if you had 2 DS with lets say set index 2,3, then you can bind both with single call setting firstSet to 2, count to 2 and last argument would be pointet to your DS pointers
+			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_gatherPipeline->getLayout(), 3, 1, &ds2);
+			cmdbuf->pushConstants(m_gatherPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(pc), &pc);
+			cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imgBarriers1 });
+			cmdbuf->dispatch(dispatchSize.x, dispatchSize.y);
+			cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imgBarriers2 });
+			cmdbuf->end();
+
+			{
+				IQueue::SSubmitInfo submit_infos[1];
+				IQueue::SSubmitInfo::SCommandBufferInfo cmdBufs[] = {
+					{
+						.cmdbuf = cmdbuf
+					}
+				};
+				submit_infos[0].commandBuffers = cmdBufs;
+				IQueue::SSubmitInfo::SSemaphoreInfo signals[] = {
+					{
+						.semaphore = m_gatherSemaphore.get(),
+						.value = m_submitIx + 1,
+						.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
+					}
+				};
+				submit_infos[0].signalSemaphores = signals;
+
+				queue->submit(submit_infos);
+				queue->endCapture();
 			}
-			uint64_t sampleCount = (viewportSize.x * viewportSize.y) / 4;
-			uint64_t workgroupSize = SubgroupSize * SubgroupSize;
-			sampleCount = workgroupSize * (1 + (sampleCount - 1) / workgroupSize);
-			m_EV /= sampleCount;
+
+			const ISemaphore::SWaitInfo wait_infos[] = {
+				{
+					.semaphore = m_gatherSemaphore.get(),
+					.value = m_submitIx + 1
+				}
+			};
+			m_device->blockForSemaphores(wait_infos);
 		}
 
 		// Render to swapchain
@@ -583,15 +771,15 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			auto queue = getGraphicsQueue();
 			auto cmdbuf = m_graphicsCmdBufs[0].get();
 			cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE);
-			auto ds = m_lumaPresentDS[1].get();
+			auto ds = m_ds[2].get();
 
 			auto pc = AutoexposurePushData
 			{
 				.window = nbl::hlsl::luma_meter::MeteringWindow::create(MeteringWindowScale, MeteringWindowOffset),
 				.lumaMinMax = LumaMinMax,
-				.EV = m_EV,
+				.sampleCount = sampleCount,
 				.viewportSize = viewportSize,
-				.lumaMeterBDA = m_lumaGatherBDA
+				.lumaMeterBDA = m_gatherBDA
 			};
 
 			queue->startCapture();
@@ -628,7 +816,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			}
 
 			cmdbuf->bindGraphicsPipeline(m_presentPipeline.get());
-			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, m_presentPipeline->getLayout(), 1, 1, &ds);
+			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, m_presentPipeline->getLayout(), 3, 1, &ds);
 			ext::FullScreenTriangle::recordDrawCall(cmdbuf);
 			cmdbuf->endRenderPass();
 
@@ -695,29 +883,28 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 	}
 
 protected:
-	nbl::video::IDeviceMemoryAllocator::SAllocation m_lumaGatherAllocation;
-	uint64_t m_lumaGatherBDA;
-	float32_t m_EV = 0;
-	smart_refctd_ptr<IGPUImage> m_gpuImg;
-	smart_refctd_ptr<IGPUImageView> m_gpuImgView;
+	nbl::video::IDeviceMemoryAllocator::SAllocation m_gatherAllocation;
+	uint64_t m_gatherBDA;
+	smart_refctd_ptr<IGPUImage> m_gpuImg, m_tonemappedImg;
+	smart_refctd_ptr<IGPUImageView> m_gpuImgView, m_tonemappedImgView;
 
 	// for image uploads
 	smart_refctd_ptr<ISemaphore> m_scratchSemaphore;
 	SIntendedSubmitInfo m_intendedSubmit;
 
 	// Pipelines
+	smart_refctd_ptr<IGPUComputePipeline> m_meterPipeline, m_gatherPipeline;
 	smart_refctd_ptr<IGPUGraphicsPipeline> m_presentPipeline;
-	smart_refctd_ptr<IGPUComputePipeline> m_lumaMeterPipeline;
 
 	// Descriptor Sets
-	std::array<smart_refctd_ptr<IGPUDescriptorSet>, ISwapchain::MaxImages> m_lumaPresentDS;
+	std::array<smart_refctd_ptr<IGPUDescriptorSet>, 3> m_ds;
 
 	// Command Buffers
 	smart_refctd_ptr<IGPUCommandPool> m_graphicsCmdPool, m_computeCmdPool;
-	std::array<smart_refctd_ptr<IGPUCommandBuffer>, ISwapchain::MaxImages> m_graphicsCmdBufs, m_computeCmdBufs;
+	std::array<smart_refctd_ptr<IGPUCommandBuffer>, 2> m_graphicsCmdBufs, m_computeCmdBufs;
 
 	// Semaphores
-	smart_refctd_ptr<ISemaphore> m_lumaMeterSemaphore, m_presentSemaphore;
+	smart_refctd_ptr<ISemaphore> m_meterSemaphore, m_gatherSemaphore, m_presentSemaphore;
 	uint64_t m_submitIx = 0;
 
 	// window

From d3b5765eb82c268e7bdadd7369d83e6273b38570 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Wed, 28 Aug 2024 18:00:25 +0530
Subject: [PATCH 39/50] Handle image layouts correctly

---
 26_Autoexposure/main.cpp | 52 +++++++---------------------------------
 1 file changed, 8 insertions(+), 44 deletions(-)

diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index 570e96807..6e61573d1 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -504,6 +504,11 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 					.subresourceRange = cpuImgParams.subresourceRange,
 					// a wiping transition
 					.newLayout = IGPUImage::LAYOUT::TRANSFER_DST_OPTIMAL
+				},
+				{
+					.image = m_tonemappedImg.get(),
+					.subresourceRange = cpuImgParams.subresourceRange,
+					.newLayout = IGPUImage::LAYOUT::GENERAL
 				}
 			};
 			const IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarriers2[] = {
@@ -552,7 +557,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			infos[0].desc = m_gpuImgView;
 			infos[1].info.image.imageLayout = IImage::LAYOUT::GENERAL;
 			infos[1].desc = m_tonemappedImgView;
-			infos[2].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+			infos[2].info.image.imageLayout = IImage::LAYOUT::GENERAL;
 			infos[2].desc = m_tonemappedImgView;
 
 
@@ -569,14 +574,14 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 					.binding = 0,
 					.arrayElement = 0,
 					.count = 1,
-					.info = infos
+					.info = infos + 1
 				},
 				{
 					.dstSet = m_ds[2].get(),
 					.binding = 0,
 					.arrayElement = 0,
 					.count = 1,
-					.info = infos
+					.info = infos + 2
 				}
 			};
 
@@ -680,45 +685,6 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				1 + ((viewportSize.y) - 1) / SubgroupSize
 			};
 
-			const SMemoryBarrier computeBarriers[] = {
-				{
-					.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
-				},
-				{
-					.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
-					.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
-				}
-			};
-
-			// change the layout of the image
-			const IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarriers1[] = {
-				{
-					.barrier = {
-						.dep = computeBarriers[0]
-						// no ownership transfers
-					},
-					.image = m_gpuImg.get(),
-				// transition the whole view
-				.subresourceRange = m_tonemappedImgView->getCreationParameters().subresourceRange,
-				// a wiping transition
-				.newLayout = IGPUImage::LAYOUT::GENERAL
-				}
-			};
-			const IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarriers2[] = {
-				{
-					.barrier = {
-						.dep = computeBarriers[1]
-						// no ownership transfers
-					},
-					.image = m_gpuImg.get(),
-				// transition the whole view
-				.subresourceRange = m_tonemappedImgView->getCreationParameters().subresourceRange,
-				// a wiping transition
-				.oldLayout = IGPUImage::LAYOUT::GENERAL,
-				.newLayout = IGPUImage::LAYOUT::READ_ONLY_OPTIMAL
-				}
-			};
-
 			queue->startCapture();
 
 			cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
@@ -726,9 +692,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_gatherPipeline->getLayout(), 0, 1, &ds1); // also if you created DS Set with 3th index you need to respect it here - firstSet tells you the index of set and count tells you what range from this index it should update, useful if you had 2 DS with lets say set index 2,3, then you can bind both with single call setting firstSet to 2, count to 2 and last argument would be pointet to your DS pointers
 			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_gatherPipeline->getLayout(), 3, 1, &ds2);
 			cmdbuf->pushConstants(m_gatherPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(pc), &pc);
-			cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imgBarriers1 });
 			cmdbuf->dispatch(dispatchSize.x, dispatchSize.y);
-			cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imgBarriers2 });
 			cmdbuf->end();
 
 			{

From 612f0f6b7b4d7d4a52d1a1af3aa8b8aaddab6bfc Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Fri, 27 Sep 2024 12:18:20 +0100
Subject: [PATCH 40/50] Simplify type

---
 26_Autoexposure/app_resources/common.hlsl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/26_Autoexposure/app_resources/common.hlsl b/26_Autoexposure/app_resources/common.hlsl
index b270c38ce..bf2c19920 100644
--- a/26_Autoexposure/app_resources/common.hlsl
+++ b/26_Autoexposure/app_resources/common.hlsl
@@ -15,7 +15,7 @@ namespace hlsl
 
 struct AutoexposurePushData
 {
-    nbl::hlsl::luma_meter::MeteringWindow window;
+    luma_meter::MeteringWindow window;
     float32_t2 lumaMinMax;
     float32_t sampleCount;
     uint32_t2 viewportSize;

From cb46d82fe03e6b4f41f2dbddb45c6d9056bfa5ad Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Fri, 27 Sep 2024 12:19:02 +0100
Subject: [PATCH 41/50] Wait for correct semaphore value

---
 26_Autoexposure/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index 6e61573d1..fdce953bd 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -821,7 +821,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				const ISemaphore::SWaitInfo cmdbufDonePending[] = {
 					{
 						.semaphore = m_presentSemaphore.get(),
-						.value = m_submitIx
+						.value = m_submitIx + 1
 					}
 				};
 				if (m_device->blockForSemaphores(cmdbufDonePending) != ISemaphore::WAIT_RESULT::SUCCESS)

From 1996cf33d03308c43ec503a8ca298c40c5386fe1 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Sun, 29 Sep 2024 18:17:39 +0100
Subject: [PATCH 42/50] Remove unnecessary data members

---
 26_Autoexposure/main.cpp | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index fdce953bd..77d2c8e62 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -455,17 +455,17 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			}
 			if (imageParams.type == IGPUImage::ET_3D)
 				imageParams.flags |= IGPUImage::ECF_2D_ARRAY_COMPATIBLE_BIT;
-			m_gpuImg = m_device->createImage(std::move(imageParams));
-			if (!m_gpuImg || !m_device->allocate(m_gpuImg->getMemoryReqs(), m_gpuImg.get()).isValid())
+			auto gpuImg = m_device->createImage(std::move(imageParams));
+			if (!gpuImg || !m_device->allocate(gpuImg->getMemoryReqs(), gpuImg.get()).isValid())
 				return false;
-			m_gpuImg->setObjectDebugName("Autoexposure Image");
+			gpuImg->setObjectDebugName("Autoexposure Image");
 
-			imageParams = m_gpuImg->getCreationParameters();
+			imageParams = gpuImg->getCreationParameters();
 			imageParams.usage = IGPUImage::EUF_SAMPLED_BIT | IGPUImage::EUF_STORAGE_BIT;
-			m_tonemappedImg = m_device->createImage(std::move(imageParams));
-			if (!m_tonemappedImg || !m_device->allocate(m_tonemappedImg->getMemoryReqs(), m_tonemappedImg.get()).isValid())
+			auto tonemappedImg = m_device->createImage(std::move(imageParams));
+			if (!tonemappedImg || !m_device->allocate(tonemappedImg->getMemoryReqs(), tonemappedImg.get()).isValid())
 				return false;
-			m_tonemappedImg->setObjectDebugName("Tonemapped Image");
+			tonemappedImg->setObjectDebugName("Tonemapped Image");
 
 			// Now show the window
 			m_winMgr->show(m_window.get());
@@ -499,14 +499,14 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 						.dep = transferBarriers[0]
 						// no ownership transfers
 					},
-					.image = m_gpuImg.get(),
+					.image = gpuImg.get(),
 					// transition the whole view
 					.subresourceRange = cpuImgParams.subresourceRange,
 					// a wiping transition
 					.newLayout = IGPUImage::LAYOUT::TRANSFER_DST_OPTIMAL
 				},
 				{
-					.image = m_tonemappedImg.get(),
+					.image = tonemappedImg.get(),
 					.subresourceRange = cpuImgParams.subresourceRange,
 					.newLayout = IGPUImage::LAYOUT::GENERAL
 				}
@@ -517,7 +517,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 						.dep = transferBarriers[1]
 						// no ownership transfers
 					},
-					.image = m_gpuImg.get(),
+					.image = gpuImg.get(),
 					// transition the whole view
 					.subresourceRange = cpuImgParams.subresourceRange,
 					// a wiping transition
@@ -531,7 +531,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				m_intendedSubmit,
 				cpuImgParams.image->getBuffer(),
 				cpuImgParams.image->getCreationParameters().format,
-				m_gpuImg.get(),
+				gpuImg.get(),
 				IGPUImage::LAYOUT::TRANSFER_DST_OPTIMAL,
 				cpuImgParams.image->getRegions()
 			);
@@ -539,14 +539,14 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			m_utils->autoSubmit(m_intendedSubmit, [&](SIntendedSubmitInfo& nextSubmit) -> bool { return true; });
 
 			IGPUImageView::SCreationParams gpuImgViewParams = {
-				.image = m_gpuImg,
+				.image = gpuImg,
 				.viewType = IGPUImageView::ET_2D,
-				.format = m_gpuImg->getCreationParameters().format,
+				.format = gpuImg->getCreationParameters().format,
 			};
 			IGPUImageView::SCreationParams tonemappedImgViewParams = {
-				.image = m_tonemappedImg,
+				.image = tonemappedImg,
 				.viewType = IGPUImageView::ET_2D,
-				.format = m_tonemappedImg->getCreationParameters().format
+				.format = tonemappedImg->getCreationParameters().format
 			};
 
 			m_gpuImgView = m_device->createImageView(std::move(gpuImgViewParams));
@@ -598,7 +598,8 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 	{
 		const uint32_t SubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
 
-		uint32_t2 viewportSize = { m_gpuImg->getCreationParameters().extent.width, m_gpuImg->getCreationParameters().extent.height };
+		auto gpuImgExtent = m_gpuImgView->getCreationParameters().image->getCreationParameters().extent;
+		uint32_t2 viewportSize = { gpuImgExtent.width, gpuImgExtent.height };
 		float32_t sampleCount = (viewportSize.x * viewportSize.y) / 4;
 		uint32_t workgroupSize = SubgroupSize * SubgroupSize;
 		sampleCount = workgroupSize * (1 + (sampleCount - 1) / workgroupSize);
@@ -849,7 +850,6 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 protected:
 	nbl::video::IDeviceMemoryAllocator::SAllocation m_gatherAllocation;
 	uint64_t m_gatherBDA;
-	smart_refctd_ptr<IGPUImage> m_gpuImg, m_tonemappedImg;
 	smart_refctd_ptr<IGPUImageView> m_gpuImgView, m_tonemappedImgView;
 
 	// for image uploads

From bc11b4a249881acab17bacb2918cd78f5c08ba58 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Sat, 14 Dec 2024 17:55:20 +0000
Subject: [PATCH 43/50] Use asset converter for images and descriptors

---
 ...ather.comp.hlsl => luma_tonemap.comp.hlsl} |   0
 26_Autoexposure/main.cpp                      | 790 +++++++++---------
 2 files changed, 401 insertions(+), 389 deletions(-)
 rename 26_Autoexposure/app_resources/{luma_gather.comp.hlsl => luma_tonemap.comp.hlsl} (100%)

diff --git a/26_Autoexposure/app_resources/luma_gather.comp.hlsl b/26_Autoexposure/app_resources/luma_tonemap.comp.hlsl
similarity index 100%
rename from 26_Autoexposure/app_resources/luma_gather.comp.hlsl
rename to 26_Autoexposure/app_resources/luma_tonemap.comp.hlsl
diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index 77d2c8e62..f6d690a00 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -2,7 +2,7 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
-#include "../common/SimpleWindowedApplication.hpp"
+#include "SimpleWindowedApplication.hpp"
 
 #include "nbl/video/surface/CSurfaceVulkan.h"
 #include "nbl/asset/interchange/IAssetLoader.h"
@@ -24,7 +24,12 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 	using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
 	using clock_t = std::chrono::steady_clock;
 
-	constexpr static inline std::string_view DefaultImagePathsFile = "../../media/noises/spp_benchmark_4k_512.exr";
+	static inline std::string DefaultImagePathsFile = "../../media/noises/spp_benchmark_4k_512.exr";
+	static inline std::array<std::string, 3> ShaderPaths = {
+		"app_resources/luma_meter.comp.hlsl",
+		"app_resources/luma_tonemap.comp.hlsl" ,
+		"app_resources/present.frag.hlsl"
+	};
 	constexpr static inline uint32_t2 Dimensions = { 1280, 720 };
 	constexpr static inline float32_t2 MeteringWindowScale = { 0.5f, 0.5f };
 	constexpr static inline float32_t2 MeteringWindowOffset = { 0.25f, 0.25f };
@@ -70,124 +75,29 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 		if (!asset_base_t::onAppInitialized(std::move(system)))
 			return false;
 
-		/*
-			* We'll be using a combined image sampler for this example, which lets us assign both a sampled image and a sampler to the same binding.
-			* In this example we provide a sampler at descriptor set creation time, via the SBinding struct below. This specifies that the sampler for this binding is immutable,
-			* as evidenced by the name of the field in the SBinding.
-			* Samplers for combined image samplers can also be mutable, which for a binding of a descriptor set is specified also at creation time by leaving the immutableSamplers
-			* field set to its default (nullptr).
-			*/
-		std::array<smart_refctd_ptr<IGPUDescriptorSetLayout>, 3> dsLayouts;
-		{
-			auto defaultSampler = m_device->createSampler(
-				{
-					.AnisotropicFilter = 0
-				}
-			);
-
-			const IGPUDescriptorSetLayout::SBinding imgBindings[3][1] = {
-				{
-					{
-						.binding = 0,
-						.type = IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,
-						.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-						.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
-						.count = 1,
-						.immutableSamplers = &defaultSampler
-					}
-				},
-				{
-					{
-						.binding = 0,
-						.type = IDescriptor::E_TYPE::ET_STORAGE_IMAGE,
-						.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-						.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
-						.count = 1,
-						.immutableSamplers = nullptr
-					}
-				},
-				{
-					{
-						.binding = 0,
-						.type = IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,
-						.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-						.stageFlags = IShader::E_SHADER_STAGE::ESS_FRAGMENT,
-						.count = 1,
-						.immutableSamplers = &defaultSampler
-					}
-				}
-			};
-
-			bool dsLayoutCreation = true;
-			for (uint32_t index = 0; index < dsLayouts.size(); index++) {
-				dsLayouts[index] = m_device->createDescriptorSetLayout(imgBindings[index]);
-				dsLayoutCreation = dsLayoutCreation && dsLayouts[index];
-			}
-
-			if (!dsLayoutCreation)
-				return logFail("Failed to Create Descriptor Layouts");
-		}
-
 		// Create semaphores
 		m_meterSemaphore = m_device->createSemaphore(m_submitIx);
-		m_gatherSemaphore = m_device->createSemaphore(m_submitIx);
+		m_tonemapSemaphore = m_device->createSemaphore(m_submitIx);
 		m_presentSemaphore = m_device->createSemaphore(m_submitIx);
 
-		// create the descriptor sets and with enough room
+		// Create command pool and buffers
 		{
-			std::array<core::smart_refctd_ptr<IDescriptorPool>, 3> dsPools;
-			bool dsPoolCreation = true;
-			{
-				const video::IGPUDescriptorSetLayout* const layouts[] = { dsLayouts[0].get() };
-				const uint32_t setCounts[] = { 1u };
-				dsPools[0] = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, layouts, setCounts);
-				dsPoolCreation = dsPoolCreation && dsPools[0];
-			}
-			{
-				const video::IGPUDescriptorSetLayout* const layouts[] = { dsLayouts[1].get() };
-				const uint32_t setCounts[] = { 1u };
-				dsPools[1] = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, layouts, setCounts);
-				dsPoolCreation = dsPoolCreation && dsPools[1];
-			}
-			{
-				const video::IGPUDescriptorSetLayout* const layouts[] = { dsLayouts[2].get() };
-				const uint32_t setCounts[] = { 1u };
-				dsPools[2] = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, layouts, setCounts);
-				dsPoolCreation = dsPoolCreation && dsPools[2];
-			}
+			auto gQueue = getGraphicsQueue();
+			m_cmdPool = m_device->createCommandPool(gQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+			if (!m_cmdPool)
+				return logFail("Couldn't create Command Pool!");
 
-			if (!dsPoolCreation)
-				return logFail("Failed to Create Descriptor Pools");
-
-			bool dsCreation = true;
-			{
-				m_ds[0] = dsPools[0]->createDescriptorSet(dsLayouts[0]);
-				dsCreation = dsCreation && m_ds[0];
-			}
-			{
-				m_ds[1] = dsPools[1]->createDescriptorSet(dsLayouts[1]);
-				dsCreation = dsCreation && m_ds[1];
-			}
-			{
-				m_ds[2] = dsPools[2]->createDescriptorSet(dsLayouts[2]);
-				dsCreation = dsCreation && m_ds[2];
-			}
-
-			if (!dsCreation)
-				return logFail("Could not create Descriptor Sets!");
+			if (!m_cmdPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data(), 1 }))
+				return logFail("Couldn't create Command Buffer!");
 		}
 
-		auto graphicsQueue = getGraphicsQueue();
-		auto computeQueue = getComputeQueue();
-
-		// Gather swapchain resources
-		std::unique_ptr<CDefaultSwapchainFramebuffers> scResources;
-		ISwapchain::SCreationParams swapchainParams;
+		// Create renderpass and init surface
+		nbl::video::IGPURenderpass* renderpass;
 		{
-			swapchainParams = { .surface = smart_refctd_ptr<ISurface>(m_surface->getSurface()) };
-			// Need to choose a surface format
+			ISwapchain::SCreationParams swapchainParams = { .surface = smart_refctd_ptr<ISurface>(m_surface->getSurface()) };
 			if (!swapchainParams.deduceFormat(m_physicalDevice))
 				return logFail("Could not choose a Surface Format for the Swapchain!");
+
 			// We actually need external dependencies to ensure ordering of the Implicit Layout Transitions relative to the semaphore signals
 			constexpr IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = {
 				// wipe-transition to ATTACHMENT_OPTIMAL
@@ -202,7 +112,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 					// because we clear and don't blend
 					.dstAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
 					}
-					// leave view offsets and flags default
+				// leave view offsets and flags default
 				},
 				// ATTACHMENT_OPTIMAL to PRESENT_SRC
 				{
@@ -213,31 +123,154 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 						.srcAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
 						// we can have NONE as the Destinations because the spec says so about presents
 						}
-					// leave view offsets and flags default
+				// leave view offsets and flags default
 				},
 				IGPURenderpass::SCreationParams::DependenciesEnd
 			};
-			scResources = std::make_unique<CDefaultSwapchainFramebuffers>(m_device.get(), swapchainParams.surfaceFormat.format, dependencies);
-			if (!scResources->getRenderpass())
+
+			auto scResources = std::make_unique<CDefaultSwapchainFramebuffers>(m_device.get(), swapchainParams.surfaceFormat.format, dependencies);
+
+			renderpass = scResources->getRenderpass();
+
+			if (!renderpass)
 				return logFail("Failed to create Renderpass!");
+
+			auto gQueue = getGraphicsQueue();
+			if (!m_surface || !m_surface->init(gQueue, std::move(scResources), swapchainParams.sharedParams))
+				return logFail("Could not create Window & Surface or initialize the Surface!");
 		}
 
-		// Load the shaders and create the pipelines
+		// Create descriptors and pipelines
 		{
-			auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr<IGPUShader>
-			{
+			auto convertDSLayoutCPU2GPU = [&](std::span<ICPUDescriptorSetLayout *> cpuLayouts) {
+				auto converter = CAssetConverter::create({ .device = m_device.get() });
+				CAssetConverter::SInputs inputs = {};
+				inputs.readCache = converter.get();
+				inputs.logger = m_logger.get();
+				CAssetConverter::SConvertParams params = {};
+				params.utilities = m_utils.get();
+
+				std::get<CAssetConverter::SInputs::asset_span_t<ICPUDescriptorSetLayout>>(inputs.assets) = cpuLayouts;
+				// don't need to assert that we don't need to provide patches since layouts are not patchable
+				//assert(true);
+				auto reservation = converter->reserve(inputs);
+				// the `.value` is just a funny way to make the `smart_refctd_ptr` copyable
+				auto gpuLayouts = reservation.getGPUObjects<ICPUDescriptorSetLayout>();
+				std::vector<smart_refctd_ptr<IGPUDescriptorSetLayout>> result;
+				result.reserve(cpuLayouts.size());
+
+				for (auto& gpuLayout : gpuLayouts) {
+					auto layout = gpuLayout.value;
+					if (!layout) {
+						m_logger->log("Failed to convert %s into an IGPUDescriptorSetLayout handle", ILogger::ELL_ERROR);
+						std::exit(-1);
+					}
+					result.push_back(layout);
+				}
+
+				return result;
+			};
+			auto convertDSCPU2GPU = [&](std::span<ICPUDescriptorSet *> cpuDS) {
+				auto converter = CAssetConverter::create({ .device = m_device.get() });
+				CAssetConverter::SInputs inputs = {};
+				inputs.readCache = converter.get();
+				inputs.logger = m_logger.get();
+				CAssetConverter::SConvertParams params = {};
+				params.utilities = m_utils.get();
+
+				std::get<CAssetConverter::SInputs::asset_span_t<ICPUDescriptorSet>>(inputs.assets) = cpuDS;
+				// don't need to assert that we don't need to provide patches since layouts are not patchable
+				//assert(true);
+				auto reservation = converter->reserve(inputs);
+				// the `.value` is just a funny way to make the `smart_refctd_ptr` copyable
+				auto gpuDS = reservation.getGPUObjects<ICPUDescriptorSet>();
+				std::vector<smart_refctd_ptr<IGPUDescriptorSet>> result;
+				result.reserve(cpuDS.size());
+
+				for (auto& ds : gpuDS) {
+					if (!ds.value) {
+						m_logger->log("Failed to convert %s into an IGPUDescriptorSet handle", ILogger::ELL_ERROR);
+						std::exit(-1);
+					}
+					result.push_back(ds.value);
+				}
+
+				return result;
+			};
+
+			ISampler::SParams samplerParams = {
+				.AnisotropicFilter = 0
+			};
+			auto defaultSampler = make_smart_refctd_ptr<ICPUSampler>(samplerParams);
+
+			std::array<ICPUDescriptorSetLayout::SBinding, 1> meterBindings = {};
+			std::array<ICPUDescriptorSetLayout::SBinding, 1> tonemapBindings = {};
+			std::array<ICPUDescriptorSetLayout::SBinding, 1> presentBindings = {};
+
+			meterBindings[0] = {
+				.binding = 0u,
+				.type = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,
+				.createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+				.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
+				.count = 1u,
+				.immutableSamplers = &defaultSampler
+			};
+			tonemapBindings[0] = {
+				.binding = 0u,
+				.type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE,
+				.createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+				.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
+				.count = 1u,
+				.immutableSamplers = nullptr
+			};
+			presentBindings[0] = {
+				.binding = 0u,
+				.type = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,
+				.createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+				.stageFlags = IShader::E_SHADER_STAGE::ESS_FRAGMENT,
+				.count = 1u,
+				.immutableSamplers = &defaultSampler
+			};
+
+			auto cpuMeterLayout = make_smart_refctd_ptr<ICPUDescriptorSetLayout>(meterBindings);
+			auto cpuTonemapLayout = make_smart_refctd_ptr<ICPUDescriptorSetLayout>(tonemapBindings);
+			auto cpuPresentLayout = make_smart_refctd_ptr<ICPUDescriptorSetLayout>(presentBindings);
+
+			std::array<ICPUDescriptorSetLayout*, 3> cpuLayouts = {
+				cpuMeterLayout.get(),
+				cpuTonemapLayout.get(),
+				cpuPresentLayout.get()
+			};
+
+			auto gpuLayouts = convertDSLayoutCPU2GPU(cpuLayouts);
+
+			auto cpuMeterDS = make_smart_refctd_ptr<ICPUDescriptorSet>(std::move(cpuMeterLayout));
+			auto cpuTonemapDS = make_smart_refctd_ptr<ICPUDescriptorSet>(std::move(cpuTonemapLayout));
+			auto cpuPresentDS = make_smart_refctd_ptr<ICPUDescriptorSet>(std::move(cpuPresentLayout));
+
+			std::array<ICPUDescriptorSet*, 3> cpuDS = {
+				cpuMeterDS.get(),
+				cpuTonemapDS.get(),
+				cpuPresentDS.get()
+			};
+
+			auto gpuDS = convertDSCPU2GPU(cpuDS);
+			m_meterDS = gpuDS[0];
+			m_tonemapDS = gpuDS[1];
+			m_presentDS = gpuDS[2];
+
+			// Create Shaders
+			auto loadAndCompileShader = [&](std::string pathToShader) {
 				IAssetLoader::SAssetLoadParams lp = {};
-				lp.logger = m_logger.get();
-				lp.workingDirectory = ""; // virtual root
-				auto assetBundle = m_assetMgr->getAsset(relPath, lp);
+				auto assetBundle = m_assetMgr->getAsset(pathToShader, lp);
 				const auto assets = assetBundle.getContents();
 				if (assets.empty())
-					return nullptr;
+				{
+					m_logger->log("Could not load shader: ", ILogger::ELL_ERROR, pathToShader);
+					std::exit(-1);
+				}
 
-				// lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
 				auto source = IAsset::castDown<ICPUShader>(assets[0]);
-				if (!source)
-					return nullptr;
 				const uint32_t workgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations;
 				const uint32_t subgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
 				auto overriddenSource = CHLSLCompiler::createOverridenCopy(
@@ -246,125 +279,89 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 					workgroupSize,
 					subgroupSize
 				);
+				// The down-cast should not fail!
+				assert(overriddenSource);
 
-				return m_device->createShader(overriddenSource.get());
-			};
-
-			auto createComputePipeline = [&](smart_refctd_ptr<IGPUShader>& shader, smart_refctd_ptr<IGPUComputePipeline>& pipeline, smart_refctd_ptr<IGPUPipelineLayout> pipelineLayout) -> bool
-			{
+				// this time we skip the use of the asset converter since the ICPUShader->IGPUShader path is quick and simple
+				auto shader = m_device->createShader(overriddenSource.get());
+				if (!shader)
 				{
-					IGPUComputePipeline::SCreationParams params = {};
-					params.layout = pipelineLayout.get();
-					params.shader.shader = shader.get();
-					params.shader.entryPoint = "main";
-					params.shader.entries = nullptr;
-					params.shader.requireFullSubgroups = true;
-					params.shader.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(5);
-					if (!m_device->createComputePipelines(nullptr, { &params,1 }, &pipeline))
-						return logFail("Failed to create compute pipeline!\n");
+					m_logger->log("Shader creationed failed: %s!", ILogger::ELL_ERROR, pathToShader);
+					std::exit(-1);
 				}
 
-				return true;
+				return shader;
 			};
 
-			const nbl::asset::SPushConstantRange pcRange = {
-					.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
-					.offset = 0,
-					.size = sizeof(AutoexposurePushData)
-			};
+			// Create compute pipelines
+			{
+				std::array<IGPUComputePipeline::SCreationParams, 2> params;
+				std::array<smart_refctd_ptr<IGPUShader>, 2> shaders;
+				std::array<smart_refctd_ptr<IGPUPipelineLayout>, 2> pipelineLayouts;
+				std::array<smart_refctd_ptr<IGPUComputePipeline>, 2> pipelines;
+				for (int index = 0; index < 2; index++) {
+					shaders[index] = loadAndCompileShader(ShaderPaths[index]);
+					const nbl::asset::SPushConstantRange pcRange = {
+							.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
+							.offset = 0,
+							.size = sizeof(AutoexposurePushData)
+					};
+					pipelineLayouts[index] = m_device->createPipelineLayout(
+						{ &pcRange, 1 },
+						nullptr,
+						nullptr,
+						smart_refctd_ptr(gpuLayouts[index]),
+						nullptr
+					);
+					if (!pipelineLayouts[index]) {
+						return logFail("Failed to create pipeline layout");
+					}
 
-			// Luma Meter
-			auto meterShader = loadCompileAndCreateShader("app_resources/luma_meter.comp.hlsl");
-			if (!meterShader)
-				return logFail("Failed to Load and Compile Compute Shader: meterShader!");
-			auto meterLayout = m_device->createPipelineLayout(
-				{ &pcRange, 1 },
-				core::smart_refctd_ptr(dsLayouts[0]),
-				nullptr,
-				nullptr,
-				nullptr
-			);
-			if (!createComputePipeline(meterShader, m_meterPipeline, meterLayout))
-				return logFail("Could not create Luma Meter Pipeline!");
-
-			// Luma Gather
-			auto gatherShader = loadCompileAndCreateShader("app_resources/luma_gather.comp.hlsl");
-			if (!gatherShader)
-				return logFail("Failed to Load and Compile Compute Shader: gatherShader!");
-			auto gatherLayout = m_device->createPipelineLayout(
-				{ &pcRange, 1 },
-				core::smart_refctd_ptr(dsLayouts[0]),
-				nullptr,
-				nullptr,
-				core::smart_refctd_ptr(dsLayouts[1])
-			);
-			if (!createComputePipeline(gatherShader, m_gatherPipeline, gatherLayout))
-				return logFail("Could not create Luma Gather Pipeline!");
-
-			// Load FSTri Shader
-			ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get());
-			if (!fsTriProtoPPln)
-				return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!");
-
-			// Load Fragment Shader
-			auto fragmentShader = loadCompileAndCreateShader("app_resources/present.frag.hlsl");;
-			if (!fragmentShader)
-				return logFail("Failed to Load and Compile Fragment Shader: lumaMeterShader!");
-
-			const IGPUShader::SSpecInfo fragSpec = {
-				.entryPoint = "main",
-				.shader = fragmentShader.get()
-			};
-			auto presentLayout = m_device->createPipelineLayout(
-				{ &pcRange, 1 },
-				nullptr,
-				nullptr,
-				nullptr,
-				core::smart_refctd_ptr(dsLayouts[2])
-			);
-			m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, presentLayout.get(), scResources->getRenderpass());
-			if (!m_presentPipeline)
-				return logFail("Could not create Graphics Pipeline!");
-		}
+					params[index] = {};
+					params[index].layout = pipelineLayouts[index].get();
+					params[index].shader.shader = shaders[index].get();
+					params[index].shader.entryPoint = "main";
+					params[index].shader.entries = nullptr;
+					params[index].shader.requireFullSubgroups = true;
+					params[index].shader.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(5);
+				}
+				
+				if (!m_device->createComputePipelines(nullptr, params, pipelines.data())) {
+					return logFail("Failed to create compute pipeline!\n");
+				}
 
-		// Init the surface and create the swapchain
-		if (!m_surface || !m_surface->init(graphicsQueue, std::move(scResources), swapchainParams.sharedParams))
-			return logFail("Could not create Window & Surface or initialize the Surface!");
+				m_meterPipeline = std::move(pipelines[0]);
+				m_tonemapPipeline = std::move(pipelines[1]);
+			}
 
-		// need resetttable commandbuffers for the upload utility
-		{
-			m_graphicsCmdPool = m_device->createCommandPool(graphicsQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-			m_computeCmdPool = m_device->createCommandPool(computeQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-
-			// create the commandbuffers
-			if (!m_graphicsCmdPool || !m_computeCmdPool)
-				return logFail("Couldn't create Command Pools!");
-
-			if (
-				!m_graphicsCmdPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_graphicsCmdBufs.data(), 1 }) ||
-				!m_computeCmdPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_computeCmdBufs.data(), 2 })
-			)
-				return logFail("Couldn't create Command Buffers!");
-		}
+			// Create graphics pipeline
+			{
+				auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
+				ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get());
+				if (!fsTriProtoPPln)
+					return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!");
+
+				// Load Fragment Shader
+				auto fragmentShader = loadAndCompileShader(ShaderPaths[2]);
+				if (!fragmentShader)
+					return logFail("Failed to Load and Compile Fragment Shader: lumaMeterShader!");
+
+				const IGPUShader::SSpecInfo fragSpec = {
+					.entryPoint = "main",
+					.shader = fragmentShader.get()
+				};
 
-		// things for IUtilities
-		{
-			m_scratchSemaphore = m_device->createSemaphore(0);
-			if (!m_scratchSemaphore)
-				return logFail("Could not create Scratch Semaphore");
-			m_scratchSemaphore->setObjectDebugName("Scratch Semaphore");
-			// we don't want to overcomplicate the example with multi-queue
-			m_intendedSubmit.queue = graphicsQueue;
-			// wait for nothing before upload
-			m_intendedSubmit.waitSemaphores = {};
-			m_intendedSubmit.waitSemaphores = {};
-			// fill later
-			m_intendedSubmit.commandBuffers = {};
-			m_intendedSubmit.scratchSemaphore = {
-				.semaphore = m_scratchSemaphore.get(),
-				.value = 0,
-				.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
-			};
+				auto presentLayout = m_device->createPipelineLayout(
+					{},
+					nullptr,
+					nullptr,
+					std::move(gpuLayouts[2]),
+					nullptr
+				);
+				m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, presentLayout.get(), scRes->getRenderpass());
+				if (!m_presentPipeline)
+					return logFail("Could not create Graphics Pipeline!");
+			}
 		}
 
 		// Allocate and create buffer for Luma Gather
@@ -397,6 +394,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 						return logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n");
 
 					assert(allocation->memory.get() == buffer->getBoundMemory().memory);
+					return true;
 				};
 
 				build_buffer(m_device, &m_gatherAllocation, buffer, m_physicalDevice->getLimits().maxSubgroupSize, "Luma Gather Buffer");
@@ -408,176 +406,188 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				return logFail("Failed to map the Device Memory!\n");
 		}
 
-		// Allocate and Leave 1/4 for image uploads, to test image copy with small memory remaining
-		{
-			uint32_t localOffset = video::StreamingTransientDataBufferMT<>::invalid_value;
-			uint32_t maxFreeBlock = m_utils->getDefaultUpStreamingBuffer()->max_size();
-			const uint32_t allocationAlignment = 64u;
-			const uint32_t allocationSize = (maxFreeBlock / 4) * 3;
-			m_utils->getDefaultUpStreamingBuffer()->multi_allocate(std::chrono::steady_clock::now() + std::chrono::microseconds(500u), 1u, &localOffset, &allocationSize, &allocationAlignment);
-		}
-
 		// Load exr file into gpu
+		smart_refctd_ptr<IGPUImage> gpuImg;
 		{
-			IAssetLoader::SAssetLoadParams params;
-			auto imageBundle = m_assetMgr->getAsset(DefaultImagePathsFile.data(), params);
-			auto cpuImg = IAsset::castDown<ICPUImage>(imageBundle.getContents().begin()[0]);
-			auto format = cpuImg->getCreationParameters().format;
-
-			ICPUImageView::SCreationParams viewParams = {
-				.flags = ICPUImageView::E_CREATE_FLAGS::ECF_NONE,
-				.image = std::move(cpuImg),
-				.viewType = IImageView<ICPUImage>::E_TYPE::ET_2D,
-				.format = format,
-				.subresourceRange = {
-					.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT,
-					.baseMipLevel = 0u,
-					.levelCount = ICPUImageView::remaining_mip_levels,
-					.baseArrayLayer = 0u,
-					.layerCount = ICPUImageView::remaining_array_layers
-				}
-			};
+			auto convertImgCPU2GPU = [&](ICPUImage* cpuImg) {
+				auto queue = getGraphicsQueue();
+				auto cmdbuf = m_cmdBufs[0].get();
+				cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE);
+				std::array<IQueue::SSubmitInfo::SCommandBufferInfo, 1> commandBufferInfo = { cmdbuf };
+				core::smart_refctd_ptr<ISemaphore> imgFillSemaphore = m_device->createSemaphore(0);
+				imgFillSemaphore->setObjectDebugName("Image Fill Semaphore");
+
+				auto converter = CAssetConverter::create({ .device = m_device.get() });
+				// We don't want to generate mip-maps for these images, to ensure that we must override the default callbacks.
+				struct SInputs final : CAssetConverter::SInputs
+				{
+					// we also need to override this to have concurrent sharing
+					inline std::span<const uint32_t> getSharedOwnershipQueueFamilies(const size_t groupCopyID, const asset::ICPUImage* buffer, const CAssetConverter::patch_t<asset::ICPUImage>& patch) const override
+					{
+						if (familyIndices.size() > 1)
+							return familyIndices;
+						return {};
+					}
 
-			const auto cpuImgView = ICPUImageView::create(std::move(viewParams));
-			const auto& cpuImgParams = cpuImgView->getCreationParameters();
+					inline uint8_t getMipLevelCount(const size_t groupCopyID, const ICPUImage* image, const CAssetConverter::patch_t<asset::ICPUImage>& patch) const override
+					{
+						return image->getCreationParameters().mipLevels;
+					}
+					inline uint16_t needToRecomputeMips(const size_t groupCopyID, const ICPUImage* image, const CAssetConverter::patch_t<asset::ICPUImage>& patch) const override
+					{
+						return 0b0u;
+					}
 
-			// create matching size image upto dimensions
-			IGPUImage::SCreationParams imageParams = {};
-			imageParams = cpuImgParams.image->getCreationParameters();
-			imageParams.usage |= IGPUImage::EUF_TRANSFER_DST_BIT | IGPUImage::EUF_SAMPLED_BIT | IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_SRC_BIT;
-			// promote format because RGB8 and friends don't actually exist in HW
-			{
-				const IPhysicalDevice::SImageFormatPromotionRequest request = {
-					.originalFormat = imageParams.format,
-					.usages = IPhysicalDevice::SFormatImageUsages::SUsage(imageParams.usage)
+					std::vector<uint32_t> familyIndices;
+				} inputs = {};
+				inputs.readCache = converter.get();
+				inputs.logger = m_logger.get();
+				{
+					const core::set<uint32_t> uniqueFamilyIndices = { queue->getFamilyIndex(), queue->getFamilyIndex() };
+					inputs.familyIndices = { uniqueFamilyIndices.begin(),uniqueFamilyIndices.end() };
+				}
+				// scratch command buffers for asset converter transfer commands
+				SIntendedSubmitInfo transfer = {
+					.queue = queue,
+					.waitSemaphores = {},
+					.prevCommandBuffers = {},
+					.scratchCommandBuffers = commandBufferInfo,
+					.scratchSemaphore = {
+						.semaphore = imgFillSemaphore.get(),
+						.value = 0,
+						// because of layout transitions
+						.stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS
+					}
 				};
-				imageParams.format = m_physicalDevice->promoteImageFormat(request, imageParams.tiling);
-			}
-			if (imageParams.type == IGPUImage::ET_3D)
-				imageParams.flags |= IGPUImage::ECF_2D_ARRAY_COMPATIBLE_BIT;
-			auto gpuImg = m_device->createImage(std::move(imageParams));
-			if (!gpuImg || !m_device->allocate(gpuImg->getMemoryReqs(), gpuImg.get()).isValid())
-				return false;
-			gpuImg->setObjectDebugName("Autoexposure Image");
-
-			imageParams = gpuImg->getCreationParameters();
-			imageParams.usage = IGPUImage::EUF_SAMPLED_BIT | IGPUImage::EUF_STORAGE_BIT;
-			auto tonemappedImg = m_device->createImage(std::move(imageParams));
-			if (!tonemappedImg || !m_device->allocate(tonemappedImg->getMemoryReqs(), tonemappedImg.get()).isValid())
-				return false;
-			tonemappedImg->setObjectDebugName("Tonemapped Image");
-
-			// Now show the window
-			m_winMgr->show(m_window.get());
-
-			// we don't want to overcomplicate the example with multi-queue
-			auto queue = getGraphicsQueue();
-			auto cmdbuf = m_graphicsCmdBufs[0].get();
-			IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo = { cmdbuf };
-			m_intendedSubmit.commandBuffers = { &cmdbufInfo, 1 };
+				// as per the `SIntendedSubmitInfo` one commandbuffer must be begun
+				cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+				// Normally we'd have to inherit and override the `getFinalOwnerQueueFamily` callback to ensure that the
+				// compute queue becomes the owner of the buffers and images post-transfer, but in this example we use concurrent sharing
+				CAssetConverter::SConvertParams params = {};
+				params.transfer = &transfer;
+				params.utilities = m_utils.get();
+
+				std::get<CAssetConverter::SInputs::asset_span_t<ICPUImage>>(inputs.assets) = { &cpuImg, 1 };
+				// assert that we don't need to provide patches
+				assert(cpuImg->getImageUsageFlags().hasFlags(ICPUImage::E_USAGE_FLAGS::EUF_SAMPLED_BIT));
+				auto reservation = converter->reserve(inputs);
+				// the `.value` is just a funny way to make the `smart_refctd_ptr` copyable
+				auto gpuImgs = reservation.getGPUObjects<ICPUImage>();
+				for (auto& gpuImg : gpuImgs) {
+					if (!gpuImg) {
+						m_logger->log("Failed to convert %s into an IGPUImage handle", ILogger::ELL_ERROR, DefaultImagePathsFile);
+						std::exit(-1);
+					}
+				}
 
-			// there's no previous operation to wait for
-			const SMemoryBarrier transferBarriers[] = {
-				{
-					.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
-					.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT
-				},
-				{
-					.srcStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
-					.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT,
+				// and launch the conversions
+				m_api->startCapture();
+				auto result = reservation.convert(params);
+				m_api->endCapture();
+				if (!result.blocking() && result.copy() != IQueue::RESULT::SUCCESS) {
+					m_logger->log("Failed to record or submit conversions", ILogger::ELL_ERROR);
+					std::exit(-1);
 				}
+
+				return gpuImgs[0].value;
 			};
 
-			// upload image and write to descriptor set
-			queue->startCapture();
+			smart_refctd_ptr<ICPUImage> cpuImg;
+			{
+				IAssetLoader::SAssetLoadParams lp;
+				SAssetBundle bundle = m_assetMgr->getAsset(DefaultImagePathsFile, lp);
+				if (bundle.getContents().empty()) {
+					m_logger->log("Couldn't load an asset.", ILogger::ELL_ERROR);
+					std::exit(-1);
+				}
 
-			cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-			// change the layout of the image
-			const IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarriers1[] = {
-				{
-					.barrier = {
-						.dep = transferBarriers[0]
-						// no ownership transfers
-					},
-					.image = gpuImg.get(),
-					// transition the whole view
-					.subresourceRange = cpuImgParams.subresourceRange,
-					// a wiping transition
-					.newLayout = IGPUImage::LAYOUT::TRANSFER_DST_OPTIMAL
-				},
-				{
-					.image = tonemappedImg.get(),
-					.subresourceRange = cpuImgParams.subresourceRange,
-					.newLayout = IGPUImage::LAYOUT::GENERAL
+				cpuImg = IAsset::castDown<ICPUImage>(bundle.getContents()[0]);
+				if (!cpuImg) {
+					m_logger->log("Couldn't load an asset.", ILogger::ELL_ERROR);
+					std::exit(-1);
 				}
 			};
-			const IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarriers2[] = {
+
+			gpuImg = convertImgCPU2GPU(cpuImg.get());
+		}
+
+		// create views for textures
+		{
+			auto createHDRIImage = [this](const asset::E_FORMAT colorFormat, const uint32_t width, const uint32_t height) -> smart_refctd_ptr<IGPUImage> {
+				IGPUImage::SCreationParams imgInfo;
+				imgInfo.format = colorFormat;
+				imgInfo.type = IGPUImage::ET_2D;
+				imgInfo.extent.width = width;
+				imgInfo.extent.height = height;
+				imgInfo.extent.depth = 1u;
+				imgInfo.mipLevels = 1u;
+				imgInfo.arrayLayers = 1u;
+				imgInfo.samples = IGPUImage::ESCF_1_BIT;
+				imgInfo.flags = static_cast<asset::IImage::E_CREATE_FLAGS>(0u);
+				imgInfo.usage = asset::IImage::EUF_STORAGE_BIT | asset::IImage::EUF_TRANSFER_DST_BIT | asset::IImage::EUF_SAMPLED_BIT;
+
+				auto image = m_device->createImage(std::move(imgInfo));
+				auto imageMemReqs = image->getMemoryReqs();
+				imageMemReqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
+				m_device->allocate(imageMemReqs, image.get());
+
+				return image;
+				};
+			auto createHDRIImageView = [this](smart_refctd_ptr<IGPUImage> img) -> smart_refctd_ptr<IGPUImageView>
 				{
-					.barrier = {
-						.dep = transferBarriers[1]
-						// no ownership transfers
-					},
-					.image = gpuImg.get(),
-					// transition the whole view
-					.subresourceRange = cpuImgParams.subresourceRange,
-					// a wiping transition
-					.oldLayout = IGPUImage::LAYOUT::TRANSFER_DST_OPTIMAL,
-					.newLayout = IGPUImage::LAYOUT::READ_ONLY_OPTIMAL
-				}
-			};
-			cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imgBarriers1 });
-			// upload contents
-			m_utils->updateImageViaStagingBuffer(
-				m_intendedSubmit,
-				cpuImgParams.image->getBuffer(),
-				cpuImgParams.image->getCreationParameters().format,
-				gpuImg.get(),
-				IGPUImage::LAYOUT::TRANSFER_DST_OPTIMAL,
-				cpuImgParams.image->getRegions()
-			);
-			cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imgBarriers2 });
-			m_utils->autoSubmit(m_intendedSubmit, [&](SIntendedSubmitInfo& nextSubmit) -> bool { return true; });
-
-			IGPUImageView::SCreationParams gpuImgViewParams = {
-				.image = gpuImg,
-				.viewType = IGPUImageView::ET_2D,
-				.format = gpuImg->getCreationParameters().format,
-			};
-			IGPUImageView::SCreationParams tonemappedImgViewParams = {
-				.image = tonemappedImg,
-				.viewType = IGPUImageView::ET_2D,
-				.format = tonemappedImg->getCreationParameters().format
-			};
+					auto format = img->getCreationParameters().format;
+					IGPUImageView::SCreationParams imgViewInfo;
+					imgViewInfo.image = std::move(img);
+					imgViewInfo.format = format;
+					imgViewInfo.viewType = IGPUImageView::ET_2D;
+					imgViewInfo.flags = static_cast<IGPUImageView::E_CREATE_FLAGS>(0u);
+					imgViewInfo.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+					imgViewInfo.subresourceRange.baseArrayLayer = 0u;
+					imgViewInfo.subresourceRange.baseMipLevel = 0u;
+					imgViewInfo.subresourceRange.layerCount = 1u;
+					imgViewInfo.subresourceRange.levelCount = 1u;
+
+					return m_device->createImageView(std::move(imgViewInfo));
+				};
 
-			m_gpuImgView = m_device->createImageView(std::move(gpuImgViewParams));
-			m_tonemappedImgView = m_device->createImageView(std::move(tonemappedImgViewParams));
+			auto params = gpuImg->getCreationParameters();
+			auto extent = params.extent;
+			gpuImg->setObjectDebugName("GPU Img");
+			m_gpuImgView = createHDRIImageView(gpuImg);
+			m_gpuImgView->setObjectDebugName("GPU Img View");
+			auto outImg = createHDRIImage(asset::E_FORMAT::EF_R16G16B16A16_SFLOAT, Dimensions.x, Dimensions.y);
+			outImg->setObjectDebugName("Tonemapped Image");
+			m_tonemappedImgView = createHDRIImageView(outImg);
+			m_tonemappedImgView->setObjectDebugName("Tonemapped Image View");
+		}
 
+		// Update Descriptors
+		{
 			IGPUDescriptorSet::SDescriptorInfo infos[3];
 			infos[0].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
 			infos[0].desc = m_gpuImgView;
 			infos[1].info.image.imageLayout = IImage::LAYOUT::GENERAL;
 			infos[1].desc = m_tonemappedImgView;
-			infos[2].info.image.imageLayout = IImage::LAYOUT::GENERAL;
+			infos[2].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
 			infos[2].desc = m_tonemappedImgView;
 
-
 			IGPUDescriptorSet::SWriteDescriptorSet writeDescriptors[] = {
 				{
-					.dstSet = m_ds[0].get(),
+					.dstSet = m_meterDS.get(),
 					.binding = 0,
 					.arrayElement = 0,
 					.count = 1,
 					.info = infos
 				},
 				{
-					.dstSet = m_ds[1].get(),
+					.dstSet = m_tonemapDS.get(),
 					.binding = 0,
 					.arrayElement = 0,
 					.count = 1,
 					.info = infos + 1
 				},
 				{
-					.dstSet = m_ds[2].get(),
+					.dstSet = m_presentDS.get(),
 					.binding = 0,
 					.arrayElement = 0,
 					.count = 1,
@@ -586,16 +596,19 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			};
 
 			m_device->updateDescriptorSets(3, writeDescriptors, 0, nullptr);
-
-			queue->endCapture();
 		}
 
+		m_winMgr->setWindowSize(m_window.get(), Dimensions.x, Dimensions.y);
+		m_surface->recreateSwapchain();
+		m_winMgr->show(m_window.get());
+
 		return true;
 	}
 
 	// We do a very simple thing, display an image and wait `DisplayImageMs` to show it
 	inline void workLoopBody() override
 	{
+#if 0
 		const uint32_t SubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
 
 		auto gpuImgExtent = m_gpuImgView->getCreationParameters().image->getCreationParameters().extent;
@@ -625,7 +638,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				1 + ((viewportSize.y / 2) - 1) / SubgroupSize
 			};
 
-			queue->startCapture();
+			m_api->startCapture();
 
 			cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
 			cmdbuf->bindComputePipeline(m_meterPipeline.get());
@@ -652,7 +665,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				submit_infos[0].signalSemaphores = signals;
 
 				queue->submit(submit_infos);
-				queue->endCapture();
+				m_api->endCapture();
 			}
 
 			const ISemaphore::SWaitInfo wait_infos[] = {
@@ -686,7 +699,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				1 + ((viewportSize.y) - 1) / SubgroupSize
 			};
 
-			queue->startCapture();
+			m_api->startCapture();
 
 			cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
 			cmdbuf->bindComputePipeline(m_gatherPipeline.get());
@@ -714,7 +727,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				submit_infos[0].signalSemaphores = signals;
 
 				queue->submit(submit_infos);
-				queue->endCapture();
+				m_api->endCapture();
 			}
 
 			const ISemaphore::SWaitInfo wait_infos[] = {
@@ -747,7 +760,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				.lumaMeterBDA = m_gatherBDA
 			};
 
-			queue->startCapture();
+			m_api->startCapture();
 
 			cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
 
@@ -815,7 +828,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 			// Present
 			m_surface->present(acquire.imageIndex, rendered);
-			queue->endCapture();
+			m_api->endCapture();
 
 			// Wait for completion
 			{
@@ -830,6 +843,8 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			}
 		}
 
+#endif
+
 		m_submitIx++;
 	}
 
@@ -848,32 +863,29 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 	}
 
 protected:
-	nbl::video::IDeviceMemoryAllocator::SAllocation m_gatherAllocation;
-	uint64_t m_gatherBDA;
-	smart_refctd_ptr<IGPUImageView> m_gpuImgView, m_tonemappedImgView;
-
-	// for image uploads
-	smart_refctd_ptr<ISemaphore> m_scratchSemaphore;
-	SIntendedSubmitInfo m_intendedSubmit;
+	// window
+	smart_refctd_ptr<IWindow> m_window;
+	smart_refctd_ptr<CSimpleResizeSurface<CDefaultSwapchainFramebuffers>> m_surface;
 
 	// Pipelines
-	smart_refctd_ptr<IGPUComputePipeline> m_meterPipeline, m_gatherPipeline;
+	smart_refctd_ptr<IGPUComputePipeline> m_meterPipeline, m_tonemapPipeline;
 	smart_refctd_ptr<IGPUGraphicsPipeline> m_presentPipeline;
 
 	// Descriptor Sets
-	std::array<smart_refctd_ptr<IGPUDescriptorSet>, 3> m_ds;
+	smart_refctd_ptr<IGPUDescriptorSet> m_meterDS, m_tonemapDS, m_presentDS;
 
 	// Command Buffers
-	smart_refctd_ptr<IGPUCommandPool> m_graphicsCmdPool, m_computeCmdPool;
-	std::array<smart_refctd_ptr<IGPUCommandBuffer>, 2> m_graphicsCmdBufs, m_computeCmdBufs;
+	smart_refctd_ptr<IGPUCommandPool> m_cmdPool;
+	std::array<smart_refctd_ptr<IGPUCommandBuffer>, 1> m_cmdBufs;
 
 	// Semaphores
-	smart_refctd_ptr<ISemaphore> m_meterSemaphore, m_gatherSemaphore, m_presentSemaphore;
+	smart_refctd_ptr<ISemaphore> m_meterSemaphore, m_tonemapSemaphore, m_presentSemaphore;
 	uint64_t m_submitIx = 0;
 
-	// window
-	smart_refctd_ptr<IWindow> m_window;
-	smart_refctd_ptr<CSimpleResizeSurface<CDefaultSwapchainFramebuffers>> m_surface;
+	// example resources
+	nbl::video::IDeviceMemoryAllocator::SAllocation m_gatherAllocation;
+	uint64_t m_gatherBDA;
+	smart_refctd_ptr<IGPUImageView> m_gpuImgView, m_tonemappedImgView;
 };
 
 NBL_MAIN_FUNC(AutoexposureApp)

From 3a94cd4abb3448594d7491bbd13f5c1c5fa5335b Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Wed, 15 Jan 2025 05:41:37 +0000
Subject: [PATCH 44/50] Rewrite descriptor set logic

---
 26_Autoexposure/main.cpp | 159 ++++++++++++++++++++-------------------
 1 file changed, 80 insertions(+), 79 deletions(-)

diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index f6d690a00..90300047d 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -203,11 +203,10 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			};
 			auto defaultSampler = make_smart_refctd_ptr<ICPUSampler>(samplerParams);
 
-			std::array<ICPUDescriptorSetLayout::SBinding, 1> meterBindings = {};
-			std::array<ICPUDescriptorSetLayout::SBinding, 1> tonemapBindings = {};
-			std::array<ICPUDescriptorSetLayout::SBinding, 1> presentBindings = {};
+			std::array<ICPUDescriptorSetLayout::SBinding, 1> imgSamplerbindings = {};
+			std::array<ICPUDescriptorSetLayout::SBinding, 1> rwImgbindings = {};
 
-			meterBindings[0] = {
+			imgSamplerbindings[0] = {
 				.binding = 0u,
 				.type = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,
 				.createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
@@ -215,49 +214,36 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				.count = 1u,
 				.immutableSamplers = &defaultSampler
 			};
-			tonemapBindings[0] = {
+			rwImgbindings[0] = {
 				.binding = 0u,
 				.type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE,
 				.createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-				.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
+				.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE | IShader::E_SHADER_STAGE::ESS_FRAGMENT,
 				.count = 1u,
 				.immutableSamplers = nullptr
 			};
-			presentBindings[0] = {
-				.binding = 0u,
-				.type = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,
-				.createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-				.stageFlags = IShader::E_SHADER_STAGE::ESS_FRAGMENT,
-				.count = 1u,
-				.immutableSamplers = &defaultSampler
-			};
 
-			auto cpuMeterLayout = make_smart_refctd_ptr<ICPUDescriptorSetLayout>(meterBindings);
-			auto cpuTonemapLayout = make_smart_refctd_ptr<ICPUDescriptorSetLayout>(tonemapBindings);
-			auto cpuPresentLayout = make_smart_refctd_ptr<ICPUDescriptorSetLayout>(presentBindings);
+			auto cpuImgSamplerLayout = make_smart_refctd_ptr<ICPUDescriptorSetLayout>(imgSamplerbindings);
+			auto cpuRWImgLayout = make_smart_refctd_ptr<ICPUDescriptorSetLayout>(rwImgbindings);
 
-			std::array<ICPUDescriptorSetLayout*, 3> cpuLayouts = {
-				cpuMeterLayout.get(),
-				cpuTonemapLayout.get(),
-				cpuPresentLayout.get()
+			std::array<ICPUDescriptorSetLayout*, 2> cpuLayouts = {
+				cpuImgSamplerLayout.get(),
+				cpuRWImgLayout.get()
 			};
 
 			auto gpuLayouts = convertDSLayoutCPU2GPU(cpuLayouts);
 
-			auto cpuMeterDS = make_smart_refctd_ptr<ICPUDescriptorSet>(std::move(cpuMeterLayout));
-			auto cpuTonemapDS = make_smart_refctd_ptr<ICPUDescriptorSet>(std::move(cpuTonemapLayout));
-			auto cpuPresentDS = make_smart_refctd_ptr<ICPUDescriptorSet>(std::move(cpuPresentLayout));
+			auto cpuImgSamplerDS = make_smart_refctd_ptr<ICPUDescriptorSet>(std::move(cpuImgSamplerLayout));
+			auto cpuRWImgDS = make_smart_refctd_ptr<ICPUDescriptorSet>(std::move(cpuRWImgLayout));
 
-			std::array<ICPUDescriptorSet*, 3> cpuDS = {
-				cpuMeterDS.get(),
-				cpuTonemapDS.get(),
-				cpuPresentDS.get()
+			std::array<ICPUDescriptorSet*, 2> cpuDS = {
+				cpuImgSamplerDS.get(),
+				cpuRWImgDS.get()
 			};
 
 			auto gpuDS = convertDSCPU2GPU(cpuDS);
-			m_meterDS = gpuDS[0];
-			m_tonemapDS = gpuDS[1];
-			m_presentDS = gpuDS[2];
+			m_imgSamplerDS = gpuDS[0];
+			m_rwImgDS = gpuDS[1];
 
 			// Create Shaders
 			auto loadAndCompileShader = [&](std::string pathToShader) {
@@ -299,31 +285,57 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				std::array<smart_refctd_ptr<IGPUShader>, 2> shaders;
 				std::array<smart_refctd_ptr<IGPUPipelineLayout>, 2> pipelineLayouts;
 				std::array<smart_refctd_ptr<IGPUComputePipeline>, 2> pipelines;
-				for (int index = 0; index < 2; index++) {
-					shaders[index] = loadAndCompileShader(ShaderPaths[index]);
+				{
+					shaders[0] = loadAndCompileShader(ShaderPaths[0]);
 					const nbl::asset::SPushConstantRange pcRange = {
 							.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
 							.offset = 0,
 							.size = sizeof(AutoexposurePushData)
 					};
-					pipelineLayouts[index] = m_device->createPipelineLayout(
+					pipelineLayouts[0] = m_device->createPipelineLayout(
 						{ &pcRange, 1 },
+						smart_refctd_ptr(gpuLayouts[0]),
 						nullptr,
 						nullptr,
-						smart_refctd_ptr(gpuLayouts[index]),
 						nullptr
 					);
-					if (!pipelineLayouts[index]) {
+					if (!pipelineLayouts[0]) {
+						return logFail("Failed to create pipeline layout");
+					}
+
+					params[0] = {};
+					params[0].layout = pipelineLayouts[0].get();
+					params[0].shader.shader = shaders[0].get();
+					params[0].shader.entryPoint = "main";
+					params[0].shader.entries = nullptr;
+					params[0].shader.requireFullSubgroups = true;
+					params[0].shader.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(5);
+				}
+				{
+					shaders[1] = loadAndCompileShader(ShaderPaths[1]);
+					const nbl::asset::SPushConstantRange pcRange = {
+							.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
+							.offset = 0,
+							.size = sizeof(AutoexposurePushData)
+					};
+					pipelineLayouts[1] = m_device->createPipelineLayout(
+						{ &pcRange, 1 },
+						smart_refctd_ptr(gpuLayouts[0]),
+						nullptr,
+						nullptr,
+						smart_refctd_ptr(gpuLayouts[1])
+					);
+					if (!pipelineLayouts[1]) {
 						return logFail("Failed to create pipeline layout");
 					}
 
-					params[index] = {};
-					params[index].layout = pipelineLayouts[index].get();
-					params[index].shader.shader = shaders[index].get();
-					params[index].shader.entryPoint = "main";
-					params[index].shader.entries = nullptr;
-					params[index].shader.requireFullSubgroups = true;
-					params[index].shader.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(5);
+					params[1] = {};
+					params[1].layout = pipelineLayouts[1].get();
+					params[1].shader.shader = shaders[1].get();
+					params[1].shader.entryPoint = "main";
+					params[1].shader.entries = nullptr;
+					params[1].shader.requireFullSubgroups = true;
+					params[1].shader.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(5);
 				}
 				
 				if (!m_device->createComputePipelines(nullptr, params, pipelines.data())) {
@@ -355,8 +367,8 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 					{},
 					nullptr,
 					nullptr,
-					std::move(gpuLayouts[2]),
-					nullptr
+					nullptr,
+					std::move(gpuLayouts[1])
 				);
 				m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, presentLayout.get(), scRes->getRenderpass());
 				if (!m_presentPipeline)
@@ -563,39 +575,30 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 		// Update Descriptors
 		{
-			IGPUDescriptorSet::SDescriptorInfo infos[3];
+			IGPUDescriptorSet::SDescriptorInfo infos[2];
 			infos[0].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
 			infos[0].desc = m_gpuImgView;
 			infos[1].info.image.imageLayout = IImage::LAYOUT::GENERAL;
 			infos[1].desc = m_tonemappedImgView;
-			infos[2].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
-			infos[2].desc = m_tonemappedImgView;
 
 			IGPUDescriptorSet::SWriteDescriptorSet writeDescriptors[] = {
 				{
-					.dstSet = m_meterDS.get(),
+					.dstSet = m_imgSamplerDS.get(),
 					.binding = 0,
 					.arrayElement = 0,
 					.count = 1,
 					.info = infos
 				},
 				{
-					.dstSet = m_tonemapDS.get(),
+					.dstSet = m_rwImgDS.get(),
 					.binding = 0,
 					.arrayElement = 0,
 					.count = 1,
 					.info = infos + 1
-				},
-				{
-					.dstSet = m_presentDS.get(),
-					.binding = 0,
-					.arrayElement = 0,
-					.count = 1,
-					.info = infos + 2
 				}
 			};
 
-			m_device->updateDescriptorSets(3, writeDescriptors, 0, nullptr);
+			m_device->updateDescriptorSets(2, writeDescriptors, 0, nullptr);
 		}
 
 		m_winMgr->setWindowSize(m_window.get(), Dimensions.x, Dimensions.y);
@@ -608,9 +611,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 	// We do a very simple thing, display an image and wait `DisplayImageMs` to show it
 	inline void workLoopBody() override
 	{
-#if 0
 		const uint32_t SubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
-
 		auto gpuImgExtent = m_gpuImgView->getCreationParameters().image->getCreationParameters().extent;
 		uint32_t2 viewportSize = { gpuImgExtent.width, gpuImgExtent.height };
 		float32_t sampleCount = (viewportSize.x * viewportSize.y) / 4;
@@ -619,10 +620,10 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 		// Luma Meter
 		{
-			auto queue = getComputeQueue();
-			auto cmdbuf = m_computeCmdBufs[0].get();
+			auto queue = getGraphicsQueue();
+			auto cmdbuf = m_cmdBufs[0].get();
 			cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE);
-			auto ds = m_ds[0].get();
+			auto ds = m_imgSamplerDS.get();
 
 			auto pc = AutoexposurePushData
 			{
@@ -647,6 +648,8 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			cmdbuf->dispatch(dispatchSize.x, dispatchSize.y);
 			cmdbuf->end();
 
+			m_api->endCapture();
+
 			{
 				IQueue::SSubmitInfo submit_infos[1];
 				IQueue::SSubmitInfo::SCommandBufferInfo cmdBufs[] = {
@@ -665,7 +668,6 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				submit_infos[0].signalSemaphores = signals;
 
 				queue->submit(submit_infos);
-				m_api->endCapture();
 			}
 
 			const ISemaphore::SWaitInfo wait_infos[] = {
@@ -677,13 +679,14 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			m_device->blockForSemaphores(wait_infos);
 		}
 
+#if 0
 		// Luma Gather and Tonemapping
 		{
-			auto queue = getComputeQueue();
-			auto cmdbuf = m_computeCmdBufs[1].get();
+			auto queue = getGraphicsQueue();
+			auto cmdbuf = m_cmdBufs[0].get();
 			cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE);
-			auto ds1 = m_ds[0].get();
-			auto ds2 = m_ds[1].get();
+			auto ds1 = m_imgSamplerDS.get();
+			auto ds2 = m_rwImgDS.get();
 
 			auto pc = AutoexposurePushData
 			{
@@ -702,10 +705,10 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			m_api->startCapture();
 
 			cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-			cmdbuf->bindComputePipeline(m_gatherPipeline.get());
-			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_gatherPipeline->getLayout(), 0, 1, &ds1); // also if you created DS Set with 3th index you need to respect it here - firstSet tells you the index of set and count tells you what range from this index it should update, useful if you had 2 DS with lets say set index 2,3, then you can bind both with single call setting firstSet to 2, count to 2 and last argument would be pointet to your DS pointers
-			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_gatherPipeline->getLayout(), 3, 1, &ds2);
-			cmdbuf->pushConstants(m_gatherPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(pc), &pc);
+			cmdbuf->bindComputePipeline(m_tonemapPipeline.get());
+			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_tonemapPipeline->getLayout(), 0, 1, &ds1); // also if you created DS Set with 3th index you need to respect it here - firstSet tells you the index of set and count tells you what range from this index it should update, useful if you had 2 DS with lets say set index 2,3, then you can bind both with single call setting firstSet to 2, count to 2 and last argument would be pointet to your DS pointers
+			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_tonemapPipeline->getLayout(), 3, 1, &ds2);
+			cmdbuf->pushConstants(m_tonemapPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(pc), &pc);
 			cmdbuf->dispatch(dispatchSize.x, dispatchSize.y);
 			cmdbuf->end();
 
@@ -719,7 +722,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				submit_infos[0].commandBuffers = cmdBufs;
 				IQueue::SSubmitInfo::SSemaphoreInfo signals[] = {
 					{
-						.semaphore = m_gatherSemaphore.get(),
+						.semaphore = m_tonemapSemaphore.get(),
 						.value = m_submitIx + 1,
 						.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
 					}
@@ -732,7 +735,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 			const ISemaphore::SWaitInfo wait_infos[] = {
 				{
-					.semaphore = m_gatherSemaphore.get(),
+					.semaphore = m_tonemapSemaphore.get(),
 					.value = m_submitIx + 1
 				}
 			};
@@ -747,9 +750,9 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				return;
 
 			auto queue = getGraphicsQueue();
-			auto cmdbuf = m_graphicsCmdBufs[0].get();
+			auto cmdbuf = m_cmdBufs[0].get();
 			cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE);
-			auto ds = m_ds[2].get();
+			auto ds = m_rwImgDS.get();
 
 			auto pc = AutoexposurePushData
 			{
@@ -842,9 +845,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 					return;
 			}
 		}
-
 #endif
-
 		m_submitIx++;
 	}
 
@@ -872,7 +873,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 	smart_refctd_ptr<IGPUGraphicsPipeline> m_presentPipeline;
 
 	// Descriptor Sets
-	smart_refctd_ptr<IGPUDescriptorSet> m_meterDS, m_tonemapDS, m_presentDS;
+	smart_refctd_ptr<IGPUDescriptorSet> m_imgSamplerDS, m_rwImgDS;
 
 	// Command Buffers
 	smart_refctd_ptr<IGPUCommandPool> m_cmdPool;

From 462e220b2af7237d75184e24e0837517fa8b467a Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Wed, 15 Jan 2025 05:41:58 +0000
Subject: [PATCH 45/50] Replace dot with mul

---
 26_Autoexposure/app_resources/luma_meter.comp.hlsl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/26_Autoexposure/app_resources/luma_meter.comp.hlsl b/26_Autoexposure/app_resources/luma_meter.comp.hlsl
index f936d8d37..b998f33ae 100644
--- a/26_Autoexposure/app_resources/luma_meter.comp.hlsl
+++ b/26_Autoexposure/app_resources/luma_meter.comp.hlsl
@@ -39,7 +39,7 @@ struct SharedAccessor
 struct TexAccessor
 {
     static float32_t3 toXYZ(float32_t3 srgbColor) {
-        return dot(colorspace::sRGBtoXYZ[1], srgbColor);
+        return mul(colorspace::sRGBtoXYZ, srgbColor);
     }
 
     float32_t3 get(float32_t2 uv) {

From 9e26a74aa1bcbe5e26ee14a79d4f2ef9e2701e0d Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Wed, 15 Jan 2025 05:42:22 +0000
Subject: [PATCH 46/50] Replace combined image sampler with RWTexture2D

---
 26_Autoexposure/app_resources/present.frag.hlsl | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/26_Autoexposure/app_resources/present.frag.hlsl b/26_Autoexposure/app_resources/present.frag.hlsl
index b436e248f..aa8febf85 100644
--- a/26_Autoexposure/app_resources/present.frag.hlsl
+++ b/26_Autoexposure/app_resources/present.frag.hlsl
@@ -11,11 +11,9 @@
 using namespace nbl::hlsl;
 using namespace ext::FullScreenTriangle;
 
-// binding 0 set 1
-[[vk::combinedImageSampler]] [[vk::binding(0, 3)]] Texture2D texture;
-[[vk::combinedImageSampler]] [[vk::binding(0, 3)]] SamplerState samplerState;
+[[vk::binding(0, 3)]] RWTexture2D<float32_t4> texture;
 
 [[vk::location(0)]] float32_t4 main(SVertexAttributes vxAttr) : SV_Target0
 {
-    return float32_t4(texture.Sample(samplerState, vxAttr.uv).rgb, 1.0f);
+    return texture[vxAttr.uv];
 }
\ No newline at end of file

From 208a58a6fbd673fbe307ae12ec16929efb45fdcf Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 21 Jan 2025 21:33:52 +0100
Subject: [PATCH 47/50] use a single asset converter throughout, always call
 `convert` to make sure its asset conversion cache is written to

---
 26_Autoexposure/main.cpp | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index 90300047d..4d436a188 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -140,20 +140,27 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				return logFail("Could not create Window & Surface or initialize the Surface!");
 		}
 
+		// One asset converter to make the cache persist
+		auto converter = CAssetConverter::create({ .device = m_device.get() });
+
 		// Create descriptors and pipelines
 		{
-			auto convertDSLayoutCPU2GPU = [&](std::span<ICPUDescriptorSetLayout *> cpuLayouts) {
-				auto converter = CAssetConverter::create({ .device = m_device.get() });
+			// need to hoist
+			CAssetConverter::SConvertParams params = {};
+			params.utilities = m_utils.get();
+
+			auto convertDSLayoutCPU2GPU = [&](std::span<ICPUDescriptorSetLayout *> cpuLayouts)
+			{
 				CAssetConverter::SInputs inputs = {};
 				inputs.readCache = converter.get();
 				inputs.logger = m_logger.get();
-				CAssetConverter::SConvertParams params = {};
-				params.utilities = m_utils.get();
 
 				std::get<CAssetConverter::SInputs::asset_span_t<ICPUDescriptorSetLayout>>(inputs.assets) = cpuLayouts;
 				// don't need to assert that we don't need to provide patches since layouts are not patchable
 				//assert(true);
 				auto reservation = converter->reserve(inputs);
+				// even though it does nothing when none assets refer in any way (direct or indirect) to memory or need any device operations performed, still need to call to write the cache
+				reservation.convert(params);
 				// the `.value` is just a funny way to make the `smart_refctd_ptr` copyable
 				auto gpuLayouts = reservation.getGPUObjects<ICPUDescriptorSetLayout>();
 				std::vector<smart_refctd_ptr<IGPUDescriptorSetLayout>> result;
@@ -170,18 +177,18 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 				return result;
 			};
-			auto convertDSCPU2GPU = [&](std::span<ICPUDescriptorSet *> cpuDS) {
-				auto converter = CAssetConverter::create({ .device = m_device.get() });
+			auto convertDSCPU2GPU = [&](std::span<ICPUDescriptorSet *> cpuDS)
+			{
 				CAssetConverter::SInputs inputs = {};
 				inputs.readCache = converter.get();
 				inputs.logger = m_logger.get();
-				CAssetConverter::SConvertParams params = {};
-				params.utilities = m_utils.get();
 
 				std::get<CAssetConverter::SInputs::asset_span_t<ICPUDescriptorSet>>(inputs.assets) = cpuDS;
 				// don't need to assert that we don't need to provide patches since layouts are not patchable
 				//assert(true);
 				auto reservation = converter->reserve(inputs);
+				// even though it does nothing when none assets refer in any way (direct or indirect) to memory or need any device operations performed, still need to call to write the cache
+				reservation.convert(params);
 				// the `.value` is just a funny way to make the `smart_refctd_ptr` copyable
 				auto gpuDS = reservation.getGPUObjects<ICPUDescriptorSet>();
 				std::vector<smart_refctd_ptr<IGPUDescriptorSet>> result;
@@ -421,7 +428,8 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 		// Load exr file into gpu
 		smart_refctd_ptr<IGPUImage> gpuImg;
 		{
-			auto convertImgCPU2GPU = [&](ICPUImage* cpuImg) {
+			auto convertImgCPU2GPU = [&](ICPUImage* cpuImg)
+			{
 				auto queue = getGraphicsQueue();
 				auto cmdbuf = m_cmdBufs[0].get();
 				cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE);
@@ -429,7 +437,6 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				core::smart_refctd_ptr<ISemaphore> imgFillSemaphore = m_device->createSemaphore(0);
 				imgFillSemaphore->setObjectDebugName("Image Fill Semaphore");
 
-				auto converter = CAssetConverter::create({ .device = m_device.get() });
 				// We don't want to generate mip-maps for these images, to ensure that we must override the default callbacks.
 				struct SInputs final : CAssetConverter::SInputs
 				{

From 10b669083fb45a7882c6328435ce4f270c7f70e5 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Thu, 23 Jan 2025 14:40:44 +0000
Subject: [PATCH 48/50] Transition m_tonemappedImgView to GENERAL

---
 26_Autoexposure/main.cpp | 68 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index 4d436a188..893d892b7 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -580,6 +580,74 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			m_tonemappedImgView->setObjectDebugName("Tonemapped Image View");
 		}
 
+		// transition m_tonemappedImgView to GENERAL
+		{
+			auto transitionSemaphore = m_device->createSemaphore(0);
+			auto queue = getGraphicsQueue();
+			auto cmdbuf = m_cmdBufs[0].get();
+			cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE);
+
+			m_api->startCapture();
+
+			cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+
+			// TRANSITION m_outImgView to GENERAL (because of descriptorSets0 -> ComputeShader Writes into the image)
+			{
+				const IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarriers[] = {
+					{
+						.barrier = {
+							.dep = {
+								.srcStageMask = PIPELINE_STAGE_FLAGS::NONE,
+								.dstStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
+							}
+						},
+						.image = m_tonemappedImgView->getCreationParameters().image.get(),
+						.subresourceRange = {
+							.aspectMask = IImage::EAF_COLOR_BIT,
+							.baseMipLevel = 0u,
+							.levelCount = 1u,
+							.baseArrayLayer = 0u,
+							.layerCount = 1u
+						},
+						.oldLayout = IImage::LAYOUT::UNDEFINED,
+						.newLayout = IImage::LAYOUT::GENERAL
+					}
+				};
+				cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imgBarriers });
+			}
+			cmdbuf->end();
+
+			const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] =
+			{
+				{
+					.semaphore = transitionSemaphore.get(),
+					.value = 1,
+					.stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS
+				}
+			};
+			const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] =
+			{
+				{.cmdbuf = cmdbuf }
+			};
+			const IQueue::SSubmitInfo infos[] =
+			{
+				{
+					.waitSemaphores = {},
+					.commandBuffers = commandBuffers,
+					.signalSemaphores = rendered
+				}
+			};
+			queue->submit(infos);
+			const ISemaphore::SWaitInfo waits[] = {
+				{
+					.semaphore = transitionSemaphore.get(),
+					.value = 1
+				}
+			};
+			m_device->blockForSemaphores(waits);
+			m_api->endCapture();
+		}
+
 		// Update Descriptors
 		{
 			IGPUDescriptorSet::SDescriptorInfo infos[2];

From 21995eae26036586e8fbb42cd166252332f8994e Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Thu, 23 Jan 2025 14:41:11 +0000
Subject: [PATCH 49/50] Keep direct track of m_gatherBuffer

---
 26_Autoexposure/main.cpp | 50 +++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 26 deletions(-)

diff --git a/26_Autoexposure/main.cpp b/26_Autoexposure/main.cpp
index 893d892b7..224324e80 100644
--- a/26_Autoexposure/main.cpp
+++ b/26_Autoexposure/main.cpp
@@ -250,7 +250,9 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 			auto gpuDS = convertDSCPU2GPU(cpuDS);
 			m_imgSamplerDS = gpuDS[0];
+			m_imgSamplerDS->setObjectDebugName("m_imgSamplerDS");
 			m_rwImgDS = gpuDS[1];
+			m_rwImgDS->setObjectDebugName("m_rwImgDS");
 
 			// Create Shaders
 			auto loadAndCompileShader = [&](std::string pathToShader) {
@@ -387,7 +389,6 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 		{
 			// Allocate memory
 			m_gatherAllocation = {};
-			smart_refctd_ptr<IGPUBuffer> buffer;
 			{
 				auto build_buffer = [this](
 					smart_refctd_ptr<ILogicalDevice> m_device,
@@ -416,9 +417,9 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 					return true;
 				};
 
-				build_buffer(m_device, &m_gatherAllocation, buffer, m_physicalDevice->getLimits().maxSubgroupSize, "Luma Gather Buffer");
+				build_buffer(m_device, &m_gatherAllocation, m_gatherBuffer, m_physicalDevice->getLimits().maxSubgroupSize, "Luma Gather Buffer");
 			}
-			m_gatherBDA = buffer->getDeviceAddress();
+			m_gatherBDA = m_gatherBuffer->getDeviceAddress();
 
 			auto mapped_memory = m_gatherAllocation.memory->map({ 0ull, m_gatherAllocation.memory->getAllocationSize() }, IDeviceMemoryAllocation::EMCAF_READ);
 			if (!mapped_memory)
@@ -551,23 +552,22 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				m_device->allocate(imageMemReqs, image.get());
 
 				return image;
-				};
-			auto createHDRIImageView = [this](smart_refctd_ptr<IGPUImage> img) -> smart_refctd_ptr<IGPUImageView>
-				{
-					auto format = img->getCreationParameters().format;
-					IGPUImageView::SCreationParams imgViewInfo;
-					imgViewInfo.image = std::move(img);
-					imgViewInfo.format = format;
-					imgViewInfo.viewType = IGPUImageView::ET_2D;
-					imgViewInfo.flags = static_cast<IGPUImageView::E_CREATE_FLAGS>(0u);
-					imgViewInfo.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
-					imgViewInfo.subresourceRange.baseArrayLayer = 0u;
-					imgViewInfo.subresourceRange.baseMipLevel = 0u;
-					imgViewInfo.subresourceRange.layerCount = 1u;
-					imgViewInfo.subresourceRange.levelCount = 1u;
-
-					return m_device->createImageView(std::move(imgViewInfo));
-				};
+			};
+			auto createHDRIImageView = [this](smart_refctd_ptr<IGPUImage> img) -> smart_refctd_ptr<IGPUImageView> {
+				auto format = img->getCreationParameters().format;
+				IGPUImageView::SCreationParams imgViewInfo;
+				imgViewInfo.image = std::move(img);
+				imgViewInfo.format = format;
+				imgViewInfo.viewType = IGPUImageView::ET_2D;
+				imgViewInfo.flags = static_cast<IGPUImageView::E_CREATE_FLAGS>(0u);
+				imgViewInfo.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+				imgViewInfo.subresourceRange.baseArrayLayer = 0u;
+				imgViewInfo.subresourceRange.baseMipLevel = 0u;
+				imgViewInfo.subresourceRange.layerCount = 1u;
+				imgViewInfo.subresourceRange.levelCount = 1u;
+
+				return m_device->createImageView(std::move(imgViewInfo));
+			};
 
 			auto params = gpuImg->getCreationParameters();
 			auto extent = params.extent;
@@ -651,7 +651,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 		// Update Descriptors
 		{
 			IGPUDescriptorSet::SDescriptorInfo infos[2];
-			infos[0].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+			infos[0].info.combinedImageSampler.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
 			infos[0].desc = m_gpuImgView;
 			infos[1].info.image.imageLayout = IImage::LAYOUT::GENERAL;
 			infos[1].desc = m_tonemappedImgView;
@@ -723,8 +723,6 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			cmdbuf->dispatch(dispatchSize.x, dispatchSize.y);
 			cmdbuf->end();
 
-			m_api->endCapture();
-
 			{
 				IQueue::SSubmitInfo submit_infos[1];
 				IQueue::SSubmitInfo::SCommandBufferInfo cmdBufs[] = {
@@ -743,6 +741,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 				submit_infos[0].signalSemaphores = signals;
 
 				queue->submit(submit_infos);
+				m_api->endCapture();
 			}
 
 			const ISemaphore::SWaitInfo wait_infos[] = {
@@ -754,7 +753,6 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 			m_device->blockForSemaphores(wait_infos);
 		}
 
-#if 0
 		// Luma Gather and Tonemapping
 		{
 			auto queue = getGraphicsQueue();
@@ -781,7 +779,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 
 			cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
 			cmdbuf->bindComputePipeline(m_tonemapPipeline.get());
-			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_tonemapPipeline->getLayout(), 0, 1, &ds1); // also if you created DS Set with 3th index you need to respect it here - firstSet tells you the index of set and count tells you what range from this index it should update, useful if you had 2 DS with lets say set index 2,3, then you can bind both with single call setting firstSet to 2, count to 2 and last argument would be pointet to your DS pointers
+			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_tonemapPipeline->getLayout(), 0, 1, &ds1);
 			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_tonemapPipeline->getLayout(), 3, 1, &ds2);
 			cmdbuf->pushConstants(m_tonemapPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(pc), &pc);
 			cmdbuf->dispatch(dispatchSize.x, dispatchSize.y);
@@ -920,7 +918,6 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 					return;
 			}
 		}
-#endif
 		m_submitIx++;
 	}
 
@@ -959,6 +956,7 @@ class AutoexposureApp final : public examples::SimpleWindowedApplication, public
 	uint64_t m_submitIx = 0;
 
 	// example resources
+	smart_refctd_ptr<IGPUBuffer> m_gatherBuffer;
 	nbl::video::IDeviceMemoryAllocator::SAllocation m_gatherAllocation;
 	uint64_t m_gatherBDA;
 	smart_refctd_ptr<IGPUImageView> m_gpuImgView, m_tonemappedImgView;

From 06dad8c118027d6ebc8ee04e19340ba643079a63 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 23 Jan 2025 20:50:00 +0100
Subject: [PATCH 50/50] yay another DXC bug that was an absolute joy to debug,
 why on earth would the SPIR-V legalization pass just decide to kill implicit
 lod texture sampling operations and just warn? GLSL makes them into explicit
 lod with implied lod 0.

---
 26_Autoexposure/app_resources/luma_meter.comp.hlsl   | 2 +-
 26_Autoexposure/app_resources/luma_tonemap.comp.hlsl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/26_Autoexposure/app_resources/luma_meter.comp.hlsl b/26_Autoexposure/app_resources/luma_meter.comp.hlsl
index b998f33ae..b15a5665a 100644
--- a/26_Autoexposure/app_resources/luma_meter.comp.hlsl
+++ b/26_Autoexposure/app_resources/luma_meter.comp.hlsl
@@ -43,7 +43,7 @@ struct TexAccessor
     }
 
     float32_t3 get(float32_t2 uv) {
-        return texture.Sample(samplerState, uv).rgb;
+        return texture.SampleLevel(samplerState, uv, 0.f).rgb;
     }
 };
 
diff --git a/26_Autoexposure/app_resources/luma_tonemap.comp.hlsl b/26_Autoexposure/app_resources/luma_tonemap.comp.hlsl
index 7b14ee5be..d7c5114d7 100644
--- a/26_Autoexposure/app_resources/luma_tonemap.comp.hlsl
+++ b/26_Autoexposure/app_resources/luma_tonemap.comp.hlsl
@@ -48,7 +48,7 @@ struct TexAccessor
     }
 
     float32_t3 get(float32_t2 uv) {
-        return textureIn.Sample(samplerStateIn, uv).rgb;
+        return textureIn.SampleLevel(samplerStateIn, uv, 0.f).rgb;
     }
 };