From 88455c63f78a045bec194697881d5076a77120da Mon Sep 17 00:00:00 2001
From: Simon Kallweit <64953474+skallweitNV@users.noreply.github.com>
Date: Thu, 26 Sep 2024 12:57:39 +0200
Subject: [PATCH] wgpu work (#54)

* use wgpuQueueWriteBuffer for uploading initial buffer data

* request device with available limits/features

* add support for uploading initial texture data

* add missing queue releases

* support wgpu sampler comparison mode
---
 src/wgpu/wgpu-buffer.cpp  | 59 ++-------------------------------
 src/wgpu/wgpu-device.cpp  | 31 ++++++++++++++++--
 src/wgpu/wgpu-device.h    |  4 +++
 src/wgpu/wgpu-sampler.cpp |  5 ++-
 src/wgpu/wgpu-texture.cpp | 69 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 107 insertions(+), 61 deletions(-)

diff --git a/src/wgpu/wgpu-buffer.cpp b/src/wgpu/wgpu-buffer.cpp
index 7d2564cb..87b326bf 100644
--- a/src/wgpu/wgpu-buffer.cpp
+++ b/src/wgpu/wgpu-buffer.cpp
@@ -115,64 +115,9 @@ Result DeviceImpl::createBuffer(const BufferDesc& desc, const void* initData, IB
 
     if (initData)
     {
-        WGPUBufferDescriptor stagingBufferDesc = {};
-        stagingBufferDesc.size = desc.size;
-        stagingBufferDesc.usage = WGPUBufferUsage_CopySrc | WGPUBufferUsage_MapWrite;
-        WGPUBuffer stagingBuffer = m_ctx.api.wgpuDeviceCreateBuffer(m_ctx.device, &stagingBufferDesc);
-        if (!stagingBuffer)
-        {
-            return SLANG_FAIL;
-        }
-        SLANG_RHI_DEFERRED({ m_ctx.api.wgpuBufferRelease(stagingBuffer); });
-
-        // Map the staging buffer
-        // TODO: we should switch to the new async API
-        {
-            WGPUBufferMapAsyncStatus status = WGPUBufferMapAsyncStatus_Unknown;
-            m_ctx.api.wgpuBufferMapAsync(
-                stagingBuffer,
-                WGPUMapMode_Write,
-                0,
-                desc.size,
-                [](WGPUBufferMapAsyncStatus status, void* userdata) { *(WGPUBufferMapAsyncStatus*)userdata = status; },
-                &status
-            );
-            while (status == WGPUBufferMapAsyncStatus_Unknown)
-            {
-                m_ctx.api.wgpuDeviceTick(m_ctx.device);
-            }
-            if (status != WGPUBufferMapAsyncStatus_Success)
-            {
-                return SLANG_FAIL;
-            }
-        }
-
-        void* data = m_ctx.api.wgpuBufferGetMappedRange(stagingBuffer, 0, desc.size);
-        if (!data)
-        {
-            m_ctx.api.wgpuBufferUnmap(stagingBuffer);
-            return SLANG_FAIL;
-        }
-        ::memcpy(data, initData, desc.size);
-        m_ctx.api.wgpuBufferUnmap(stagingBuffer);
-
-        WGPUCommandEncoder encoder = m_ctx.api.wgpuDeviceCreateCommandEncoder(m_ctx.device, nullptr);
-        if (!encoder)
-        {
-            return SLANG_FAIL;
-        }
-        SLANG_RHI_DEFERRED({ m_ctx.api.wgpuCommandEncoderRelease(encoder); });
-
-        m_ctx.api.wgpuCommandEncoderCopyBufferToBuffer(encoder, stagingBuffer, 0, buffer->m_buffer, 0, desc.size);
-        WGPUCommandBuffer commandBuffer = m_ctx.api.wgpuCommandEncoderFinish(encoder, nullptr);
-        if (!commandBuffer)
-        {
-            return SLANG_FAIL;
-        }
-        SLANG_RHI_DEFERRED({ m_ctx.api.wgpuCommandBufferRelease(commandBuffer); });
-
         WGPUQueue queue = m_ctx.api.wgpuDeviceGetQueue(m_ctx.device);
-        m_ctx.api.wgpuQueueSubmit(queue, 1, &commandBuffer);
+        m_ctx.api.wgpuQueueWriteBuffer(queue, buffer->m_buffer, 0, initData, desc.size);
+        SLANG_RHI_DEFERRED({ m_ctx.api.wgpuQueueRelease(queue); });
 
         // Wait for the command buffer to finish executing
         // TODO: we should switch to the new async API
diff --git a/src/wgpu/wgpu-device.cpp b/src/wgpu/wgpu-device.cpp
index e761f250..61305238 100644
--- a/src/wgpu/wgpu-device.cpp
+++ b/src/wgpu/wgpu-device.cpp
@@ -91,6 +91,15 @@ Result DeviceImpl::initialize(const Desc& desc)
         return SLANG_FAIL;
     }
 
+    // Query adapter limits.
+    WGPUSupportedLimits adapterLimits = {};
+    api.wgpuAdapterGetLimits(m_ctx.adapter, &adapterLimits);
+
+    // Query adapter features.
+    size_t adapterFeatureCount = api.wgpuAdapterEnumerateFeatures(m_ctx.adapter, nullptr);
+    std::vector<WGPUFeatureName> adapterFeatures(adapterFeatureCount);
+    api.wgpuAdapterEnumerateFeatures(m_ctx.adapter, adapterFeatures.data());
+
     auto requestDeviceCallback =
         [](WGPURequestDeviceStatus status, WGPUDevice device, char const* message, void* userdata)
     {
@@ -101,7 +110,13 @@ Result DeviceImpl::initialize(const Desc& desc)
         }
     };
 
+    // We request a device with the maximum available limits and feature set.
+    WGPURequiredLimits requiredLimits = {};
+    requiredLimits.limits = adapterLimits.limits;
     WGPUDeviceDescriptor deviceDesc = {};
+    deviceDesc.requiredFeatures = adapterFeatures.data();
+    deviceDesc.requiredFeatureCount = adapterFeatures.size();
+    deviceDesc.requiredLimits = &requiredLimits;
     deviceDesc.uncapturedErrorCallbackInfo.callback = errorCallback;
     deviceDesc.uncapturedErrorCallbackInfo.userdata = this;
     api.wgpuAdapterRequestDevice(m_ctx.adapter, &deviceDesc, requestDeviceCallback, &m_ctx);
@@ -110,10 +125,18 @@ Result DeviceImpl::initialize(const Desc& desc)
         return SLANG_FAIL;
     }
 
-    WGPUSupportedLimits limits = {};
-    api.wgpuDeviceGetLimits(m_ctx.device, &limits);
+    // Query device limits.
+    WGPUSupportedLimits supportedLimits = {};
+    api.wgpuDeviceGetLimits(m_ctx.device, &supportedLimits);
+    m_ctx.limits = supportedLimits.limits;
+
+    m_info.limits.maxComputeDispatchThreadGroups[0] = m_ctx.limits.maxComputeWorkgroupSizeX;
 
-    m_info.limits.maxComputeDispatchThreadGroups[0] = limits.limits.maxComputeWorkgroupSizeX;
+    // Query device features.
+    size_t deviceFeatureCount = api.wgpuDeviceEnumerateFeatures(m_ctx.device, nullptr);
+    std::vector<WGPUFeatureName> deviceFeatures(deviceFeatureCount);
+    api.wgpuDeviceEnumerateFeatures(m_ctx.device, deviceFeatures.data());
+    m_ctx.features.insert(deviceFeatures.begin(), deviceFeatures.end());
 
     return SLANG_OK;
 }
@@ -196,6 +219,7 @@ Result DeviceImpl::readTexture(
     SLANG_RHI_DEFERRED({ m_ctx.api.wgpuCommandBufferRelease(commandBuffer); });
 
     WGPUQueue queue = m_ctx.api.wgpuDeviceGetQueue(m_ctx.device);
+    SLANG_RHI_DEFERRED({ m_ctx.api.wgpuQueueRelease(queue); });
     m_ctx.api.wgpuQueueSubmit(queue, 1, &commandBuffer);
 
     // Wait for the command buffer to finish executing
@@ -283,6 +307,7 @@ Result DeviceImpl::readBuffer(IBuffer* buffer, Offset offset, Size size, ISlangB
     SLANG_RHI_DEFERRED({ m_ctx.api.wgpuCommandBufferRelease(commandBuffer); });
 
     WGPUQueue queue = m_ctx.api.wgpuDeviceGetQueue(m_ctx.device);
+    SLANG_RHI_DEFERRED({ m_ctx.api.wgpuQueueRelease(queue); });
     m_ctx.api.wgpuQueueSubmit(queue, 1, &commandBuffer);
 
     // Wait for the command buffer to finish executing
diff --git a/src/wgpu/wgpu-device.h b/src/wgpu/wgpu-device.h
index 7a878aa2..f4c67400 100644
--- a/src/wgpu/wgpu-device.h
+++ b/src/wgpu/wgpu-device.h
@@ -2,6 +2,8 @@
 
 #include "wgpu-base.h"
 
+#include <unordered_set>
+
 namespace rhi::wgpu {
 
 struct Context
@@ -10,6 +12,8 @@ struct Context
     WGPUInstance instance = nullptr;
     WGPUAdapter adapter = nullptr;
     WGPUDevice device = nullptr;
+    WGPULimits limits = {};
+    std::unordered_set<WGPUFeatureName> features;
 
     ~Context();
 };
diff --git a/src/wgpu/wgpu-sampler.cpp b/src/wgpu/wgpu-sampler.cpp
index 49ed1965..a443fcc5 100644
--- a/src/wgpu/wgpu-sampler.cpp
+++ b/src/wgpu/wgpu-sampler.cpp
@@ -37,7 +37,10 @@ Result DeviceImpl::createSampler(SamplerDesc const& desc, ISampler** outSampler)
     samplerDesc.mipmapFilter = translateMipmapFilterMode(desc.mipFilter);
     samplerDesc.lodMinClamp = desc.minLOD;
     samplerDesc.lodMaxClamp = desc.maxLOD;
-    samplerDesc.compare = translateCompareFunction(desc.comparisonFunc);
+    if (desc.reductionOp == TextureReductionOp::Comparison)
+    {
+        samplerDesc.compare = translateCompareFunction(desc.comparisonFunc);
+    }
     samplerDesc.maxAnisotropy = desc.maxAnisotropy;
     samplerDesc.label = desc.label;
     sampler->m_sampler = m_ctx.api.wgpuDeviceCreateSampler(m_ctx.device, &samplerDesc);
diff --git a/src/wgpu/wgpu-texture.cpp b/src/wgpu/wgpu-texture.cpp
index 621732da..66d5104e 100644
--- a/src/wgpu/wgpu-texture.cpp
+++ b/src/wgpu/wgpu-texture.cpp
@@ -2,6 +2,8 @@
 #include "wgpu-device.h"
 #include "wgpu-util.h"
 
+#include "core/deferred.h"
+
 namespace rhi::wgpu {
 
 TextureImpl::TextureImpl(DeviceImpl* device, const TextureDesc& desc)
@@ -41,6 +43,10 @@ Result DeviceImpl::createTexture(const TextureDesc& desc_, const SubresourceData
     textureDesc.size.height = desc.size.height;
     textureDesc.size.depthOrArrayLayers = desc.size.depth;
     textureDesc.usage = translateTextureUsage(desc.usage);
+    if (initData)
+    {
+        textureDesc.usage |= WGPUTextureUsage_CopyDst;
+    }
     textureDesc.dimension = translateTextureDimension(desc.type);
     textureDesc.format = translateTextureFormat(desc.format);
     textureDesc.mipLevelCount = desc.numMipLevels;
@@ -51,6 +57,69 @@ Result DeviceImpl::createTexture(const TextureDesc& desc_, const SubresourceData
     {
         return SLANG_FAIL;
     }
+
+    if (initData)
+    {
+        FormatInfo formatInfo;
+        rhiGetFormatInfo(desc.format, &formatInfo);
+
+        WGPUQueue queue = m_ctx.api.wgpuDeviceGetQueue(m_ctx.device);
+        SLANG_RHI_DEFERRED({ m_ctx.api.wgpuQueueRelease(queue); });
+        int mipLevelCount = desc.numMipLevels;
+        int arrayLayerCount = desc.arrayLength * (desc.type == TextureType::TextureCube ? 6 : 1);
+
+        for (int arrayLayer = 0; arrayLayer < arrayLayerCount; ++arrayLayer)
+        {
+            for (int mipLevel = 0; mipLevel < mipLevelCount; ++mipLevel)
+            {
+                Extents mipSize = calcMipSize(desc.size, mipLevel);
+                int subresourceIndex = arrayLayer * mipLevelCount + mipLevel;
+                const SubresourceData& data = initData[subresourceIndex];
+
+                WGPUImageCopyTexture imageCopyTexture = {};
+                imageCopyTexture.texture = texture->m_texture;
+                imageCopyTexture.mipLevel = mipLevel;
+                imageCopyTexture.origin = {0, 0, 0};
+                imageCopyTexture.aspect = WGPUTextureAspect_All;
+
+                WGPUExtent3D writeSize = {};
+                writeSize.width =
+                    ((mipSize.width + formatInfo.blockWidth - 1) / formatInfo.blockWidth) * formatInfo.blockWidth;
+                writeSize.height =
+                    ((mipSize.height + formatInfo.blockHeight - 1) / formatInfo.blockHeight) * formatInfo.blockHeight;
+                writeSize.depthOrArrayLayers = mipSize.depth;
+
+                WGPUTextureDataLayout dataLayout = {};
+                dataLayout.offset = 0;
+                dataLayout.bytesPerRow = data.strideY;
+                dataLayout.rowsPerImage = writeSize.height / formatInfo.blockHeight;
+
+                size_t dataSize = dataLayout.bytesPerRow * dataLayout.rowsPerImage * mipSize.depth;
+
+                m_ctx.api.wgpuQueueWriteTexture(queue, &imageCopyTexture, data.data, dataSize, &dataLayout, &writeSize);
+            }
+        }
+
+        // Wait for queue to finish.
+        // TODO: we should switch to the new async API
+        {
+            WGPUQueueWorkDoneStatus status = WGPUQueueWorkDoneStatus_Unknown;
+            m_ctx.api.wgpuQueueOnSubmittedWorkDone(
+                queue,
+                [](WGPUQueueWorkDoneStatus status, void* userdata) { *(WGPUQueueWorkDoneStatus*)userdata = status; },
+                &status
+            );
+            while (status == WGPUQueueWorkDoneStatus_Unknown)
+            {
+                m_ctx.api.wgpuDeviceTick(m_ctx.device);
+            }
+            if (status != WGPUQueueWorkDoneStatus_Success)
+            {
+                return SLANG_FAIL;
+            }
+        }
+    }
+
     returnComPtr(outTexture, texture);
     return SLANG_OK;
 }