From 5d76b165b0fee2f397478999145ebba1f07daf09 Mon Sep 17 00:00:00 2001
From: Matthew Pohlmann <matthew.pohlmann@gmail.com>
Date: Sat, 6 Jan 2024 13:15:12 -0800
Subject: [PATCH 01/27] WIP: First-pass implementation of Vulkan profiling.
 Currently assumes Vulkan 1.2 and/or a handful of extensions are available.

---
 lib/Remotery.c | 725 ++++++++++++++++++++++++++++++++++++++++++++++++-
 lib/Remotery.h |  72 ++++-
 2 files changed, 789 insertions(+), 8 deletions(-)

diff --git a/lib/Remotery.c b/lib/Remotery.c
index d50223c9..2158dfda 100644
--- a/lib/Remotery.c
+++ b/lib/Remotery.c
@@ -51,6 +51,7 @@
     @D3D12:         Direct3D 12 event sampling
     @OPENGL:        OpenGL event sampling
     @METAL:         Metal event sampling
+    @VULKAN:        Vulkan event sampling
     @SAMPLEAPI:     Sample API for user callbacks
     @PROPERTYAPI:   Property API for user callbacks
     @PROPERTIES:    Property API
@@ -392,6 +393,14 @@ static void usTimer_Init(usTimer* timer)
 #endif
 }
 
+#if defined(RMT_PLATFORM_WINDOWS)
+    #define usTimer_FromRawTicks(timer, ticks) (rmtU64)(((ticks) - (timer)->counter_start.QuadPart) * (timer)->counter_scale)
+#elif defined(RMT_PLATFORM_MACOS)
+    #define usTimer_FromRawTicks(timer, ticks) (rmtU64)(((ticks) - (timer)->counter_start) * (timer)->counter_scale)
+#elif defined(RMT_PLATFORM_LINUX)
+    #define usTimer_FromRawTicks(timer, ticks) (rmtU64)((ticks) - (timer)->counter_start)
+#endif
+
 static rmtU64 usTimer_Get(usTimer* timer)
 {
 #if defined(RMT_PLATFORM_WINDOWS)
@@ -401,18 +410,19 @@ static rmtU64 usTimer_Get(usTimer* timer)
 
     // Read counter and convert to microseconds
     QueryPerformanceCounter(&performance_count);
-    return (rmtU64)((performance_count.QuadPart - timer->counter_start.QuadPart) * timer->counter_scale);
+    return usTimer_FromRawTicks(timer, performance_count.QuadPart);
 
 #elif defined(RMT_PLATFORM_MACOS)
 
     rmtU64 curr_time = mach_absolute_time();
-    return (rmtU64)((curr_time - timer->counter_start) * timer->counter_scale);
+    return usTimer_FromRawTicks(timer, curr_time);
 
 #elif defined(RMT_PLATFORM_LINUX)
 
     struct timespec tv;
     clock_gettime(CLOCK_REALTIME, &tv);
-    return ((rmtU64)(tv.tv_sec * (rmtU64)1000000) + (rmtU64)(tv.tv_nsec * 0.001)) - timer->counter_start;
+    rmtU64 ticks = (rmtU64)(tv.tv_sec * (rmtU64)1000000) + (rmtU64)(tv.tv_nsec * 0.001);
+    return usTimer_FromRawTicks(timer, ticks);
 
 #endif
 }
@@ -5162,6 +5172,12 @@ static rmtError D3D12ThreadData_Create(D3D12ThreadData** d3d12);
 static void D3D12ThreadData_Destructor(D3D12ThreadData* d3d12);
 #endif
 
+#if RMT_USE_VULKAN
+typedef struct VulkanThreadData VulkanThreadData;
+static rmtError VulkanThreadData_Create(VulkanThreadData** vulkan);
+static void VulkanThreadData_Destructor(VulkanThreadData* vulkan);
+#endif
+
 typedef struct ThreadProfiler
 {
     // Storage for backing up initial register values when modifying a thread's context
@@ -5194,6 +5210,10 @@ typedef struct ThreadProfiler
 #if RMT_USE_D3D12
     D3D12ThreadData* d3d12ThreadData;
 #endif
+
+#if RMT_USE_VULKAN
+    VulkanThreadData* vulkanThreadData;
+#endif
 } ThreadProfiler;
 
 static rmtError ThreadProfiler_Constructor(rmtMessageQueue* mq_to_rmt, ThreadProfiler* thread_profiler, rmtThreadId thread_id)
@@ -5215,6 +5235,10 @@ static rmtError ThreadProfiler_Constructor(rmtMessageQueue* mq_to_rmt, ThreadPro
     thread_profiler->d3d12ThreadData = NULL;
 #endif
 
+#if RMT_USE_VULKAN
+    thread_profiler->vulkanThreadData = NULL;
+#endif
+
     // Pre-open the thread handle
     rmtTry(rmtOpenThreadHandle(thread_id, &thread_profiler->threadHandle));
 
@@ -5237,6 +5261,10 @@ static rmtError ThreadProfiler_Constructor(rmtMessageQueue* mq_to_rmt, ThreadPro
     rmtTry(D3D12ThreadData_Create(&thread_profiler->d3d12ThreadData));
 #endif
 
+#if RMT_USE_VULKAN
+    rmtTry(VulkanThreadData_Create(&thread_profiler->vulkanThreadData));
+#endif
+
     return RMT_ERROR_NONE;
 }
 
@@ -5244,6 +5272,10 @@ static void ThreadProfiler_Destructor(ThreadProfiler* thread_profiler)
 {
     rmtU32 index;
 
+#if RMT_USE_VULKAN
+    rmtDelete(VulkanThreadData, thread_profiler->vulkanThreadData);
+#endif
+
 #if RMT_USE_D3D12
     rmtDelete(D3D12ThreadData, thread_profiler->d3d12ThreadData);
 #endif
@@ -6222,6 +6254,12 @@ struct Remotery
     struct D3D12BindImpl* d3d12Binds;
 #endif
 
+#if RMT_USE_VULKAN
+    // Linked list of all Vulkan queue samplers
+    rmtMutex vulkanBindsMutex;
+    struct VulkanBindImpl* vulkanBinds;
+#endif
+
     ThreadProfilers* threadProfilers;
 
     // Root of all registered properties, guarded by mutex as property register can come from any thread
@@ -6410,6 +6448,10 @@ static rmtError bin_SampleTree(Buffer* buffer, Msg_SampleTree* msg)
     {
         strncat_s(thread_name, sizeof(thread_name), " (Metal)", 8);
     }
+    if (root_sample->type == RMT_SampleType_Vulkan)
+    {
+        strncat_s(thread_name, sizeof(thread_name), " (Vulkan)", 9);
+    }
 
     // Get digest hash of samples so that viewer can efficiently rebuild its tables
     PostProcessSamples(root_sample, &nb_samples);
@@ -6961,6 +7003,11 @@ static rmtError Remotery_Constructor(Remotery* rmt)
     rmt->d3d12Binds = NULL;
 #endif
 
+#if RMT_USE_VULKAN
+    mtxInit(&rmt->vulkanBindsMutex);
+    rmt->vulkanBinds = NULL;
+#endif
+
     // Kick-off the timer
     usTimer_Init(&rmt->timer);
 
@@ -7059,6 +7106,13 @@ static void Remotery_Destructor(Remotery* rmt)
 
     rmtDelete(ObjectAllocator, rmt->propertyAllocator);
 
+#if RMT_USE_VULKAN
+    while (rmt->vulkanBinds != NULL)
+    {
+        _rmt_UnbindVulkan((rmtVulkanBind*)rmt->vulkanBinds);
+    }
+    mtxDelete(&rmt->vulkanBindsMutex);
+#endif
 
 #if RMT_USE_D3D12
     while (rmt->d3d12Binds != NULL)
@@ -7446,6 +7500,10 @@ RMT_API void _rmt_EndCPUSample(void)
 static rmtError D3D12MarkFrame(struct D3D12BindImpl* bind);
 #endif
 
+#if RMT_USE_VULKAN
+static rmtError VulkanMarkFrame(struct VulkanBindImpl* bind);
+#endif
+
 RMT_API rmtError _rmt_MarkFrame(void)
 {
     if (g_Remotery == NULL)
@@ -7458,10 +7516,15 @@ RMT_API rmtError _rmt_MarkFrame(void)
         rmtTry(D3D12MarkFrame(g_Remotery->d3d12Binds));
     #endif
 
+    #if RMT_USE_VULKAN
+        // This will kick off mark frames on the complete chain of binds
+        rmtTry(VulkanMarkFrame(g_Remotery->vulkanBinds));
+    #endif
+
     return RMT_ERROR_NONE;
 }
 
-#if RMT_USE_OPENGL || RMT_USE_D3D11 || RMT_USE_D3D12
+#if RMT_USE_OPENGL || RMT_USE_D3D11 || RMT_USE_D3D12 || RMT_USE_VULKAN
 static void Remotery_DeleteSampleTree(Remotery* rmt, enum rmtSampleType sample_type)
 {
     ThreadProfiler* thread_profiler;
@@ -8738,7 +8801,7 @@ static rmtError D3D12MarkFrame(D3D12BindImpl* bind)
         gpu_timestamp_us = (rmtU64)(gpu_timestamp_ticks * gpu_ticks_to_us);
 
         // Convert CPU ticks to microseconds, offset from the global timer start
-        cpu_timestamp_us = (rmtU64)((cpu_timestamp_ticks - g_Remotery->timer.counter_start.QuadPart) * g_Remotery->timer.counter_scale);
+        cpu_timestamp_us = usTimer_FromRawTicks(&g_Remotery->timer, cpu_timestamp_ticks);
 
         // And we now have the offset from GPU microseconds to CPU microseconds
         gpu_to_cpu_timestamp_us = cpu_timestamp_us - gpu_timestamp_us;
@@ -9886,6 +9949,658 @@ RMT_API void _rmt_EndMetalSample(void)
 
 #endif // RMT_USE_METAL
 
+/*
+------------------------------------------------------------------------------------------------------------------------
+------------------------------------------------------------------------------------------------------------------------
+   @VULKAN: Vulkan event sampling
+------------------------------------------------------------------------------------------------------------------------
+------------------------------------------------------------------------------------------------------------------------
+*/
+
+#if RMT_USE_VULKAN
+
+#include <vulkan/vulkan.h>
+
+typedef struct VulkanThreadData
+{
+    rmtU32 lastAllocatedQueryIndex;
+
+    // Sample trees in transit in the message queue for release on shutdown
+    Buffer* flushSamples;
+} VulkanThreadData;
+
+static rmtError VulkanThreadData_Create(VulkanThreadData** vulkan_thread_data)
+{
+    assert(vulkan_thread_data != NULL);
+
+    // Allocate space for the Vulkan data
+    rmtTryMalloc(VulkanThreadData, *vulkan_thread_data);
+
+    // Set defaults
+    (*vulkan_thread_data)->lastAllocatedQueryIndex = 0;
+    (*vulkan_thread_data)->flushSamples = NULL;
+
+    rmtTryNew(Buffer, (*vulkan_thread_data)->flushSamples, 8 * 1024);
+
+    return RMT_ERROR_NONE;
+}
+
+static void VulkanThreadData_Destructor(VulkanThreadData* vulkan_thread_data)
+{
+    assert(vulkan_thread_data != NULL);
+    rmtDelete(Buffer, vulkan_thread_data->flushSamples);
+}
+
+typedef struct VulkanSample
+{
+    // IS-A inheritance relationship
+    Sample base;
+
+    // Cached bind and command buffer used to create the sample so that the user doesn't have to pass it
+    struct VulkanBindImpl* bind;
+    VkCommandBuffer commandBuffer;
+
+    // Begin/End timestamp indices in the query heap
+    rmtU32 queryIndex;
+
+} VulkanSample;
+
+static rmtError VulkanSample_Constructor(VulkanSample* sample)
+{
+    assert(sample != NULL);
+
+    // Chain to sample constructor
+    Sample_Constructor((Sample*)sample);
+    sample->base.type = RMT_SampleType_Vulkan;
+    sample->bind = NULL;
+    sample->commandBuffer = NULL;
+    sample->queryIndex = 0;
+
+    return RMT_ERROR_NONE;
+}
+
+static void VulkanSample_Destructor(VulkanSample* sample)
+{
+    Sample_Destructor((Sample*)sample);
+}
+
+typedef struct VulkanBindImpl
+{
+    rmtVulkanBind base;
+
+    // Ring buffer of GPU timestamp destinations for all queries
+    rmtU32 maxNbQueries;
+    VkQueryPool gpuTimestampRingBuffer;
+
+    // CPU-accessible copy destination for all timestamps
+    rmtU64* cpuTimestampRingBuffer;
+
+    // Pointers to samples that expect the result of timestamps
+    VulkanSample** sampleRingBuffer;
+
+    // Read/write positions of the ring buffer allocator, synchronising access to all the ring buffers at once
+    // TODO(don): Separate by cache line?
+    rmtAtomicU32 ringBufferRead;
+    rmtAtomicU32 ringBufferWrite;
+
+    VkSemaphore gpuQuerySemaphore;
+
+    // Convert gpu ticks to us, retrieved from physical device properties
+    double gpu_ticks_to_us;
+
+    // Function pointers to Vulkan functions
+    PFN_vkQueueSubmit vkQueueSubmit;
+    PFN_vkGetPhysicalDeviceProperties vkGetPhysicalDeviceProperties;
+    PFN_vkCreateQueryPool vkCreateQueryPool;
+    PFN_vkDestroyQueryPool vkDestroyQueryPool;
+    PFN_vkResetQueryPool vkResetQueryPool; // VK_EXT_host_query_reset or VK_VERSION_1_2
+    PFN_vkGetQueryPoolResults vkGetQueryPoolResults;
+    PFN_vkCmdWriteTimestamp vkCmdWriteTimestamp;
+    PFN_vkCreateSemaphore vkCreateSemaphore; // Creating a timeline semaphore, so VK_KHR_timeline_semaphore or VK_VERSION_1_2
+    PFN_vkDestroySemaphore vkDestroySemaphore;
+    PFN_vkSignalSemaphore vkSignalSemaphore; // VK_KHR_timeline_semaphore or VK_VERSION_1_2
+    PFN_vkGetSemaphoreCounterValue vkGetSemaphoreCounterValue; // VK_KHR_timeline_semaphore or VK_VERSION_1_2
+    PFN_vkGetCalibratedTimestampsEXT vkGetCalibratedTimestampsEXT; // VK_EXT_calibrated_timestamps or VK_KHR_calibrated_timestamps
+
+    // Queue to the Vulkan main update thread
+    rmtMessageQueue* mqToVulkanUpdate;
+
+    struct VulkanBindImpl* next;
+
+} VulkanBindImpl;
+
+static rmtError LoadVulkanFunctions(VulkanBindImpl* bind, VkInstance vulkan_instance, VkDevice vulkan_device)
+{
+    PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr;
+    PFN_vkGetDeviceProcAddr vkGetDeviceProcAddr;
+
+#define VK_DEVICE_FN(fn)                                                         \
+    bind->fn = (PFN_ ## fn)vkGetDeviceProcAddr(vulkan_device, #fn);              \
+    if (bind->fn == NULL)                                                        \
+        return RMT_ERROR_RESOURCE_ACCESS_FAIL;
+
+#define VK_DEVICE_FN_FALLBACK(fn, fn_fallback)                                   \
+    bind->fn = (PFN_ ## fn)vkGetDeviceProcAddr(vulkan_device, #fn);              \
+    if (bind->fn == NULL)                                                        \
+    {                                                                            \
+        bind->fn = (PFN_ ## fn)vkGetDeviceProcAddr(vulkan_device, #fn_fallback); \
+        if (bind->fn == NULL)                                                    \
+            return RMT_ERROR_RESOURCE_ACCESS_FAIL;                               \
+    }
+
+    VK_DEVICE_FN(vkQueueSubmit);
+    VK_DEVICE_FN(vkGetPhysicalDeviceProperties);
+    VK_DEVICE_FN(vkCreateQueryPool);
+    VK_DEVICE_FN(vkDestroyQueryPool);
+    VK_DEVICE_FN_FALLBACK(vkResetQueryPool, vkResetQueryPoolEXT);
+    VK_DEVICE_FN(vkGetQueryPoolResults);
+    VK_DEVICE_FN(vkCmdWriteTimestamp);
+    VK_DEVICE_FN(vkCreateSemaphore);
+    VK_DEVICE_FN(vkDestroySemaphore);
+    VK_DEVICE_FN_FALLBACK(vkSignalSemaphore, vkSignalSemaphoreKHR);
+    VK_DEVICE_FN_FALLBACK(vkGetSemaphoreCounterValue, vkGetSemaphoreCounterValueKHR);
+    VK_DEVICE_FN(vkGetCalibratedTimestampsEXT); // TODO(valakor): Support vkGetCalibratedTimestampsKHR
+
+#undef VK_DEVICE_FN
+#undef VK_DEVICE_FN_FALLBACK
+
+    return RMT_ERROR_NONE;
+}
+
+static rmtError CreateQueryPool(VulkanBindImpl* bind, VkDevice vulkan_device, rmtU32 nb_queries)
+{
+    VkQueryPoolCreateInfo create_info;
+    ZeroMemory(&create_info, sizeof(create_info));
+    create_info.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO;
+    create_info.queryType = VK_QUERY_TYPE_TIMESTAMP;
+    create_info.queryCount = nb_queries;
+
+    if (bind->vkCreateQueryPool(vulkan_device, &create_info, NULL, &bind->gpuTimestampRingBuffer) != VK_SUCCESS)
+    {
+        return rmtMakeError(RMT_ERROR_RESOURCE_CREATE_FAIL, "Failed to create Vulkan Query Pool");
+    }
+
+    bind->vkResetQueryPool(vulkan_device, bind->gpuTimestampRingBuffer, 0, nb_queries);
+
+    return RMT_ERROR_NONE;
+}
+
+static rmtError CreateQuerySemaphore(VulkanBindImpl* bind, VkDevice vulkan_device)
+{
+    VkSemaphoreTypeCreateInfoKHR type_info;
+    ZeroMemory(&type_info, sizeof(type_info));
+    type_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR;
+    type_info.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE_KHR;
+    type_info.initialValue = 0;
+
+    VkSemaphoreCreateInfo create_info;
+    ZeroMemory(&create_info, sizeof(create_info));
+    create_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
+
+    if (bind->vkCreateSemaphore(vulkan_device, &create_info, NULL, &bind->gpuQuerySemaphore) != VK_SUCCESS)
+    {
+        return rmtMakeError(RMT_ERROR_RESOURCE_CREATE_FAIL, "Failed to create Vulkan Query Semaphore");
+    }
+
+    return RMT_ERROR_NONE;
+}
+
+static rmtError CopyTimestamps(VulkanBindImpl* bind, VkDevice vulkan_device, rmtU32 ring_pos_a, rmtU32 ring_pos_b, double gpu_ticks_to_us, rmtS64 gpu_to_cpu_timestamp_us)
+{
+    rmtU32 query_index;
+    VulkanSample** cpu_sample_buffer = bind->sampleRingBuffer;
+    rmtU64* cpu_timestamps = bind->cpuTimestampRingBuffer;
+
+    rmtU32 query_count = ring_pos_b - ring_pos_a;
+    rmtU64 query_size = query_count * sizeof(rmtU64);
+
+    bind->vkGetQueryPoolResults(vulkan_device, bind->gpuTimestampRingBuffer, ring_pos_a, query_count, query_size, cpu_timestamps + ring_pos_a,
+                              sizeof(rmtU64), VK_QUERY_RESULT_64_BIT);
+
+    // Copy all timestamps to their expectant samples
+    for (query_index = ring_pos_a; query_index < ring_pos_b; query_index += 2)
+    {
+        rmtU64 us_start = (rmtU64)(cpu_timestamps[query_index] * gpu_ticks_to_us + gpu_to_cpu_timestamp_us);
+        rmtU64 us_end = (rmtU64)(cpu_timestamps[query_index + 1] * gpu_ticks_to_us + gpu_to_cpu_timestamp_us);
+
+        VulkanSample* sample = cpu_sample_buffer[query_index >> 1];
+        sample->base.us_start = us_start;
+        Sample_Close(&sample->base, us_end);
+        sample->base.us_end = us_end;
+    }
+
+    // Reset the query pool indices
+    bind->vkResetQueryPool(vulkan_device, bind->gpuTimestampRingBuffer, ring_pos_a, query_count);
+
+    return RMT_ERROR_NONE;
+}
+
+static rmtError UpdateGpuTicksToUs(VulkanBindImpl* bind, VkPhysicalDevice vulkan_physical_device)
+{
+    // TODO(valakor): Is this slow? We could cache timestampPeriod during initialization, but on some devices
+    //  (namely some Apple devices using Vulkan via MoltenVK, potentially others) the value is dynamic and can
+    //  change on every call. For more information see:
+    //  https://github.com/KhronosGroup/MoltenVK/blob/main/Docs/MoltenVK_Runtime_UserGuide.md
+
+    VkPhysicalDeviceProperties device_properties;
+    ZeroMemory(&device_properties, sizeof(device_properties));
+    bind->vkGetPhysicalDeviceProperties(vulkan_physical_device, &device_properties);
+
+    float gpu_ns_per_tick = device_properties.limits.timestampPeriod;
+    bind->gpu_ticks_to_us = gpu_ns_per_tick / 1000.0;
+}
+
+static rmtError GetTimestampCalibration(VulkanBindImpl* bind, VkPhysicalDevice vulkan_physical_device, VkDevice vulkan_device, double* gpu_ticks_to_us, rmtS64* gpu_to_cpu_timestamp_us)
+{
+    // TODO(valakor): Honor RMT_GPU_CPU_SYNC_SECONDS? It's unclear to me how expensive vkGetCalibratedTimestampsEXT is
+    //  on all supported platforms.
+
+    rmtU64 gpu_timestamp_ticks;
+    rmtU64 cpu_timestamp_ticks;
+    rmtU64 gpu_timestamp_us;
+    rmtU64 cpu_timestamp_us;
+    float gpu_tick_period;
+
+    // Always query a device timestamp
+    rmtU32 timestamp_count = 1;
+    rmtU64 max_deviation;
+    rmtU64 timestamps[2];
+    VkCalibratedTimestampInfoEXT timestamp_infos[2];
+    ZeroMemory(&timestamp_infos, sizeof(timestamp_infos));
+    timestamp_infos[0].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT;
+    timestamp_infos[0].timeDomain = VK_TIME_DOMAIN_DEVICE_EXT;
+
+    // Potentially also query a cpu timestamp if supported
+#if defined(RMT_PLATFORM_WINDOWS)
+    timestamp_count = 2;
+    timestamp_infos[1].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT;
+    timestamp_infos[1].timeDomain = VK_TIME_DOMAIN_QUERY_PERFORMANCE_COUNTER_EXT;
+#elif defined(RMT_PLATFORM_MACOS)
+    // On Apple platforms MoltenVK reports support for VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT even though under the hood
+    //  it uses mach_absolute_time(), which is actually CLOCK_BOOT. This doesn't matter though as Remotery also uses
+    //  mach_absolute_time() for time measurements so the results are comparable. For more information see:
+    //  <INSERT LINK HERE>
+    timestamp_count = 2;
+    timestamp_infos[1].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT;
+    timestamp_infos[1].timeDomain = VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT;
+#else
+    // On Linux Remotery uses CLOCK_REALTIME (though it probably shouldn't), but Vulkan only provides time domains for
+    //  CLOCK_MONOTONIC and CLOCK_MONOTONIC_RAW. For now we'll just query the CPU here manually and hope it's close enough.
+    timestamp_count = 1;
+#endif
+
+    // TODO(valakor): Consider taking max_deviation into account. Docs state that users may want to call vkGetCalibratedTimestamps
+    //  multiple times in a row until retrieving a max deviation that is "acceptable". We could just call it a set number of
+    //  times and take the min, or determine a reasonable average during init and ensure we get something close to that here.
+
+    if (bind->vkGetCalibratedTimestampsEXT(vulkan_device, timestamp_count, timestamp_infos, timestamps, &max_deviation) != VK_SUCCESS)
+    {
+        return rmtMakeError(RMT_ERROR_RESOURCE_ACCESS_FAIL, "Failed to get Vulkan calibrated timestamps");
+    }
+
+    // Convert CPU ticks to microseconds, offset from the global timer start
+#if defined(RMT_PLATFORM_WINDOWS) || defined(RMT_PLATFORM_MACOS)
+    cpu_timestamp_ticks = timestamps[1];
+    cpu_timestamp_us = usTimer_FromRawTicks(&g_Remotery->timer, cpu_timestamp_ticks);
+#else
+    cpu_timestamp_us = usTimer_Get(&g_Remotery->timer);
+#endif
+
+    UpdateGpuTicksToUs(bind, vulkan_physical_device);
+    *gpu_ticks_to_us = bind->gpu_ticks_to_us;
+
+    // Convert GPU ticks to microseconds
+    gpu_timestamp_ticks = timestamps[0];
+    gpu_timestamp_us = (rmtU64)(gpu_timestamp_ticks * bind->gpu_ticks_to_us);
+
+    // And we now have the offset from GPU microseconds to CPU microseconds
+    *gpu_to_cpu_timestamp_us = cpu_timestamp_us - gpu_timestamp_us;
+
+    return RMT_ERROR_NONE;
+}
+
+static rmtError VulkanMarkFrame(VulkanBindImpl* bind)
+{
+    if (bind == NULL)
+    {
+        return RMT_ERROR_NONE;
+    }
+
+    VkPhysicalDevice vulkan_physical_device = (VkPhysicalDevice)bind->base.physical_device;
+    VkDevice vulkan_device = (VkDevice)bind->base.device;
+    VkQueue vulkan_queue = (VkQueue)bind->base.queue;
+
+    rmtU32 index_mask = bind->maxNbQueries - 1;
+    rmtU32 current_read_cpu = LoadAcquire(&bind->ringBufferRead);
+    rmtU32 current_write_cpu = LoadAcquire(&bind->ringBufferWrite);
+
+    // Tell the GPU where the CPU write position is
+    VkTimelineSemaphoreSubmitInfoKHR semaphore_submit_info;
+    ZeroMemory(&semaphore_submit_info, sizeof(semaphore_submit_info));
+    semaphore_submit_info.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR;
+    semaphore_submit_info.signalSemaphoreValueCount = 1;
+    semaphore_submit_info.pSignalSemaphoreValues = &current_write_cpu;
+
+    VkSubmitInfo submit_info;
+    ZeroMemory(&submit_info, sizeof(submit_info));
+    submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+    submit_info.pNext = &semaphore_submit_info;
+    submit_info.signalSemaphoreCount = 1;
+    submit_info.pSignalSemaphores = &bind->gpuQuerySemaphore;
+    bind->vkQueueSubmit(vulkan_queue, 1, &submit_info, NULL);
+
+    // Has the GPU processed any writes?
+    rmtU64 current_write_gpu = 0;
+    if (bind->vkGetSemaphoreCounterValue(vulkan_device, bind->gpuQuerySemaphore, &current_write_gpu) != VK_SUCCESS)
+    {
+        return rmtMakeError(RMT_ERROR_RESOURCE_ACCESS_FAIL, "Failed to get Vulkan Semaphore value");
+    }
+
+    if (current_write_gpu > current_read_cpu)
+    {
+        double gpu_ticks_to_us;
+        rmtS64 gpu_to_cpu_timestamp_us;
+
+        // Physical ring buffer positions
+        rmtU32 ring_pos_a = current_read_cpu & index_mask;
+        rmtU32 ring_pos_b = current_write_gpu & index_mask;
+
+        rmtTry(GetTimestampCalibration(bind, vulkan_physical_device, vulkan_device, &gpu_ticks_to_us, &gpu_to_cpu_timestamp_us));
+
+        // Copy resulting timestamps to their samples
+        // Will have to split the copies into two passes if they cross the ring buffer wrap around
+        if (ring_pos_b < ring_pos_a)
+        {
+            rmtTry(CopyTimestamps(bind, vulkan_device, ring_pos_a, bind->maxNbQueries, gpu_ticks_to_us, gpu_to_cpu_timestamp_us));
+            rmtTry(CopyTimestamps(bind, vulkan_device, 0, ring_pos_b, gpu_ticks_to_us, gpu_to_cpu_timestamp_us));
+        }
+        else
+        {
+            rmtTry(CopyTimestamps(bind, vulkan_device, ring_pos_a, ring_pos_b, gpu_ticks_to_us, gpu_to_cpu_timestamp_us));
+        }
+
+        // Release the ring buffer entries just processed
+        StoreRelease(&bind->ringBufferRead, current_write_gpu);
+    }
+
+    // Attempt to empty the queue of complete message trees
+    Message* message;
+    while ((message = rmtMessageQueue_PeekNextMessage(bind->mqToVulkanUpdate)))
+    {
+        Msg_SampleTree* msg_sample_tree;
+        Sample* root_sample;
+
+        // Ensure only Vulkan sample tree messages come through here
+        assert(message->id == MsgID_SampleTree);
+        msg_sample_tree = (Msg_SampleTree*)message->payload;
+        root_sample = msg_sample_tree->rootSample;
+        assert(root_sample->type == RMT_SampleType_Vulkan);
+
+        // If the last-allocated query in this tree has been GPU-processed it's safe to now send the tree to Remotery thread
+        if (current_write_gpu > msg_sample_tree->userData)
+        {
+            QueueSampleTree(g_Remotery->mq_to_rmt_thread, root_sample, msg_sample_tree->allocator, msg_sample_tree->threadName,
+                                0, message->threadProfiler, RMT_FALSE);
+            rmtMessageQueue_ConsumeNextMessage(bind->mqToVulkanUpdate, message);
+        }
+        else
+        {
+            break;
+        }
+    }
+
+    // Chain to the next bind here so that root calling code doesn't need to know the definition of VulkanBindImpl
+    rmtTry(VulkanMarkFrame(bind->next));
+
+    return RMT_ERROR_NONE;
+}
+
+RMT_API rmtError _rmt_BindVulkan(void* instance, void* physical_device, void* device, void* queue, rmtVulkanBind** out_bind)
+{
+    VulkanBindImpl* bind;
+    VkInstance vulkan_instance = (VkInstance)instance;
+    VkPhysicalDevice vulkan_physical_device = (VkPhysicalDevice)physical_device;
+    VkDevice vulkan_device = (VkDevice)device;
+    VkQueue vulkan_queue = (VkQueue)queue;
+
+    if (g_Remotery == NULL)
+    {
+        return RMT_ERROR_REMOTERY_NOT_CREATED;
+    }
+
+    assert(vulkan_instance != NULL);
+    assert(physical_device != NULL);
+    assert(device != NULL);
+    assert(queue != NULL);
+    assert(out_bind != NULL);
+
+    // Allocate the bind container
+    rmtTryMalloc(VulkanBindImpl, bind);
+
+    // Set default state
+    bind->base.physical_device = physical_device;
+    bind->base.device = device;
+    bind->base.queue = queue;
+    bind->maxNbQueries = 32 * 1024;
+    bind->gpuTimestampRingBuffer = NULL;
+    bind->cpuTimestampRingBuffer = NULL;
+    bind->sampleRingBuffer = NULL;
+    bind->ringBufferRead = 0;
+    bind->ringBufferWrite = 0;
+    bind->gpuQuerySemaphore = NULL;
+    bind->gpu_ticks_to_us = 1.0;
+    bind->vkQueueSubmit = NULL;
+    bind->vkGetPhysicalDeviceProperties = NULL;
+    bind->vkCreateQueryPool = NULL;
+    bind->vkDestroyQueryPool = NULL;
+    bind->vkResetQueryPool = NULL;
+    bind->vkGetQueryPoolResults = NULL;
+    bind->vkCmdWriteTimestamp = NULL;
+    bind->vkCreateSemaphore = NULL;
+    bind->vkDestroySemaphore = NULL;
+    bind->vkSignalSemaphore = NULL;
+    bind->vkGetSemaphoreCounterValue = NULL;
+    bind->vkGetCalibratedTimestampsEXT = NULL;
+    bind->mqToVulkanUpdate = NULL;
+    bind->next = NULL;
+
+    rmtTry(LoadVulkanFunctions(bind, vulkan_instance, vulkan_device));
+
+    // Create the independent ring buffer storage items
+    // TODO(don): Leave space beetween start and end to stop invalidating cache lines?
+    // NOTE(don): ABA impossible due to non-wrapping ring buffer indices
+    rmtTry(CreateQueryPool(bind, vulkan_device, bind->maxNbQueries));
+    rmtTryMallocArray(VulkanSample*, bind->sampleRingBuffer, bind->maxNbQueries / 2);
+    rmtTryMallocArray(rmtU64, bind->cpuTimestampRingBuffer, bind->maxNbQueries);
+    rmtTry(CreateQuerySemaphore(bind, vulkan_device));
+
+    rmtTryNew(rmtMessageQueue, bind->mqToVulkanUpdate, g_Settings.messageQueueSizeInBytes);
+
+    // Add to the global linked list of binds
+    {
+        mtxLock(&g_Remotery->vulkanBindsMutex);
+        bind->next = g_Remotery->vulkanBinds;
+        g_Remotery->vulkanBinds = bind;
+        mtxUnlock(&g_Remotery->vulkanBindsMutex);
+    }
+
+    *out_bind = &bind->base;
+
+    return RMT_ERROR_NONE;
+}
+
+RMT_API void _rmt_UnbindVulkan(rmtVulkanBind* bind)
+{
+    VulkanBindImpl* vulkan_bind = (VulkanBindImpl*)bind;
+    VkDevice vulkan_device = (VkDevice)vulkan_bind->base.device;
+
+    assert(bind != NULL);
+
+    // Remove from the linked list
+    {
+        mtxLock(&g_Remotery->vulkanBindsMutex);
+        VulkanBindImpl* cur = g_Remotery->vulkanBinds;
+        VulkanBindImpl* prev = NULL;
+        for ( ; cur != NULL; cur = cur->next)
+        {
+            if (cur == vulkan_bind)
+            {
+                if (prev != NULL)
+                {
+                    prev->next = cur->next;
+                }
+                else
+                {
+                    g_Remotery->vulkanBinds = cur->next;
+                }
+
+                break;
+            }
+        }
+        mtxUnlock(&g_Remotery->vulkanBindsMutex);
+    }
+
+    // TODO: Clean up resources
+
+    if (vulkan_bind->gpuQuerySemaphore != NULL)
+    {
+        vulkan_bind->vkDestroySemaphore(vulkan_device, vulkan_bind->gpuQuerySemaphore, NULL);
+    }
+
+    rmtFree(vulkan_bind->sampleRingBuffer);
+    rmtFree(vulkan_bind->cpuTimestampRingBuffer);
+
+    if (vulkan_bind->gpuTimestampRingBuffer != NULL)
+    {
+        vulkan_bind->vkDestroyQueryPool(vulkan_device, vulkan_bind->gpuTimestampRingBuffer, NULL);
+    }
+}
+
+static rmtError AllocateVulkanSampleTree(SampleTree** vulkan_tree)
+{
+    rmtTryNew(SampleTree, *vulkan_tree, sizeof(VulkanSample), (ObjConstructor)VulkanSample_Constructor,
+            (ObjDestructor)VulkanSample_Destructor);
+    return RMT_ERROR_NONE;
+}
+
+static rmtError AllocQueryPair(VulkanBindImpl* vulkan_bind, rmtAtomicU32* out_allocation_index)
+{
+    // Check for overflow against a tail which is only ever written by one thread
+    rmtU32 read = LoadAcquire(&vulkan_bind->ringBufferRead);
+    rmtU32 write = LoadAcquire(&vulkan_bind->ringBufferWrite);
+    rmtU32 nb_queries = (write - read);
+    rmtU32 queries_left = vulkan_bind->maxNbQueries - nb_queries;
+    if (queries_left < 2)
+    {
+        return rmtMakeError(RMT_ERROR_RESOURCE_CREATE_FAIL, "Vulkan query ring buffer overflow");
+    }
+
+    *out_allocation_index = AtomicAddU32(&vulkan_bind->ringBufferWrite, 2);
+    return RMT_ERROR_NONE;
+}
+
+RMT_API void _rmt_BeginVulkanSample(rmtVulkanBind* bind, void* command_buffer, rmtPStr name, rmtU32* hash_cache)
+{
+    ThreadProfiler* thread_profiler;
+
+    if (g_Remotery == NULL || bind == NULL)
+        return;
+
+    assert(command_buffer != NULL);
+
+    if (ThreadProfilers_GetCurrentThreadProfiler(g_Remotery->threadProfilers, &thread_profiler) == RMT_ERROR_NONE)
+    {
+        Sample* sample;
+        rmtU32 name_hash;
+        SampleTree** vulkan_tree;
+
+        name_hash = ThreadProfiler_GetNameHash(thread_profiler, g_Remotery->mq_to_rmt_thread, name, hash_cache);
+
+        // Create the Vulkan tree on-demand as the tree needs an up-front-created root.
+        // This is not possible to create on initialisation as a Vulkan binding is not yet available.
+        vulkan_tree = &thread_profiler->sampleTrees[RMT_SampleType_Vulkan];
+        if (*vulkan_tree == NULL)
+        {
+            AllocateVulkanSampleTree(vulkan_tree);
+        }
+
+        // Push the sample and activate the timestamp
+        if (ThreadProfiler_Push(*vulkan_tree, name_hash, 0, &sample) == RMT_ERROR_NONE)
+        {
+            rmtError error;
+
+            VulkanBindImpl* vulkan_bind = (VulkanBindImpl*)bind;
+            VkCommandBuffer vulkan_command_buffer = (VkCommandBuffer)command_buffer;
+
+            VulkanSample* vulkan_sample = (VulkanSample*)sample;
+            vulkan_sample->bind = vulkan_bind;
+            vulkan_sample->commandBuffer = vulkan_command_buffer;
+            vulkan_sample->base.usGpuIssueOnCpu = usTimer_Get(&g_Remotery->timer);
+
+            error = AllocQueryPair(vulkan_bind, &vulkan_sample->queryIndex);
+            if (error == RMT_ERROR_NONE)
+            {
+                rmtU32 physical_query_index = vulkan_sample->queryIndex & (vulkan_bind->maxNbQueries - 1);
+                vulkan_bind->vkCmdWriteTimestamp(vulkan_command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, vulkan_bind->gpuTimestampRingBuffer, physical_query_index);
+
+                // Track which Vulkan sample expects the timestamp results
+                vulkan_bind->sampleRingBuffer[physical_query_index / 2] = vulkan_sample;
+
+                // Keep track of the last allocated query so we can check when the GPU has finished with them all
+                thread_profiler->vulkanThreadData->lastAllocatedQueryIndex = vulkan_sample->queryIndex;
+            }
+            else
+            {
+                // SET QUERY INDEX TO INVALID so that pop doesn't release it
+            }
+        }
+    }
+}
+
+RMT_API void _rmt_EndVulkanSample()
+{
+    ThreadProfiler* thread_profiler;
+
+    if (g_Remotery == NULL)
+        return;
+
+    if (ThreadProfilers_GetCurrentThreadProfiler(g_Remotery->threadProfilers, &thread_profiler) == RMT_ERROR_NONE)
+    {
+        VulkanThreadData* vulkan_thread_data = thread_profiler->vulkanThreadData;
+        VulkanSample* vulkan_sample;
+
+        // Sample tree isn't there if Vulkan hasn't been initialised
+        SampleTree* vulkan_tree = thread_profiler->sampleTrees[RMT_SampleType_Vulkan];
+        if (vulkan_tree == NULL)
+        {
+            return;
+        }
+
+        // Close the timestamp
+        vulkan_sample = (VulkanSample*)vulkan_tree->currentParent;
+        if (vulkan_sample->base.recurse_depth > 0)
+        {
+            vulkan_sample->base.recurse_depth--;
+        }
+        else
+        {
+            // Issue the timestamp query for the end of the sample
+            VulkanBindImpl* vulkan_bind = vulkan_sample->bind;
+            VkCommandBuffer vulkan_command_buffer = vulkan_sample->commandBuffer;
+            rmtU32 query_index = vulkan_sample->queryIndex & (vulkan_bind->maxNbQueries - 1);
+            vulkan_bind->vkCmdWriteTimestamp(vulkan_command_buffer, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
+                                           vulkan_bind->gpuTimestampRingBuffer, query_index + 1);
+
+            if (ThreadProfiler_Pop(thread_profiler, vulkan_bind->mqToVulkanUpdate, (Sample*)vulkan_sample,
+                                   vulkan_thread_data->lastAllocatedQueryIndex))
+            {
+            }
+        }
+    }
+}
+
+#endif // RMT_USE_VULKAN
+
 /*
 ------------------------------------------------------------------------------------------------------------------------
 ------------------------------------------------------------------------------------------------------------------------
diff --git a/lib/Remotery.h b/lib/Remotery.h
index 2ad05b31..1516e556 100644
--- a/lib/Remotery.h
+++ b/lib/Remotery.h
@@ -80,6 +80,12 @@ documented just below this comment.
 #define RMT_USE_METAL 0
 #endif
 
+// Allow Vulkan profiling
+#ifndef RMT_USE_VULKAN
+// TODO: Set back to 0 when checking in!
+#define RMT_USE_VULKAN 1
+#endif
+
 // Initially use POSIX thread names to name threads instead of Thread0, 1, ...
 #ifndef RMT_USE_POSIX_THREADNAMES
 #define RMT_USE_POSIX_THREADNAMES 0
@@ -148,7 +154,7 @@ documented just below this comment.
 #endif
 
 #if __GNUC__ || __clang__
-#if __x86_64__ || __ppc64__ || __amd64__
+#if __x86_64__ || __ppc64__ || __amd64__ || __arm64__
 #define RMT_ARCH_64BIT
 #else
 #define RMT_ARCH_32BIT
@@ -206,6 +212,11 @@ documented just below this comment.
 #else
     #define IFDEF_RMT_USE_METAL(t, f) f
 #endif
+#if RMT_ENABLED && RMT_USE_VULKAN
+    #define IFDEF_RMT_USE_VULKAN(t, f) t
+#else
+    #define IFDEF_RMT_USE_VULKAN(t, f) f
+#endif
 
 
 // Public interface is written in terms of these macros to easily enable/disable itself
@@ -262,6 +273,7 @@ typedef enum rmtSampleType
     RMT_SampleType_D3D12,
     RMT_SampleType_OpenGL,
     RMT_SampleType_Metal,
+    RMT_SampleType_Vulkan,
     RMT_SampleType_Count,
 } rmtSampleType;
 
@@ -464,8 +476,8 @@ typedef struct rmtSettings
     RMT_OPTIONAL(RMT_ENABLED, _rmt_EndCPUSample())
 
 // Used for both CPU and GPU profiling
-// Essential to call this every frame, ever since D3D12 support was added
-// D3D12 Requirements: Don't sample any command lists that begin before this call and end after it
+// Essential to call this every frame, ever since D3D12/Vulkan support was added
+// D3D12/Vulkan Requirements: Don't sample any command lists that begin before this call and end after it
 #define rmt_MarkFrame()                                                             \
     RMT_OPTIONAL_RET(RMT_ENABLED, _rmt_MarkFrame(), RMT_ERROR_NONE)
 
@@ -599,6 +611,39 @@ typedef struct rmtD3D12Bind
     RMT_OPTIONAL(RMT_USE_METAL, _rmt_EndMetalSample())
 
 
+typedef struct rmtVulkanBind
+{
+    // The physical vulkan device
+    void* physical_device;
+
+    // The main device shared by all threads
+    void* device;
+
+    // The queue command buffers are executed on for profiling
+    void* queue;
+
+} rmtVulkanBind;
+
+// Create a Vulkan binding for the given device/queue pair
+#define rmt_BindVulkan(instance, physical_device, device, queue, out_bind)  \
+    RMT_OPTIONAL_RET(RMT_USE_VULKAN, _rmt_BindVulkan(instance, physical_device, device, queue, out_bind), NULL)
+
+#define rmt_UnbindVulkan(bind)                                              \
+    RMT_OPTIONAL(RMT_USE_VULKAN, _rmt_UnbindVulkan(bind))
+
+#define rmt_BeginVulkanSample(bind, command_buffer, name)                   \
+    RMT_OPTIONAL(RMT_USE_VULKAN, {                                          \
+        static rmtU32 rmt_sample_hash_##name = 0;                           \
+        _rmt_BeginVulkanSample(bind, command_buffer, #name, &rmt_sample_hash_##name);     \
+    })
+
+#define rmt_BeginVulkanSampleDynamic(bind, command_buffer, namestr)         \
+    RMT_OPTIONAL(RMT_USE_VULKAN, _rmt_BeginVulkanSample(bind, command_buffer, namestr, NULL))
+
+#define rmt_EndVulkanSample()                                               \
+    RMT_OPTIONAL(RMT_USE_VULKAN, _rmt_EndVulkanSample())
+
+
 /*--------------------------------------------------------------------------------------------------------------------------------
    Runtime Properties
 --------------------------------------------------------------------------------------------------------------------------------*/
@@ -983,6 +1028,17 @@ struct rmt_EndMetalSampleOnScopeExit
 };
 #endif
 
+#if RMT_USE_VULKAN
+extern "C" RMT_API void _rmt_EndVulkanSample();
+struct rmt_EndVulkanSampleOnScopeExit
+{
+    ~rmt_EndVulkanSampleOnScopeExit()
+    {
+        _rmt_EndVulkanSample();
+    }
+};
+#endif
+
 #endif
 
 
@@ -1005,6 +1061,9 @@ struct rmt_EndMetalSampleOnScopeExit
 #define rmt_ScopedMetalSample(name)                                                                     \
         RMT_OPTIONAL(RMT_USE_METAL, rmt_BeginMetalSample(name));                                        \
         RMT_OPTIONAL(RMT_USE_METAL, rmt_EndMetalSampleOnScopeExit rmt_ScopedMetalSample##name);
+#define rmt_ScopedVulkanSample(bind, command_buffer, name)                                              \
+        RMT_OPTIONAL(RMT_USE_VULKAN, rmt_BeginVulkanSample(bind, command_buffer, name));                \
+        RMT_OPTIONAL(RMT_USE_VULKAN, rmt_EndVulkanSampleOnScopeExit rmt_ScopedVulkanSample##name());
 
 #endif
 
@@ -1063,6 +1122,13 @@ RMT_API void _rmt_BeginMetalSample(rmtPStr name, rmtU32* hash_cache);
 RMT_API void _rmt_EndMetalSample(void);
 #endif
 
+#if RMT_USE_VULKAN
+RMT_API rmtError _rmt_BindVulkan(void* instance, void* physical_device, void* device, void* queue, rmtVulkanBind** out_bind);
+RMT_API void _rmt_UnbindVulkan(rmtVulkanBind* bind);
+RMT_API void _rmt_BeginVulkanSample(rmtVulkanBind* bind, void* command_buffer, rmtPStr name, rmtU32* hash_cache);
+RMT_API void _rmt_EndVulkanSample();
+#endif
+
 // Sample iterator
 RMT_API void                _rmt_IterateChildren(rmtSampleIterator* iter, rmtSample* sample);
 RMT_API rmtBool             _rmt_IterateNext(rmtSampleIterator* iter);

From dff750bde2ceedf65b462224c7debc29a127efdd Mon Sep 17 00:00:00 2001
From: Matthew Pohlmann <matthew.pohlmann@gmail.com>
Date: Sat, 6 Jan 2024 14:24:07 -0800
Subject: [PATCH 02/27] Update comment describing compilation requirements for
 Vulkan profiling

---
 lib/Remotery.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/Remotery.h b/lib/Remotery.h
index 1516e556..1aef3b49 100644
--- a/lib/Remotery.h
+++ b/lib/Remotery.h
@@ -30,6 +30,9 @@ Compiling
   library linkage. For example to compile the same run: cc lib/Remotery.c sample/sample.c
   -I lib -pthread -lm
 
+* Vulkan - Ensure your include directories are set such that the Vulkan headers can be
+  included with the statement: #include <vulkan/vulkan.h>.
+
 You can define some extra macros to modify what features are compiled into Remotery. These are
 documented just below this comment.
 

From 11314f7761bdd51e465d559fff95315ad8b7a602 Mon Sep 17 00:00:00 2001
From: Matthew Pohlmann <matthew.pohlmann@gmail.com>
Date: Sat, 6 Jan 2024 17:40:10 -0800
Subject: [PATCH 03/27] Fix calling rmt_GetLastErrorMessage from C++

---
 lib/Remotery.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/lib/Remotery.h b/lib/Remotery.h
index 1aef3b49..15e1e73b 100644
--- a/lib/Remotery.h
+++ b/lib/Remotery.h
@@ -345,9 +345,19 @@ typedef enum rmtError
 } rmtError;
 // clang-format on
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 // Gets the last error message issued on the calling thread
 RMT_API rmtPStr rmt_GetLastErrorMessage();
 
+#ifdef __cplusplus
+}
+#endif
+
+
+
 
 /*--------------------------------------------------------------------------------------------------------------------------------
    Runtime Settings

From 45f48873d34639c4334fc1ee0e49512db112dce1 Mon Sep 17 00:00:00 2001
From: Matthew Pohlmann <matthew.pohlmann@gmail.com>
Date: Sat, 6 Jan 2024 17:41:34 -0800
Subject: [PATCH 04/27] Fix rmt_ScopedD3D12Sample and rmt_ScopedVulkanSample.
 Extra parens confuse some compilers that think the end scope objects are
 function declarations

---
 lib/Remotery.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/Remotery.h b/lib/Remotery.h
index 15e1e73b..cf950c5d 100644
--- a/lib/Remotery.h
+++ b/lib/Remotery.h
@@ -1067,7 +1067,7 @@ struct rmt_EndVulkanSampleOnScopeExit
         RMT_OPTIONAL(RMT_USE_D3D11, rmt_EndD3D11SampleOnScopeExit rmt_ScopedD3D11Sample##name);
 #define rmt_ScopedD3D12Sample(bind, command_list, name)                                                 \
         RMT_OPTIONAL(RMT_USE_D3D12, rmt_BeginD3D12Sample(bind, command_list, name));                    \
-        RMT_OPTIONAL(RMT_USE_D3D12, rmt_EndD3D12SampleOnScopeExit rmt_ScopedD3D12Sample##name());
+        RMT_OPTIONAL(RMT_USE_D3D12, rmt_EndD3D12SampleOnScopeExit rmt_ScopedD3D12Sample##name);
 #define rmt_ScopedOpenGLSample(name)                                                                    \
         RMT_OPTIONAL(RMT_USE_OPENGL, rmt_BeginOpenGLSample(name));                                      \
         RMT_OPTIONAL(RMT_USE_OPENGL, rmt_EndOpenGLSampleOnScopeExit rmt_ScopedOpenGLSample##name);
@@ -1076,7 +1076,7 @@ struct rmt_EndVulkanSampleOnScopeExit
         RMT_OPTIONAL(RMT_USE_METAL, rmt_EndMetalSampleOnScopeExit rmt_ScopedMetalSample##name);
 #define rmt_ScopedVulkanSample(bind, command_buffer, name)                                              \
         RMT_OPTIONAL(RMT_USE_VULKAN, rmt_BeginVulkanSample(bind, command_buffer, name));                \
-        RMT_OPTIONAL(RMT_USE_VULKAN, rmt_EndVulkanSampleOnScopeExit rmt_ScopedVulkanSample##name());
+        RMT_OPTIONAL(RMT_USE_VULKAN, rmt_EndVulkanSampleOnScopeExit rmt_ScopedVulkanSample##name);
 
 #endif
 

From e6085500c856b5f4560f92d9757522b15d01b1a5 Mon Sep 17 00:00:00 2001
From: Matthew Pohlmann <matthew.pohlmann@gmail.com>
Date: Sat, 6 Jan 2024 17:42:56 -0800
Subject: [PATCH 05/27] Fix Remotery_Destructor when D3D12 or Vulkan sampling
 are enabled. Need to clear g_Remotery only after calling _rmt_UnbindVulkan or
 _rmt_UnbindD3D12 since both functions access g_Remotery

---
 lib/Remotery.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/lib/Remotery.c b/lib/Remotery.c
index 2158dfda..d91d6517 100644
--- a/lib/Remotery.c
+++ b/lib/Remotery.c
@@ -7098,12 +7098,6 @@ static void Remotery_Destructor(Remotery* rmt)
 
     rmtDelete(ThreadProfilers, rmt->threadProfilers);
 
-    if (g_RemoteryCreated)
-    {
-        g_Remotery = NULL;
-        g_RemoteryCreated = RMT_FALSE;
-    }
-
     rmtDelete(ObjectAllocator, rmt->propertyAllocator);
 
 #if RMT_USE_VULKAN
@@ -7130,6 +7124,12 @@ static void Remotery_Destructor(Remotery* rmt)
     rmtDelete(Metal, rmt->metal);
 #endif
 
+    if (g_RemoteryCreated)
+    {
+        g_Remotery = NULL;
+        g_RemoteryCreated = RMT_FALSE;
+    }
+
     rmtCloseFile(rmt->logfile);
 
     rmtDelete(StringTable, rmt->string_table);

From 067ba824a0d39a5926119e70614abb7be707ae5b Mon Sep 17 00:00:00 2001
From: Matthew Pohlmann <matthew.pohlmann@gmail.com>
Date: Sat, 6 Jan 2024 17:43:35 -0800
Subject: [PATCH 06/27] WIP: Fix getting Vulkan function pointers

---
 lib/Remotery.c | 48 ++++++++++++++++++++++++++----------------------
 lib/Remotery.h | 11 ++++++-----
 2 files changed, 32 insertions(+), 27 deletions(-)

diff --git a/lib/Remotery.c b/lib/Remotery.c
index d91d6517..d1ad4561 100644
--- a/lib/Remotery.c
+++ b/lib/Remotery.c
@@ -10040,6 +10040,7 @@ typedef struct VulkanBindImpl
 
     // Read/write positions of the ring buffer allocator, synchronising access to all the ring buffers at once
     // TODO(don): Separate by cache line?
+    // TODO(valakor): These should REALLY be 64-bit
     rmtAtomicU32 ringBufferRead;
     rmtAtomicU32 ringBufferWrite;
 
@@ -10069,23 +10070,20 @@ typedef struct VulkanBindImpl
 
 } VulkanBindImpl;
 
-static rmtError LoadVulkanFunctions(VulkanBindImpl* bind, VkInstance vulkan_instance, VkDevice vulkan_device)
+static rmtError LoadVulkanFunctions(VulkanBindImpl* bind, VkInstance vulkan_instance, PFN_vkGetInstanceProcAddr pfn_vkGetInstanceProcAddr)
 {
-    PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr;
-    PFN_vkGetDeviceProcAddr vkGetDeviceProcAddr;
-
-#define VK_DEVICE_FN(fn)                                                         \
-    bind->fn = (PFN_ ## fn)vkGetDeviceProcAddr(vulkan_device, #fn);              \
-    if (bind->fn == NULL)                                                        \
+#define VK_DEVICE_FN(fn)                                                             \
+    bind->fn = (PFN_ ## fn)pfn_vkGetInstanceProcAddr(vulkan_instance, #fn);              \
+    if (bind->fn == NULL)                                                            \
         return RMT_ERROR_RESOURCE_ACCESS_FAIL;
 
-#define VK_DEVICE_FN_FALLBACK(fn, fn_fallback)                                   \
-    bind->fn = (PFN_ ## fn)vkGetDeviceProcAddr(vulkan_device, #fn);              \
-    if (bind->fn == NULL)                                                        \
-    {                                                                            \
-        bind->fn = (PFN_ ## fn)vkGetDeviceProcAddr(vulkan_device, #fn_fallback); \
-        if (bind->fn == NULL)                                                    \
-            return RMT_ERROR_RESOURCE_ACCESS_FAIL;                               \
+#define VK_DEVICE_FN_FALLBACK(fn, fn_fallback)                                       \
+    bind->fn = (PFN_ ## fn)pfn_vkGetInstanceProcAddr(vulkan_instance, #fn);              \
+    if (bind->fn == NULL)                                                            \
+    {                                                                                \
+        bind->fn = (PFN_ ## fn)pfn_vkGetInstanceProcAddr(vulkan_instance, #fn_fallback); \
+        if (bind->fn == NULL)                                                        \
+            return RMT_ERROR_RESOURCE_ACCESS_FAIL;                                   \
     }
 
     VK_DEVICE_FN(vkQueueSubmit);
@@ -10136,6 +10134,7 @@ static rmtError CreateQuerySemaphore(VulkanBindImpl* bind, VkDevice vulkan_devic
     VkSemaphoreCreateInfo create_info;
     ZeroMemory(&create_info, sizeof(create_info));
     create_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
+    create_info.pNext = &type_info;
 
     if (bind->vkCreateSemaphore(vulkan_device, &create_info, NULL, &bind->gpuQuerySemaphore) != VK_SUCCESS)
     {
@@ -10188,6 +10187,8 @@ static rmtError UpdateGpuTicksToUs(VulkanBindImpl* bind, VkPhysicalDevice vulkan
 
     float gpu_ns_per_tick = device_properties.limits.timestampPeriod;
     bind->gpu_ticks_to_us = gpu_ns_per_tick / 1000.0;
+
+    return RMT_ERROR_NONE;
 }
 
 static rmtError GetTimestampCalibration(VulkanBindImpl* bind, VkPhysicalDevice vulkan_physical_device, VkDevice vulkan_device, double* gpu_ticks_to_us, rmtS64* gpu_to_cpu_timestamp_us)
@@ -10271,8 +10272,8 @@ static rmtError VulkanMarkFrame(VulkanBindImpl* bind)
     VkQueue vulkan_queue = (VkQueue)bind->base.queue;
 
     rmtU32 index_mask = bind->maxNbQueries - 1;
-    rmtU32 current_read_cpu = LoadAcquire(&bind->ringBufferRead);
-    rmtU32 current_write_cpu = LoadAcquire(&bind->ringBufferWrite);
+    rmtU64 current_read_cpu = LoadAcquire(&bind->ringBufferRead);
+    rmtU64 current_write_cpu = LoadAcquire(&bind->ringBufferWrite);
 
     // Tell the GPU where the CPU write position is
     VkTimelineSemaphoreSubmitInfoKHR semaphore_submit_info;
@@ -10290,20 +10291,21 @@ static rmtError VulkanMarkFrame(VulkanBindImpl* bind)
     bind->vkQueueSubmit(vulkan_queue, 1, &submit_info, NULL);
 
     // Has the GPU processed any writes?
-    rmtU64 current_write_gpu = 0;
-    if (bind->vkGetSemaphoreCounterValue(vulkan_device, bind->gpuQuerySemaphore, &current_write_gpu) != VK_SUCCESS)
+    rmtU64 current_write_gpu64 = 0;
+    if (bind->vkGetSemaphoreCounterValue(vulkan_device, bind->gpuQuerySemaphore, &current_write_gpu64) != VK_SUCCESS)
     {
         return rmtMakeError(RMT_ERROR_RESOURCE_ACCESS_FAIL, "Failed to get Vulkan Semaphore value");
     }
 
+    rmtU32 current_write_gpu = (rmtU32)current_write_gpu64;
     if (current_write_gpu > current_read_cpu)
     {
         double gpu_ticks_to_us;
         rmtS64 gpu_to_cpu_timestamp_us;
 
         // Physical ring buffer positions
-        rmtU32 ring_pos_a = current_read_cpu & index_mask;
-        rmtU32 ring_pos_b = current_write_gpu & index_mask;
+        rmtU32 ring_pos_a = (rmtU32)current_read_cpu & index_mask;
+        rmtU32 ring_pos_b = (rmtU32)current_write_gpu & index_mask;
 
         rmtTry(GetTimestampCalibration(bind, vulkan_physical_device, vulkan_device, &gpu_ticks_to_us, &gpu_to_cpu_timestamp_us));
 
@@ -10355,13 +10357,14 @@ static rmtError VulkanMarkFrame(VulkanBindImpl* bind)
     return RMT_ERROR_NONE;
 }
 
-RMT_API rmtError _rmt_BindVulkan(void* instance, void* physical_device, void* device, void* queue, rmtVulkanBind** out_bind)
+RMT_API rmtError _rmt_BindVulkan(void* instance, void* physical_device, void* device, void* queue, VulkanGetInstanceProcAddr get_proc_addr, rmtVulkanBind** out_bind)
 {
     VulkanBindImpl* bind;
     VkInstance vulkan_instance = (VkInstance)instance;
     VkPhysicalDevice vulkan_physical_device = (VkPhysicalDevice)physical_device;
     VkDevice vulkan_device = (VkDevice)device;
     VkQueue vulkan_queue = (VkQueue)queue;
+    PFN_vkGetInstanceProcAddr pfn_vkGetInstanceProcAddr = (PFN_vkGetInstanceProcAddr)get_proc_addr;
 
     if (g_Remotery == NULL)
     {
@@ -10373,6 +10376,7 @@ RMT_API rmtError _rmt_BindVulkan(void* instance, void* physical_device, void* de
     assert(device != NULL);
     assert(queue != NULL);
     assert(out_bind != NULL);
+    assert(get_proc_addr != NULL);
 
     // Allocate the bind container
     rmtTryMalloc(VulkanBindImpl, bind);
@@ -10404,7 +10408,7 @@ RMT_API rmtError _rmt_BindVulkan(void* instance, void* physical_device, void* de
     bind->mqToVulkanUpdate = NULL;
     bind->next = NULL;
 
-    rmtTry(LoadVulkanFunctions(bind, vulkan_instance, vulkan_device));
+    rmtTry(LoadVulkanFunctions(bind, vulkan_instance, pfn_vkGetInstanceProcAddr));
 
     // Create the independent ring buffer storage items
     // TODO(don): Leave space beetween start and end to stop invalidating cache lines?
diff --git a/lib/Remotery.h b/lib/Remotery.h
index cf950c5d..8337d846 100644
--- a/lib/Remotery.h
+++ b/lib/Remotery.h
@@ -349,8 +349,8 @@ typedef enum rmtError
 extern "C" {
 #endif
 
-// Gets the last error message issued on the calling thread
-RMT_API rmtPStr rmt_GetLastErrorMessage();
+    // Gets the last error message issued on the calling thread
+    RMT_API rmtPStr rmt_GetLastErrorMessage();
 
 #ifdef __cplusplus
 }
@@ -638,8 +638,8 @@ typedef struct rmtVulkanBind
 } rmtVulkanBind;
 
 // Create a Vulkan binding for the given device/queue pair
-#define rmt_BindVulkan(instance, physical_device, device, queue, out_bind)  \
-    RMT_OPTIONAL_RET(RMT_USE_VULKAN, _rmt_BindVulkan(instance, physical_device, device, queue, out_bind), NULL)
+#define rmt_BindVulkan(instance, physical_device, device, queue, get_proc_addr, out_bind) \
+    RMT_OPTIONAL_RET(RMT_USE_VULKAN, _rmt_BindVulkan(instance, physical_device, device, queue, get_proc_addr, out_bind), NULL)
 
 #define rmt_UnbindVulkan(bind)                                              \
     RMT_OPTIONAL(RMT_USE_VULKAN, _rmt_UnbindVulkan(bind))
@@ -1136,7 +1136,8 @@ RMT_API void _rmt_EndMetalSample(void);
 #endif
 
 #if RMT_USE_VULKAN
-RMT_API rmtError _rmt_BindVulkan(void* instance, void* physical_device, void* device, void* queue, rmtVulkanBind** out_bind);
+typedef void*(*VulkanGetInstanceProcAddr)(void*, const char*);
+RMT_API rmtError _rmt_BindVulkan(void* instance, void* physical_device, void* device, void* queue, VulkanGetInstanceProcAddr get_proc_addr, rmtVulkanBind** out_bind);
 RMT_API void _rmt_UnbindVulkan(rmtVulkanBind* bind);
 RMT_API void _rmt_BeginVulkanSample(rmtVulkanBind* bind, void* command_buffer, rmtPStr name, rmtU32* hash_cache);
 RMT_API void _rmt_EndVulkanSample();

From 433a90cc9ad13c2292be948f13029c68b7ddb35a Mon Sep 17 00:00:00 2001
From: Matthew Pohlmann <matthew.pohlmann@gmail.com>
Date: Sat, 6 Jan 2024 17:43:59 -0800
Subject: [PATCH 07/27] Update readme.md

---
 readme.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/readme.md b/readme.md
index d77e4300..613bf817 100644
--- a/readme.md
+++ b/readme.md
@@ -25,7 +25,7 @@ Supported Profiling Platforms:
 
 Supported GPU Profiling APIS:
 
-* D3D 11/12, OpenGL, CUDA, Metal.
+* D3D 11/12, OpenGL, CUDA, Metal, Vulkan.
 
 Compiling
 ---------
@@ -58,6 +58,7 @@ You can define some extra macros to modify what features are compiled into Remot
     RMT_USE_D3D12       0           Allow D3D12 GPU profiling
     RMT_USE_OPENGL      0           Allow OpenGL GPU profiling (dynamically links OpenGL libraries on available platforms)
     RMT_USE_METAL       0           Allow Metal profiling of command buffers
+    RMT_USE_VULKAN      0           Allow Vulkan GPU profiling
 
 
 Basic Use

From bf90e0db05ce639baabba7a63b72dbdc4d9cc4e8 Mon Sep 17 00:00:00 2001
From: Matthew Pohlmann <matthew.pohlmann@gmail.com>
Date: Sat, 6 Jan 2024 22:26:14 -0800
Subject: [PATCH 08/27] Fix visualization of uint64 and sint64 properties

---
 vis/Code/Remotery.js | 31 +++++++------------------------
 1 file changed, 7 insertions(+), 24 deletions(-)

diff --git a/vis/Code/Remotery.js b/vis/Code/Remotery.js
index 4b373fe0..c8036454 100644
--- a/vis/Code/Remotery.js
+++ b/vis/Code/Remotery.js
@@ -478,39 +478,21 @@ Remotery = (function()
 
     function UInt64ToFloat32(view, offset)
     {
-        // Unpack as two 32-bit integers so we have a vague attempt at reconstructing the value
-        const a = view.getUint32(offset + 0, true);
-        const b = view.getUint32(offset + 4, true);
-
-        // Can't do bit arithmetic above 32-bits in JS so combine using power-of-two math
-        const v = a + (b * Math.pow(2, 32));
+        // Read as a double to match Buffer_WriteU64
+        const v = view.getFloat64(offset, true);
 
         // TODO(don): Potentially massive data loss!
-        snapshots_view.setFloat32(offset, v);
+        view.setFloat32(offset, v, true);
     }
 
 
     function SInt64ToFloat32(view, offset)
     {
-        // Unpack as two 32-bit integers so we have a vague attempt at reconstructing the value
-        const a = view.getUint32(offset + 0, true);
-        const b = view.getUint32(offset + 4, true);
-
-        // Is this negative?
-        if (b & 0x80000000)
-        {
-            // Can only convert from twos-complement with 32-bit arithmetic so shave off the upper 32-bits
-            // TODO(don): Crazy data loss here
-            const v = -(~(a - 1));
-        }
-        else
-        {
-            // Can't do bit arithmetic above 32-bits in JS so combine using power-of-two math
-            const v = a + (b * Math.pow(2, 32));
-        }
+        // Read as a double to match Buffer_WriteU64
+        const v = view.getFloat64(offset, true);
 
         // TODO(don): Potentially massive data loss!
-        snapshots_view.setFloat32(offset, v);
+        view.setFloat32(offset, v, true);
     }
 
 
@@ -591,6 +573,7 @@ Remotery = (function()
                 case 5:
                     SInt64ToFloat32(snapshots_view, offset + 16);
                     SInt64ToFloat32(snapshots_view, offset + 24);
+                    break;
                 case 6:
                     UInt64ToFloat32(snapshots_view, offset + 16);
                     UInt64ToFloat32(snapshots_view, offset + 24);

From fce6e3b8be8fe098ecd57a22c08d78d6888b11bf Mon Sep 17 00:00:00 2001
From: Matthew Pohlmann <matthew.pohlmann@gmail.com>
Date: Sat, 6 Jan 2024 22:26:40 -0800
Subject: [PATCH 09/27] WIP: Vulkan spec states that you can't call
 vkGetQueryPoolResults with a dataSize of 0

---
 lib/Remotery.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/Remotery.c b/lib/Remotery.c
index d1ad4561..55972337 100644
--- a/lib/Remotery.c
+++ b/lib/Remotery.c
@@ -10153,6 +10153,9 @@ static rmtError CopyTimestamps(VulkanBindImpl* bind, VkDevice vulkan_device, rmt
     rmtU32 query_count = ring_pos_b - ring_pos_a;
     rmtU64 query_size = query_count * sizeof(rmtU64);
 
+    if (query_count == 0)
+        return RMT_ERROR_NONE;
+
     bind->vkGetQueryPoolResults(vulkan_device, bind->gpuTimestampRingBuffer, ring_pos_a, query_count, query_size, cpu_timestamps + ring_pos_a,
                               sizeof(rmtU64), VK_QUERY_RESULT_64_BIT);
 

From 533f7cfa5e4290bea185406634794ebe81204bad Mon Sep 17 00:00:00 2001
From: Matthew Pohlmann <matthew.pohlmann@gmail.com>
Date: Sun, 7 Jan 2024 15:49:28 -0800
Subject: [PATCH 10/27] WIP: First-pass Vulkan profiling implementation
 complete

---
 lib/Remotery.c | 133 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 97 insertions(+), 36 deletions(-)

diff --git a/lib/Remotery.c b/lib/Remotery.c
index 55972337..267eff21 100644
--- a/lib/Remotery.c
+++ b/lib/Remotery.c
@@ -754,6 +754,19 @@ static rmtU32 AtomicAddU32(rmtAtomicU32* value, rmtU32 add)
 #endif
 }
 
+static rmtU64 AtomicAddU64(rmtAtomicU64* value, rmtU64 add)
+{
+#if defined(RMT_USE_C11_ATOMICS)
+    return atomic_fetch_add(value, add);
+#elif defined(RMT_USE_CPP_ATOMICS)
+    return value->fetch_add(add);
+#elif defined(RMT_PLATFORM_WINDOWS) && !defined(__MINGW32__)
+    return (rmtU64)_InterlockedExchangeAdd64((long long volatile*)value, (long long)add);
+#elif defined(RMT_PLATFORM_POSIX) || defined(__MINGW32__)
+    return (rmtU64)__sync_fetch_and_add(value, add);
+#endif
+}
+
 static void AtomicSubS32(rmtAtomicS32* value, rmtS32 sub)
 {
     // Not all platforms have an implementation so just negate and add
@@ -773,6 +786,19 @@ static rmtU32 AtomicStoreU32(rmtAtomicU32* value, rmtU32 set)
 #endif
 }
 
+static rmtU64 AtomicStoreU64(rmtAtomicU64* value, rmtU64 set)
+{
+#if defined(RMT_USE_C11_ATOMICS)
+    return atomic_exchange(value, set);
+#elif defined(RMT_USE_CPP_ATOMICS)
+    return value->exchange(set);
+#elif defined(RMT_PLATFORM_WINDOWS) && !defined(__MINGW32__)
+    return (rmtU64)_InterlockedExchange64((long long volatile*)value, (long long)set);
+#elif defined(RMT_PLATFORM_POSIX) || defined(__MINGW32__)
+    return (rmtU64)__sync_lock_test_and_set(value, set);
+#endif
+}
+
 static rmtU32 AtomicLoadU32(rmtAtomicU32* value)
 {
 #if defined(RMT_USE_C11_ATOMICS)
@@ -786,6 +812,19 @@ static rmtU32 AtomicLoadU32(rmtAtomicU32* value)
 #endif
 }
 
+static rmtU64 AtomicLoadU64(rmtAtomicU64* value)
+{
+#if defined(RMT_USE_C11_ATOMICS)
+    return atomic_load(value);
+#elif defined(RMT_USE_CPP_ATOMICS)
+    return value->load();
+#elif defined(RMT_PLATFORM_WINDOWS) && !defined(__MINGW32__)
+    return (rmtU64)_InterlockedExchangeAdd64((long long volatile*)value, (long long)0);
+#elif defined(RMT_PLATFORM_POSIX) || defined(__MINGW32__)
+    return (rmtU64)__sync_fetch_and_add(value, 0);
+#endif
+}
+
 static void CompilerWriteFence()
 {
 #if defined(__clang__)
@@ -815,6 +854,13 @@ static rmtU32 LoadAcquire(rmtAtomicU32* address)
     return value;
 }
 
+static rmtU64 LoadAcquire64(rmtAtomicU64* address)
+{
+    rmtU64 value = *address;
+    CompilerReadFence();
+    return value;
+}
+
 static long* LoadAcquirePointer(long* volatile* ptr)
 {
     long* value = *ptr;
@@ -828,6 +874,12 @@ static void StoreRelease(rmtAtomicU32* address, rmtU32 value)
     *address = value;
 }
 
+static void StoreRelease64(rmtAtomicU64* address, rmtU64 value)
+{
+    CompilerWriteFence();
+    *address = value;
+}
+
 static void StoreReleasePointer(long* volatile* ptr, long* value)
 {
     CompilerWriteFence();
@@ -10040,9 +10092,8 @@ typedef struct VulkanBindImpl
 
     // Read/write positions of the ring buffer allocator, synchronising access to all the ring buffers at once
     // TODO(don): Separate by cache line?
-    // TODO(valakor): These should REALLY be 64-bit
-    rmtAtomicU32 ringBufferRead;
-    rmtAtomicU32 ringBufferWrite;
+    rmtAtomicU64 ringBufferRead;
+    rmtAtomicU64 ringBufferWrite;
 
     VkSemaphore gpuQuerySemaphore;
 
@@ -10197,7 +10248,7 @@ static rmtError UpdateGpuTicksToUs(VulkanBindImpl* bind, VkPhysicalDevice vulkan
 static rmtError GetTimestampCalibration(VulkanBindImpl* bind, VkPhysicalDevice vulkan_physical_device, VkDevice vulkan_device, double* gpu_ticks_to_us, rmtS64* gpu_to_cpu_timestamp_us)
 {
     // TODO(valakor): Honor RMT_GPU_CPU_SYNC_SECONDS? It's unclear to me how expensive vkGetCalibratedTimestampsEXT is
-    //  on all supported platforms.
+    //  on all supported platforms, but at least on Windows on my machine it was on the order of 100-150us.
 
     rmtU64 gpu_timestamp_ticks;
     rmtU64 cpu_timestamp_ticks;
@@ -10221,9 +10272,9 @@ static rmtError GetTimestampCalibration(VulkanBindImpl* bind, VkPhysicalDevice v
     timestamp_infos[1].timeDomain = VK_TIME_DOMAIN_QUERY_PERFORMANCE_COUNTER_EXT;
 #elif defined(RMT_PLATFORM_MACOS)
     // On Apple platforms MoltenVK reports support for VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT even though under the hood
-    //  it uses mach_absolute_time(), which is actually CLOCK_BOOT. This doesn't matter though as Remotery also uses
+    //  it uses mach_absolute_time(), which is actually CLOCK_UPTIME_RAW. This doesn't matter though as Remotery also uses
     //  mach_absolute_time() for time measurements so the results are comparable. For more information see:
-    //  <INSERT LINK HERE>
+    //  https://github.com/KhronosGroup/MoltenVK/blob/main/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
     timestamp_count = 2;
     timestamp_infos[1].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT;
     timestamp_infos[1].timeDomain = VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT;
@@ -10274,41 +10325,48 @@ static rmtError VulkanMarkFrame(VulkanBindImpl* bind)
     VkDevice vulkan_device = (VkDevice)bind->base.device;
     VkQueue vulkan_queue = (VkQueue)bind->base.queue;
 
-    rmtU32 index_mask = bind->maxNbQueries - 1;
-    rmtU64 current_read_cpu = LoadAcquire(&bind->ringBufferRead);
-    rmtU64 current_write_cpu = LoadAcquire(&bind->ringBufferWrite);
-
-    // Tell the GPU where the CPU write position is
-    VkTimelineSemaphoreSubmitInfoKHR semaphore_submit_info;
-    ZeroMemory(&semaphore_submit_info, sizeof(semaphore_submit_info));
-    semaphore_submit_info.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR;
-    semaphore_submit_info.signalSemaphoreValueCount = 1;
-    semaphore_submit_info.pSignalSemaphoreValues = &current_write_cpu;
-
-    VkSubmitInfo submit_info;
-    ZeroMemory(&submit_info, sizeof(submit_info));
-    submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
-    submit_info.pNext = &semaphore_submit_info;
-    submit_info.signalSemaphoreCount = 1;
-    submit_info.pSignalSemaphores = &bind->gpuQuerySemaphore;
-    bind->vkQueueSubmit(vulkan_queue, 1, &submit_info, NULL);
+    rmtU64 index_mask = (rmtU64)bind->maxNbQueries - 1;
+    rmtU64 current_read_cpu = LoadAcquire64(&bind->ringBufferRead);
+    rmtU64 current_write_cpu = LoadAcquire64(&bind->ringBufferWrite);
+    rmtU32 current_read_cpu_index = (rmtU32)(current_read_cpu & index_mask);
 
     // Has the GPU processed any writes?
-    rmtU64 current_write_gpu64 = 0;
-    if (bind->vkGetSemaphoreCounterValue(vulkan_device, bind->gpuQuerySemaphore, &current_write_gpu64) != VK_SUCCESS)
+    rmtU64 current_write_gpu = 0;
+    if (bind->vkGetSemaphoreCounterValue(vulkan_device, bind->gpuQuerySemaphore, &current_write_gpu) != VK_SUCCESS)
     {
         return rmtMakeError(RMT_ERROR_RESOURCE_ACCESS_FAIL, "Failed to get Vulkan Semaphore value");
     }
 
-    rmtU32 current_write_gpu = (rmtU32)current_write_gpu64;
+    if (current_write_cpu > current_write_gpu)
+    {
+        // Tell the GPU where the CPU write position is
+        // NOTE(valakor): Vulkan spec states that signalling a timeline semaphore must strictly increase its value
+        VkTimelineSemaphoreSubmitInfoKHR semaphore_submit_info;
+        ZeroMemory(&semaphore_submit_info, sizeof(semaphore_submit_info));
+        semaphore_submit_info.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR;
+        semaphore_submit_info.signalSemaphoreValueCount = 1;
+        semaphore_submit_info.pSignalSemaphoreValues = &current_write_cpu;
+
+        VkSubmitInfo submit_info;
+        ZeroMemory(&submit_info, sizeof(submit_info));
+        submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+        submit_info.pNext = &semaphore_submit_info;
+        submit_info.signalSemaphoreCount = 1;
+        submit_info.pSignalSemaphores = &bind->gpuQuerySemaphore;
+        if (bind->vkQueueSubmit(vulkan_queue, 1, &submit_info, NULL) != VK_SUCCESS)
+        {
+            return rmtMakeError(RMT_ERROR_RESOURCE_ACCESS_FAIL, "Failed to submit Vulkan Semaphore update to queue");
+        }
+    }
+
     if (current_write_gpu > current_read_cpu)
     {
         double gpu_ticks_to_us;
         rmtS64 gpu_to_cpu_timestamp_us;
 
         // Physical ring buffer positions
-        rmtU32 ring_pos_a = (rmtU32)current_read_cpu & index_mask;
-        rmtU32 ring_pos_b = (rmtU32)current_write_gpu & index_mask;
+        rmtU32 ring_pos_a = current_read_cpu_index;
+        rmtU32 ring_pos_b = (rmtU32)(current_write_gpu & index_mask);
 
         rmtTry(GetTimestampCalibration(bind, vulkan_physical_device, vulkan_device, &gpu_ticks_to_us, &gpu_to_cpu_timestamp_us));
 
@@ -10325,7 +10383,7 @@ static rmtError VulkanMarkFrame(VulkanBindImpl* bind)
         }
 
         // Release the ring buffer entries just processed
-        StoreRelease(&bind->ringBufferRead, current_write_gpu);
+        StoreRelease64(&bind->ringBufferRead, current_write_gpu);
     }
 
     // Attempt to empty the queue of complete message trees
@@ -10342,7 +10400,9 @@ static rmtError VulkanMarkFrame(VulkanBindImpl* bind)
         assert(root_sample->type == RMT_SampleType_Vulkan);
 
         // If the last-allocated query in this tree has been GPU-processed it's safe to now send the tree to Remotery thread
-        if (current_write_gpu > msg_sample_tree->userData)
+        rmtU32 sample_tree_write_index = msg_sample_tree->userData;
+        rmtU64 sample_tree_write = (rmtU64)(sample_tree_write_index - current_read_cpu_index) + current_read_cpu;
+        if (current_write_gpu > sample_tree_write)
         {
             QueueSampleTree(g_Remotery->mq_to_rmt_thread, root_sample, msg_sample_tree->allocator, msg_sample_tree->threadName,
                                 0, message->threadProfiler, RMT_FALSE);
@@ -10490,19 +10550,20 @@ static rmtError AllocateVulkanSampleTree(SampleTree** vulkan_tree)
     return RMT_ERROR_NONE;
 }
 
-static rmtError AllocQueryPair(VulkanBindImpl* vulkan_bind, rmtAtomicU32* out_allocation_index)
+static rmtError AllocQueryPair(VulkanBindImpl* vulkan_bind, rmtU32* out_allocation_index)
 {
     // Check for overflow against a tail which is only ever written by one thread
-    rmtU32 read = LoadAcquire(&vulkan_bind->ringBufferRead);
-    rmtU32 write = LoadAcquire(&vulkan_bind->ringBufferWrite);
-    rmtU32 nb_queries = (write - read);
+    rmtU64 read = LoadAcquire64(&vulkan_bind->ringBufferRead);
+    rmtU64 write = LoadAcquire64(&vulkan_bind->ringBufferWrite);
+    rmtU32 nb_queries = (rmtU32)(write - read);
     rmtU32 queries_left = vulkan_bind->maxNbQueries - nb_queries;
     if (queries_left < 2)
     {
         return rmtMakeError(RMT_ERROR_RESOURCE_CREATE_FAIL, "Vulkan query ring buffer overflow");
     }
 
-    *out_allocation_index = AtomicAddU32(&vulkan_bind->ringBufferWrite, 2);
+    rmtU64 index_mask = (rmtU64)vulkan_bind->maxNbQueries - 1;
+    *out_allocation_index = (rmtU32)(AtomicAddU64(&vulkan_bind->ringBufferWrite, 2) & index_mask);
     return RMT_ERROR_NONE;
 }
 

From 083be5877980a05d9663528b76235600f96f0783 Mon Sep 17 00:00:00 2001
From: Matthew Pohlmann <matthew.pohlmann@gmail.com>
Date: Sun, 7 Jan 2024 16:01:00 -0800
Subject: [PATCH 11/27] Default RMT_USE_VULKAN back to 0

---
 lib/Remotery.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/Remotery.h b/lib/Remotery.h
index 8337d846..33cb0b0e 100644
--- a/lib/Remotery.h
+++ b/lib/Remotery.h
@@ -85,8 +85,7 @@ documented just below this comment.
 
 // Allow Vulkan profiling
 #ifndef RMT_USE_VULKAN
-// TODO: Set back to 0 when checking in!
-#define RMT_USE_VULKAN 1
+#define RMT_USE_VULKAN 0
 #endif
 
 // Initially use POSIX thread names to name threads instead of Thread0, 1, ...

From bebb5a295cbe389f3083b0acebd23aba82a9392b Mon Sep 17 00:00:00 2001
From: Matthew Pohlmann <matthew.pohlmann@gmail.com>
Date: Sun, 7 Jan 2024 16:29:54 -0800
Subject: [PATCH 12/27] Update a handful of comments

---
 lib/Remotery.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/lib/Remotery.c b/lib/Remotery.c
index 267eff21..b66a1bdf 100644
--- a/lib/Remotery.c
+++ b/lib/Remotery.c
@@ -10091,7 +10091,8 @@ typedef struct VulkanBindImpl
     VulkanSample** sampleRingBuffer;
 
     // Read/write positions of the ring buffer allocator, synchronising access to all the ring buffers at once
-    // TODO(don): Separate by cache line?
+    // NOTE(valakor): These are 64-bit instead of 32-bit so that we can reasonably assume they never wrap.
+    // TODO(valakor): Separate by cache line?
     rmtAtomicU64 ringBufferRead;
     rmtAtomicU64 ringBufferWrite;
 
@@ -10248,7 +10249,7 @@ static rmtError UpdateGpuTicksToUs(VulkanBindImpl* bind, VkPhysicalDevice vulkan
 static rmtError GetTimestampCalibration(VulkanBindImpl* bind, VkPhysicalDevice vulkan_physical_device, VkDevice vulkan_device, double* gpu_ticks_to_us, rmtS64* gpu_to_cpu_timestamp_us)
 {
     // TODO(valakor): Honor RMT_GPU_CPU_SYNC_SECONDS? It's unclear to me how expensive vkGetCalibratedTimestampsEXT is
-    //  on all supported platforms, but at least on Windows on my machine it was on the order of 100-150us.
+    //  on all supported platforms, but at least on my Windows/NVIDIA machine it was on the order of 100-150us.
 
     rmtU64 gpu_timestamp_ticks;
     rmtU64 cpu_timestamp_ticks;
@@ -10474,8 +10475,8 @@ RMT_API rmtError _rmt_BindVulkan(void* instance, void* physical_device, void* de
     rmtTry(LoadVulkanFunctions(bind, vulkan_instance, pfn_vkGetInstanceProcAddr));
 
     // Create the independent ring buffer storage items
-    // TODO(don): Leave space beetween start and end to stop invalidating cache lines?
-    // NOTE(don): ABA impossible due to non-wrapping ring buffer indices
+    // TODO(valakor): Leave space beetween start and end to stop invalidating cache lines?
+    // NOTE(valakor): ABA impossible due to non-wrapping ring buffer indices
     rmtTry(CreateQueryPool(bind, vulkan_device, bind->maxNbQueries));
     rmtTryMallocArray(VulkanSample*, bind->sampleRingBuffer, bind->maxNbQueries / 2);
     rmtTryMallocArray(rmtU64, bind->cpuTimestampRingBuffer, bind->maxNbQueries);

From 5502716525af5d2b0e7bde6db0c7bf4f740c6e9a Mon Sep 17 00:00:00 2001
From: Matthew Pohlmann <matthew.pohlmann@gmail.com>
Date: Sun, 7 Jan 2024 22:50:35 -0800
Subject: [PATCH 13/27] Add missing ZeroMemory macro on non-Windows platforms

---
 lib/Remotery.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lib/Remotery.c b/lib/Remotery.c
index b66a1bdf..98c5659c 100644
--- a/lib/Remotery.c
+++ b/lib/Remotery.c
@@ -189,6 +189,10 @@ static rmtBool g_SettingsInitialized = RMT_FALSE;
     #include <atomic>
 #endif
 
+#ifndef RMT_PLATFORM_WINDOWS
+    #define ZeroMemory(dest, len) memset(dest, 0, len)
+#endif
+
 // clang-format on
 
 #if defined(_MSC_VER) && !defined(__clang__)

From 90c8e2e05f0c34f87446429b9d02e757f98a587c Mon Sep 17 00:00:00 2001
From: Matthew Pohlmann <matthew.pohlmann@gmail.com>
Date: Sun, 7 Jan 2024 22:51:35 -0800
Subject: [PATCH 14/27] Support thread naming on MacOS

---
 lib/Remotery.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/lib/Remotery.c b/lib/Remotery.c
index 98c5659c..4c06b1a9 100644
--- a/lib/Remotery.c
+++ b/lib/Remotery.c
@@ -2246,6 +2246,14 @@ static void rmtGetThreadName(rmtThreadId thread_id, rmtThreadHandle thread_handl
     len = strnlen_s(out_thread_name, thread_name_size);
     itoahex_s(out_thread_name + len, thread_name_size - len, thread_id);
 
+#elif defined(RMT_PLATFORM_MACOS)
+
+    int ret = pthread_getname_np(pthread_self(), out_thread_name, thread_name_size);
+    if (ret != 0 || out_thread_name[0] == '\0')
+    {
+        rmtGetThreadNameFallback(out_thread_name, thread_name_size);
+    }
+
 #elif defined(RMT_PLATFORM_LINUX) && RMT_USE_POSIX_THREADNAMES && !defined(__FreeBSD__) && !defined(__OpenBSD__)
 
     prctl(PR_GET_NAME, out_thread_name, 0, 0, 0);
@@ -7374,6 +7382,8 @@ static void SetDebuggerThreadName(const char* name)
     {
     }
 #endif
+#elif defined(RMT_PLATFORM_MACOS)
+    pthread_setname_np(name);
 #else
     RMT_UNREFERENCED_PARAMETER(name);
 #endif
@@ -7413,7 +7423,7 @@ RMT_API void _rmt_SetCurrentThreadName(rmtPStr thread_name)
     SetDebuggerThreadName(thread_name);
 
     // Send the thread name for lookup
-#ifdef RMT_PLATFORM_WINDOWS
+#if defined(RMT_PLATFORM_WINDOWS) || defined(RMT_PLATFORM_MACOS)
     name_length = strnlen_s(thread_profiler->threadName, 64);
     QueueAddToStringTable(g_Remotery->mq_to_rmt_thread, thread_profiler->threadNameHash, thread_name, name_length, NULL);
 #endif

From d41b2bfab6aa8747aea7192656db90d9f2fe426f Mon Sep 17 00:00:00 2001
From: Matthew Pohlmann <matthew.pohlmann@gmail.com>
Date: Sun, 7 Jan 2024 23:03:27 -0800
Subject: [PATCH 15/27] Vulkan on MacOS (via MoltenVK) only allows query pools
 of up to size 32k bytes, or 4k queries

---
 lib/Remotery.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lib/Remotery.c b/lib/Remotery.c
index 4c06b1a9..f4edf946 100644
--- a/lib/Remotery.c
+++ b/lib/Remotery.c
@@ -10463,7 +10463,11 @@ RMT_API rmtError _rmt_BindVulkan(void* instance, void* physical_device, void* de
     bind->base.physical_device = physical_device;
     bind->base.device = device;
     bind->base.queue = queue;
-    bind->maxNbQueries = 32 * 1024;
+#ifdef RMT_PLATFORM_MACOS
+    bind->maxNbQueries = 4 * 1024;
+#else
+	bind->maxNbQueries = 32 * 1024;
+#endif
     bind->gpuTimestampRingBuffer = NULL;
     bind->cpuTimestampRingBuffer = NULL;
     bind->sampleRingBuffer = NULL;

From 6e3cbbf8edd85e2b378fef757ec918fd65fa4225 Mon Sep 17 00:00:00 2001
From: Matthew Pohlmann <matthew.pohlmann@gmail.com>
Date: Sun, 7 Jan 2024 23:06:50 -0800
Subject: [PATCH 16/27] Add missing comment about query pool size limits on
 MacOS

---
 lib/Remotery.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/Remotery.c b/lib/Remotery.c
index f4edf946..816f3dd3 100644
--- a/lib/Remotery.c
+++ b/lib/Remotery.c
@@ -10464,6 +10464,8 @@ RMT_API rmtError _rmt_BindVulkan(void* instance, void* physical_device, void* de
     bind->base.device = device;
     bind->base.queue = queue;
 #ifdef RMT_PLATFORM_MACOS
+	// NOTE(valakor): Vulkan on MacOS via MoltenVK only supports timestamp query pools of up to 4k 64-bit queries. See
+	//  https://github.com/KhronosGroup/MoltenVK/blob/main/MoltenVK/MoltenVK/GPUObjects/MVKQueryPool.mm
     bind->maxNbQueries = 4 * 1024;
 #else
 	bind->maxNbQueries = 32 * 1024;

From af38067c6a4fbea9e82b947313c11837739d824c Mon Sep 17 00:00:00 2001
From: Matthew Pohlmann <matthew.pohlmann@gmail.com>
Date: Sun, 7 Jan 2024 23:34:31 -0800
Subject: [PATCH 17/27] Fix timestamp calibration on MacOS. Turns out MoltenVK
 actually returns a timestamp in the mach_continuous_time() time domain
 instead of mach_absolute_time(), which I think is a bug

---
 lib/Remotery.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lib/Remotery.c b/lib/Remotery.c
index 816f3dd3..bd68ae0d 100644
--- a/lib/Remotery.c
+++ b/lib/Remotery.c
@@ -10285,7 +10285,9 @@ static rmtError GetTimestampCalibration(VulkanBindImpl* bind, VkPhysicalDevice v
     timestamp_count = 2;
     timestamp_infos[1].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT;
     timestamp_infos[1].timeDomain = VK_TIME_DOMAIN_QUERY_PERFORMANCE_COUNTER_EXT;
-#elif defined(RMT_PLATFORM_MACOS)
+#elif 0 // defined(RMT_PLATFORM_MACOS)
+	// TODO(valakor) The comment below would be correct if not for a bug in MoltenVK that returns the wrong timestamp
+	//  value for VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT. For now we'll just sample the CPU timestamp manually.
     // On Apple platforms MoltenVK reports support for VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT even though under the hood
     //  it uses mach_absolute_time(), which is actually CLOCK_UPTIME_RAW. This doesn't matter though as Remotery also uses
     //  mach_absolute_time() for time measurements so the results are comparable. For more information see:
@@ -10309,7 +10311,7 @@ static rmtError GetTimestampCalibration(VulkanBindImpl* bind, VkPhysicalDevice v
     }
 
     // Convert CPU ticks to microseconds, offset from the global timer start
-#if defined(RMT_PLATFORM_WINDOWS) || defined(RMT_PLATFORM_MACOS)
+#if defined(RMT_PLATFORM_WINDOWS) // || defined(RMT_PLATFORM_MACOS)
     cpu_timestamp_ticks = timestamps[1];
     cpu_timestamp_us = usTimer_FromRawTicks(&g_Remotery->timer, cpu_timestamp_ticks);
 #else

From 6f957e8453dc327a7d52a5b01b76f9348a53fa6c Mon Sep 17 00:00:00 2001
From: Matthew Pohlmann <matthew.pohlmann@gmail.com>
Date: Mon, 8 Jan 2024 12:10:17 -0800
Subject: [PATCH 18/27] Update some comments in GetTimestampCalibration to
 reflect correct information w.r.t. MoltenVK/Vulkan. I'm close to convincing
 myself that I should drop querying the CPU timestamp entirely from Vulkan and
 just query it myself (like the Linux fallback does).

---
 lib/Remotery.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/lib/Remotery.c b/lib/Remotery.c
index bd68ae0d..cce16c99 100644
--- a/lib/Remotery.c
+++ b/lib/Remotery.c
@@ -10280,18 +10280,26 @@ static rmtError GetTimestampCalibration(VulkanBindImpl* bind, VkPhysicalDevice v
     timestamp_infos[0].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT;
     timestamp_infos[0].timeDomain = VK_TIME_DOMAIN_DEVICE_EXT;
 
+	// TODO(valakor): Reconsider whether we bother asking Vulkan to give us a CPU timestamp at all. It'd be much
+	//  simpler to just query the device timestamp (supported by all platforms) and manually query our timer instead
+	//  of all this platform-specific code. All we need is something "close enough".
+
     // Potentially also query a cpu timestamp if supported
 #if defined(RMT_PLATFORM_WINDOWS)
     timestamp_count = 2;
     timestamp_infos[1].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT;
     timestamp_infos[1].timeDomain = VK_TIME_DOMAIN_QUERY_PERFORMANCE_COUNTER_EXT;
 #elif 0 // defined(RMT_PLATFORM_MACOS)
-	// TODO(valakor) The comment below would be correct if not for a bug in MoltenVK that returns the wrong timestamp
-	//  value for VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT. For now we'll just sample the CPU timestamp manually.
-    // On Apple platforms MoltenVK reports support for VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT even though under the hood
-    //  it uses mach_absolute_time(), which is actually CLOCK_UPTIME_RAW. This doesn't matter though as Remotery also uses
-    //  mach_absolute_time() for time measurements so the results are comparable. For more information see:
-    //  https://github.com/KhronosGroup/MoltenVK/blob/main/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
+	// TODO(valakor): We have to fall back to manually querying CPU time due to the following issue:
+    //  On Apple platforms MoltenVK reports support for VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT, which matches the time
+	//  domain of mach_continuous_time(). To support mach_absolute_time() Vulkan would have to extend the available
+	//  time domains to include something like "VK_TIME_DOMAIN_CLOCK_UPTIME_RAW_EXT". See the comments here:
+	//  https://github.com/KhronosGroup/MoltenVK/blob/main/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
+	//
+	//  Alternatively, Remotery could switch to using mach_continuous_time(). The difference between the two is that
+    //  mach_continuous_time() (CLOCK_MONOTONIC_RAW) includes system sleep time, whereas mach_absolute_time()
+	//  (CLOCK_UPTIME_RAW) does not. I'm not 100% convinced that's what we would want, but I think it is technically
+	//  more secure.
     timestamp_count = 2;
     timestamp_infos[1].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT;
     timestamp_infos[1].timeDomain = VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT;

From 8ceedd36c67cd5b83b9b0d821b56b72a1157b2f8 Mon Sep 17 00:00:00 2001
From: Matthew Pohlmann <matthew.pohlmann@gmail.com>
Date: Tue, 9 Jan 2024 22:03:23 -0800
Subject: [PATCH 19/27] Remove ZeroMemory macro hack, just use memset

---
 lib/Remotery.c | 46 +++++++++++++++++++++-------------------------
 1 file changed, 21 insertions(+), 25 deletions(-)

diff --git a/lib/Remotery.c b/lib/Remotery.c
index cce16c99..72bce748 100644
--- a/lib/Remotery.c
+++ b/lib/Remotery.c
@@ -189,10 +189,6 @@ static rmtBool g_SettingsInitialized = RMT_FALSE;
     #include <atomic>
 #endif
 
-#ifndef RMT_PLATFORM_WINDOWS
-    #define ZeroMemory(dest, len) memset(dest, 0, len)
-#endif
-
 // clang-format on
 
 #if defined(_MSC_VER) && !defined(__clang__)
@@ -10174,7 +10170,7 @@ static rmtError LoadVulkanFunctions(VulkanBindImpl* bind, VkInstance vulkan_inst
 static rmtError CreateQueryPool(VulkanBindImpl* bind, VkDevice vulkan_device, rmtU32 nb_queries)
 {
     VkQueryPoolCreateInfo create_info;
-    ZeroMemory(&create_info, sizeof(create_info));
+    memset(&create_info, 0, sizeof(create_info));
     create_info.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO;
     create_info.queryType = VK_QUERY_TYPE_TIMESTAMP;
     create_info.queryCount = nb_queries;
@@ -10192,13 +10188,13 @@ static rmtError CreateQueryPool(VulkanBindImpl* bind, VkDevice vulkan_device, rm
 static rmtError CreateQuerySemaphore(VulkanBindImpl* bind, VkDevice vulkan_device)
 {
     VkSemaphoreTypeCreateInfoKHR type_info;
-    ZeroMemory(&type_info, sizeof(type_info));
+    memset(&type_info, 0, sizeof(type_info));
     type_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR;
     type_info.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE_KHR;
     type_info.initialValue = 0;
 
     VkSemaphoreCreateInfo create_info;
-    ZeroMemory(&create_info, sizeof(create_info));
+    memset(&create_info, 0, sizeof(create_info));
     create_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
     create_info.pNext = &type_info;
 
@@ -10251,7 +10247,7 @@ static rmtError UpdateGpuTicksToUs(VulkanBindImpl* bind, VkPhysicalDevice vulkan
     //  https://github.com/KhronosGroup/MoltenVK/blob/main/Docs/MoltenVK_Runtime_UserGuide.md
 
     VkPhysicalDeviceProperties device_properties;
-    ZeroMemory(&device_properties, sizeof(device_properties));
+    memset(&device_properties, 0, sizeof(device_properties));
     bind->vkGetPhysicalDeviceProperties(vulkan_physical_device, &device_properties);
 
     float gpu_ns_per_tick = device_properties.limits.timestampPeriod;
@@ -10276,13 +10272,13 @@ static rmtError GetTimestampCalibration(VulkanBindImpl* bind, VkPhysicalDevice v
     rmtU64 max_deviation;
     rmtU64 timestamps[2];
     VkCalibratedTimestampInfoEXT timestamp_infos[2];
-    ZeroMemory(&timestamp_infos, sizeof(timestamp_infos));
+    memset(timestamp_infos, 0, sizeof(timestamp_infos));
     timestamp_infos[0].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT;
     timestamp_infos[0].timeDomain = VK_TIME_DOMAIN_DEVICE_EXT;
 
-	// TODO(valakor): Reconsider whether we bother asking Vulkan to give us a CPU timestamp at all. It'd be much
-	//  simpler to just query the device timestamp (supported by all platforms) and manually query our timer instead
-	//  of all this platform-specific code. All we need is something "close enough".
+    // TODO(valakor): Reconsider whether we bother asking Vulkan to give us a CPU timestamp at all. It'd be much
+    //  simpler to just query the device timestamp (supported by all platforms) and manually query our timer instead
+    //  of all this platform-specific code. All we need is something "close enough".
 
     // Potentially also query a cpu timestamp if supported
 #if defined(RMT_PLATFORM_WINDOWS)
@@ -10290,16 +10286,16 @@ static rmtError GetTimestampCalibration(VulkanBindImpl* bind, VkPhysicalDevice v
     timestamp_infos[1].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT;
     timestamp_infos[1].timeDomain = VK_TIME_DOMAIN_QUERY_PERFORMANCE_COUNTER_EXT;
 #elif 0 // defined(RMT_PLATFORM_MACOS)
-	// TODO(valakor): We have to fall back to manually querying CPU time due to the following issue:
+    // TODO(valakor): We have to fall back to manually querying CPU time due to the following issue:
     //  On Apple platforms MoltenVK reports support for VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT, which matches the time
-	//  domain of mach_continuous_time(). To support mach_absolute_time() Vulkan would have to extend the available
-	//  time domains to include something like "VK_TIME_DOMAIN_CLOCK_UPTIME_RAW_EXT". See the comments here:
-	//  https://github.com/KhronosGroup/MoltenVK/blob/main/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
-	//
-	//  Alternatively, Remotery could switch to using mach_continuous_time(). The difference between the two is that
+    //  domain of mach_continuous_time(). To support mach_absolute_time() Vulkan would have to extend the available
+    //  time domains to include something like "VK_TIME_DOMAIN_CLOCK_UPTIME_RAW_EXT". See the comments here:
+    //  https://github.com/KhronosGroup/MoltenVK/blob/main/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
+    //
+    //  Alternatively, Remotery could switch to using mach_continuous_time(). The difference between the two is that
     //  mach_continuous_time() (CLOCK_MONOTONIC_RAW) includes system sleep time, whereas mach_absolute_time()
-	//  (CLOCK_UPTIME_RAW) does not. I'm not 100% convinced that's what we would want, but I think it is technically
-	//  more secure.
+    //  (CLOCK_UPTIME_RAW) does not. I'm not 100% convinced that's what we would want, but I think it is technically
+    //  more secure.
     timestamp_count = 2;
     timestamp_infos[1].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT;
     timestamp_infos[1].timeDomain = VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT;
@@ -10367,13 +10363,13 @@ static rmtError VulkanMarkFrame(VulkanBindImpl* bind)
         // Tell the GPU where the CPU write position is
         // NOTE(valakor): Vulkan spec states that signalling a timeline semaphore must strictly increase its value
         VkTimelineSemaphoreSubmitInfoKHR semaphore_submit_info;
-        ZeroMemory(&semaphore_submit_info, sizeof(semaphore_submit_info));
+        memset(&semaphore_submit_info, 0, sizeof(semaphore_submit_info));
         semaphore_submit_info.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR;
         semaphore_submit_info.signalSemaphoreValueCount = 1;
         semaphore_submit_info.pSignalSemaphoreValues = &current_write_cpu;
 
         VkSubmitInfo submit_info;
-        ZeroMemory(&submit_info, sizeof(submit_info));
+        memset(&submit_info, 0, sizeof(submit_info));
         submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
         submit_info.pNext = &semaphore_submit_info;
         submit_info.signalSemaphoreCount = 1;
@@ -10474,11 +10470,11 @@ RMT_API rmtError _rmt_BindVulkan(void* instance, void* physical_device, void* de
     bind->base.device = device;
     bind->base.queue = queue;
 #ifdef RMT_PLATFORM_MACOS
-	// NOTE(valakor): Vulkan on MacOS via MoltenVK only supports timestamp query pools of up to 4k 64-bit queries. See
-	//  https://github.com/KhronosGroup/MoltenVK/blob/main/MoltenVK/MoltenVK/GPUObjects/MVKQueryPool.mm
+    // NOTE(valakor): Vulkan on MacOS via MoltenVK only supports timestamp query pools of up to 4k 64-bit queries. See
+    //  https://github.com/KhronosGroup/MoltenVK/blob/main/MoltenVK/MoltenVK/GPUObjects/MVKQueryPool.mm
     bind->maxNbQueries = 4 * 1024;
 #else
-	bind->maxNbQueries = 32 * 1024;
+    bind->maxNbQueries = 32 * 1024;
 #endif
     bind->gpuTimestampRingBuffer = NULL;
     bind->cpuTimestampRingBuffer = NULL;

From f38c000cac3e808b3d4f9907f91830fcfc817175 Mon Sep 17 00:00:00 2001
From: Matthew Pohlmann <matthew.pohlmann@gmail.com>
Date: Tue, 9 Jan 2024 22:05:57 -0800
Subject: [PATCH 20/27] Clarify that the function pointer passed to
 rmt_BindVulkan is the vkGetInstanceProcAddr, not vkGetDeviceProcAddr

---
 lib/Remotery.c | 6 +++---
 lib/Remotery.h | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/lib/Remotery.c b/lib/Remotery.c
index 72bce748..34116eaa 100644
--- a/lib/Remotery.c
+++ b/lib/Remotery.c
@@ -10441,14 +10441,14 @@ static rmtError VulkanMarkFrame(VulkanBindImpl* bind)
     return RMT_ERROR_NONE;
 }
 
-RMT_API rmtError _rmt_BindVulkan(void* instance, void* physical_device, void* device, void* queue, VulkanGetInstanceProcAddr get_proc_addr, rmtVulkanBind** out_bind)
+RMT_API rmtError _rmt_BindVulkan(void* instance, void* physical_device, void* device, void* queue, VulkanGetInstanceProcAddr get_instance_proc_addr, rmtVulkanBind** out_bind)
 {
     VulkanBindImpl* bind;
     VkInstance vulkan_instance = (VkInstance)instance;
     VkPhysicalDevice vulkan_physical_device = (VkPhysicalDevice)physical_device;
     VkDevice vulkan_device = (VkDevice)device;
     VkQueue vulkan_queue = (VkQueue)queue;
-    PFN_vkGetInstanceProcAddr pfn_vkGetInstanceProcAddr = (PFN_vkGetInstanceProcAddr)get_proc_addr;
+    PFN_vkGetInstanceProcAddr pfn_vkGetInstanceProcAddr = (PFN_vkGetInstanceProcAddr)get_instance_proc_addr;
 
     if (g_Remotery == NULL)
     {
@@ -10460,7 +10460,7 @@ RMT_API rmtError _rmt_BindVulkan(void* instance, void* physical_device, void* de
     assert(device != NULL);
     assert(queue != NULL);
     assert(out_bind != NULL);
-    assert(get_proc_addr != NULL);
+    assert(get_instance_proc_addr != NULL);
 
     // Allocate the bind container
     rmtTryMalloc(VulkanBindImpl, bind);
diff --git a/lib/Remotery.h b/lib/Remotery.h
index 3a126690..2713dfe3 100644
--- a/lib/Remotery.h
+++ b/lib/Remotery.h
@@ -637,8 +637,8 @@ typedef struct rmtVulkanBind
 } rmtVulkanBind;
 
 // Create a Vulkan binding for the given device/queue pair
-#define rmt_BindVulkan(instance, physical_device, device, queue, get_proc_addr, out_bind) \
-    RMT_OPTIONAL_RET(RMT_USE_VULKAN, _rmt_BindVulkan(instance, physical_device, device, queue, get_proc_addr, out_bind), NULL)
+#define rmt_BindVulkan(instance, physical_device, device, queue, get_instance_proc_addr, out_bind) \
+    RMT_OPTIONAL_RET(RMT_USE_VULKAN, _rmt_BindVulkan(instance, physical_device, device, queue, get_instance_proc_addr, out_bind), NULL)
 
 #define rmt_UnbindVulkan(bind)                                              \
     RMT_OPTIONAL(RMT_USE_VULKAN, _rmt_UnbindVulkan(bind))
@@ -1136,7 +1136,7 @@ RMT_API void _rmt_EndMetalSample(void);
 
 #if RMT_USE_VULKAN
 typedef void*(*VulkanGetInstanceProcAddr)(void*, const char*);
-RMT_API rmtError _rmt_BindVulkan(void* instance, void* physical_device, void* device, void* queue, VulkanGetInstanceProcAddr get_proc_addr, rmtVulkanBind** out_bind);
+RMT_API rmtError _rmt_BindVulkan(void* instance, void* physical_device, void* device, void* queue, VulkanGetInstanceProcAddr get_instance_proc_addr, rmtVulkanBind** out_bind);
 RMT_API void _rmt_UnbindVulkan(rmtVulkanBind* bind);
 RMT_API void _rmt_BeginVulkanSample(rmtVulkanBind* bind, void* command_buffer, rmtPStr name, rmtU32* hash_cache);
 RMT_API void _rmt_EndVulkanSample();

From 0c96a6fcd7981a736e99d4c3c2a60ccf3994d306 Mon Sep 17 00:00:00 2001
From: Matthew Pohlmann <matthew.pohlmann@gmail.com>
Date: Tue, 9 Jan 2024 22:24:48 -0800
Subject: [PATCH 21/27] Rename VulkanGetInstanceProcAddr to
 rmtVulkanGetInstanceProcAddr

---
 lib/Remotery.c | 2 +-
 lib/Remotery.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/Remotery.c b/lib/Remotery.c
index 34116eaa..e64773ec 100644
--- a/lib/Remotery.c
+++ b/lib/Remotery.c
@@ -10441,7 +10441,7 @@ static rmtError VulkanMarkFrame(VulkanBindImpl* bind)
     return RMT_ERROR_NONE;
 }
 
-RMT_API rmtError _rmt_BindVulkan(void* instance, void* physical_device, void* device, void* queue, VulkanGetInstanceProcAddr get_instance_proc_addr, rmtVulkanBind** out_bind)
+RMT_API rmtError _rmt_BindVulkan(void* instance, void* physical_device, void* device, void* queue, rmtVulkanGetInstanceProcAddr get_instance_proc_addr, rmtVulkanBind** out_bind)
 {
     VulkanBindImpl* bind;
     VkInstance vulkan_instance = (VkInstance)instance;
diff --git a/lib/Remotery.h b/lib/Remotery.h
index 2713dfe3..e6d52a8e 100644
--- a/lib/Remotery.h
+++ b/lib/Remotery.h
@@ -1135,8 +1135,8 @@ RMT_API void _rmt_EndMetalSample(void);
 #endif
 
 #if RMT_USE_VULKAN
-typedef void*(*VulkanGetInstanceProcAddr)(void*, const char*);
-RMT_API rmtError _rmt_BindVulkan(void* instance, void* physical_device, void* device, void* queue, VulkanGetInstanceProcAddr get_instance_proc_addr, rmtVulkanBind** out_bind);
+typedef void*(*rmtVulkanGetInstanceProcAddr)(void*, const char*);
+RMT_API rmtError _rmt_BindVulkan(void* instance, void* physical_device, void* device, void* queue, rmtVulkanGetInstanceProcAddr get_instance_proc_addr, rmtVulkanBind** out_bind);
 RMT_API void _rmt_UnbindVulkan(rmtVulkanBind* bind);
 RMT_API void _rmt_BeginVulkanSample(rmtVulkanBind* bind, void* command_buffer, rmtPStr name, rmtU32* hash_cache);
 RMT_API void _rmt_EndVulkanSample();

From 53897b475d00d60c5b57d173a13508350943b5c4 Mon Sep 17 00:00:00 2001
From: Matthew Pohlmann <matthew.pohlmann@gmail.com>
Date: Tue, 9 Jan 2024 22:28:24 -0800
Subject: [PATCH 22/27] Update readme.md

---
 readme.md | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/readme.md b/readme.md
index 613bf817..f9091540 100644
--- a/readme.md
+++ b/readme.md
@@ -213,6 +213,50 @@ The C API supports begin/end also:
     rmt_EndMetalSample();
 
 
+Sampling Vulkan GPU activity
+---------------------------
+
+Remotery can sample Vulkan command buffers issued to the GPU on multiple queues from multiple threads. Command buffers
+must be submitted to the same queue as the samples are issued to. Multiple queues can be profiled by creating multiple
+Vulkan bind objects.
+
+    // Parameters are VkInstance, VkPhysicalDevice, VkDevice, VkQueue, vkGetInstanceProcAddr, rmtVulkanBind**
+    // NOTE: The get_instance_proc_addr parameter doesn't match vkGetInstanceProcAddr exactly in order to avoid
+    // including Vulkan.h in Remotery.h, so the actual function pointer must be cast when passed to rmt_BindVulkan.
+    rmtVulkanBind* vulkan_bind = NULL;
+    rmt_BindVulkan(instance, physical_device, device, queue, (rmtVulkanGetInstanceProcAddr)get_instance_proc_addr, &vulkan_bind);
+
+Sampling is then a simple case of:
+
+    // Explicit begin/end for C
+    {
+        rmt_BeginVulkanSample(vulkan_bind, command_buffer, UnscopedSample);
+        // ... Vulkan code ...
+        rmt_EndVulkanSample();
+    }
+
+    // Scoped begin/end for C++
+    {
+        rmt_ScopedVulkanSample(vulkan_bind, command_buffer, ScopedSample);
+        // ... Vulkan code ...
+    }
+
+NOTE: Vulkan sampling on Apple platforms via MoltenVK must be done with caution. Metal doesn't natively support timestamps
+inside of render or compute passes, so MoltenVK simply reports all timestamps inside those scopes as the begin/end time of
+the entire render pass!
+
+Subsequent sampling calls from the same thread will use that device/queue combination. Once per frame you must call `rmt_MarkFrame()`
+to gather GPU timestamps on the CPU.
+
+    // End of frame, possibly after calling vkPresentKHR or at the very beginning of the frame
+    rmt_MarkFrame();
+
+When you destroy your Vulkan device and queue you can manually clean up resources by calling `rmt_UnbindVulkan`, though this is
+dont automatically by `rmt_DestroyGlobalInstance` as well for all rmt_BindVulkan objects:
+
+    rmt_UnbindVulkan(vulkan_bind);
+
+
 Applying Configuration Settings
 -------------------------------
 

From b0bf7c51784790c75f566ae3f488d79c23e5eef5 Mon Sep 17 00:00:00 2001
From: Matthew Pohlmann <matthew.pohlmann@gmail.com>
Date: Tue, 9 Jan 2024 22:51:56 -0800
Subject: [PATCH 23/27] Ensure we delete VulkanBindImpl::mqToVulkanUpdate

---
 lib/Remotery.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/lib/Remotery.c b/lib/Remotery.c
index e64773ec..46c27a58 100644
--- a/lib/Remotery.c
+++ b/lib/Remotery.c
@@ -10463,6 +10463,7 @@ RMT_API rmtError _rmt_BindVulkan(void* instance, void* physical_device, void* de
     assert(get_instance_proc_addr != NULL);
 
     // Allocate the bind container
+    // TODO(valakor): If anything after this fails we'll leak this bind instance
     rmtTryMalloc(VulkanBindImpl, bind);
 
     // Set default state
@@ -10554,7 +10555,9 @@ RMT_API void _rmt_UnbindVulkan(rmtVulkanBind* bind)
         mtxUnlock(&g_Remotery->vulkanBindsMutex);
     }
 
-    // TODO: Clean up resources
+    // Clean up bind resources
+
+    rmtDelete(rmtMessageQueue, vulkan_bind->mqToVulkanUpdate);
 
     if (vulkan_bind->gpuQuerySemaphore != NULL)
     {

From 70a13a9ea5a36753f74df8e68383289d444208f3 Mon Sep 17 00:00:00 2001
From: Matthew Pohlmann <matthew.pohlmann@gmail.com>
Date: Tue, 9 Jan 2024 23:00:11 -0800
Subject: [PATCH 24/27] Cleaner shutdown by automatically consuming all pending
 GPU samples. Means the user doesn't have to do this themselves

---
 lib/Remotery.c | 37 ++++++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/lib/Remotery.c b/lib/Remotery.c
index 46c27a58..690982d6 100644
--- a/lib/Remotery.c
+++ b/lib/Remotery.c
@@ -7153,13 +7153,6 @@ static void Remotery_Destructor(Remotery* rmt)
 {
     assert(rmt != NULL);
 
-    // Join the remotery thread before clearing the global object as the thread is profiling itself
-    rmtDelete(rmtThread, rmt->thread);
-
-    rmtDelete(ThreadProfilers, rmt->threadProfilers);
-
-    rmtDelete(ObjectAllocator, rmt->propertyAllocator);
-
 #if RMT_USE_VULKAN
     while (rmt->vulkanBinds != NULL)
     {
@@ -7168,6 +7161,13 @@ static void Remotery_Destructor(Remotery* rmt)
     mtxDelete(&rmt->vulkanBindsMutex);
 #endif
 
+    // Join the remotery thread before clearing the global object as the thread is profiling itself
+    rmtDelete(rmtThread, rmt->thread);
+
+    rmtDelete(ThreadProfilers, rmt->threadProfilers);
+
+    rmtDelete(ObjectAllocator, rmt->propertyAllocator);
+
 #if RMT_USE_D3D12
     while (rmt->d3d12Binds != NULL)
     {
@@ -7563,7 +7563,7 @@ static rmtError D3D12MarkFrame(struct D3D12BindImpl* bind);
 #endif
 
 #if RMT_USE_VULKAN
-static rmtError VulkanMarkFrame(struct VulkanBindImpl* bind);
+static rmtError VulkanMarkFrame(struct VulkanBindImpl* bind, rmtBool recurse);
 #endif
 
 RMT_API rmtError _rmt_MarkFrame(void)
@@ -7580,7 +7580,7 @@ RMT_API rmtError _rmt_MarkFrame(void)
 
     #if RMT_USE_VULKAN
         // This will kick off mark frames on the complete chain of binds
-        rmtTry(VulkanMarkFrame(g_Remotery->vulkanBinds));
+        rmtTry(VulkanMarkFrame(g_Remotery->vulkanBinds, RMT_TRUE));
     #endif
 
     return RMT_ERROR_NONE;
@@ -10113,6 +10113,7 @@ typedef struct VulkanBindImpl
 
     // Function pointers to Vulkan functions
     PFN_vkQueueSubmit vkQueueSubmit;
+    PFN_vkQueueWaitIdle vkQueueWaitIdle;
     PFN_vkGetPhysicalDeviceProperties vkGetPhysicalDeviceProperties;
     PFN_vkCreateQueryPool vkCreateQueryPool;
     PFN_vkDestroyQueryPool vkDestroyQueryPool;
@@ -10149,6 +10150,7 @@ static rmtError LoadVulkanFunctions(VulkanBindImpl* bind, VkInstance vulkan_inst
     }
 
     VK_DEVICE_FN(vkQueueSubmit);
+    VK_DEVICE_FN(vkQueueWaitIdle);
     VK_DEVICE_FN(vkGetPhysicalDeviceProperties);
     VK_DEVICE_FN(vkCreateQueryPool);
     VK_DEVICE_FN(vkDestroyQueryPool);
@@ -10335,7 +10337,7 @@ static rmtError GetTimestampCalibration(VulkanBindImpl* bind, VkPhysicalDevice v
     return RMT_ERROR_NONE;
 }
 
-static rmtError VulkanMarkFrame(VulkanBindImpl* bind)
+static rmtError VulkanMarkFrame(VulkanBindImpl* bind, rmtBool recurse)
 {
     if (bind == NULL)
     {
@@ -10436,7 +10438,10 @@ static rmtError VulkanMarkFrame(VulkanBindImpl* bind)
     }
 
     // Chain to the next bind here so that root calling code doesn't need to know the definition of VulkanBindImpl
-    rmtTry(VulkanMarkFrame(bind->next));
+    if (recurse)
+    {
+        rmtTry(VulkanMarkFrame(bind->next, recurse));
+    }
 
     return RMT_ERROR_NONE;
 }
@@ -10485,6 +10490,7 @@ RMT_API rmtError _rmt_BindVulkan(void* instance, void* physical_device, void* de
     bind->gpuQuerySemaphore = NULL;
     bind->gpu_ticks_to_us = 1.0;
     bind->vkQueueSubmit = NULL;
+    bind->vkQueueWaitIdle = NULL;
     bind->vkGetPhysicalDeviceProperties = NULL;
     bind->vkCreateQueryPool = NULL;
     bind->vkDestroyQueryPool = NULL;
@@ -10528,6 +10534,7 @@ RMT_API void _rmt_UnbindVulkan(rmtVulkanBind* bind)
 {
     VulkanBindImpl* vulkan_bind = (VulkanBindImpl*)bind;
     VkDevice vulkan_device = (VkDevice)vulkan_bind->base.device;
+    VkQueue vulkan_queue = (VkQueue)vulkan_bind->base.queue;
 
     assert(bind != NULL);
 
@@ -10555,6 +10562,14 @@ RMT_API void _rmt_UnbindVulkan(rmtVulkanBind* bind)
         mtxUnlock(&g_Remotery->vulkanBindsMutex);
     }
 
+    // Ensure all samples submitted to the GPU are consumed for clean shutdown
+    if (LoadAcquire64(&vulkan_bind->ringBufferWrite) > LoadAcquire64(&vulkan_bind->ringBufferRead))
+    {
+        VulkanMarkFrame(vulkan_bind, RMT_FALSE);
+        vulkan_bind->vkQueueWaitIdle(vulkan_queue);
+        VulkanMarkFrame(vulkan_bind, RMT_FALSE);
+    }
+
     // Clean up bind resources
 
     rmtDelete(rmtMessageQueue, vulkan_bind->mqToVulkanUpdate);

From e7141d66b6a02ea6fd3aee8d4244134284b825c3 Mon Sep 17 00:00:00 2001
From: Matthew Pohlmann <matthew.pohlmann@gmail.com>
Date: Sat, 13 Jan 2024 20:21:37 -0800
Subject: [PATCH 25/27] Clarify Vulkan compilation and extension/version
 requirements in Remotery.h and README

---
 lib/Remotery.h | 5 ++++-
 readme.md      | 6 ++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/lib/Remotery.h b/lib/Remotery.h
index e6d52a8e..d014c056 100644
--- a/lib/Remotery.h
+++ b/lib/Remotery.h
@@ -31,7 +31,10 @@ Compiling
   -I lib -pthread -lm
 
 * Vulkan - Ensure your include directories are set such that the Vulkan headers can be
-  included with the statement: #include <vulkan/vulkan.h>.
+  included with the statement: #include <vulkan/vulkan.h>. Currently the Vulkan implementation
+  requires either Vulkan 1.2+ with the "hostQueryReset" and "timelineSemaphore" features enabled,
+  or < 1.1 with the "VK_EXT_host_query_reset" and "VK_KHR_timeline_semaphore" extensions. The
+  extension "VK_EXT_calibrated_timestamps" is also always required.
 
 You can define some extra macros to modify what features are compiled into Remotery. These are
 documented just below this comment.
diff --git a/readme.md b/readme.md
index f9091540..05791652 100644
--- a/readme.md
+++ b/readme.md
@@ -47,6 +47,12 @@ Compiling
   ([devel/remotery](https://www.freshports.org/devel/remotery/)) and modify the port's
   Makefile if needed. There is also a package available via `pkg install remotery`.
 
+* Vulkan - Ensure your include directories are set such that the Vulkan headers can be
+  included with the statement: `#include <vulkan/vulkan.h>`. Currently the Vulkan implementation
+  requires either Vulkan 1.2+ with the `hostQueryReset` and `timelineSemaphore` features enabled,
+  or < 1.1 with the `VK_EXT_host_query_reset` and `VK_KHR_timeline_semaphore` extensions. The
+  extension `VK_EXT_calibrated_timestamps` is also always required.
+
 You can define some extra macros to modify what features are compiled into Remotery:
 
     Macro               Default     Description

From 2e56a7b9243b94011afc0e6995ab4b91d794011a Mon Sep 17 00:00:00 2001
From: Matthew Pohlmann <matthew.pohlmann@gmail.com>
Date: Wed, 17 Jan 2024 14:43:57 -0800
Subject: [PATCH 26/27] Have the user specify Vulkan functions pointers instead
 of loading them. Fix and update documentation

---
 lib/Remotery.c | 146 +++++++++++++++++++------------------------------
 lib/Remotery.h |  37 ++++++++++---
 readme.md      |  24 ++++----
 3 files changed, 100 insertions(+), 107 deletions(-)

diff --git a/lib/Remotery.c b/lib/Remotery.c
index 690982d6..f9d53e6b 100644
--- a/lib/Remotery.c
+++ b/lib/Remotery.c
@@ -10023,6 +10023,8 @@ RMT_API void _rmt_EndMetalSample(void)
 
 #include <vulkan/vulkan.h>
 
+#define VULKAN_CALL(bind, fn) ((PFN_ ## fn)bind->funcs.fn)
+
 typedef struct VulkanThreadData
 {
     rmtU32 lastAllocatedQueryIndex;
@@ -10089,6 +10091,7 @@ static void VulkanSample_Destructor(VulkanSample* sample)
 typedef struct VulkanBindImpl
 {
     rmtVulkanBind base;
+    rmtVulkanFunctions funcs;
 
     // Ring buffer of GPU timestamp destinations for all queries
     rmtU32 maxNbQueries;
@@ -10111,21 +10114,6 @@ typedef struct VulkanBindImpl
     // Convert gpu ticks to us, retrieved from physical device properties
     double gpu_ticks_to_us;
 
-    // Function pointers to Vulkan functions
-    PFN_vkQueueSubmit vkQueueSubmit;
-    PFN_vkQueueWaitIdle vkQueueWaitIdle;
-    PFN_vkGetPhysicalDeviceProperties vkGetPhysicalDeviceProperties;
-    PFN_vkCreateQueryPool vkCreateQueryPool;
-    PFN_vkDestroyQueryPool vkDestroyQueryPool;
-    PFN_vkResetQueryPool vkResetQueryPool; // VK_EXT_host_query_reset or VK_VERSION_1_2
-    PFN_vkGetQueryPoolResults vkGetQueryPoolResults;
-    PFN_vkCmdWriteTimestamp vkCmdWriteTimestamp;
-    PFN_vkCreateSemaphore vkCreateSemaphore; // Creating a timeline semaphore, so VK_KHR_timeline_semaphore or VK_VERSION_1_2
-    PFN_vkDestroySemaphore vkDestroySemaphore;
-    PFN_vkSignalSemaphore vkSignalSemaphore; // VK_KHR_timeline_semaphore or VK_VERSION_1_2
-    PFN_vkGetSemaphoreCounterValue vkGetSemaphoreCounterValue; // VK_KHR_timeline_semaphore or VK_VERSION_1_2
-    PFN_vkGetCalibratedTimestampsEXT vkGetCalibratedTimestampsEXT; // VK_EXT_calibrated_timestamps or VK_KHR_calibrated_timestamps
-
     // Queue to the Vulkan main update thread
     rmtMessageQueue* mqToVulkanUpdate;
 
@@ -10133,42 +10121,6 @@ typedef struct VulkanBindImpl
 
 } VulkanBindImpl;
 
-static rmtError LoadVulkanFunctions(VulkanBindImpl* bind, VkInstance vulkan_instance, PFN_vkGetInstanceProcAddr pfn_vkGetInstanceProcAddr)
-{
-#define VK_DEVICE_FN(fn)                                                             \
-    bind->fn = (PFN_ ## fn)pfn_vkGetInstanceProcAddr(vulkan_instance, #fn);              \
-    if (bind->fn == NULL)                                                            \
-        return RMT_ERROR_RESOURCE_ACCESS_FAIL;
-
-#define VK_DEVICE_FN_FALLBACK(fn, fn_fallback)                                       \
-    bind->fn = (PFN_ ## fn)pfn_vkGetInstanceProcAddr(vulkan_instance, #fn);              \
-    if (bind->fn == NULL)                                                            \
-    {                                                                                \
-        bind->fn = (PFN_ ## fn)pfn_vkGetInstanceProcAddr(vulkan_instance, #fn_fallback); \
-        if (bind->fn == NULL)                                                        \
-            return RMT_ERROR_RESOURCE_ACCESS_FAIL;                                   \
-    }
-
-    VK_DEVICE_FN(vkQueueSubmit);
-    VK_DEVICE_FN(vkQueueWaitIdle);
-    VK_DEVICE_FN(vkGetPhysicalDeviceProperties);
-    VK_DEVICE_FN(vkCreateQueryPool);
-    VK_DEVICE_FN(vkDestroyQueryPool);
-    VK_DEVICE_FN_FALLBACK(vkResetQueryPool, vkResetQueryPoolEXT);
-    VK_DEVICE_FN(vkGetQueryPoolResults);
-    VK_DEVICE_FN(vkCmdWriteTimestamp);
-    VK_DEVICE_FN(vkCreateSemaphore);
-    VK_DEVICE_FN(vkDestroySemaphore);
-    VK_DEVICE_FN_FALLBACK(vkSignalSemaphore, vkSignalSemaphoreKHR);
-    VK_DEVICE_FN_FALLBACK(vkGetSemaphoreCounterValue, vkGetSemaphoreCounterValueKHR);
-    VK_DEVICE_FN(vkGetCalibratedTimestampsEXT); // TODO(valakor): Support vkGetCalibratedTimestampsKHR
-
-#undef VK_DEVICE_FN
-#undef VK_DEVICE_FN_FALLBACK
-
-    return RMT_ERROR_NONE;
-}
-
 static rmtError CreateQueryPool(VulkanBindImpl* bind, VkDevice vulkan_device, rmtU32 nb_queries)
 {
     VkQueryPoolCreateInfo create_info;
@@ -10177,12 +10129,12 @@ static rmtError CreateQueryPool(VulkanBindImpl* bind, VkDevice vulkan_device, rm
     create_info.queryType = VK_QUERY_TYPE_TIMESTAMP;
     create_info.queryCount = nb_queries;
 
-    if (bind->vkCreateQueryPool(vulkan_device, &create_info, NULL, &bind->gpuTimestampRingBuffer) != VK_SUCCESS)
+    if (VULKAN_CALL(bind, vkCreateQueryPool)(vulkan_device, &create_info, NULL, &bind->gpuTimestampRingBuffer) != VK_SUCCESS)
     {
         return rmtMakeError(RMT_ERROR_RESOURCE_CREATE_FAIL, "Failed to create Vulkan Query Pool");
     }
 
-    bind->vkResetQueryPool(vulkan_device, bind->gpuTimestampRingBuffer, 0, nb_queries);
+    VULKAN_CALL(bind, vkResetQueryPool)(vulkan_device, bind->gpuTimestampRingBuffer, 0, nb_queries);
 
     return RMT_ERROR_NONE;
 }
@@ -10200,7 +10152,7 @@ static rmtError CreateQuerySemaphore(VulkanBindImpl* bind, VkDevice vulkan_devic
     create_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
     create_info.pNext = &type_info;
 
-    if (bind->vkCreateSemaphore(vulkan_device, &create_info, NULL, &bind->gpuQuerySemaphore) != VK_SUCCESS)
+    if (VULKAN_CALL(bind, vkCreateSemaphore)(vulkan_device, &create_info, NULL, &bind->gpuQuerySemaphore) != VK_SUCCESS)
     {
         return rmtMakeError(RMT_ERROR_RESOURCE_CREATE_FAIL, "Failed to create Vulkan Query Semaphore");
     }
@@ -10220,7 +10172,7 @@ static rmtError CopyTimestamps(VulkanBindImpl* bind, VkDevice vulkan_device, rmt
     if (query_count == 0)
         return RMT_ERROR_NONE;
 
-    bind->vkGetQueryPoolResults(vulkan_device, bind->gpuTimestampRingBuffer, ring_pos_a, query_count, query_size, cpu_timestamps + ring_pos_a,
+    VULKAN_CALL(bind, vkGetQueryPoolResults)(vulkan_device, bind->gpuTimestampRingBuffer, ring_pos_a, query_count, query_size, cpu_timestamps + ring_pos_a,
                               sizeof(rmtU64), VK_QUERY_RESULT_64_BIT);
 
     // Copy all timestamps to their expectant samples
@@ -10236,7 +10188,7 @@ static rmtError CopyTimestamps(VulkanBindImpl* bind, VkDevice vulkan_device, rmt
     }
 
     // Reset the query pool indices
-    bind->vkResetQueryPool(vulkan_device, bind->gpuTimestampRingBuffer, ring_pos_a, query_count);
+    VULKAN_CALL(bind, vkResetQueryPool)(vulkan_device, bind->gpuTimestampRingBuffer, ring_pos_a, query_count);
 
     return RMT_ERROR_NONE;
 }
@@ -10250,7 +10202,7 @@ static rmtError UpdateGpuTicksToUs(VulkanBindImpl* bind, VkPhysicalDevice vulkan
 
     VkPhysicalDeviceProperties device_properties;
     memset(&device_properties, 0, sizeof(device_properties));
-    bind->vkGetPhysicalDeviceProperties(vulkan_physical_device, &device_properties);
+    VULKAN_CALL(bind, vkGetPhysicalDeviceProperties)(vulkan_physical_device, &device_properties);
 
     float gpu_ns_per_tick = device_properties.limits.timestampPeriod;
     bind->gpu_ticks_to_us = gpu_ns_per_tick / 1000.0;
@@ -10311,7 +10263,7 @@ static rmtError GetTimestampCalibration(VulkanBindImpl* bind, VkPhysicalDevice v
     //  multiple times in a row until retrieving a max deviation that is "acceptable". We could just call it a set number of
     //  times and take the min, or determine a reasonable average during init and ensure we get something close to that here.
 
-    if (bind->vkGetCalibratedTimestampsEXT(vulkan_device, timestamp_count, timestamp_infos, timestamps, &max_deviation) != VK_SUCCESS)
+    if (VULKAN_CALL(bind, vkGetCalibratedTimestampsEXT)(vulkan_device, timestamp_count, timestamp_infos, timestamps, &max_deviation) != VK_SUCCESS)
     {
         return rmtMakeError(RMT_ERROR_RESOURCE_ACCESS_FAIL, "Failed to get Vulkan calibrated timestamps");
     }
@@ -10355,7 +10307,7 @@ static rmtError VulkanMarkFrame(VulkanBindImpl* bind, rmtBool recurse)
 
     // Has the GPU processed any writes?
     rmtU64 current_write_gpu = 0;
-    if (bind->vkGetSemaphoreCounterValue(vulkan_device, bind->gpuQuerySemaphore, &current_write_gpu) != VK_SUCCESS)
+    if (VULKAN_CALL(bind, vkGetSemaphoreCounterValue)(vulkan_device, bind->gpuQuerySemaphore, &current_write_gpu) != VK_SUCCESS)
     {
         return rmtMakeError(RMT_ERROR_RESOURCE_ACCESS_FAIL, "Failed to get Vulkan Semaphore value");
     }
@@ -10376,7 +10328,7 @@ static rmtError VulkanMarkFrame(VulkanBindImpl* bind, rmtBool recurse)
         submit_info.pNext = &semaphore_submit_info;
         submit_info.signalSemaphoreCount = 1;
         submit_info.pSignalSemaphores = &bind->gpuQuerySemaphore;
-        if (bind->vkQueueSubmit(vulkan_queue, 1, &submit_info, NULL) != VK_SUCCESS)
+        if (VULKAN_CALL(bind, vkQueueSubmit)(vulkan_queue, 1, &submit_info, NULL) != VK_SUCCESS)
         {
             return rmtMakeError(RMT_ERROR_RESOURCE_ACCESS_FAIL, "Failed to submit Vulkan Semaphore update to queue");
         }
@@ -10446,26 +10398,54 @@ static rmtError VulkanMarkFrame(VulkanBindImpl* bind, rmtBool recurse)
     return RMT_ERROR_NONE;
 }
 
-RMT_API rmtError _rmt_BindVulkan(void* instance, void* physical_device, void* device, void* queue, rmtVulkanGetInstanceProcAddr get_instance_proc_addr, rmtVulkanBind** out_bind)
+RMT_API rmtError _rmt_BindVulkan(void* instance, void* physical_device, void* device, void* queue, const rmtVulkanFunctions* funcs, rmtVulkanBind** out_bind)
 {
     VulkanBindImpl* bind;
     VkInstance vulkan_instance = (VkInstance)instance;
     VkPhysicalDevice vulkan_physical_device = (VkPhysicalDevice)physical_device;
     VkDevice vulkan_device = (VkDevice)device;
     VkQueue vulkan_queue = (VkQueue)queue;
-    PFN_vkGetInstanceProcAddr pfn_vkGetInstanceProcAddr = (PFN_vkGetInstanceProcAddr)get_instance_proc_addr;
 
     if (g_Remotery == NULL)
-    {
         return RMT_ERROR_REMOTERY_NOT_CREATED;
-    }
 
-    assert(vulkan_instance != NULL);
-    assert(physical_device != NULL);
-    assert(device != NULL);
-    assert(queue != NULL);
-    assert(out_bind != NULL);
-    assert(get_instance_proc_addr != NULL);
+    if (instance == NULL)
+        return RMT_ERROR_INVALID_INPUT;
+
+    if (physical_device == NULL)
+        return RMT_ERROR_INVALID_INPUT;
+
+    if (device == NULL)
+        return RMT_ERROR_INVALID_INPUT;
+
+    if (queue == NULL)
+        return RMT_ERROR_INVALID_INPUT;
+
+    if (funcs == NULL)
+        return RMT_ERROR_INVALID_INPUT;
+
+    if (out_bind == NULL)
+        return RMT_ERROR_INVALID_INPUT;
+
+ #define CHECK_VK_FUNC(fn)                                               \
+    if (funcs->fn == NULL)                                               \
+        return RMT_ERROR_INVALID_INPUT;
+
+    CHECK_VK_FUNC(vkGetPhysicalDeviceProperties);
+    CHECK_VK_FUNC(vkQueueSubmit);
+    CHECK_VK_FUNC(vkQueueWaitIdle);
+    CHECK_VK_FUNC(vkCreateQueryPool);
+    CHECK_VK_FUNC(vkDestroyQueryPool);
+    CHECK_VK_FUNC(vkResetQueryPool);
+    CHECK_VK_FUNC(vkGetQueryPoolResults);
+    CHECK_VK_FUNC(vkCmdWriteTimestamp);
+    CHECK_VK_FUNC(vkCreateSemaphore);
+    CHECK_VK_FUNC(vkDestroySemaphore);
+    CHECK_VK_FUNC(vkSignalSemaphore);
+    CHECK_VK_FUNC(vkGetSemaphoreCounterValue);
+    CHECK_VK_FUNC(vkGetCalibratedTimestampsEXT);
+
+#undef CHECK_VK_FUNC
 
     // Allocate the bind container
     // TODO(valakor): If anything after this fails we'll leak this bind instance
@@ -10475,6 +10455,7 @@ RMT_API rmtError _rmt_BindVulkan(void* instance, void* physical_device, void* de
     bind->base.physical_device = physical_device;
     bind->base.device = device;
     bind->base.queue = queue;
+    bind->funcs = *funcs;
 #ifdef RMT_PLATFORM_MACOS
     // NOTE(valakor): Vulkan on MacOS via MoltenVK only supports timestamp query pools of up to 4k 64-bit queries. See
     //  https://github.com/KhronosGroup/MoltenVK/blob/main/MoltenVK/MoltenVK/GPUObjects/MVKQueryPool.mm
@@ -10489,24 +10470,9 @@ RMT_API rmtError _rmt_BindVulkan(void* instance, void* physical_device, void* de
     bind->ringBufferWrite = 0;
     bind->gpuQuerySemaphore = NULL;
     bind->gpu_ticks_to_us = 1.0;
-    bind->vkQueueSubmit = NULL;
-    bind->vkQueueWaitIdle = NULL;
-    bind->vkGetPhysicalDeviceProperties = NULL;
-    bind->vkCreateQueryPool = NULL;
-    bind->vkDestroyQueryPool = NULL;
-    bind->vkResetQueryPool = NULL;
-    bind->vkGetQueryPoolResults = NULL;
-    bind->vkCmdWriteTimestamp = NULL;
-    bind->vkCreateSemaphore = NULL;
-    bind->vkDestroySemaphore = NULL;
-    bind->vkSignalSemaphore = NULL;
-    bind->vkGetSemaphoreCounterValue = NULL;
-    bind->vkGetCalibratedTimestampsEXT = NULL;
     bind->mqToVulkanUpdate = NULL;
     bind->next = NULL;
 
-    rmtTry(LoadVulkanFunctions(bind, vulkan_instance, pfn_vkGetInstanceProcAddr));
-
     // Create the independent ring buffer storage items
     // TODO(valakor): Leave space beetween start and end to stop invalidating cache lines?
     // NOTE(valakor): ABA impossible due to non-wrapping ring buffer indices
@@ -10566,7 +10532,7 @@ RMT_API void _rmt_UnbindVulkan(rmtVulkanBind* bind)
     if (LoadAcquire64(&vulkan_bind->ringBufferWrite) > LoadAcquire64(&vulkan_bind->ringBufferRead))
     {
         VulkanMarkFrame(vulkan_bind, RMT_FALSE);
-        vulkan_bind->vkQueueWaitIdle(vulkan_queue);
+        VULKAN_CALL(vulkan_bind, vkQueueWaitIdle)(vulkan_queue);
         VulkanMarkFrame(vulkan_bind, RMT_FALSE);
     }
 
@@ -10576,7 +10542,7 @@ RMT_API void _rmt_UnbindVulkan(rmtVulkanBind* bind)
 
     if (vulkan_bind->gpuQuerySemaphore != NULL)
     {
-        vulkan_bind->vkDestroySemaphore(vulkan_device, vulkan_bind->gpuQuerySemaphore, NULL);
+        VULKAN_CALL(vulkan_bind, vkDestroySemaphore)(vulkan_device, vulkan_bind->gpuQuerySemaphore, NULL);
     }
 
     rmtFree(vulkan_bind->sampleRingBuffer);
@@ -10584,7 +10550,7 @@ RMT_API void _rmt_UnbindVulkan(rmtVulkanBind* bind)
 
     if (vulkan_bind->gpuTimestampRingBuffer != NULL)
     {
-        vulkan_bind->vkDestroyQueryPool(vulkan_device, vulkan_bind->gpuTimestampRingBuffer, NULL);
+        VULKAN_CALL(vulkan_bind, vkDestroyQueryPool)(vulkan_device, vulkan_bind->gpuTimestampRingBuffer, NULL);
     }
 }
 
@@ -10654,7 +10620,7 @@ RMT_API void _rmt_BeginVulkanSample(rmtVulkanBind* bind, void* command_buffer, r
             if (error == RMT_ERROR_NONE)
             {
                 rmtU32 physical_query_index = vulkan_sample->queryIndex & (vulkan_bind->maxNbQueries - 1);
-                vulkan_bind->vkCmdWriteTimestamp(vulkan_command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, vulkan_bind->gpuTimestampRingBuffer, physical_query_index);
+                VULKAN_CALL(vulkan_bind, vkCmdWriteTimestamp)(vulkan_command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, vulkan_bind->gpuTimestampRingBuffer, physical_query_index);
 
                 // Track which Vulkan sample expects the timestamp results
                 vulkan_bind->sampleRingBuffer[physical_query_index / 2] = vulkan_sample;
@@ -10701,7 +10667,7 @@ RMT_API void _rmt_EndVulkanSample()
             VulkanBindImpl* vulkan_bind = vulkan_sample->bind;
             VkCommandBuffer vulkan_command_buffer = vulkan_sample->commandBuffer;
             rmtU32 query_index = vulkan_sample->queryIndex & (vulkan_bind->maxNbQueries - 1);
-            vulkan_bind->vkCmdWriteTimestamp(vulkan_command_buffer, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
+            VULKAN_CALL(vulkan_bind, vkCmdWriteTimestamp)(vulkan_command_buffer, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
                                            vulkan_bind->gpuTimestampRingBuffer, query_index + 1);
 
             if (ThreadProfiler_Pop(thread_profiler, vulkan_bind->mqToVulkanUpdate, (Sample*)vulkan_sample,
diff --git a/lib/Remotery.h b/lib/Remotery.h
index d014c056..98274d7c 100644
--- a/lib/Remotery.h
+++ b/lib/Remotery.h
@@ -626,22 +626,46 @@ typedef struct rmtD3D12Bind
     RMT_OPTIONAL(RMT_USE_METAL, _rmt_EndMetalSample())
 
 
+typedef struct rmtVulkanFunctions
+{
+    // Function pointers to Vulkan functions
+    // Untyped so that the Vulkan headers are not required in this file
+
+    // Instance functions
+    void* vkGetPhysicalDeviceProperties;
+
+    // Device functions
+    void* vkQueueSubmit;
+    void* vkQueueWaitIdle;
+    void* vkCreateQueryPool;
+    void* vkDestroyQueryPool;
+    void* vkResetQueryPool; // vkResetQueryPool (Vulkan 1.2+ with hostQueryReset) or vkResetQueryPoolEXT (VK_EXT_host_query_reset)
+    void* vkGetQueryPoolResults;
+    void* vkCmdWriteTimestamp;
+    void* vkCreateSemaphore;
+    void* vkDestroySemaphore;
+    void* vkSignalSemaphore; // vkSignalSemaphore (Vulkan 1.2+ with timelineSemaphore) or vkSignalSemaphoreKHR (VK_KHR_timeline_semaphore)
+    void* vkGetSemaphoreCounterValue; // vkGetSemaphoreCounterValue (Vulkan 1.2+ with timelineSemaphore) or vkGetSemaphoreCounterValueKHR (VK_KHR_timeline_semaphore)
+    void* vkGetCalibratedTimestampsEXT; // vkGetCalibratedTimestampsKHR (VK_KHR_calibrated_timestamps) or vkGetCalibratedTimestampsEXT (VK_EXT_calibrated_timestamps)
+
+} rmtVulkanFunctions;
+
 typedef struct rmtVulkanBind
 {
-    // The physical vulkan device
+    // The physical Vulkan device, of type VkPhysicalDevice
     void* physical_device;
 
-    // The main device shared by all threads
+    // The logical Vulkan device, of type VkDevice
     void* device;
 
-    // The queue command buffers are executed on for profiling
+    // The queue command buffers are executed on for profiling, of type VkQueue
     void* queue;
 
 } rmtVulkanBind;
 
 // Create a Vulkan binding for the given device/queue pair
-#define rmt_BindVulkan(instance, physical_device, device, queue, get_instance_proc_addr, out_bind) \
-    RMT_OPTIONAL_RET(RMT_USE_VULKAN, _rmt_BindVulkan(instance, physical_device, device, queue, get_instance_proc_addr, out_bind), NULL)
+#define rmt_BindVulkan(instance, physical_device, device, queue, funcs, out_bind)         \
+    RMT_OPTIONAL_RET(RMT_USE_VULKAN, _rmt_BindVulkan(instance, physical_device, device, queue, funcs, out_bind), NULL)
 
 #define rmt_UnbindVulkan(bind)                                              \
     RMT_OPTIONAL(RMT_USE_VULKAN, _rmt_UnbindVulkan(bind))
@@ -1138,8 +1162,7 @@ RMT_API void _rmt_EndMetalSample(void);
 #endif
 
 #if RMT_USE_VULKAN
-typedef void*(*rmtVulkanGetInstanceProcAddr)(void*, const char*);
-RMT_API rmtError _rmt_BindVulkan(void* instance, void* physical_device, void* device, void* queue, rmtVulkanGetInstanceProcAddr get_instance_proc_addr, rmtVulkanBind** out_bind);
+RMT_API rmtError _rmt_BindVulkan(void* instance, void* physical_device, void* device, void* queue, const rmtVulkanFunctions* funcs, rmtVulkanBind** out_bind);
 RMT_API void _rmt_UnbindVulkan(rmtVulkanBind* bind);
 RMT_API void _rmt_BeginVulkanSample(rmtVulkanBind* bind, void* command_buffer, rmtPStr name, rmtU32* hash_cache);
 RMT_API void _rmt_EndVulkanSample();
diff --git a/readme.md b/readme.md
index 05791652..3712897f 100644
--- a/readme.md
+++ b/readme.md
@@ -50,8 +50,8 @@ Compiling
 * Vulkan - Ensure your include directories are set such that the Vulkan headers can be
   included with the statement: `#include <vulkan/vulkan.h>`. Currently the Vulkan implementation
   requires either Vulkan 1.2+ with the `hostQueryReset` and `timelineSemaphore` features enabled,
-  or < 1.1 with the `VK_EXT_host_query_reset` and `VK_KHR_timeline_semaphore` extensions. The
-  extension `VK_EXT_calibrated_timestamps` is also always required.
+  or < 1.2 with the `VK_EXT_host_query_reset` and `VK_KHR_timeline_semaphore` extensions. The
+  extension `VK_EXT_calibrated_timestamps` (or `VK_KHR_calibrated_timestamps`) is also always required.
 
 You can define some extra macros to modify what features are compiled into Remotery:
 
@@ -226,11 +226,15 @@ Remotery can sample Vulkan command buffers issued to the GPU on multiple queues
 must be submitted to the same queue as the samples are issued to. Multiple queues can be profiled by creating multiple
 Vulkan bind objects.
 
-    // Parameters are VkInstance, VkPhysicalDevice, VkDevice, VkQueue, vkGetInstanceProcAddr, rmtVulkanBind**
-    // NOTE: The get_instance_proc_addr parameter doesn't match vkGetInstanceProcAddr exactly in order to avoid
-    // including Vulkan.h in Remotery.h, so the actual function pointer must be cast when passed to rmt_BindVulkan.
+    rmtVulkanFunctions vulkan_funcs;
+    vulkan_funcs.vkGetPhysicalDeviceProperties = my_vulkan_instance_table->vkGetPhysicalDeviceProperties;
+    vulkan_funcs.vkQueueSubmit = my_vulkan_device_table->vkQueueSubmit;
+    // ... All other function pointers
+
+    // Parameters are VkInstance, VkPhysicalDevice, VkDevice, VkQueue, rmtVulkanFunctions*, rmtVulkanBind**
+    // NOTE: The Vulkan functions are copied internally and so do not have to be kept alive after this call.
     rmtVulkanBind* vulkan_bind = NULL;
-    rmt_BindVulkan(instance, physical_device, device, queue, (rmtVulkanGetInstanceProcAddr)get_instance_proc_addr, &vulkan_bind);
+    rmt_BindVulkan(instance, physical_device, device, queue, &vulkan_funcs, &vulkan_bind);
 
 Sampling is then a simple case of:
 
@@ -251,14 +255,14 @@ NOTE: Vulkan sampling on Apple platforms via MoltenVK must be done with caution.
 inside of render or compute passes, so MoltenVK simply reports all timestamps inside those scopes as the begin/end time of
 the entire render pass!
 
-Subsequent sampling calls from the same thread will use that device/queue combination. Once per frame you must call `rmt_MarkFrame()`
-to gather GPU timestamps on the CPU.
+Sampling calls using the same `vulkan_bind` object measure use the device and queue specified when the bind was created.
+Once per frame you must call `rmt_MarkFrame()` to gather GPU timestamps on the CPU.
 
     // End of frame, possibly after calling vkPresentKHR or at the very beginning of the frame
     rmt_MarkFrame();
 
-When you destroy your Vulkan device and queue you can manually clean up resources by calling `rmt_UnbindVulkan`, though this is
-dont automatically by `rmt_DestroyGlobalInstance` as well for all rmt_BindVulkan objects:
+Before you destroy your Vulkan device and queue you can manually clean up resources by calling `rmt_UnbindVulkan`, though this is
+done automatically by `rmt_DestroyGlobalInstance` as well for all `rmt_BindVulkan` objects:
 
     rmt_UnbindVulkan(vulkan_bind);
 

From 2f28ede99c5df27c74ac06e498b9639066042b02 Mon Sep 17 00:00:00 2001
From: Matthew Pohlmann <matthew.pohlmann@gmail.com>
Date: Wed, 17 Jan 2024 14:58:59 -0800
Subject: [PATCH 27/27] Fix a handful of issues: 1. Return RMT_ERROR_NONE
 instead of NULL from rmt_BindVulkan if Vulkan is not enabled 2. Explicitly
 cast function pointers to void* in README since this is potentially required
 by some compilers 3. Use rmtMakeError in rmt_BindVulkan to describe what
 parameter/function pointer is missing

---
 lib/Remotery.c | 14 +++++++-------
 lib/Remotery.h |  2 +-
 readme.md      |  4 ++--
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/lib/Remotery.c b/lib/Remotery.c
index f9d53e6b..905f5a48 100644
--- a/lib/Remotery.c
+++ b/lib/Remotery.c
@@ -10410,26 +10410,26 @@ RMT_API rmtError _rmt_BindVulkan(void* instance, void* physical_device, void* de
         return RMT_ERROR_REMOTERY_NOT_CREATED;
 
     if (instance == NULL)
-        return RMT_ERROR_INVALID_INPUT;
+        return rmtMakeError(RMT_ERROR_INVALID_INPUT, "Missing instance");
 
     if (physical_device == NULL)
-        return RMT_ERROR_INVALID_INPUT;
+        return rmtMakeError(RMT_ERROR_INVALID_INPUT, "Missing physical_device");
 
     if (device == NULL)
-        return RMT_ERROR_INVALID_INPUT;
+        return rmtMakeError(RMT_ERROR_INVALID_INPUT, "Missing device");
 
     if (queue == NULL)
-        return RMT_ERROR_INVALID_INPUT;
+        return rmtMakeError(RMT_ERROR_INVALID_INPUT, "Missing queue");
 
     if (funcs == NULL)
-        return RMT_ERROR_INVALID_INPUT;
+        return rmtMakeError(RMT_ERROR_INVALID_INPUT, "Missing funcs");
 
     if (out_bind == NULL)
-        return RMT_ERROR_INVALID_INPUT;
+        return rmtMakeError(RMT_ERROR_INVALID_INPUT, "Missing out_bind");
 
  #define CHECK_VK_FUNC(fn)                                               \
     if (funcs->fn == NULL)                                               \
-        return RMT_ERROR_INVALID_INPUT;
+        return rmtMakeError(RMT_ERROR_INVALID_INPUT, "Missing " #fn)
 
     CHECK_VK_FUNC(vkGetPhysicalDeviceProperties);
     CHECK_VK_FUNC(vkQueueSubmit);
diff --git a/lib/Remotery.h b/lib/Remotery.h
index 98274d7c..b0d89b51 100644
--- a/lib/Remotery.h
+++ b/lib/Remotery.h
@@ -665,7 +665,7 @@ typedef struct rmtVulkanBind
 
 // Create a Vulkan binding for the given device/queue pair
 #define rmt_BindVulkan(instance, physical_device, device, queue, funcs, out_bind)         \
-    RMT_OPTIONAL_RET(RMT_USE_VULKAN, _rmt_BindVulkan(instance, physical_device, device, queue, funcs, out_bind), NULL)
+    RMT_OPTIONAL_RET(RMT_USE_VULKAN, _rmt_BindVulkan(instance, physical_device, device, queue, funcs, out_bind), RMT_ERROR_NONE)
 
 #define rmt_UnbindVulkan(bind)                                              \
     RMT_OPTIONAL(RMT_USE_VULKAN, _rmt_UnbindVulkan(bind))
diff --git a/readme.md b/readme.md
index 3712897f..5fd92790 100644
--- a/readme.md
+++ b/readme.md
@@ -227,8 +227,8 @@ must be submitted to the same queue as the samples are issued to. Multiple queue
 Vulkan bind objects.
 
     rmtVulkanFunctions vulkan_funcs;
-    vulkan_funcs.vkGetPhysicalDeviceProperties = my_vulkan_instance_table->vkGetPhysicalDeviceProperties;
-    vulkan_funcs.vkQueueSubmit = my_vulkan_device_table->vkQueueSubmit;
+    vulkan_funcs.vkGetPhysicalDeviceProperties = (void*)my_vulkan_instance_table->vkGetPhysicalDeviceProperties;
+    vulkan_funcs.vkQueueSubmit = (void*)my_vulkan_device_table->vkQueueSubmit;
     // ... All other function pointers
 
     // Parameters are VkInstance, VkPhysicalDevice, VkDevice, VkQueue, rmtVulkanFunctions*, rmtVulkanBind**