From 86ecf39bb72a7e857e69e4753d08f018031fc1f3 Mon Sep 17 00:00:00 2001 From: Andrew Woloszyn Date: Thu, 22 Aug 2024 14:11:56 -0400 Subject: [PATCH] [hip][cuda] Merge the tracing implementations. (#18299) These were entirely copy-pasted off one another and it's not likely they will have to diverge in the future. --------- Signed-off-by: Andrew Woloszyn --- runtime/src/iree/hal/drivers/cuda/BUILD.bazel | 3 +- .../src/iree/hal/drivers/cuda/CMakeLists.txt | 3 +- .../src/iree/hal/drivers/cuda/cuda_device.c | 165 +++++- .../hal/drivers/cuda/graph_command_buffer.c | 67 ++- .../hal/drivers/cuda/graph_command_buffer.h | 5 +- .../src/iree/hal/drivers/cuda/nccl_channel.c | 17 +- .../src/iree/hal/drivers/cuda/nccl_channel.h | 6 +- .../hal/drivers/cuda/stream_command_buffer.c | 68 ++- .../hal/drivers/cuda/stream_command_buffer.h | 4 +- runtime/src/iree/hal/drivers/cuda/tracing.h | 188 ------- .../src/iree/hal/drivers/hip/CMakeLists.txt | 3 +- .../hal/drivers/hip/graph_command_buffer.c | 81 +-- .../hal/drivers/hip/graph_command_buffer.h | 5 +- runtime/src/iree/hal/drivers/hip/hip_device.c | 165 +++++- .../src/iree/hal/drivers/hip/rccl_channel.c | 18 +- .../src/iree/hal/drivers/hip/rccl_channel.h | 6 +- .../hal/drivers/hip/stream_command_buffer.c | 70 ++- .../hal/drivers/hip/stream_command_buffer.h | 4 +- runtime/src/iree/hal/drivers/hip/tracing.c | 526 ------------------ runtime/src/iree/hal/drivers/hip/tracing.h | 190 ------- runtime/src/iree/hal/utils/BUILD.bazel | 12 + runtime/src/iree/hal/utils/CMakeLists.txt | 15 + .../cuda/tracing.c => utils/stream_tracing.c} | 330 +++++------ runtime/src/iree/hal/utils/stream_tracing.h | 249 +++++++++ 24 files changed, 930 insertions(+), 1270 deletions(-) delete mode 100644 runtime/src/iree/hal/drivers/cuda/tracing.h delete mode 100644 runtime/src/iree/hal/drivers/hip/tracing.c delete mode 100644 runtime/src/iree/hal/drivers/hip/tracing.h rename runtime/src/iree/hal/{drivers/cuda/tracing.c => utils/stream_tracing.c} (58%) create mode 100644 runtime/src/iree/hal/utils/stream_tracing.h diff --git a/runtime/src/iree/hal/drivers/cuda/BUILD.bazel b/runtime/src/iree/hal/drivers/cuda/BUILD.bazel index 142d0e18bd9e..c0e70698b999 100644 --- a/runtime/src/iree/hal/drivers/cuda/BUILD.bazel +++ b/runtime/src/iree/hal/drivers/cuda/BUILD.bazel @@ -43,8 +43,6 @@ iree_runtime_cc_library( "stream_command_buffer.h", "timepoint_pool.c", "timepoint_pool.h", - "tracing.c", - "tracing.h", ], hdrs = [ "api.h", @@ -69,6 +67,7 @@ iree_runtime_cc_library( "//runtime/src/iree/hal/utils:memory_file", "//runtime/src/iree/hal/utils:resource_set", "//runtime/src/iree/hal/utils:semaphore_base", + "//runtime/src/iree/hal/utils:stream_tracing", "//runtime/src/iree/schemas:cuda_executable_def_c_fbs", ], ) diff --git a/runtime/src/iree/hal/drivers/cuda/CMakeLists.txt b/runtime/src/iree/hal/drivers/cuda/CMakeLists.txt index f7c9afd564ab..e5f4c6769db6 100644 --- a/runtime/src/iree/hal/drivers/cuda/CMakeLists.txt +++ b/runtime/src/iree/hal/drivers/cuda/CMakeLists.txt @@ -44,8 +44,6 @@ iree_cc_library( "stream_command_buffer.h" "timepoint_pool.c" "timepoint_pool.h" - "tracing.c" - "tracing.h" DEPS ::dynamic_symbols iree::base @@ -66,6 +64,7 @@ iree_cc_library( iree::hal::utils::memory_file iree::hal::utils::resource_set iree::hal::utils::semaphore_base + iree::hal::utils::stream_tracing iree::schemas::cuda_executable_def_c_fbs PUBLIC ) diff --git a/runtime/src/iree/hal/drivers/cuda/cuda_device.c b/runtime/src/iree/hal/drivers/cuda/cuda_device.c index 37d421209439..0018c103733f 100644 --- a/runtime/src/iree/hal/drivers/cuda/cuda_device.c +++ b/runtime/src/iree/hal/drivers/cuda/cuda_device.c @@ -26,11 +26,11 @@ #include "iree/hal/drivers/cuda/pipeline_layout.h" #include "iree/hal/drivers/cuda/stream_command_buffer.h" #include "iree/hal/drivers/cuda/timepoint_pool.h" -#include "iree/hal/drivers/cuda/tracing.h" #include "iree/hal/utils/deferred_command_buffer.h" #include "iree/hal/utils/deferred_work_queue.h" #include "iree/hal/utils/file_transfer.h" #include "iree/hal/utils/memory_file.h" +#include "iree/hal/utils/stream_tracing.h" //===----------------------------------------------------------------------===// // iree_hal_cuda_device_t @@ -62,7 +62,7 @@ typedef struct iree_hal_cuda_device_t { // The CUstream used to issue device kernels and allocations. CUstream dispatch_cu_stream; - iree_hal_cuda_tracing_context_t* tracing_context; + iree_hal_stream_tracing_context_t* tracing_context; iree_allocator_t host_allocator; @@ -259,6 +259,108 @@ iree_hal_cuda_deferred_work_queue_device_interface_submit_command_buffer( return status; } +typedef struct iree_hal_cuda_tracing_device_interface_t { + iree_hal_stream_tracing_device_interface_t base; + CUdevice cu_device; + CUcontext cu_context; + CUstream dispatch_cu_stream; + iree_allocator_t host_allocator; + const iree_hal_cuda_dynamic_symbols_t* cuda_symbols; +} iree_hal_cuda_tracing_device_interface_t; +static const iree_hal_stream_tracing_device_interface_vtable_t + iree_hal_cuda_tracing_device_interface_vtable_t; + +void iree_hal_cuda_tracing_device_interface_destroy( + iree_hal_stream_tracing_device_interface_t* base_device_interface) { + iree_hal_cuda_tracing_device_interface_t* device_interface = + (iree_hal_cuda_tracing_device_interface_t*)base_device_interface; + + iree_allocator_free(device_interface->host_allocator, device_interface); +} + +iree_status_t iree_hal_cuda_tracing_device_interface_synchronize_native_event( + iree_hal_stream_tracing_device_interface_t* base_device_interface, + iree_hal_stream_tracing_native_event_t base_event) { + iree_hal_cuda_tracing_device_interface_t* device_interface = + (iree_hal_cuda_tracing_device_interface_t*)base_device_interface; + + return IREE_CURESULT_TO_STATUS(device_interface->cuda_symbols, + cuEventSynchronize((CUevent)base_event)); +} + +iree_status_t iree_hal_cuda_tracing_device_interface_create_native_event( + iree_hal_stream_tracing_device_interface_t* base_device_interface, + iree_hal_stream_tracing_native_event_t* base_event) { + iree_hal_cuda_tracing_device_interface_t* device_interface = + (iree_hal_cuda_tracing_device_interface_t*)base_device_interface; + + return IREE_CURESULT_TO_STATUS( + device_interface->cuda_symbols, + cuEventCreate((CUevent*)base_event, CU_EVENT_DEFAULT)); +} + +iree_status_t iree_hal_cuda_tracing_device_interface_query_native_event( + iree_hal_stream_tracing_device_interface_t* base_device_interface, + iree_hal_stream_tracing_native_event_t base_event) { + iree_hal_cuda_tracing_device_interface_t* device_interface = + (iree_hal_cuda_tracing_device_interface_t*)base_device_interface; + + return IREE_CURESULT_TO_STATUS(device_interface->cuda_symbols, + cuEventQuery((CUevent)base_event)); +} + +void iree_hal_cuda_tracing_device_interface_event_elapsed_time( + iree_hal_stream_tracing_device_interface_t* base_device_interface, + float* relative_millis, iree_hal_stream_tracing_native_event_t start_event, + iree_hal_stream_tracing_native_event_t end_event) { + iree_hal_cuda_tracing_device_interface_t* device_interface = + (iree_hal_cuda_tracing_device_interface_t*)base_device_interface; + + IREE_CUDA_IGNORE_ERROR( + device_interface->cuda_symbols, + cuEventElapsedTime(relative_millis, (CUevent)start_event, + (CUevent)end_event)); +} + +void iree_hal_cuda_tracing_device_interface_destroy_native_event( + iree_hal_stream_tracing_device_interface_t* base_device_interface, + iree_hal_stream_tracing_native_event_t base_event) { + iree_hal_cuda_tracing_device_interface_t* device_interface = + (iree_hal_cuda_tracing_device_interface_t*)base_device_interface; + + IREE_CUDA_IGNORE_ERROR(device_interface->cuda_symbols, + cuEventDestroy((CUevent)base_event)); +} + +iree_status_t iree_hal_cuda_tracing_device_interface_record_native_event( + iree_hal_stream_tracing_device_interface_t* base_device_interface, + iree_hal_stream_tracing_native_event_t base_event) { + iree_hal_cuda_tracing_device_interface_t* device_interface = + (iree_hal_cuda_tracing_device_interface_t*)base_device_interface; + + return IREE_CURESULT_TO_STATUS( + device_interface->cuda_symbols, + cuEventRecord((CUevent)base_event, device_interface->dispatch_cu_stream)); +} + +iree_status_t +iree_hal_cuda_tracing_device_interface_add_graph_event_record_node( + iree_hal_stream_tracing_device_interface_t* base_device_interface, + iree_hal_stream_tracing_native_graph_node_t* out_node, + iree_hal_stream_tracing_native_graph_t graph, + iree_hal_stream_tracing_native_graph_node_t* dependency_nodes, + size_t dependency_nodes_count, + iree_hal_stream_tracing_native_event_t event) { + iree_hal_cuda_tracing_device_interface_t* device_interface = + (iree_hal_cuda_tracing_device_interface_t*)base_device_interface; + + return IREE_CURESULT_TO_STATUS( + device_interface->cuda_symbols, + cuGraphAddEventRecordNode((CUgraphNode*)out_node, (CUgraph)graph, + (CUgraphNode*)dependency_nodes, + dependency_nodes_count, (CUevent)event)); +} + static iree_hal_cuda_device_t* iree_hal_cuda_device_cast( iree_hal_device_t* base_value) { IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda_device_vtable); @@ -346,18 +448,36 @@ static iree_status_t iree_hal_cuda_device_create_internal( // Enable tracing for the (currently only) stream - no-op if disabled. if (iree_status_is_ok(status) && device->params.stream_tracing) { - if (device->params.stream_tracing >= IREE_HAL_CUDA_TRACING_VERBOSITY_MAX || - device->params.stream_tracing < IREE_HAL_CUDA_TRACING_VERBOSITY_OFF) { + if (device->params.stream_tracing >= IREE_HAL_TRACING_VERBOSITY_MAX || + device->params.stream_tracing < IREE_HAL_TRACING_VERBOSITY_OFF) { return iree_make_status( IREE_STATUS_INVALID_ARGUMENT, "invalid stream_tracing argument: expected to be between %d and %d", - IREE_HAL_CUDA_TRACING_VERBOSITY_OFF, - IREE_HAL_CUDA_TRACING_VERBOSITY_MAX); + IREE_HAL_TRACING_VERBOSITY_OFF, IREE_HAL_TRACING_VERBOSITY_MAX); + } + + iree_hal_cuda_tracing_device_interface_t* tracing_device_interface = NULL; + status = iree_allocator_malloc( + host_allocator, sizeof(iree_hal_cuda_tracing_device_interface_t), + (void**)&tracing_device_interface); + + if (IREE_UNLIKELY(!iree_status_is_ok(status))) { + iree_hal_device_release((iree_hal_device_t*)device); + return status; } - status = iree_hal_cuda_tracing_context_allocate( - device->cuda_symbols, device->identifier, dispatch_stream, - device->params.stream_tracing, &device->block_pool, host_allocator, - &device->tracing_context); + + tracing_device_interface->base.vtable = + &iree_hal_cuda_tracing_device_interface_vtable_t; + tracing_device_interface->cu_context = context; + tracing_device_interface->cu_device = cu_device; + tracing_device_interface->dispatch_cu_stream = dispatch_stream; + tracing_device_interface->host_allocator = host_allocator; + tracing_device_interface->cuda_symbols = cuda_symbols; + + status = iree_hal_stream_tracing_context_allocate( + (iree_hal_stream_tracing_device_interface_t*)tracing_device_interface, + device->identifier, device->params.stream_tracing, &device->block_pool, + host_allocator, &device->tracing_context); } // Memory pool support is conditional. @@ -505,7 +625,7 @@ static void iree_hal_cuda_device_destroy(iree_hal_device_t* base_device) { // Destroy memory pools that hold on to reserved memory. iree_hal_cuda_memory_pools_deinitialize(&device->memory_pools); - iree_hal_cuda_tracing_context_free(device->tracing_context); + iree_hal_stream_tracing_context_free(device->tracing_context); // Destroy various pools for synchronization. if (device->timepoint_pool) { @@ -947,8 +1067,8 @@ static iree_status_t iree_hal_cuda_device_queue_write( } static void iree_hal_cuda_device_collect_tracing_context(void* user_data) { - iree_hal_cuda_tracing_context_collect( - (iree_hal_cuda_tracing_context_t*)user_data); + iree_hal_stream_tracing_context_collect( + (iree_hal_stream_tracing_context_t*)user_data); } static iree_status_t iree_hal_cuda_device_queue_execute( @@ -1074,3 +1194,22 @@ static const iree_hal_deferred_work_queue_device_interface_vtable_t .submit_command_buffer = iree_hal_cuda_deferred_work_queue_device_interface_submit_command_buffer, }; + +static const iree_hal_stream_tracing_device_interface_vtable_t + iree_hal_cuda_tracing_device_interface_vtable_t = { + .destroy = iree_hal_cuda_tracing_device_interface_destroy, + .synchronize_native_event = + iree_hal_cuda_tracing_device_interface_synchronize_native_event, + .create_native_event = + iree_hal_cuda_tracing_device_interface_create_native_event, + .query_native_event = + iree_hal_cuda_tracing_device_interface_query_native_event, + .event_elapsed_time = + iree_hal_cuda_tracing_device_interface_event_elapsed_time, + .destroy_native_event = + iree_hal_cuda_tracing_device_interface_destroy_native_event, + .record_native_event = + iree_hal_cuda_tracing_device_interface_record_native_event, + .add_graph_event_record_node = + iree_hal_cuda_tracing_device_interface_add_graph_event_record_node, +}; diff --git a/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c b/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c index e5b88df0286f..458faeca751c 100644 --- a/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c +++ b/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c @@ -15,9 +15,9 @@ #include "iree/hal/drivers/cuda/cuda_status_util.h" #include "iree/hal/drivers/cuda/native_executable.h" #include "iree/hal/drivers/cuda/pipeline_layout.h" -#include "iree/hal/drivers/cuda/tracing.h" #include "iree/hal/utils/collective_batch.h" #include "iree/hal/utils/resource_set.h" +#include "iree/hal/utils/stream_tracing.h" // The maximal number of CUDA graph nodes that can run concurrently between // barriers. @@ -32,8 +32,8 @@ typedef struct iree_hal_cuda_graph_command_buffer_t { const iree_hal_cuda_dynamic_symbols_t* symbols; // Per-stream CUDA tracing context. - iree_hal_cuda_tracing_context_t* tracing_context; - iree_hal_cuda_tracing_context_event_list_t tracing_event_list; + iree_hal_stream_tracing_context_t* tracing_context; + iree_hal_stream_tracing_context_event_list_t tracing_event_list; // A resource set to maintain references to all resources used within the // command buffer. @@ -96,12 +96,15 @@ static void iree_cuda_graph_command_buffer_trace_zone_begin_external( CUgraphNode* tracing_event_node = &command_buffer->cu_graph_nodes[command_buffer->graph_node_count++]; size_t dependency_count = command_buffer->cu_barrier_node ? 1 : 0; - IREE_CUDA_GRAPH_TRACE_ZONE_BEGIN_EXTERNAL( + IREE_HAL_GRAPH_TRACE_ZONE_BEGIN_EXTERNAL( command_buffer->tracing_context, &command_buffer->tracing_event_list, - tracing_event_node, command_buffer->cu_graph, verbosity, - &command_buffer->cu_barrier_node, dependency_count, file_name, - file_name_length, line, function_name, function_name_length, name, - name_length); + (iree_hal_stream_tracing_native_graph_node_t*)tracing_event_node, + (iree_hal_stream_tracing_native_graph_t*)command_buffer->cu_graph, + verbosity, + (iree_hal_stream_tracing_native_graph_node_t*)&command_buffer + ->cu_barrier_node, + dependency_count, file_name, file_name_length, line, function_name, + function_name_length, name, name_length); // Move the barrier forward to make sure that the tracing event is recorded // before work starts. @@ -123,10 +126,14 @@ static void iree_cuda_graph_command_buffer_trace_zone_end( size_t dependency_count = command_buffer->cu_barrier_node ? 1 : 0; IREE_ASSERT_GT(dependency_count, 0, "ending a zone should at least depend on the beginning"); - IREE_CUDA_GRAPH_TRACE_ZONE_END( + IREE_HAL_GRAPH_TRACE_ZONE_END( command_buffer->tracing_context, &command_buffer->tracing_event_list, - tracing_event_node, command_buffer->cu_graph, verbosity, - &command_buffer->cu_barrier_node, dependency_count); + (iree_hal_stream_tracing_native_graph_node_t*)tracing_event_node, + (iree_hal_stream_tracing_native_graph_t*)command_buffer->cu_graph, + verbosity, + (iree_hal_stream_tracing_native_graph_node_t*)&command_buffer + ->cu_barrier_node, + dependency_count); // We need to wait on the tracing end before other work starts. // GPU tracing zones are first-in, last-out. @@ -161,7 +168,7 @@ static void iree_cuda_graph_command_buffer_trace_zone_end( iree_status_t iree_hal_cuda_graph_command_buffer_create( iree_hal_allocator_t* device_allocator, const iree_hal_cuda_dynamic_symbols_t* cuda_symbols, - iree_hal_cuda_tracing_context_t* tracing_context, CUcontext context, + iree_hal_stream_tracing_context_t* tracing_context, CUcontext context, iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity, @@ -230,8 +237,8 @@ static void iree_hal_cuda_graph_command_buffer_destroy( iree_allocator_t host_allocator = command_buffer->host_allocator; IREE_TRACE_ZONE_BEGIN(z0); - iree_hal_cuda_tracing_free(command_buffer->tracing_context, - &command_buffer->tracing_event_list); + iree_hal_stream_tracing_free(command_buffer->tracing_context, + &command_buffer->tracing_event_list); // Drop any pending collective batches before we tear things down. iree_hal_collective_batch_clear(&command_buffer->collective_batch); @@ -278,8 +285,8 @@ void iree_hal_cuda_graph_tracing_notify_submitted_commands( return; } - iree_hal_cuda_tracing_notify_submitted(command_buffer->tracing_context, - &command_buffer->tracing_event_list); + iree_hal_stream_tracing_notify_submitted(command_buffer->tracing_context, + &command_buffer->tracing_event_list); } // Flushes any pending batched collective operations. @@ -339,7 +346,7 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_begin( cuGraphCreate(&command_buffer->cu_graph, /*flags=*/0), "cuGraphCreate"); IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN( - command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_COARSE); + command_buffer, IREE_HAL_TRACING_VERBOSITY_COARSE); return iree_ok_status(); } @@ -354,7 +361,7 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_end( iree_hal_cuda_graph_command_buffer_flush_collectives(command_buffer)); IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END( - command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_COARSE); + command_buffer, IREE_HAL_TRACING_VERBOSITY_COARSE); // Reset state used during recording. command_buffer->cu_barrier_node = NULL; @@ -389,7 +396,7 @@ static void iree_hal_cuda_graph_command_buffer_begin_debug_group( (void)command_buffer; IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( - command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_COARSE, + command_buffer, IREE_HAL_TRACING_VERBOSITY_COARSE, location ? location->file.data : NULL, location ? location->file.size : 0, location ? location->line : 0, /*func_name=*/NULL, 0, label.data, label.size); @@ -401,7 +408,7 @@ static void iree_hal_cuda_graph_command_buffer_end_debug_group( iree_hal_cuda_graph_command_buffer_cast(base_command_buffer); (void)command_buffer; IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END( - command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_COARSE); + command_buffer, IREE_HAL_TRACING_VERBOSITY_COARSE); } static iree_status_t @@ -515,7 +522,7 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_fill_buffer( iree_hal_cuda_graph_command_buffer_cast(base_command_buffer); IREE_TRACE_ZONE_BEGIN(z0); IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN( - command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE); + command_buffer, IREE_HAL_TRACING_VERBOSITY_FINE); IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_cuda_graph_command_buffer_flush_collectives(command_buffer)); @@ -555,7 +562,7 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_fill_buffer( "cuGraphAddMemsetNode"); IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END( - command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE); + command_buffer, IREE_HAL_TRACING_VERBOSITY_FINE); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } @@ -567,7 +574,7 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_update_buffer( iree_hal_cuda_graph_command_buffer_cast(base_command_buffer); IREE_TRACE_ZONE_BEGIN(z0); IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN( - command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE); + command_buffer, IREE_HAL_TRACING_VERBOSITY_FINE); IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_cuda_graph_command_buffer_flush_collectives(command_buffer)); @@ -619,7 +626,7 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_update_buffer( "cuGraphAddMemcpyNode"); IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END( - command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE); + command_buffer, IREE_HAL_TRACING_VERBOSITY_FINE); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } @@ -631,7 +638,7 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_copy_buffer( iree_hal_cuda_graph_command_buffer_cast(base_command_buffer); IREE_TRACE_ZONE_BEGIN(z0); IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN( - command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE); + command_buffer, IREE_HAL_TRACING_VERBOSITY_FINE); IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_cuda_graph_command_buffer_flush_collectives(command_buffer)); @@ -679,7 +686,7 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_copy_buffer( "cuGraphAddMemcpyNode"); IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END( - command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE); + command_buffer, IREE_HAL_TRACING_VERBOSITY_FINE); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } @@ -776,7 +783,7 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_dispatch( executable, entry_point, &kernel_info)); IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( - command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE, + command_buffer, IREE_HAL_TRACING_VERBOSITY_FINE, kernel_info.source_filename.data, kernel_info.source_filename.size, kernel_info.source_line, kernel_info.function_name.data, kernel_info.function_name.size, @@ -880,7 +887,7 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_dispatch( "cuGraphAddKernelNode"); IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END( - command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE); + command_buffer, IREE_HAL_TRACING_VERBOSITY_FINE); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } @@ -913,7 +920,7 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_dispatch2( executable, entry_point, &kernel_info)); IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( - command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE, + command_buffer, IREE_HAL_TRACING_VERBOSITY_FINE, kernel_info.source_filename.data, kernel_info.source_filename.size, kernel_info.source_line, kernel_info.function_name.data, kernel_info.function_name.size, /*name=*/NULL, 0); @@ -1006,7 +1013,7 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_dispatch2( "cuGraphAddKernelNode"); IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END( - command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE); + command_buffer, IREE_HAL_TRACING_VERBOSITY_FINE); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } diff --git a/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.h b/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.h index aa0d2cc3b352..cf101d3a7420 100644 --- a/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.h +++ b/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.h @@ -17,7 +17,8 @@ extern "C" { #endif // __cplusplus typedef struct iree_arena_block_pool_t iree_arena_block_pool_t; -typedef struct iree_hal_cuda_tracing_context_t iree_hal_cuda_tracing_context_t; +typedef struct iree_hal_stream_tracing_context_t + iree_hal_stream_tracing_context_t; // Creates a command buffer that records into a CUDA graph. // @@ -27,7 +28,7 @@ typedef struct iree_hal_cuda_tracing_context_t iree_hal_cuda_tracing_context_t; iree_status_t iree_hal_cuda_graph_command_buffer_create( iree_hal_allocator_t* device_allocator, const iree_hal_cuda_dynamic_symbols_t* cuda_symbols, - iree_hal_cuda_tracing_context_t* tracing_context, CUcontext context, + iree_hal_stream_tracing_context_t* tracing_context, CUcontext context, iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity, diff --git a/runtime/src/iree/hal/drivers/cuda/nccl_channel.c b/runtime/src/iree/hal/drivers/cuda/nccl_channel.c index 2f6eb3fe93c6..5a001da0d722 100644 --- a/runtime/src/iree/hal/drivers/cuda/nccl_channel.c +++ b/runtime/src/iree/hal/drivers/cuda/nccl_channel.c @@ -541,8 +541,8 @@ static iree_status_t iree_hal_cuda_nccl_submit_batch_entry( iree_status_t iree_hal_cuda_nccl_submit_batch( const iree_hal_cuda_nccl_dynamic_symbols_t* symbols, - iree_hal_cuda_tracing_context_t* tracing_context, - iree_hal_cuda_tracing_context_event_list_t* tracing_event_list, + iree_hal_stream_tracing_context_t* tracing_context, + iree_hal_stream_tracing_context_event_list_t* tracing_event_list, const iree_hal_collective_batch_t* batch, CUstream stream) { IREE_ASSERT_ARGUMENT(symbols); IREE_ASSERT_ARGUMENT(batch); @@ -558,11 +558,10 @@ iree_status_t iree_hal_cuda_nccl_submit_batch( iree_hal_collective_batch_entry_t* entry = &batch->entries[i]; iree_string_view_t collective_str = iree_hal_collective_op_format(&entry->op, &string_temp); - IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( - tracing_context, tracing_event_list, stream, - IREE_HAL_CUDA_TRACING_VERBOSITY_FINE, __FILE__, strlen(__FILE__), - (uint32_t)__LINE__, __FUNCTION__, strlen(__FUNCTION__), - collective_str.data, collective_str.size); + IREE_HAL_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( + tracing_context, tracing_event_list, IREE_HAL_TRACING_VERBOSITY_FINE, + __FILE__, strlen(__FILE__), (uint32_t)__LINE__, __FUNCTION__, + strlen(__FUNCTION__), collective_str.data, collective_str.size); } #endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE @@ -579,8 +578,8 @@ iree_status_t iree_hal_cuda_nccl_submit_batch( // End all zones we began above - note that these are just simply nested so // order doesn't matter so long as we end the right number of zones. for (iree_host_size_t i = 0; i < batch->count; ++i) { - IREE_CUDA_STREAM_TRACE_ZONE_END(tracing_context, tracing_event_list, stream, - IREE_HAL_CUDA_TRACING_VERBOSITY_FINE); + IREE_HAL_STREAM_TRACE_ZONE_END(tracing_context, tracing_event_list, + IREE_HAL_TRACING_VERBOSITY_FINE); } #endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE diff --git a/runtime/src/iree/hal/drivers/cuda/nccl_channel.h b/runtime/src/iree/hal/drivers/cuda/nccl_channel.h index 182bc5ca44e1..2215355c5d50 100644 --- a/runtime/src/iree/hal/drivers/cuda/nccl_channel.h +++ b/runtime/src/iree/hal/drivers/cuda/nccl_channel.h @@ -12,8 +12,8 @@ #include "iree/hal/drivers/cuda/api.h" #include "iree/hal/drivers/cuda/cuda_dynamic_symbols.h" #include "iree/hal/drivers/cuda/nccl_dynamic_symbols.h" -#include "iree/hal/drivers/cuda/tracing.h" #include "iree/hal/utils/collective_batch.h" +#include "iree/hal/utils/stream_tracing.h" #ifdef __cplusplus extern "C" { @@ -48,8 +48,8 @@ iree_status_t iree_hal_cuda_nccl_channel_create( // Note that operations in the batch may apply to different channels. iree_status_t iree_hal_cuda_nccl_submit_batch( const iree_hal_cuda_nccl_dynamic_symbols_t* nccl_symbols, - iree_hal_cuda_tracing_context_t* tracing_context, - iree_hal_cuda_tracing_context_event_list_t* tracing_event_list, + iree_hal_stream_tracing_context_t* tracing_context, + iree_hal_stream_tracing_context_event_list_t* tracing_event_list, const iree_hal_collective_batch_t* batch, CUstream stream); #ifdef __cplusplus diff --git a/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c b/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c index 4b8a0b106f8f..927d72f0e0ea 100644 --- a/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c +++ b/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c @@ -22,8 +22,8 @@ typedef struct iree_hal_cuda_stream_command_buffer_t { const iree_hal_cuda_nccl_dynamic_symbols_t* nccl_symbols; // Per-stream CUDA tracing context. - iree_hal_cuda_tracing_context_t* tracing_context; - iree_hal_cuda_tracing_context_event_list_t tracing_event_list; + iree_hal_stream_tracing_context_t* tracing_context; + iree_hal_stream_tracing_context_event_list_t tracing_event_list; CUstream cu_stream; @@ -60,7 +60,7 @@ iree_status_t iree_hal_cuda_stream_command_buffer_create( iree_hal_allocator_t* device_allocator, const iree_hal_cuda_dynamic_symbols_t* cuda_symbols, const iree_hal_cuda_nccl_dynamic_symbols_t* nccl_symbols, - iree_hal_cuda_tracing_context_t* tracing_context, + iree_hal_stream_tracing_context_t* tracing_context, iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, iree_host_size_t binding_capacity, CUstream stream, @@ -123,8 +123,8 @@ static void iree_hal_cuda_stream_command_buffer_destroy( iree_allocator_t host_allocator = command_buffer->host_allocator; IREE_TRACE_ZONE_BEGIN(z0); - iree_hal_cuda_tracing_free(command_buffer->tracing_context, - &command_buffer->tracing_event_list); + iree_hal_stream_tracing_free(command_buffer->tracing_context, + &command_buffer->tracing_event_list); iree_hal_collective_batch_deinitialize(&command_buffer->collective_batch); iree_hal_resource_set_free(command_buffer->resource_set); @@ -148,8 +148,8 @@ void iree_hal_cuda_stream_notify_submitted_commands( return; } - iree_hal_cuda_tracing_notify_submitted(command_buffer->tracing_context, - &command_buffer->tracing_event_list); + iree_hal_stream_tracing_notify_submitted(command_buffer->tracing_context, + &command_buffer->tracing_event_list); } // Flushes any pending batched collective operations. @@ -180,9 +180,9 @@ static iree_status_t iree_hal_cuda_stream_command_buffer_begin( iree_hal_cuda_stream_command_buffer_cast(base_command_buffer); (void)command_buffer; - IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( + IREE_HAL_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( command_buffer->tracing_context, &command_buffer->tracing_event_list, - command_buffer->cu_stream, IREE_HAL_CUDA_TRACING_VERBOSITY_COARSE, + IREE_HAL_TRACING_VERBOSITY_COARSE, /*file_name=*/NULL, 0, /*line=*/0, "iree_hal_cuda_stream_command_buffer", strlen("iree_hal_cuda_stream_command_buffer"), /*name=*/NULL, 0); @@ -217,9 +217,9 @@ static iree_status_t iree_hal_cuda_stream_command_buffer_end( command_buffer->resource_set, &command_buffer->collective_batch); - IREE_CUDA_STREAM_TRACE_ZONE_END( - command_buffer->tracing_context, &command_buffer->tracing_event_list, - command_buffer->cu_stream, IREE_HAL_CUDA_TRACING_VERBOSITY_COARSE); + IREE_HAL_STREAM_TRACE_ZONE_END(command_buffer->tracing_context, + &command_buffer->tracing_event_list, + IREE_HAL_TRACING_VERBOSITY_COARSE); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); @@ -233,11 +233,10 @@ static void iree_hal_cuda_stream_command_buffer_begin_debug_group( iree_hal_cuda_stream_command_buffer_cast(base_command_buffer); (void)command_buffer; - IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( + IREE_HAL_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( command_buffer->tracing_context, &command_buffer->tracing_event_list, - command_buffer->cu_stream, IREE_HAL_CUDA_TRACING_VERBOSITY_COARSE, - location ? location->file.data : NULL, location ? location->file.size : 0, - location ? location->line : 0, + IREE_HAL_TRACING_VERBOSITY_COARSE, location ? location->file.data : NULL, + location ? location->file.size : 0, location ? location->line : 0, /*func_name=*/NULL, 0, label.data, label.size); // TODO: pass along to CUPTI if available. @@ -251,9 +250,9 @@ static void iree_hal_cuda_stream_command_buffer_end_debug_group( // TODO: pass along to CUPTI if available. - IREE_CUDA_STREAM_TRACE_ZONE_END( - command_buffer->tracing_context, &command_buffer->tracing_event_list, - command_buffer->cu_stream, IREE_HAL_CUDA_TRACING_VERBOSITY_COARSE); + IREE_HAL_STREAM_TRACE_ZONE_END(command_buffer->tracing_context, + &command_buffer->tracing_event_list, + IREE_HAL_TRACING_VERBOSITY_COARSE); } static iree_status_t iree_hal_cuda_stream_command_buffer_execution_barrier( @@ -549,12 +548,11 @@ static iree_status_t iree_hal_cuda_stream_command_buffer_dispatch( z0, iree_hal_cuda_native_executable_entry_point_kernel_info( executable, entry_point, &kernel_info)); - IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( + IREE_HAL_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( command_buffer->tracing_context, &command_buffer->tracing_event_list, - command_buffer->cu_stream, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE, - kernel_info.source_filename.data, kernel_info.source_filename.size, - kernel_info.source_line, kernel_info.function_name.data, - kernel_info.function_name.size, + IREE_HAL_TRACING_VERBOSITY_FINE, kernel_info.source_filename.data, + kernel_info.source_filename.size, kernel_info.source_line, + kernel_info.function_name.data, kernel_info.function_name.size, /*name=*/NULL, 0); IREE_RETURN_AND_END_ZONE_IF_ERROR( @@ -636,9 +634,9 @@ static iree_status_t iree_hal_cuda_stream_command_buffer_dispatch( params_ptr, NULL), "cuLaunchKernel"); - IREE_CUDA_STREAM_TRACE_ZONE_END( - command_buffer->tracing_context, &command_buffer->tracing_event_list, - command_buffer->cu_stream, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE); + IREE_HAL_STREAM_TRACE_ZONE_END(command_buffer->tracing_context, + &command_buffer->tracing_event_list, + IREE_HAL_TRACING_VERBOSITY_FINE); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); @@ -672,12 +670,12 @@ static iree_status_t iree_hal_cuda_stream_command_buffer_dispatch2( z0, iree_hal_cuda_native_executable_entry_point_kernel_info( executable, entry_point, &kernel_info)); - IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( + IREE_HAL_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( command_buffer->tracing_context, &command_buffer->tracing_event_list, - command_buffer->cu_stream, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE, - kernel_info.source_filename.data, kernel_info.source_filename.size, - kernel_info.source_line, kernel_info.function_name.data, - kernel_info.function_name.size, /*name=*/NULL, 0); + IREE_HAL_TRACING_VERBOSITY_FINE, kernel_info.source_filename.data, + kernel_info.source_filename.size, kernel_info.source_line, + kernel_info.function_name.data, kernel_info.function_name.size, + /*name=*/NULL, 0); IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_resource_set_insert(command_buffer->resource_set, 1, @@ -749,9 +747,9 @@ static iree_status_t iree_hal_cuda_stream_command_buffer_dispatch2( command_buffer->cu_stream, params_ptr, NULL), "cuLaunchKernel"); - IREE_CUDA_STREAM_TRACE_ZONE_END( - command_buffer->tracing_context, &command_buffer->tracing_event_list, - command_buffer->cu_stream, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE); + IREE_HAL_STREAM_TRACE_ZONE_END(command_buffer->tracing_context, + &command_buffer->tracing_event_list, + IREE_HAL_TRACING_VERBOSITY_FINE); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); diff --git a/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.h b/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.h index 47a9fdbaaa56..f71d7490f9fb 100644 --- a/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.h +++ b/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.h @@ -12,7 +12,7 @@ #include "iree/hal/drivers/cuda/cuda_dynamic_symbols.h" #include "iree/hal/drivers/cuda/cuda_headers.h" #include "iree/hal/drivers/cuda/nccl_dynamic_symbols.h" -#include "iree/hal/drivers/cuda/tracing.h" +#include "iree/hal/utils/stream_tracing.h" #ifdef __cplusplus extern "C" { @@ -33,7 +33,7 @@ iree_status_t iree_hal_cuda_stream_command_buffer_create( iree_hal_allocator_t* device_allocator, const iree_hal_cuda_dynamic_symbols_t* cuda_symbols, const iree_hal_cuda_nccl_dynamic_symbols_t* nccl_symbols, - iree_hal_cuda_tracing_context_t* tracing_context, + iree_hal_stream_tracing_context_t* tracing_context, iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, iree_host_size_t binding_capacity, CUstream stream, diff --git a/runtime/src/iree/hal/drivers/cuda/tracing.h b/runtime/src/iree/hal/drivers/cuda/tracing.h deleted file mode 100644 index 1174f778337c..000000000000 --- a/runtime/src/iree/hal/drivers/cuda/tracing.h +++ /dev/null @@ -1,188 +0,0 @@ -// Copyright 2023 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#ifndef IREE_HAL_DRIVERS_CUDA_TRACING_H_ -#define IREE_HAL_DRIVERS_CUDA_TRACING_H_ - -#include "iree/base/api.h" -#include "iree/base/internal/arena.h" -#include "iree/base/tracing.h" -#include "iree/hal/api.h" -#include "iree/hal/drivers/cuda/cuda_dynamic_symbols.h" -#include "iree/hal/drivers/cuda/cuda_headers.h" - -#ifdef __cplusplus -extern "C" { -#endif // __cplusplus - -// Per-stream CUDA tracing context. -// No-op if IREE tracing is not enabled. -// -// Use the IREE_CUDA_TRACE_* macros to trace a contiguous set of stream -// operations. Unlike the normal tracy macros there are no zone IDs and instead -// each stream gets an ID allocated once and passed to all tracing macros. -// -// Usage: -// IREE_CUDA_STREAM_TRACE_ZONE_BEGIN(queue->tracing_context, stream); -// cuLaunchKernel(..., stream); -// IREE_CUDA_STREAM_TRACE_ZONE_END(queue->tracing_context, stream); -// ... -// iree_hal_cuda_tracing_context_collect(queue->tracing_context); -// -// NOTE: timestamps can have non-trivial side-effecting behavior and may -// introduce serialization in graph execution. -// -// TODO(benvanik): expose CUevent reservation separate from recording. For -// graphs we will need to insert the events but in order to reuse the graphs -// we'll need to reserve and patch new events each graph launch. -// -// Thread-compatible: external synchronization is required if using from -// multiple threads (same as with CUstream itself). -typedef struct iree_hal_cuda_tracing_context_t iree_hal_cuda_tracing_context_t; -typedef struct iree_hal_cuda_tracing_context_event_t - iree_hal_cuda_tracing_context_event_t; - -// This is used when tracing is enabled. Calls to dispatch and event related -// functions will update the pointers to keep the list up to date. -typedef struct iree_hal_cuda_tracing_context_event_list_t { - iree_hal_cuda_tracing_context_event_t* head; - iree_hal_cuda_tracing_context_event_t* tail; -} iree_hal_cuda_tracing_context_event_list_t; - -typedef enum iree_hal_cuda_tracing_verbosity_e { - IREE_HAL_CUDA_TRACING_VERBOSITY_OFF = 0, - IREE_HAL_CUDA_TRACING_VERBOSITY_COARSE, - IREE_HAL_CUDA_TRACING_VERBOSITY_FINE, - IREE_HAL_CUDA_TRACING_VERBOSITY_MAX -} iree_hal_cuda_tracing_verbosity_t; - -// Allocates a tracing context for the given CUDA |stream|. -// Each context must only be used with the stream it was created for. -iree_status_t iree_hal_cuda_tracing_context_allocate( - const iree_hal_cuda_dynamic_symbols_t* symbols, - iree_string_view_t queue_name, CUstream stream, - iree_hal_cuda_tracing_verbosity_t stream_tracing_verbosity, - iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator, - iree_hal_cuda_tracing_context_t** out_context); - -// Frees a tracing context and all associated CUDA resources. -// All submissions using the resources must be completed prior to calling. -void iree_hal_cuda_tracing_context_free( - iree_hal_cuda_tracing_context_t* context); - -// Collects in-flight timestamp queries from the stream and feeds them to tracy. -// Must be called frequently (every submission, etc) to drain the backlog; -// tracing may start failing if the internal ringbuffer is exceeded. -void iree_hal_cuda_tracing_context_collect( - iree_hal_cuda_tracing_context_t* context); - -// Notifies that the given list of events has been dispached on to the gpu. -void iree_hal_cuda_tracing_notify_submitted( - iree_hal_cuda_tracing_context_t* context, - iree_hal_cuda_tracing_context_event_list_t* event_list); - -// Frees the events and returns them back into the tracing context. -void iree_hal_cuda_tracing_free( - iree_hal_cuda_tracing_context_t* context, - iree_hal_cuda_tracing_context_event_list_t* event_list); - -#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE - -// Begins a normal zone derived on the calling |src_loc|. -// Must be perfectly nested and paired with a corresponding zone end. -void iree_hal_cuda_stream_tracing_zone_begin_impl( - iree_hal_cuda_tracing_context_t* context, - iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream, - iree_hal_cuda_tracing_verbosity_t verbosity, - const iree_tracing_location_t* src_loc); - -// Begins an external zone using the given source information. -// The provided strings will be copied into the tracy buffer. -void iree_hal_cuda_stream_tracing_zone_begin_external_impl( - iree_hal_cuda_tracing_context_t* context, - iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream, - iree_hal_cuda_tracing_verbosity_t verbosity, const char* file_name, - size_t file_name_length, uint32_t line, const char* function_name, - size_t function_name_length, const char* name, size_t name_length); - -void iree_hal_cuda_graph_tracing_zone_begin_external_impl( - iree_hal_cuda_tracing_context_t* context, - iree_hal_cuda_tracing_context_event_list_t* event_list, - CUgraphNode* out_node, CUgraph graph, - iree_hal_cuda_tracing_verbosity_t verbosity, CUgraphNode* dependency_nodes, - size_t dependency_nodes_count, const char* file_name, - size_t file_name_length, uint32_t line, const char* function_name, - size_t function_name_length, const char* name, size_t name_length); - -void iree_hal_cuda_stream_tracing_zone_end_impl( - iree_hal_cuda_tracing_context_t* context, - iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream, - iree_hal_cuda_tracing_verbosity_t verbosity); -void iree_hal_cuda_graph_tracing_zone_end_impl( - iree_hal_cuda_tracing_context_t* context, - iree_hal_cuda_tracing_context_event_list_t* event_list, - CUgraphNode* out_node, CUgraph graph, - iree_hal_cuda_tracing_verbosity_t verbosity, CUgraphNode* dependency_nodes, - size_t dependency_nodes_count); - -// Begins a new zone with the parent function name. -#define IREE_CUDA_STREAM_TRACE_ZONE_BEGIN(context, event_list_begin, \ - event_list_end, stream, verbosity) \ - static const iree_tracing_location_t TracyConcat( \ - __tracy_source_location, __LINE__) = {NULL, __FUNCTION__, __FILE__, \ - (uint32_t)__LINE__, 0}; \ - iree_hal_cuda_stream_tracing_zone_begin_impl( \ - context, event_list_begin, event_list_end, stream, verbosity, \ - &TracyConcat(__tracy_source_location, __LINE__)); - -// Begins an externally defined zone with a dynamic source location. -// The |file_name|, |function_name|, and optional |name| strings will be copied -// into the trace buffer and do not need to persist. -#define IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( \ - context, event_list, stream, verbosity, file_name, file_name_length, line, \ - function_name, function_name_length, name, name_length) \ - iree_hal_cuda_stream_tracing_zone_begin_external_impl( \ - context, event_list, stream, verbosity, file_name, file_name_length, \ - line, function_name, function_name_length, name, name_length) -#define IREE_CUDA_GRAPH_TRACE_ZONE_BEGIN_EXTERNAL( \ - context, event_list, out_node, graph, verbosity, dependency_nodes, \ - dependency_nodes_count, file_name, file_name_length, line, function_name, \ - function_name_length, name, name_length) \ - iree_hal_cuda_graph_tracing_zone_begin_external_impl( \ - context, event_list, out_node, graph, verbosity, dependency_nodes, \ - dependency_nodes_count, file_name, file_name_length, line, \ - function_name, function_name_length, name, name_length) - -#define IREE_CUDA_STREAM_TRACE_ZONE_END(context, event_list, stream, \ - verbosity) \ - iree_hal_cuda_stream_tracing_zone_end_impl(context, event_list, stream, \ - verbosity) -#define IREE_CUDA_GRAPH_TRACE_ZONE_END(context, event_list, out_node, graph, \ - verbosity, dependency_nodes, \ - dependency_nodes_count) \ - iree_hal_cuda_graph_tracing_zone_end_impl( \ - context, event_list, out_node, graph, verbosity, dependency_nodes, \ - dependency_nodes_count) -#else - -#define IREE_CUDA_STREAM_TRACE_ZONE_BEGIN(context, event_list, stream, \ - verbosity) -#define IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( \ - context, event_list, stream, verbosity, file_name, file_name_length, line, \ - function_name, function_name_length, name, name_length) -#define IREE_CUDA_GRAPH_TRACE_ZONE_BEGIN_EXTERNAL( \ - context, event_list, out_node, graph, verbosity, dependency_nodes, \ - dependency_nodes_count, file_name, file_name_length, line, function_name, \ - function_name_length, name, name_length) -#define IREE_CUDA_STREAM_TRACE_ZONE_END(context, event_list, stream, verbosity) - -#endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE - -#ifdef __cplusplus -} // extern "C" -#endif // __cplusplus - -#endif // IREE_HAL_DRIVERS_CUDA_TRACING_H_ diff --git a/runtime/src/iree/hal/drivers/hip/CMakeLists.txt b/runtime/src/iree/hal/drivers/hip/CMakeLists.txt index f48d3e1ce69c..cb9b29e571af 100644 --- a/runtime/src/iree/hal/drivers/hip/CMakeLists.txt +++ b/runtime/src/iree/hal/drivers/hip/CMakeLists.txt @@ -48,8 +48,6 @@ iree_cc_library( "stream_command_buffer.h" "timepoint_pool.c" "timepoint_pool.h" - "tracing.c" - "tracing.h" INCLUDES "${HIP_API_HEADERS_ROOT}" DEPS @@ -72,6 +70,7 @@ iree_cc_library( iree::hal::utils::memory_file iree::hal::utils::resource_set iree::hal::utils::semaphore_base + iree::hal::utils::stream_tracing iree::schemas::rocm_executable_def_c_fbs PUBLIC ) diff --git a/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c b/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c index afade26d4be0..f0c86aba8b37 100644 --- a/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c +++ b/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c @@ -16,9 +16,9 @@ #include "iree/hal/drivers/hip/native_executable.h" #include "iree/hal/drivers/hip/pipeline_layout.h" #include "iree/hal/drivers/hip/status_util.h" -#include "iree/hal/drivers/hip/tracing.h" #include "iree/hal/utils/collective_batch.h" #include "iree/hal/utils/resource_set.h" +#include "iree/hal/utils/stream_tracing.h" // The maximal number of HIP graph nodes that can run concurrently between // barriers. @@ -33,8 +33,8 @@ typedef struct iree_hal_hip_graph_command_buffer_t { const iree_hal_hip_dynamic_symbols_t* symbols; // Per-stream HIP tracing context. - iree_hal_hip_tracing_context_t* tracing_context; - iree_hal_hip_tracing_context_event_list_t tracing_event_list; + iree_hal_stream_tracing_context_t* tracing_context; + iree_hal_stream_tracing_context_event_list_t tracing_event_list; // A resource set to maintain references to all resources used within the // command buffer. @@ -84,7 +84,7 @@ iree_hal_hip_graph_command_buffer_cast(iree_hal_command_buffer_t* base_value) { static void iree_hip_graph_command_buffer_trace_zone_begin_external( iree_hal_hip_graph_command_buffer_t* command_buffer, - iree_hal_hip_tracing_verbosity_t verbosity, const char* file_name, + iree_hal_stream_tracing_verbosity_t verbosity, const char* file_name, size_t file_name_length, uint32_t line, const char* function_name, size_t function_name_length, const char* name, size_t name_length) { // Make sure there are no new nodes after the last barrier. @@ -97,12 +97,15 @@ static void iree_hip_graph_command_buffer_trace_zone_begin_external( hipGraphNode_t* tracing_event_node = &command_buffer->hip_graph_nodes[command_buffer->graph_node_count++]; size_t dependency_count = command_buffer->hip_barrier_node ? 1 : 0; - IREE_HIP_GRAPH_TRACE_ZONE_BEGIN_EXTERNAL( + IREE_HAL_GRAPH_TRACE_ZONE_BEGIN_EXTERNAL( command_buffer->tracing_context, &command_buffer->tracing_event_list, - tracing_event_node, command_buffer->hip_graph, verbosity, - &command_buffer->hip_barrier_node, dependency_count, file_name, - file_name_length, line, function_name, function_name_length, name, - name_length); + (iree_hal_stream_tracing_native_graph_node_t*)tracing_event_node, + (iree_hal_stream_tracing_native_graph_t*)command_buffer->hip_graph, + verbosity, + (iree_hal_stream_tracing_native_graph_node_t*)&command_buffer + ->hip_barrier_node, + dependency_count, file_name, file_name_length, line, function_name, + function_name_length, name, name_length); // Move the barrier forward to make sure that the tracing event is recorded // before work starts. @@ -112,7 +115,7 @@ static void iree_hip_graph_command_buffer_trace_zone_begin_external( static void iree_hip_graph_command_buffer_trace_zone_end( iree_hal_hip_graph_command_buffer_t* command_buffer, - iree_hal_hip_tracing_verbosity_t verbosity) { + iree_hal_stream_tracing_verbosity_t verbosity) { // Make sure there are no new nodes after the last barrier. // Prior work should end before the tracing event is recorded. if (IREE_UNLIKELY(command_buffer->graph_node_count != 0)) { @@ -125,10 +128,14 @@ static void iree_hip_graph_command_buffer_trace_zone_end( size_t dependency_count = command_buffer->hip_barrier_node ? 1 : 0; IREE_ASSERT_GT(dependency_count, 0, "ending a zone should at least depend on the beginning"); - IREE_HIP_GRAPH_TRACE_ZONE_END( + IREE_HAL_GRAPH_TRACE_ZONE_END( command_buffer->tracing_context, &command_buffer->tracing_event_list, - tracing_event_node, command_buffer->hip_graph, verbosity, - &command_buffer->hip_barrier_node, dependency_count); + (iree_hal_stream_tracing_native_graph_node_t*)tracing_event_node, + (iree_hal_stream_tracing_native_graph_t*)command_buffer->hip_graph, + verbosity, + (iree_hal_stream_tracing_native_graph_node_t*)&command_buffer + ->hip_barrier_node, + dependency_count); // We need to wait on the tracing end before other work starts. // GPU tracing zones are first-in, last-out. @@ -164,7 +171,7 @@ static void iree_hip_graph_command_buffer_trace_zone_end( iree_status_t iree_hal_hip_graph_command_buffer_create( iree_hal_allocator_t* device_allocator, const iree_hal_hip_dynamic_symbols_t* hip_symbols, - iree_hal_hip_tracing_context_t* tracing_context, hipCtx_t context, + iree_hal_stream_tracing_context_t* tracing_context, hipCtx_t context, iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity, @@ -235,8 +242,8 @@ static void iree_hal_hip_graph_command_buffer_destroy( iree_allocator_t host_allocator = command_buffer->host_allocator; IREE_TRACE_ZONE_BEGIN(z0); - iree_hal_hip_tracing_free(command_buffer->tracing_context, - &command_buffer->tracing_event_list); + iree_hal_stream_tracing_free(command_buffer->tracing_context, + &command_buffer->tracing_event_list); // Drop any pending collective batches before we tear things down. iree_hal_collective_batch_clear(&command_buffer->collective_batch); @@ -283,8 +290,8 @@ void iree_hal_hip_graph_tracing_notify_submitted_commands( return; } - iree_hal_hip_tracing_notify_submitted(command_buffer->tracing_context, - &command_buffer->tracing_event_list); + iree_hal_stream_tracing_notify_submitted(command_buffer->tracing_context, + &command_buffer->tracing_event_list); } // Flushes any pending batched collective operations. @@ -345,7 +352,7 @@ static iree_status_t iree_hal_hip_graph_command_buffer_begin( "hipGraphCreate"); IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( - command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_COARSE, + command_buffer, IREE_HAL_TRACING_VERBOSITY_COARSE, /*file_name=*/NULL, 0, /*line=*/0, "iree_hal_hip_graph_command_buffer", strlen("iree_hal_hip_graph_command_buffer"), /*name=*/NULL, 0); @@ -363,7 +370,7 @@ static iree_status_t iree_hal_hip_graph_command_buffer_end( iree_hal_hip_graph_command_buffer_flush_collectives(command_buffer)); IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END( - command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_COARSE); + command_buffer, IREE_HAL_TRACING_VERBOSITY_COARSE); // Reset state used during recording. command_buffer->hip_barrier_node = NULL; @@ -398,7 +405,7 @@ static void iree_hal_hip_graph_command_buffer_begin_debug_group( (void)command_buffer; IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( - command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_COARSE, + command_buffer, IREE_HAL_TRACING_VERBOSITY_COARSE, location ? location->file.data : NULL, location ? location->file.size : 0, location ? location->line : 0, /*func_name=*/NULL, 0, label.data, label.size); @@ -410,7 +417,7 @@ static void iree_hal_hip_graph_command_buffer_end_debug_group( iree_hal_hip_graph_command_buffer_cast(base_command_buffer); (void)command_buffer; IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END( - command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_COARSE); + command_buffer, IREE_HAL_TRACING_VERBOSITY_COARSE); } static iree_status_t @@ -524,7 +531,7 @@ static iree_status_t iree_hal_hip_graph_command_buffer_fill_buffer( iree_hal_hip_graph_command_buffer_cast(base_command_buffer); IREE_TRACE_ZONE_BEGIN(z0); IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN( - command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_FINE); + command_buffer, IREE_HAL_TRACING_VERBOSITY_FINE); IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_hip_graph_command_buffer_flush_collectives(command_buffer)); @@ -562,8 +569,8 @@ static iree_status_t iree_hal_hip_graph_command_buffer_fill_buffer( dependency_count, ¶ms), "hipGraphAddMemsetNode"); - IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END( - command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_FINE); + IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer, + IREE_HAL_TRACING_VERBOSITY_FINE); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } @@ -580,7 +587,7 @@ static iree_status_t iree_hal_hip_graph_command_buffer_update_buffer( } IREE_TRACE_ZONE_BEGIN(z0); IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN( - command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_FINE); + command_buffer, IREE_HAL_TRACING_VERBOSITY_FINE); IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_hip_graph_command_buffer_flush_collectives(command_buffer)); @@ -632,8 +639,8 @@ static iree_status_t iree_hal_hip_graph_command_buffer_update_buffer( dependency_count, ¶ms, command_buffer->hip_context), "hipDrvGraphAddMemcpyNode"); - IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END( - command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_FINE); + IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer, + IREE_HAL_TRACING_VERBOSITY_FINE); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } @@ -650,7 +657,7 @@ static iree_status_t iree_hal_hip_graph_command_buffer_copy_buffer( } IREE_TRACE_ZONE_BEGIN(z0); IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN( - command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_FINE); + command_buffer, IREE_HAL_TRACING_VERBOSITY_FINE); IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_hip_graph_command_buffer_flush_collectives(command_buffer)); @@ -696,8 +703,8 @@ static iree_status_t iree_hal_hip_graph_command_buffer_copy_buffer( dependency_count, ¶ms, command_buffer->hip_context), "hipDrvGraphAddMemcpyNode"); - IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END( - command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_FINE); + IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer, + IREE_HAL_TRACING_VERBOSITY_FINE); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } @@ -801,7 +808,7 @@ static iree_status_t iree_hal_hip_graph_command_buffer_dispatch( executable, entry_point, &kernel_info)); IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( - command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_FINE, + command_buffer, IREE_HAL_TRACING_VERBOSITY_FINE, kernel_info.source_filename.data, kernel_info.source_filename.size, kernel_info.source_line, kernel_info.function_name.data, kernel_info.function_name.size, @@ -889,8 +896,8 @@ static iree_status_t iree_hal_hip_graph_command_buffer_dispatch( dependency_count, ¶ms), "hipGraphAddKernelNode"); - IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END( - command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_FINE); + IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer, + IREE_HAL_TRACING_VERBOSITY_FINE); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } @@ -923,7 +930,7 @@ static iree_status_t iree_hal_hip_graph_command_buffer_dispatch2( executable, entry_point, &kernel_info)); IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( - command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_FINE, + command_buffer, IREE_HAL_TRACING_VERBOSITY_FINE, kernel_info.source_filename.data, kernel_info.source_filename.size, kernel_info.source_line, kernel_info.function_name.data, kernel_info.function_name.size, /*name=*/NULL, 0); @@ -1006,8 +1013,8 @@ static iree_status_t iree_hal_hip_graph_command_buffer_dispatch2( dependency_count, ¶ms), "hipGraphAddKernelNode"); - IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END( - command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_FINE); + IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer, + IREE_HAL_TRACING_VERBOSITY_FINE); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } diff --git a/runtime/src/iree/hal/drivers/hip/graph_command_buffer.h b/runtime/src/iree/hal/drivers/hip/graph_command_buffer.h index 9701ee86ea9e..424d780521c8 100644 --- a/runtime/src/iree/hal/drivers/hip/graph_command_buffer.h +++ b/runtime/src/iree/hal/drivers/hip/graph_command_buffer.h @@ -21,7 +21,8 @@ extern "C" { // changes and may have outstanding issues. typedef struct iree_arena_block_pool_t iree_arena_block_pool_t; -typedef struct iree_hal_hip_tracing_context_t iree_hal_hip_tracing_context_t; +typedef struct iree_hal_stream_tracing_context_t + iree_hal_stream_tracing_context_t; // Creates a command buffer that records into a HIP graph. // @@ -30,7 +31,7 @@ typedef struct iree_hal_hip_tracing_context_t iree_hal_hip_tracing_context_t; iree_status_t iree_hal_hip_graph_command_buffer_create( iree_hal_allocator_t* device_allocator, const iree_hal_hip_dynamic_symbols_t* hip_symbols, - iree_hal_hip_tracing_context_t* tracing_context, hipCtx_t context, + iree_hal_stream_tracing_context_t* tracing_context, hipCtx_t context, iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity, diff --git a/runtime/src/iree/hal/drivers/hip/hip_device.c b/runtime/src/iree/hal/drivers/hip/hip_device.c index df1592ee44b6..1e8e978b6643 100644 --- a/runtime/src/iree/hal/drivers/hip/hip_device.c +++ b/runtime/src/iree/hal/drivers/hip/hip_device.c @@ -27,11 +27,11 @@ #include "iree/hal/drivers/hip/status_util.h" #include "iree/hal/drivers/hip/stream_command_buffer.h" #include "iree/hal/drivers/hip/timepoint_pool.h" -#include "iree/hal/drivers/hip/tracing.h" #include "iree/hal/utils/deferred_command_buffer.h" #include "iree/hal/utils/deferred_work_queue.h" #include "iree/hal/utils/file_transfer.h" #include "iree/hal/utils/memory_file.h" +#include "iree/hal/utils/stream_tracing.h" //===----------------------------------------------------------------------===// // iree_hal_hip_device_t @@ -63,7 +63,7 @@ typedef struct iree_hal_hip_device_t { // The hipStream_t used to issue device kernels and allocations. hipStream_t hip_dispatch_stream; - iree_hal_hip_tracing_context_t* tracing_context; + iree_hal_stream_tracing_context_t* tracing_context; iree_allocator_t host_allocator; @@ -257,6 +257,108 @@ iree_hal_hip_deferred_work_queue_device_interface_submit_command_buffer( return status; } +typedef struct iree_hal_hip_tracing_device_interface_t { + iree_hal_stream_tracing_device_interface_t base; + hipDevice_t device; + hipCtx_t context; + hipStream_t dispatch_stream; + iree_allocator_t host_allocator; + const iree_hal_hip_dynamic_symbols_t* hip_symbols; +} iree_hal_hip_tracing_device_interface_t; +static const iree_hal_stream_tracing_device_interface_vtable_t + iree_hal_hip_tracing_device_interface_vtable_t; + +void iree_hal_hip_tracing_device_interface_destroy( + iree_hal_stream_tracing_device_interface_t* base_device_interface) { + iree_hal_hip_tracing_device_interface_t* device_interface = + (iree_hal_hip_tracing_device_interface_t*)base_device_interface; + + iree_allocator_free(device_interface->host_allocator, device_interface); +} + +iree_status_t iree_hal_hip_tracing_device_interface_synchronize_native_event( + iree_hal_stream_tracing_device_interface_t* base_device_interface, + iree_hal_stream_tracing_native_event_t base_event) { + iree_hal_hip_tracing_device_interface_t* device_interface = + (iree_hal_hip_tracing_device_interface_t*)base_device_interface; + + return IREE_HIP_RESULT_TO_STATUS(device_interface->hip_symbols, + hipEventSynchronize((hipEvent_t)base_event)); +} + +iree_status_t iree_hal_hip_tracing_device_interface_create_native_event( + iree_hal_stream_tracing_device_interface_t* base_device_interface, + iree_hal_stream_tracing_native_event_t* base_event) { + iree_hal_hip_tracing_device_interface_t* device_interface = + (iree_hal_hip_tracing_device_interface_t*)base_device_interface; + + return IREE_HIP_RESULT_TO_STATUS( + device_interface->hip_symbols, + hipEventCreateWithFlags((hipEvent_t*)base_event, hipEventDefault)); +} + +iree_status_t iree_hal_hip_tracing_device_interface_query_native_event( + iree_hal_stream_tracing_device_interface_t* base_device_interface, + iree_hal_stream_tracing_native_event_t base_event) { + iree_hal_hip_tracing_device_interface_t* device_interface = + (iree_hal_hip_tracing_device_interface_t*)base_device_interface; + + return IREE_HIP_RESULT_TO_STATUS(device_interface->hip_symbols, + hipEventQuery((hipEvent_t)base_event)); +} + +void iree_hal_hip_tracing_device_interface_event_elapsed_time( + iree_hal_stream_tracing_device_interface_t* base_device_interface, + float* relative_millis, iree_hal_stream_tracing_native_event_t start_event, + iree_hal_stream_tracing_native_event_t end_event) { + iree_hal_hip_tracing_device_interface_t* device_interface = + (iree_hal_hip_tracing_device_interface_t*)base_device_interface; + + IREE_HIP_IGNORE_ERROR( + device_interface->hip_symbols, + hipEventElapsedTime(relative_millis, (hipEvent_t)start_event, + (hipEvent_t)end_event)); +} + +void iree_hal_hip_tracing_device_interface_destroy_native_event( + iree_hal_stream_tracing_device_interface_t* base_device_interface, + iree_hal_stream_tracing_native_event_t base_event) { + iree_hal_hip_tracing_device_interface_t* device_interface = + (iree_hal_hip_tracing_device_interface_t*)base_device_interface; + + IREE_HIP_IGNORE_ERROR(device_interface->hip_symbols, + hipEventDestroy((hipEvent_t)base_event)); +} + +iree_status_t iree_hal_hip_tracing_device_interface_record_native_event( + iree_hal_stream_tracing_device_interface_t* base_device_interface, + iree_hal_stream_tracing_native_event_t base_event) { + iree_hal_hip_tracing_device_interface_t* device_interface = + (iree_hal_hip_tracing_device_interface_t*)base_device_interface; + + return IREE_HIP_RESULT_TO_STATUS( + device_interface->hip_symbols, + hipEventRecord((hipEvent_t)base_event, + (hipStream_t)device_interface->dispatch_stream)); +} + +iree_status_t iree_hal_hip_tracing_device_interface_add_graph_event_record_node( + iree_hal_stream_tracing_device_interface_t* base_device_interface, + iree_hal_stream_tracing_native_graph_node_t* out_node, + iree_hal_stream_tracing_native_graph_t graph, + iree_hal_stream_tracing_native_graph_node_t* dependency_nodes, + size_t dependency_nodes_count, + iree_hal_stream_tracing_native_event_t event) { + iree_hal_hip_tracing_device_interface_t* device_interface = + (iree_hal_hip_tracing_device_interface_t*)base_device_interface; + + return IREE_HIP_RESULT_TO_STATUS( + device_interface->hip_symbols, + hipGraphAddEventRecordNode((hipGraphNode_t*)out_node, (hipGraph_t)graph, + (hipGraphNode_t*)dependency_nodes, + dependency_nodes_count, (hipEvent_t)event)); +} + static iree_hal_hip_device_t* iree_hal_hip_device_cast( iree_hal_device_t* base_value) { IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_hip_device_vtable); @@ -344,18 +446,36 @@ static iree_status_t iree_hal_hip_device_create_internal( // Enable tracing for the (currently only) stream - no-op if disabled. if (iree_status_is_ok(status) && device->params.stream_tracing) { - if (device->params.stream_tracing >= IREE_HAL_HIP_TRACING_VERBOSITY_MAX || - device->params.stream_tracing < IREE_HAL_HIP_TRACING_VERBOSITY_OFF) { + if (device->params.stream_tracing >= IREE_HAL_TRACING_VERBOSITY_MAX || + device->params.stream_tracing < IREE_HAL_TRACING_VERBOSITY_OFF) { return iree_make_status( IREE_STATUS_INVALID_ARGUMENT, "invalid stream_tracing argument: expected to be between %d and %d", - IREE_HAL_HIP_TRACING_VERBOSITY_OFF, - IREE_HAL_HIP_TRACING_VERBOSITY_MAX); + IREE_HAL_TRACING_VERBOSITY_OFF, IREE_HAL_TRACING_VERBOSITY_MAX); + } + + iree_hal_hip_tracing_device_interface_t* tracing_device_interface = NULL; + status = iree_allocator_malloc( + host_allocator, sizeof(iree_hal_hip_tracing_device_interface_t), + (void**)&tracing_device_interface); + + if (IREE_UNLIKELY(!iree_status_is_ok(status))) { + iree_hal_device_release((iree_hal_device_t*)device); + return status; } - status = iree_hal_hip_tracing_context_allocate( - device->hip_symbols, device->identifier, dispatch_stream, - device->params.stream_tracing, &device->block_pool, host_allocator, - &device->tracing_context); + + tracing_device_interface->base.vtable = + &iree_hal_hip_tracing_device_interface_vtable_t; + tracing_device_interface->context = context; + tracing_device_interface->device = hip_device; + tracing_device_interface->dispatch_stream = dispatch_stream; + tracing_device_interface->host_allocator = host_allocator; + tracing_device_interface->hip_symbols = symbols; + + status = iree_hal_stream_tracing_context_allocate( + (iree_hal_stream_tracing_device_interface_t*)tracing_device_interface, + device->identifier, device->params.stream_tracing, &device->block_pool, + host_allocator, &device->tracing_context); } // Memory pool support is conditional. @@ -502,7 +622,7 @@ static void iree_hal_hip_device_destroy(iree_hal_device_t* base_device) { // Destroy memory pools that hold on to reserved memory. iree_hal_hip_memory_pools_deinitialize(&device->memory_pools); - iree_hal_hip_tracing_context_free(device->tracing_context); + iree_hal_stream_tracing_context_free(device->tracing_context); // Destroy various pools for synchronization. if (device->timepoint_pool) { @@ -945,8 +1065,8 @@ static iree_status_t iree_hal_hip_device_queue_write( } static void iree_hal_hip_device_collect_tracing_context(void* user_data) { - iree_hal_hip_tracing_context_collect( - (iree_hal_hip_tracing_context_t*)user_data); + iree_hal_stream_tracing_context_collect( + (iree_hal_stream_tracing_context_t*)user_data); } static iree_status_t iree_hal_hip_device_queue_execute( @@ -1071,3 +1191,22 @@ static const iree_hal_deferred_work_queue_device_interface_vtable_t .submit_command_buffer = iree_hal_hip_deferred_work_queue_device_interface_submit_command_buffer, }; + +static const iree_hal_stream_tracing_device_interface_vtable_t + iree_hal_hip_tracing_device_interface_vtable_t = { + .destroy = iree_hal_hip_tracing_device_interface_destroy, + .synchronize_native_event = + iree_hal_hip_tracing_device_interface_synchronize_native_event, + .create_native_event = + iree_hal_hip_tracing_device_interface_create_native_event, + .query_native_event = + iree_hal_hip_tracing_device_interface_query_native_event, + .event_elapsed_time = + iree_hal_hip_tracing_device_interface_event_elapsed_time, + .destroy_native_event = + iree_hal_hip_tracing_device_interface_destroy_native_event, + .record_native_event = + iree_hal_hip_tracing_device_interface_record_native_event, + .add_graph_event_record_node = + iree_hal_hip_tracing_device_interface_add_graph_event_record_node, +}; diff --git a/runtime/src/iree/hal/drivers/hip/rccl_channel.c b/runtime/src/iree/hal/drivers/hip/rccl_channel.c index 84e592cb9ff1..80791ff87eb7 100644 --- a/runtime/src/iree/hal/drivers/hip/rccl_channel.c +++ b/runtime/src/iree/hal/drivers/hip/rccl_channel.c @@ -575,8 +575,8 @@ static iree_status_t iree_hal_hip_nccl_submit_batch_entry( iree_status_t iree_hal_hip_nccl_submit_batch( const iree_hal_hip_nccl_dynamic_symbols_t* symbols, - iree_hal_hip_tracing_context_t* tracing_context, - iree_hal_hip_tracing_context_event_list_t* tracing_event_list, + iree_hal_stream_tracing_context_t* tracing_context, + iree_hal_stream_tracing_context_event_list_t* tracing_event_list, const iree_hal_collective_batch_t* batch, hipStream_t stream) { IREE_ASSERT_ARGUMENT(symbols); IREE_ASSERT_ARGUMENT(batch); @@ -592,11 +592,10 @@ iree_status_t iree_hal_hip_nccl_submit_batch( iree_hal_collective_batch_entry_t* entry = &batch->entries[i]; iree_string_view_t collective_str = iree_hal_collective_op_format(&entry->op, &string_temp); - IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( - tracing_context, tracing_event_list, stream, - IREE_HAL_HIP_TRACING_VERBOSITY_FINE, __FILE__, strlen(__FILE__), - (uint32_t)__LINE__, __FUNCTION__, strlen(__FUNCTION__), - collective_str.data, collective_str.size); + IREE_HAL_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( + tracing_context, tracing_event_list, IREE_HAL_TRACING_VERBOSITY_FINE, + __FILE__, strlen(__FILE__), (uint32_t)__LINE__, __FUNCTION__, + strlen(__FUNCTION__), collective_str.data, collective_str.size); } #endif // IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE @@ -613,9 +612,8 @@ iree_status_t iree_hal_hip_nccl_submit_batch( // order doesn't matter so long as we end the right number of zones. IREE_TRACE({ for (iree_host_size_t i = 0; i < batch->count; ++i) { - IREE_HIP_STREAM_TRACE_ZONE_END(tracing_context, tracing_event_list, - stream, - IREE_HAL_HIP_TRACING_VERBOSITY_FINE); + IREE_HAL_STREAM_TRACE_ZONE_END(tracing_context, tracing_event_list, + IREE_HAL_TRACING_VERBOSITY_FINE); } }); diff --git a/runtime/src/iree/hal/drivers/hip/rccl_channel.h b/runtime/src/iree/hal/drivers/hip/rccl_channel.h index 366cf2fd565d..4f25dd6e6356 100644 --- a/runtime/src/iree/hal/drivers/hip/rccl_channel.h +++ b/runtime/src/iree/hal/drivers/hip/rccl_channel.h @@ -12,8 +12,8 @@ #include "iree/hal/drivers/hip/api.h" #include "iree/hal/drivers/hip/dynamic_symbols.h" #include "iree/hal/drivers/hip/rccl_dynamic_symbols.h" -#include "iree/hal/drivers/hip/tracing.h" #include "iree/hal/utils/collective_batch.h" +#include "iree/hal/utils/stream_tracing.h" #ifdef __cplusplus extern "C" { @@ -48,8 +48,8 @@ iree_status_t iree_hal_hip_nccl_channel_create( // Note that operations in the batch may apply to different channels. iree_status_t iree_hal_hip_nccl_submit_batch( const iree_hal_hip_nccl_dynamic_symbols_t* nccl_symbols, - iree_hal_hip_tracing_context_t* tracing_context, - iree_hal_hip_tracing_context_event_list_t* tracing_event_list, + iree_hal_stream_tracing_context_t* tracing_context, + iree_hal_stream_tracing_context_event_list_t* tracing_event_list, const iree_hal_collective_batch_t* batch, hipStream_t stream); #ifdef __cplusplus diff --git a/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c b/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c index 1b8b6b665289..d536612a4b6c 100644 --- a/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c +++ b/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c @@ -12,9 +12,9 @@ #include "iree/hal/drivers/hip/pipeline_layout.h" #include "iree/hal/drivers/hip/rccl_channel.h" #include "iree/hal/drivers/hip/status_util.h" -#include "iree/hal/drivers/hip/tracing.h" #include "iree/hal/utils/collective_batch.h" #include "iree/hal/utils/resource_set.h" +#include "iree/hal/utils/stream_tracing.h" typedef struct iree_hal_hip_stream_command_buffer_t { iree_hal_command_buffer_t base; @@ -24,8 +24,8 @@ typedef struct iree_hal_hip_stream_command_buffer_t { const iree_hal_hip_nccl_dynamic_symbols_t* nccl_symbols; // Per-stream HIP tracing context. - iree_hal_hip_tracing_context_t* tracing_context; - iree_hal_hip_tracing_context_event_list_t tracing_event_list; + iree_hal_stream_tracing_context_t* tracing_context; + iree_hal_stream_tracing_context_event_list_t tracing_event_list; hipStream_t hip_stream; @@ -61,7 +61,7 @@ iree_status_t iree_hal_hip_stream_command_buffer_create( iree_hal_allocator_t* device_allocator, const iree_hal_hip_dynamic_symbols_t* hip_symbols, const iree_hal_hip_nccl_dynamic_symbols_t* nccl_symbols, - iree_hal_hip_tracing_context_t* tracing_context, + iree_hal_stream_tracing_context_t* tracing_context, iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, iree_host_size_t binding_capacity, hipStream_t stream, @@ -124,8 +124,8 @@ static void iree_hal_hip_stream_command_buffer_destroy( iree_allocator_t host_allocator = command_buffer->host_allocator; IREE_TRACE_ZONE_BEGIN(z0); - iree_hal_hip_tracing_free(command_buffer->tracing_context, - &command_buffer->tracing_event_list); + iree_hal_stream_tracing_free(command_buffer->tracing_context, + &command_buffer->tracing_event_list); iree_hal_collective_batch_deinitialize(&command_buffer->collective_batch); iree_hal_resource_set_free(command_buffer->resource_set); @@ -149,8 +149,8 @@ void iree_hal_hip_stream_notify_submitted_commands( return; } - iree_hal_hip_tracing_notify_submitted(command_buffer->tracing_context, - &command_buffer->tracing_event_list); + iree_hal_stream_tracing_notify_submitted(command_buffer->tracing_context, + &command_buffer->tracing_event_list); } // Flushes any pending batched collective operations. @@ -181,9 +181,9 @@ static iree_status_t iree_hal_hip_stream_command_buffer_begin( iree_hal_hip_stream_command_buffer_cast(base_command_buffer); (void)command_buffer; - IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( + IREE_HAL_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( command_buffer->tracing_context, &command_buffer->tracing_event_list, - command_buffer->hip_stream, IREE_HAL_HIP_TRACING_VERBOSITY_COARSE, + IREE_HAL_TRACING_VERBOSITY_COARSE, /*file_name=*/NULL, 0, /*line=*/0, "iree_hal_hip_stream_command_buffer", strlen("iree_hal_hip_stream_command_buffer"), /*name=*/NULL, 0); @@ -212,9 +212,9 @@ static iree_status_t iree_hal_hip_stream_command_buffer_end( z0, iree_hal_resource_set_allocate(command_buffer->arena.block_pool, &command_buffer->resource_set)); - IREE_HIP_STREAM_TRACE_ZONE_END( - command_buffer->tracing_context, &command_buffer->tracing_event_list, - command_buffer->hip_stream, IREE_HAL_HIP_TRACING_VERBOSITY_COARSE); + IREE_HAL_STREAM_TRACE_ZONE_END(command_buffer->tracing_context, + &command_buffer->tracing_event_list, + IREE_HAL_TRACING_VERBOSITY_COARSE); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); @@ -228,11 +228,10 @@ static void iree_hal_hip_stream_command_buffer_begin_debug_group( iree_hal_hip_stream_command_buffer_cast(base_command_buffer); (void)command_buffer; - IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( + IREE_HAL_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( command_buffer->tracing_context, &command_buffer->tracing_event_list, - command_buffer->hip_stream, IREE_HAL_HIP_TRACING_VERBOSITY_COARSE, - location ? location->file.data : NULL, location ? location->file.size : 0, - location ? location->line : 0, + IREE_HAL_TRACING_VERBOSITY_COARSE, location ? location->file.data : NULL, + location ? location->file.size : 0, location ? location->line : 0, /*func_name=*/NULL, 0, label.data, label.size); } @@ -242,9 +241,9 @@ static void iree_hal_hip_stream_command_buffer_end_debug_group( iree_hal_hip_stream_command_buffer_cast(base_command_buffer); (void)command_buffer; - IREE_HIP_STREAM_TRACE_ZONE_END( - command_buffer->tracing_context, &command_buffer->tracing_event_list, - command_buffer->hip_stream, IREE_HAL_HIP_TRACING_VERBOSITY_COARSE); + IREE_HAL_STREAM_TRACE_ZONE_END(command_buffer->tracing_context, + &command_buffer->tracing_event_list, + IREE_HAL_TRACING_VERBOSITY_COARSE); } static iree_status_t iree_hal_hip_stream_command_buffer_execution_barrier( @@ -541,12 +540,11 @@ static iree_status_t iree_hal_hip_stream_command_buffer_dispatch( z0, iree_hal_hip_native_executable_entry_point_kernel_info( executable, entry_point, &kernel_info)); - IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( + IREE_HAL_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( command_buffer->tracing_context, &command_buffer->tracing_event_list, - command_buffer->hip_stream, IREE_HAL_HIP_TRACING_VERBOSITY_FINE, - kernel_info.source_filename.data, kernel_info.source_filename.size, - kernel_info.source_line, kernel_info.function_name.data, - kernel_info.function_name.size, + IREE_HAL_TRACING_VERBOSITY_FINE, kernel_info.source_filename.data, + kernel_info.source_filename.size, kernel_info.source_line, + kernel_info.function_name.data, kernel_info.function_name.size, /*name=*/NULL, 0); IREE_RETURN_AND_END_ZONE_IF_ERROR( @@ -617,9 +615,9 @@ static iree_status_t iree_hal_hip_stream_command_buffer_dispatch( command_buffer->hip_stream, params_ptr, NULL), "hipModuleLaunchKernel"); - IREE_HIP_STREAM_TRACE_ZONE_END( - command_buffer->tracing_context, &command_buffer->tracing_event_list, - command_buffer->hip_stream, IREE_HAL_HIP_TRACING_VERBOSITY_FINE); + IREE_HAL_STREAM_TRACE_ZONE_END(command_buffer->tracing_context, + &command_buffer->tracing_event_list, + IREE_HAL_TRACING_VERBOSITY_FINE); IREE_TRACE_ZONE_END(z0); return status; @@ -652,12 +650,12 @@ static iree_status_t iree_hal_hip_stream_command_buffer_dispatch2( z0, iree_hal_hip_native_executable_entry_point_kernel_info( executable, entry_point, &kernel_info)); - IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( + IREE_HAL_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( command_buffer->tracing_context, &command_buffer->tracing_event_list, - command_buffer->hip_stream, IREE_HAL_HIP_TRACING_VERBOSITY_FINE, - kernel_info.source_filename.data, kernel_info.source_filename.size, - kernel_info.source_line, kernel_info.function_name.data, - kernel_info.function_name.size, /*name=*/NULL, 0); + IREE_HAL_TRACING_VERBOSITY_FINE, kernel_info.source_filename.data, + kernel_info.source_filename.size, kernel_info.source_line, + kernel_info.function_name.data, kernel_info.function_name.size, + /*name=*/NULL, 0); IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_resource_set_insert(command_buffer->resource_set, 1, @@ -720,9 +718,9 @@ static iree_status_t iree_hal_hip_stream_command_buffer_dispatch2( params_ptr, NULL), "hipModuleLaunchKernel"); - IREE_HIP_STREAM_TRACE_ZONE_END( - command_buffer->tracing_context, &command_buffer->tracing_event_list, - command_buffer->hip_stream, IREE_HAL_HIP_TRACING_VERBOSITY_FINE); + IREE_HAL_STREAM_TRACE_ZONE_END(command_buffer->tracing_context, + &command_buffer->tracing_event_list, + IREE_HAL_TRACING_VERBOSITY_FINE); IREE_TRACE_ZONE_END(z0); return status; diff --git a/runtime/src/iree/hal/drivers/hip/stream_command_buffer.h b/runtime/src/iree/hal/drivers/hip/stream_command_buffer.h index 50eddf1d9daa..cc88c3a4b6b7 100644 --- a/runtime/src/iree/hal/drivers/hip/stream_command_buffer.h +++ b/runtime/src/iree/hal/drivers/hip/stream_command_buffer.h @@ -12,7 +12,7 @@ #include "iree/hal/drivers/hip/dynamic_symbols.h" #include "iree/hal/drivers/hip/hip_headers.h" #include "iree/hal/drivers/hip/rccl_dynamic_symbols.h" -#include "iree/hal/drivers/hip/tracing.h" +#include "iree/hal/utils/stream_tracing.h" #ifdef __cplusplus extern "C" { @@ -33,7 +33,7 @@ iree_status_t iree_hal_hip_stream_command_buffer_create( iree_hal_allocator_t* device_allocator, const iree_hal_hip_dynamic_symbols_t* hip_symbols, const iree_hal_hip_nccl_dynamic_symbols_t* nccl_symbols, - iree_hal_hip_tracing_context_t* tracing_context, + iree_hal_stream_tracing_context_t* tracing_context, iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, iree_host_size_t binding_capacity, hipStream_t stream, diff --git a/runtime/src/iree/hal/drivers/hip/tracing.c b/runtime/src/iree/hal/drivers/hip/tracing.c deleted file mode 100644 index 62b15effa71b..000000000000 --- a/runtime/src/iree/hal/drivers/hip/tracing.c +++ /dev/null @@ -1,526 +0,0 @@ -// Copyright 2024 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include "iree/hal/drivers/hip/tracing.h" - -#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE - -#include "iree/hal/drivers/hip/dynamic_symbols.h" -#include "iree/hal/drivers/hip/status_util.h" - -// Total number of events per tracing context. This translates to the maximum -// number of outstanding timestamp queries before collection is required. -// To prevent spilling pages we leave some room for the context structure. -#define IREE_HAL_HIP_TRACING_DEFAULT_QUERY_CAPACITY (16 * 1024 - 256) - -// iree_hal_hip_tracing_context_event_t contains a hipEvent that is used to -// record timestamps for tracing GPU execution. In this struct, there are also -// two linked lists that the current event may be added to during its lifetime. -// -// --------------------->---Submissions--->---------- -// \ \ \ -// \ \ \ -// command_buffer command_buffer command_buffer -// -// The submission list is owned by the tracing context and elements are -// inserted and removed as commmand_buffers are submitted and when they -// complete. This is a list of the head elements for each command buffer. -// The commnad buffer list is owned by the command buffer. It is the list of -// events used to trace command buffer dispatches. -// -// When the event is in the freelist, next_submission should be null, and -// we reuse next_in_command_buffer to track the next free event. -// -// When the even is grabbed from the freelist to track GPU executions, -// it is added to the list in recording command_buffer. -struct iree_hal_hip_tracing_context_event_t { - hipEvent_t event; - iree_hal_hip_tracing_context_event_t* next_in_command_buffer; - iree_hal_hip_tracing_context_event_t* next_submission; - bool was_submitted; -}; - -struct iree_hal_hip_tracing_context_t { - const iree_hal_hip_dynamic_symbols_t* symbols; - iree_slim_mutex_t event_mutex; - - hipStream_t stream; - iree_arena_block_pool_t* block_pool; - iree_allocator_t host_allocator; - - // A unique GPU zone ID allocated from Tracy. - // There is a global limit of 255 GPU zones (ID 255 is special). - uint8_t id; - - // Base event used for computing relative times for all recorded events. - // This is required as HIP only allows for relative timing between events and - // we need a stable base event. - hipEvent_t base_event; - - // Unallocated event list head. next_in_command_buffer points to the next - // available event. - iree_hal_hip_tracing_context_event_t* event_freelist_head; - - // Submitted events - iree_hal_hip_tracing_context_event_list_t submitted_event_list; - - int32_t verbosity; - - uint32_t query_capacity; - - // Event pool reused to capture tracing timestamps. - // The lifetime of the events are as follows. - // 1) All events are allocated when the tracing context is created. - // 2) When a command_buffer inserts a query via: - // iree_hal_cuda_**_tracing_context_insert_query - // an event is pulled from the event freelist and added to the - // command buffer. - // 3) When a command buffer is dispatched and - // iree_hal_hip_tracing_notify_submitted is called, the events - // for that command buffer are added to the submitted_event_list. - // 4) When the command buffer completes iree_hal_cuda_tracing_context_collect - // is called, and the events are removed from submitted_event_list as - // we collect their values. - // 5) When the command buffer is destroyed, all events are put at the front - // of event_freelist. - iree_hal_hip_tracing_context_event_t - event_pool[IREE_HAL_HIP_TRACING_DEFAULT_QUERY_CAPACITY]; -}; - -static iree_status_t iree_hal_hip_tracing_context_initial_calibration( - const iree_hal_hip_dynamic_symbols_t* symbols, hipStream_t stream, - hipEvent_t base_event, int64_t* out_cpu_timestamp, - int64_t* out_gpu_timestamp, float* out_timestamp_period) { - IREE_TRACE_ZONE_BEGIN(z0); - *out_cpu_timestamp = 0; - *out_gpu_timestamp = 0; - *out_timestamp_period = 1.0f; - - // Record event to the stream; in the absence of a synchronize this may not - // flush immediately. - IREE_RETURN_AND_END_ZONE_IF_ERROR( - z0, - IREE_HIP_RESULT_TO_STATUS(symbols, hipEventRecord(base_event, stream))); - - // Force flush the event and wait for it to complete. - IREE_RETURN_AND_END_ZONE_IF_ERROR( - z0, IREE_HIP_RESULT_TO_STATUS(symbols, hipEventSynchronize(base_event))); - - // Track when we know the event has completed and has a reasonable timestamp. - // This may drift from the actual time differential between host/device but is - // (maybe?) the best we can do. - *out_cpu_timestamp = iree_tracing_time(); - - IREE_TRACE_ZONE_END(z0); - return iree_ok_status(); -} - -iree_status_t iree_hal_hip_tracing_context_allocate( - const iree_hal_hip_dynamic_symbols_t* symbols, - iree_string_view_t queue_name, hipStream_t stream, - iree_hal_hip_tracing_verbosity_t stream_tracing_verbosity, - iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator, - iree_hal_hip_tracing_context_t** out_context) { - IREE_TRACE_ZONE_BEGIN(z0); - IREE_ASSERT_ARGUMENT(symbols); - IREE_ASSERT_ARGUMENT(stream); - IREE_ASSERT_ARGUMENT(block_pool); - IREE_ASSERT_ARGUMENT(out_context); - *out_context = NULL; - - iree_hal_hip_tracing_context_t* context = NULL; - iree_status_t status = - iree_allocator_malloc(host_allocator, sizeof(*context), (void**)&context); - if (iree_status_is_ok(status)) { - context->symbols = symbols; - context->stream = stream; - context->block_pool = block_pool; - context->host_allocator = host_allocator; - context->query_capacity = IREE_ARRAYSIZE(context->event_pool); - context->submitted_event_list.head = NULL; - context->submitted_event_list.tail = NULL; - context->verbosity = stream_tracing_verbosity; - iree_slim_mutex_initialize(&context->event_mutex); - } - - // Pre-allocate all events in the event pool. - if (iree_status_is_ok(status)) { - IREE_TRACE_ZONE_BEGIN_NAMED( - z_event_pool, "iree_hal_hip_tracing_context_allocate_event_pool"); - IREE_TRACE_ZONE_APPEND_VALUE_I64(z_event_pool, - (int64_t)context->query_capacity); - context->event_freelist_head = &context->event_pool[0]; - for (iree_host_size_t i = 0; i < context->query_capacity; ++i) { - status = IREE_HIP_RESULT_TO_STATUS( - symbols, hipEventCreateWithFlags(&context->event_pool[i].event, - hipEventDefault)); - if (!iree_status_is_ok(status)) break; - if (i > 0) { - context->event_pool[i - 1].next_in_command_buffer = - &context->event_pool[i]; - } - context->event_pool[i].next_submission = NULL; - context->event_pool[i].was_submitted = false; - if (i + 1 == context->query_capacity) { - context->event_pool[i].next_in_command_buffer = NULL; - } - } - IREE_TRACE_ZONE_END(z_event_pool); - } - - // Create the initial GPU event and insert it into the stream. - // All events we record are relative to this event. - int64_t cpu_timestamp = 0; - int64_t gpu_timestamp = 0; - float timestamp_period = 0.0f; - if (iree_status_is_ok(status)) { - status = IREE_HIP_RESULT_TO_STATUS( - symbols, - hipEventCreateWithFlags(&context->base_event, hipEventDefault)); - } - if (iree_status_is_ok(status)) { - status = iree_hal_hip_tracing_context_initial_calibration( - symbols, stream, context->base_event, &cpu_timestamp, &gpu_timestamp, - ×tamp_period); - } - - // Allocate the GPU context and pass initial calibration data. - if (iree_status_is_ok(status)) { - context->id = iree_tracing_gpu_context_allocate( - IREE_TRACING_GPU_CONTEXT_TYPE_VULKAN, queue_name.data, queue_name.size, - /*is_calibrated=*/false, cpu_timestamp, gpu_timestamp, - timestamp_period); - } - - if (iree_status_is_ok(status)) { - *out_context = context; - } else { - iree_hal_hip_tracing_context_free(context); - } - IREE_TRACE_ZONE_END(z0); - return status; -} - -void iree_hal_hip_tracing_context_free( - iree_hal_hip_tracing_context_t* context) { - if (!context) return; - IREE_TRACE_ZONE_BEGIN(z0); - - // Always perform a collection on shutdown. - iree_hal_hip_tracing_context_collect(context); - - // Release all events; since collection completed they should all be unused. - IREE_TRACE_ZONE_BEGIN_NAMED(z_event_pool, - "iree_hal_hip_tracing_context_free_event_pool"); - for (iree_host_size_t i = 0; i < context->query_capacity; ++i) { - if (context->event_pool[i].event) { - IREE_HIP_IGNORE_ERROR(context->symbols, - hipEventDestroy(context->event_pool[i].event)); - } - } - IREE_TRACE_ZONE_END(z_event_pool); - if (context->base_event) { - IREE_HIP_IGNORE_ERROR(context->symbols, - hipEventDestroy(context->base_event)); - } - - iree_slim_mutex_deinitialize(&context->event_mutex); - - iree_allocator_t host_allocator = context->host_allocator; - iree_allocator_free(host_allocator, context); - - IREE_TRACE_ZONE_END(z0); -} - -void iree_hal_hip_tracing_context_collect( - iree_hal_hip_tracing_context_t* context) { - if (!context) return; - iree_slim_mutex_lock(&context->event_mutex); - // No outstanding queries - if (!context->submitted_event_list.head) { - iree_slim_mutex_unlock(&context->event_mutex); - return; - } - IREE_TRACE_ZONE_BEGIN(z0); - - // submitted_event_list is a list of the head elements for each command - // buffer that has been submitted. Here we loop over all of the events, - // wait for them to complete and gather the results with hipEventQuery. - iree_hal_hip_tracing_context_event_t* events = - context->submitted_event_list.head; - uint32_t read_query_count = 0; - // Outer per-command_buffer loop. - while (events) { - iree_hal_hip_tracing_context_event_t* event = events; - // Inner per-event loop. - while (event) { - uint32_t query_id = (uint32_t)(event - &context->event_pool[0]); - - hipError_t result = context->symbols->hipEventSynchronize(event->event); - if (result != hipSuccess) break; - result = context->symbols->hipEventQuery(event->event); - if (result != hipSuccess) break; - - // Calculate context-relative time and notify tracy. - float relative_millis = 0.0f; - IREE_HIP_IGNORE_ERROR( - context->symbols, - hipEventElapsedTime(&relative_millis, context->base_event, - event->event)); - int64_t gpu_timestamp = (int64_t)((double)relative_millis * 1000000.0); - - iree_tracing_gpu_zone_notify(context->id, query_id, gpu_timestamp); - read_query_count += 1; - event = event->next_in_command_buffer; - } - iree_hal_hip_tracing_context_event_t* next = events->next_submission; - events->was_submitted = true; - events = next; - context->submitted_event_list.head = events; - } - IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)read_query_count); - - IREE_TRACE_ZONE_END(z0); - iree_slim_mutex_unlock(&context->event_mutex); -} - -void iree_hal_hip_tracing_notify_submitted( - iree_hal_hip_tracing_context_t* context, - iree_hal_hip_tracing_context_event_list_t* event_list) { - if (!context) return; - IREE_ASSERT_ARGUMENT(event_list); - iree_slim_mutex_lock(&context->event_mutex); - - if (!event_list->head) { - iree_slim_mutex_unlock(&context->event_mutex); - return; - } - - if (!context->submitted_event_list.head) { - context->submitted_event_list.head = event_list->head; - context->submitted_event_list.tail = event_list->head; - } else { - context->submitted_event_list.tail->next_submission = event_list->head; - context->submitted_event_list.tail = event_list->head; - } - - iree_slim_mutex_unlock(&context->event_mutex); -} - -void iree_hal_hip_tracing_free( - iree_hal_hip_tracing_context_t* context, - iree_hal_hip_tracing_context_event_list_t* event_list) { - if (!context) return; - iree_slim_mutex_lock(&context->event_mutex); - IREE_ASSERT_ARGUMENT(event_list); - - if (!event_list->head) { - iree_slim_mutex_unlock(&context->event_mutex); - return; - } - // Free an event list that was previously created. There is some book-keeping - // to keep tracy happy, and then we remove the elements from the - // passed in event_list and add them to the front of the free-list. - - // If this event list has never been submitted we still need to add values to - // the timeline otherwise tracy will not behave correctly. - if (!event_list->head->was_submitted) { - iree_hal_hip_tracing_context_event_t* event = event_list->head; - while (event) { - uint32_t query_id = (uint32_t)(event - &context->event_pool[0]); - iree_tracing_gpu_zone_notify(context->id, query_id, 0); - event = event->next_in_command_buffer; - } - } - - if (!context->event_freelist_head) { - context->event_freelist_head = event_list->head; - iree_slim_mutex_unlock(&context->event_mutex); - return; - } - event_list->head->next_submission = NULL; - event_list->head->was_submitted = false; - event_list->tail->next_in_command_buffer = context->event_freelist_head; - context->event_freelist_head = event_list->head; - - event_list->head = NULL; - event_list->tail = NULL; - iree_slim_mutex_unlock(&context->event_mutex); -} - -static void iree_hal_hip_tracing_context_event_list_append_event( - iree_hal_hip_tracing_context_event_list_t* event_list, - iree_hal_hip_tracing_context_event_t* event) { - if (!event_list->head) { - event_list->head = event; - event_list->tail = event; - } else { - event_list->tail->next_in_command_buffer = event; - event_list->tail = event; - } -} - -// Grabs the next available query out of the freelist and adds it to -// the event_list that was passed in. Also starts the recording of the -// event. -static uint16_t iree_hal_hip_stream_tracing_context_insert_query( - iree_hal_hip_tracing_context_t* context, - iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream) { - iree_slim_mutex_lock(&context->event_mutex); - IREE_ASSERT_ARGUMENT(event_list); - - // Allocate an event from the pool for use by the query. - // TODO: If we have run out of our freelist, then we need to try and recover - // or allocate more events. - iree_hal_hip_tracing_context_event_t* event = context->event_freelist_head; - context->event_freelist_head = event->next_in_command_buffer; - uint32_t query_id = event - &context->event_pool[0]; - IREE_ASSERT(event->next_in_command_buffer != NULL); - event->next_in_command_buffer = NULL; - - IREE_HIP_IGNORE_ERROR(context->symbols, hipEventRecord(event->event, stream)); - - iree_hal_hip_tracing_context_event_list_append_event(event_list, event); - - iree_slim_mutex_unlock(&context->event_mutex); - return query_id; -} - -// Grabs the next available query out of the freelist and adds it to -// the event_list that was passed in. Also inserts the event record -// node into the passed in graph. It returns the index of the -// event. -static uint16_t iree_hal_hip_graph_tracing_context_insert_query( - iree_hal_hip_tracing_context_t* context, - iree_hal_hip_tracing_context_event_list_t* event_list, - hipGraphNode_t* out_node, hipGraph_t graph, - hipGraphNode_t* dependency_nodes, size_t dependency_nodes_count) { - IREE_ASSERT_ARGUMENT(event_list); - iree_slim_mutex_lock(&context->event_mutex); - // Allocate an event from the pool for use by the query. - // TODO: If we have run out of our freelist, then we need to try and recover - // or - // allocate more events. - iree_hal_hip_tracing_context_event_t* event = context->event_freelist_head; - context->event_freelist_head = event->next_in_command_buffer; - uint32_t query_id = event - &context->event_pool[0]; - IREE_ASSERT(event->next_in_command_buffer != NULL); - event->next_in_command_buffer = NULL; - - iree_status_t status = IREE_HIP_RESULT_TO_STATUS( - context->symbols, - hipGraphAddEventRecordNode(out_node, graph, dependency_nodes, - dependency_nodes_count, event->event)); - IREE_ASSERT(iree_status_is_ok(status)); - - iree_hal_hip_tracing_context_event_list_append_event(event_list, event); - - iree_slim_mutex_unlock(&context->event_mutex); - return query_id; -} - -// TODO: optimize this implementation to reduce the number of events required: -// today we insert 2 events per zone (one for begin and one for end) but in -// many cases we could reduce this by inserting events only between zones and -// using the differences between them. -void iree_hal_hip_stream_tracing_zone_begin_impl( - iree_hal_hip_tracing_context_t* context, - iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream, - iree_hal_hip_tracing_verbosity_t verbosity, - const iree_tracing_location_t* src_loc) { - if (!context) return; - if (verbosity > context->verbosity) return; - uint16_t query_id = iree_hal_hip_stream_tracing_context_insert_query( - context, event_list, stream); - iree_tracing_gpu_zone_begin(context->id, query_id, src_loc); -} - -void iree_hal_hip_stream_tracing_zone_begin_external_impl( - iree_hal_hip_tracing_context_t* context, - iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream, - iree_hal_hip_tracing_verbosity_t verbosity, const char* file_name, - size_t file_name_length, uint32_t line, const char* function_name, - size_t function_name_length, const char* name, size_t name_length) { - if (!context) return; - if (verbosity > context->verbosity) return; - uint16_t query_id = iree_hal_hip_stream_tracing_context_insert_query( - context, event_list, stream); - iree_tracing_gpu_zone_begin_external(context->id, query_id, file_name, - file_name_length, line, function_name, - function_name_length, name, name_length); -} - -void iree_hal_hip_graph_tracing_zone_begin_external_impl( - iree_hal_hip_tracing_context_t* context, - iree_hal_hip_tracing_context_event_list_t* event_list, - hipGraphNode_t* out_node, hipGraph_t graph, - iree_hal_hip_tracing_verbosity_t verbosity, - hipGraphNode_t* dependency_nodes, size_t dependency_nodes_count, - const char* file_name, size_t file_name_length, uint32_t line, - const char* function_name, size_t function_name_length, const char* name, - size_t name_length) { - if (!context) return; - if (verbosity > context->verbosity) return; - uint16_t query_id = iree_hal_hip_graph_tracing_context_insert_query( - context, event_list, out_node, graph, dependency_nodes, - dependency_nodes_count); - iree_tracing_gpu_zone_begin_external(context->id, query_id, file_name, - file_name_length, line, function_name, - function_name_length, name, name_length); -} - -void iree_hal_hip_stream_tracing_zone_end_impl( - iree_hal_hip_tracing_context_t* context, - iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream, - iree_hal_hip_tracing_verbosity_t verbosity) { - if (!context) return; - if (verbosity > context->verbosity) return; - uint16_t query_id = iree_hal_hip_stream_tracing_context_insert_query( - context, event_list, stream); - iree_tracing_gpu_zone_end(context->id, query_id); -} - -void iree_hal_hip_graph_tracing_zone_end_impl( - iree_hal_hip_tracing_context_t* context, - iree_hal_hip_tracing_context_event_list_t* event_list, - hipGraphNode_t* out_node, hipGraph_t graph, - iree_hal_hip_tracing_verbosity_t verbosity, - hipGraphNode_t* dependency_nodes, size_t dependency_nodes_count) { - if (!context) return; - if (verbosity > context->verbosity) return; - uint16_t query_id = iree_hal_hip_graph_tracing_context_insert_query( - context, event_list, out_node, graph, dependency_nodes, - dependency_nodes_count); - iree_tracing_gpu_zone_end(context->id, query_id); -} - -#else - -iree_status_t iree_hal_hip_tracing_context_allocate( - const iree_hal_hip_dynamic_symbols_t* symbols, - iree_string_view_t queue_name, hipStream_t stream, - iree_hal_hip_tracing_verbosity_t stream_tracing_verbosity, - iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator, - iree_hal_hip_tracing_context_t** out_context) { - *out_context = NULL; - return iree_ok_status(); -} - -void iree_hal_hip_tracing_context_free( - iree_hal_hip_tracing_context_t* context) {} - -void iree_hal_hip_tracing_context_collect( - iree_hal_hip_tracing_context_t* context) {} - -void iree_hal_hip_tracing_notify_submitted( - iree_hal_hip_tracing_context_t* context, - iree_hal_hip_tracing_context_event_list_t* event_list) {} - -void iree_hal_hip_tracing_free( - iree_hal_hip_tracing_context_t* context, - iree_hal_hip_tracing_context_event_list_t* event_list) {} - -#endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE diff --git a/runtime/src/iree/hal/drivers/hip/tracing.h b/runtime/src/iree/hal/drivers/hip/tracing.h deleted file mode 100644 index 8323fd768b5c..000000000000 --- a/runtime/src/iree/hal/drivers/hip/tracing.h +++ /dev/null @@ -1,190 +0,0 @@ -// Copyright 2024 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#ifndef IREE_HAL_DRIVERS_HIP_TRACING_H_ -#define IREE_HAL_DRIVERS_HIP_TRACING_H_ - -#include "iree/base/api.h" -#include "iree/base/internal/arena.h" -#include "iree/base/tracing.h" -#include "iree/hal/api.h" -#include "iree/hal/drivers/hip/dynamic_symbols.h" -#include "iree/hal/drivers/hip/hip_headers.h" - -#ifdef __cplusplus -extern "C" { -#endif // __cplusplus - -// Per-stream HIP tracing context. -// No-op if IREE tracing is not enabled. -// -// Use the IREE_HIP_TRACE_* macros to trace a contiguous set of stream -// operations. Unlike the normal tracy macros there are no zone IDs and instead -// each stream gets an ID allocated once and passed to all tracing macros. -// -// Usage: -// IREE_HIP_STREAM_TRACE_ZONE_BEGIN(queue->tracing_context, stream); -// hipModuleLaunchKernel(..., stream); -// IREE_HIP_STREAM_TRACE_ZONE_END(queue->tracing_context, stream); -// ... -// iree_hal_hip_tracing_context_collect(queue->tracing_context); -// -// NOTE: timestamps can have non-trivial side-effecting behavior and may -// introduce serialization in graph execution. -// -// TODO(benvanik): expose hipEvent_t reservation separate from recording. For -// graphs we will need to insert the events but in order to reuse the graphs -// we'll need to reserve and patch new events each graph launch. -// -// Thread-compatible: external synchronization is required if using from -// multiple threads (same as with hipStream_t itself). -typedef struct iree_hal_hip_tracing_context_t iree_hal_hip_tracing_context_t; -typedef struct iree_hal_hip_tracing_context_event_t - iree_hal_hip_tracing_context_event_t; - -// This is used when tracing is enabled. Calls to dispatch and event related -// functions will update the pointers to keep the list up to date. -typedef struct iree_hal_hip_tracing_context_event_list_t { - iree_hal_hip_tracing_context_event_t* head; - iree_hal_hip_tracing_context_event_t* tail; -} iree_hal_hip_tracing_context_event_list_t; - -typedef enum iree_hal_hip_tracing_verbosity_e { - IREE_HAL_HIP_TRACING_VERBOSITY_OFF = 0, - IREE_HAL_HIP_TRACING_VERBOSITY_COARSE, - IREE_HAL_HIP_TRACING_VERBOSITY_FINE, - IREE_HAL_HIP_TRACING_VERBOSITY_MAX -} iree_hal_hip_tracing_verbosity_t; - -// Allocates a tracing context for the given HIP |stream|. -// Each context must only be used with the stream it was created for. -iree_status_t iree_hal_hip_tracing_context_allocate( - const iree_hal_hip_dynamic_symbols_t* symbols, - iree_string_view_t queue_name, hipStream_t stream, - iree_hal_hip_tracing_verbosity_t stream_tracing_verbosity, - iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator, - iree_hal_hip_tracing_context_t** out_context); - -// Frees a tracing context and all associated HIP resources. -// All submissions using the resources must be completed prior to calling. -void iree_hal_hip_tracing_context_free(iree_hal_hip_tracing_context_t* context); - -// Collects in-flight timestamp queries from the stream and feeds them to tracy. -// Must be called frequently (every submission, etc) to drain the backlog; -// tracing may start failing if the internal ringbuffer is exceeded. -void iree_hal_hip_tracing_context_collect( - iree_hal_hip_tracing_context_t* context); - -// Notifies that the given list of events has been dispached on to the gpu. -void iree_hal_hip_tracing_notify_submitted( - iree_hal_hip_tracing_context_t* context, - iree_hal_hip_tracing_context_event_list_t* event_list); - -// Frees the events and returns them back into the tracing context. -void iree_hal_hip_tracing_free( - iree_hal_hip_tracing_context_t* context, - iree_hal_hip_tracing_context_event_list_t* event_list); - -#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE - -// Begins a normal zone derived on the calling |src_loc|. -// Must be perfectly nested and paired with a corresponding zone end. -void iree_hal_hip_stream_tracing_zone_begin_impl( - iree_hal_hip_tracing_context_t* context, - iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream, - iree_hal_hip_tracing_verbosity_t verbosity, - const iree_tracing_location_t* src_loc); - -// Begins an external zone using the given source information. -// The provided strings will be copied into the tracy buffer. -void iree_hal_hip_stream_tracing_zone_begin_external_impl( - iree_hal_hip_tracing_context_t* context, - iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream, - iree_hal_hip_tracing_verbosity_t verbosity, const char* file_name, - size_t file_name_length, uint32_t line, const char* function_name, - size_t function_name_length, const char* name, size_t name_length); - -void iree_hal_hip_graph_tracing_zone_begin_external_impl( - iree_hal_hip_tracing_context_t* context, - iree_hal_hip_tracing_context_event_list_t* event_list, - hipGraphNode_t* out_node, hipGraph_t graph, - iree_hal_hip_tracing_verbosity_t verbosity, - hipGraphNode_t* dependency_nodes, size_t dependency_nodes_count, - const char* file_name, size_t file_name_length, uint32_t line, - const char* function_name, size_t function_name_length, const char* name, - size_t name_length); - -void iree_hal_hip_stream_tracing_zone_end_impl( - iree_hal_hip_tracing_context_t* context, - iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream, - iree_hal_hip_tracing_verbosity_t verbosity); -void iree_hal_hip_graph_tracing_zone_end_impl( - iree_hal_hip_tracing_context_t* context, - iree_hal_hip_tracing_context_event_list_t* event_list, - hipGraphNode_t* out_node, hipGraph_t graph, - iree_hal_hip_tracing_verbosity_t verbosity, - hipGraphNode_t* dependency_nodes, size_t dependency_nodes_count); - -// Begins a new zone with the parent function name. -#define IREE_HIP_STREAM_TRACE_ZONE_BEGIN(context, event_list, stream, \ - verbosity) \ - static const iree_tracing_location_t TracyConcat( \ - __tracy_source_location, __LINE__) = {NULL, __FUNCTION__, __FILE__, \ - (uint32_t)__LINE__, 0}; \ - iree_hal_hip_stream_tracing_zone_begin_impl( \ - context, event_list, stream, verbosity, \ - &TracyConcat(__tracy_source_location, __LINE__)); - -// Begins an externally defined zone with a dynamic source location. -// The |file_name|, |function_name|, and optional |name| strings will be copied -// into the trace buffer and do not need to persist. -#define IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( \ - context, event_list, stream, verbosity, file_name, file_name_length, line, \ - function_name, function_name_length, name, name_length) \ - iree_hal_hip_stream_tracing_zone_begin_external_impl( \ - context, event_list, stream, verbosity, file_name, file_name_length, \ - line, function_name, function_name_length, name, name_length) - -#define IREE_HIP_GRAPH_TRACE_ZONE_BEGIN_EXTERNAL( \ - context, event_list, out_node, graph, verbosity, dependency_nodes, \ - dependency_nodes_count, file_name, file_name_length, line, function_name, \ - function_name_length, name, name_length) \ - iree_hal_hip_graph_tracing_zone_begin_external_impl( \ - context, event_list, out_node, graph, verbosity, dependency_nodes, \ - dependency_nodes_count, file_name, file_name_length, line, \ - function_name, function_name_length, name, name_length) - -#define IREE_HIP_STREAM_TRACE_ZONE_END(context, event_list, stream, verbosity) \ - iree_hal_hip_stream_tracing_zone_end_impl(context, event_list, stream, \ - verbosity) - -#define IREE_HIP_GRAPH_TRACE_ZONE_END(context, event_list, out_node, graph, \ - verbosity, dependency_nodes, \ - dependency_nodes_count) \ - iree_hal_hip_graph_tracing_zone_end_impl(context, event_list, out_node, \ - graph, verbosity, dependency_nodes, \ - dependency_nodes_count) -#else - -#define IREE_HIP_STREAM_TRACE_ZONE_BEGIN(context, event_list, stream, verbosity) -#define IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( \ - context, event_list, stream, verbosity, file_name, file_name_length, line, \ - function_name, function_name_length, name, name_length) -#define IREE_HIP_GRAPH_TRACE_ZONE_BEGIN_EXTERNAL( \ - context, event_list, out_node, graph, verbosity, dependency_nodes, \ - dependency_nodes_count, file_name, file_name_length, line, function_name, \ - function_name_length, name, name_length) -#define IREE_HIP_STREAM_TRACE_ZONE_END(context, evnet_list, stream, verbosity) -#define IREE_HIP_GRAPH_TRACE_ZONE_END(context, event_list, out_node, graph, \ - verbosity, dependency_nodes, \ - dependency_nodes_count) -#endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE - -#ifdef __cplusplus -} // extern "C" -#endif // __cplusplus - -#endif // IREE_HAL_DRIVERS_HIP_TRACING_H_ diff --git a/runtime/src/iree/hal/utils/BUILD.bazel b/runtime/src/iree/hal/utils/BUILD.bazel index 0a01f98c4b27..153160121cc0 100644 --- a/runtime/src/iree/hal/utils/BUILD.bazel +++ b/runtime/src/iree/hal/utils/BUILD.bazel @@ -202,6 +202,18 @@ iree_runtime_cc_test( ], ) +iree_runtime_cc_library( + name = "stream_tracing", + srcs = ["stream_tracing.c"], + hdrs = ["stream_tracing.h"], + deps = [ + "//runtime/src/iree/base", + "//runtime/src/iree/base/internal:arena", + "//runtime/src/iree/base/internal:synchronization", + "//runtime/src/iree/hal", + ], +) + iree_cmake_extra_content( content = """ if(NOT IREE_ENABLE_THREADING) diff --git a/runtime/src/iree/hal/utils/CMakeLists.txt b/runtime/src/iree/hal/utils/CMakeLists.txt index 1f58626030db..fed7908d6b72 100644 --- a/runtime/src/iree/hal/utils/CMakeLists.txt +++ b/runtime/src/iree/hal/utils/CMakeLists.txt @@ -238,6 +238,21 @@ iree_cc_test( iree::testing::gtest_main ) +iree_cc_library( + NAME + stream_tracing + HDRS + "stream_tracing.h" + SRCS + "stream_tracing.c" + DEPS + iree::base + iree::base::internal::arena + iree::base::internal::synchronization + iree::hal + PUBLIC +) + if(NOT IREE_ENABLE_THREADING) return() endif() diff --git a/runtime/src/iree/hal/drivers/cuda/tracing.c b/runtime/src/iree/hal/utils/stream_tracing.c similarity index 58% rename from runtime/src/iree/hal/drivers/cuda/tracing.c rename to runtime/src/iree/hal/utils/stream_tracing.c index 057fddad9b02..47a9e3ed48c6 100644 --- a/runtime/src/iree/hal/drivers/cuda/tracing.c +++ b/runtime/src/iree/hal/utils/stream_tracing.c @@ -1,24 +1,22 @@ -// Copyright 2023 The IREE Authors +// Copyright 2024 The IREE Authors // // Licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include "iree/hal/drivers/cuda/tracing.h" +#include "iree/hal/utils/stream_tracing.h" #if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE -#include "iree/hal/drivers/cuda/cuda_dynamic_symbols.h" -#include "iree/hal/drivers/cuda/cuda_status_util.h" - // Total number of events per tracing context. This translates to the maximum // number of outstanding timestamp queries before collection is required. // To prevent spilling pages we leave some room for the context structure. -#define IREE_HAL_CUDA_TRACING_DEFAULT_QUERY_CAPACITY (16 * 1024 - 256) +#define IREE_HAL_TRACING_DEFAULT_QUERY_CAPACITY (16 * 1024 - 256) -// iree_hal_cuda_tracing_context_event_t contains a cuEvent that is used to -// record timestamps for tracing GPU execution. In this struct, there are also -// two linked lists that the current event may be added to during its lifetime. +// iree_hal_stream_tracing_context_event_t contains a native event that is used +// to record timestamps for tracing GPU execution. In this struct, there are +// also two linked lists that the current event may be added to during its +// lifetime. // // --------------------->---Submissions--->---------- // \ \ \ @@ -36,18 +34,18 @@ // // When the even is grabbed from the freelist to track GPU executions, // it is added to the list in recording command_buffer. -struct iree_hal_cuda_tracing_context_event_t { - CUevent event; - iree_hal_cuda_tracing_context_event_t* next_in_command_buffer; - iree_hal_cuda_tracing_context_event_t* next_submission; +struct iree_hal_stream_tracing_context_event_t { + iree_hal_stream_tracing_native_event_t event; + iree_hal_stream_tracing_context_event_t* next_in_command_buffer; + iree_hal_stream_tracing_context_event_t* next_submission; bool was_submitted; }; -struct iree_hal_cuda_tracing_context_t { - const iree_hal_cuda_dynamic_symbols_t* symbols; +struct iree_hal_stream_tracing_context_t { + iree_hal_stream_tracing_device_interface_t* device_interface; + iree_slim_mutex_t event_mutex; - CUstream stream; iree_arena_block_pool_t* block_pool; iree_allocator_t host_allocator; @@ -56,43 +54,45 @@ struct iree_hal_cuda_tracing_context_t { uint8_t id; // Base event used for computing relative times for all recorded events. - // This is required as CUDA (without CUPTI) only allows for relative timing - // between events and we need a stable base event. - CUevent base_event; + // This is required as some apis only allow relative timing between events and + // we need a stable base event. + iree_hal_stream_tracing_native_event_t base_event; // Unallocated event list head. next_in_command_buffer points to the next // available event. - iree_hal_cuda_tracing_context_event_t* event_freelist_head; + iree_hal_stream_tracing_context_event_t* event_freelist_head; - // Submitted events. - iree_hal_cuda_tracing_context_event_list_t submitted_event_list; + // Submitted events + iree_hal_stream_tracing_context_event_list_t submitted_event_list; - uint32_t query_capacity; + int32_t verbosity; - iree_hal_cuda_tracing_verbosity_t verbosity; + uint32_t query_capacity; // Event pool reused to capture tracing timestamps. // The lifetime of the events are as follows. // 1) All events are allocated when the tracing context is created. // 2) When a command_buffer inserts a query via: - // iree_hal_cuda_**_tracing_context_insert_query + // iree_hal_stream_tracing_context_insert_query // an event is pulled from the event freelist and added to the // command buffer. // 3) When a command buffer is dispatched and - // iree_hal_cuda_tracing_notify_submitted is called, the events + // iree_hal_stream_tracing_notify_submitted is called, the events // for that command buffer are added to the submitted_event_list. - // 4) When the command buffer completes iree_hal_cuda_tracing_context_collect + // 4) When the command buffer completes + // iree_hal_stream_tracing_context_collect // is called, and the events are removed from submitted_event_list as // we collect their values. // 5) When the command buffer is destroyed, all events are put at the front // of event_freelist. - iree_hal_cuda_tracing_context_event_t - event_pool[IREE_HAL_CUDA_TRACING_DEFAULT_QUERY_CAPACITY]; + iree_hal_stream_tracing_context_event_t + event_pool[IREE_HAL_TRACING_DEFAULT_QUERY_CAPACITY]; }; -static iree_status_t iree_hal_cuda_tracing_context_initial_calibration( - const iree_hal_cuda_dynamic_symbols_t* symbols, CUstream stream, - CUevent base_event, int64_t* out_cpu_timestamp, int64_t* out_gpu_timestamp, +static iree_status_t iree_hal_stream_tracing_context_initial_calibration( + iree_hal_stream_tracing_device_interface_t* device_interface, + iree_hal_stream_tracing_native_event_t base_event, + int64_t* out_cpu_timestamp, int64_t* out_gpu_timestamp, float* out_timestamp_period) { IREE_TRACE_ZONE_BEGIN(z0); *out_cpu_timestamp = 0; @@ -102,40 +102,40 @@ static iree_status_t iree_hal_cuda_tracing_context_initial_calibration( // Record event to the stream; in the absence of a synchronize this may not // flush immediately. IREE_RETURN_AND_END_ZONE_IF_ERROR( - z0, IREE_CURESULT_TO_STATUS(symbols, cuEventRecord(base_event, stream))); + z0, device_interface->vtable->record_native_event(device_interface, + base_event)); // Force flush the event and wait for it to complete. IREE_RETURN_AND_END_ZONE_IF_ERROR( - z0, IREE_CURESULT_TO_STATUS(symbols, cuEventSynchronize(base_event))); + z0, device_interface->vtable->synchronize_native_event(device_interface, + base_event)); // Track when we know the event has completed and has a reasonable timestamp. // This may drift from the actual time differential between host/device but is - // (maybe?) the best we can do without CUPTI. + // (maybe?) the best we can do. *out_cpu_timestamp = iree_tracing_time(); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } -iree_status_t iree_hal_cuda_tracing_context_allocate( - const iree_hal_cuda_dynamic_symbols_t* symbols, - iree_string_view_t queue_name, CUstream stream, - iree_hal_cuda_tracing_verbosity_t stream_tracing_verbosity, +iree_status_t iree_hal_stream_tracing_context_allocate( + iree_hal_stream_tracing_device_interface_t* device_interface, + iree_string_view_t queue_name, + iree_hal_stream_tracing_verbosity_t stream_tracing_verbosity, iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator, - iree_hal_cuda_tracing_context_t** out_context) { + iree_hal_stream_tracing_context_t** out_context) { IREE_TRACE_ZONE_BEGIN(z0); - IREE_ASSERT_ARGUMENT(symbols); - IREE_ASSERT_ARGUMENT(stream); + IREE_ASSERT_ARGUMENT(device_interface); IREE_ASSERT_ARGUMENT(block_pool); IREE_ASSERT_ARGUMENT(out_context); *out_context = NULL; - iree_hal_cuda_tracing_context_t* context = NULL; + iree_hal_stream_tracing_context_t* context = NULL; iree_status_t status = iree_allocator_malloc(host_allocator, sizeof(*context), (void**)&context); if (iree_status_is_ok(status)) { - context->symbols = symbols; - context->stream = stream; + context->device_interface = device_interface; context->block_pool = block_pool; context->host_allocator = host_allocator; context->query_capacity = IREE_ARRAYSIZE(context->event_pool); @@ -148,14 +148,13 @@ iree_status_t iree_hal_cuda_tracing_context_allocate( // Pre-allocate all events in the event pool. if (iree_status_is_ok(status)) { IREE_TRACE_ZONE_BEGIN_NAMED( - z_event_pool, "iree_hal_cuda_tracing_context_allocate_event_pool"); + z_event_pool, "iree_hal_stream_tracing_context_allocate_event_pool"); IREE_TRACE_ZONE_APPEND_VALUE_I64(z_event_pool, (int64_t)context->query_capacity); context->event_freelist_head = &context->event_pool[0]; for (iree_host_size_t i = 0; i < context->query_capacity; ++i) { - status = IREE_CURESULT_TO_STATUS( - symbols, - cuEventCreate(&context->event_pool[i].event, CU_EVENT_DEFAULT)); + status = device_interface->vtable->create_native_event( + device_interface, &context->event_pool[i].event); if (!iree_status_is_ok(status)) break; if (i > 0) { context->event_pool[i - 1].next_in_command_buffer = @@ -176,12 +175,12 @@ iree_status_t iree_hal_cuda_tracing_context_allocate( int64_t gpu_timestamp = 0; float timestamp_period = 0.0f; if (iree_status_is_ok(status)) { - status = IREE_CURESULT_TO_STATUS( - symbols, cuEventCreate(&context->base_event, CU_EVENT_DEFAULT)); + status = device_interface->vtable->create_native_event( + device_interface, &context->base_event); } if (iree_status_is_ok(status)) { - status = iree_hal_cuda_tracing_context_initial_calibration( - symbols, stream, context->base_event, &cpu_timestamp, &gpu_timestamp, + status = iree_hal_stream_tracing_context_initial_calibration( + device_interface, context->base_event, &cpu_timestamp, &gpu_timestamp, ×tamp_period); } @@ -196,33 +195,33 @@ iree_status_t iree_hal_cuda_tracing_context_allocate( if (iree_status_is_ok(status)) { *out_context = context; } else { - iree_hal_cuda_tracing_context_free(context); + iree_hal_stream_tracing_context_free(context); } IREE_TRACE_ZONE_END(z0); return status; } -void iree_hal_cuda_tracing_context_free( - iree_hal_cuda_tracing_context_t* context) { +void iree_hal_stream_tracing_context_free( + iree_hal_stream_tracing_context_t* context) { if (!context) return; IREE_TRACE_ZONE_BEGIN(z0); // Always perform a collection on shutdown. - iree_hal_cuda_tracing_context_collect(context); + iree_hal_stream_tracing_context_collect(context); // Release all events; since collection completed they should all be unused. - IREE_TRACE_ZONE_BEGIN_NAMED(z_event_pool, - "iree_hal_cuda_tracing_context_free_event_pool"); + IREE_TRACE_ZONE_BEGIN_NAMED( + z_event_pool, "iree_hal_stream_tracing_context_free_event_pool"); for (iree_host_size_t i = 0; i < context->query_capacity; ++i) { if (context->event_pool[i].event) { - IREE_CUDA_IGNORE_ERROR(context->symbols, - cuEventDestroy(context->event_pool[i].event)); + context->device_interface->vtable->destroy_native_event( + context->device_interface, context->event_pool[i].event); } } IREE_TRACE_ZONE_END(z_event_pool); if (context->base_event) { - IREE_CUDA_IGNORE_ERROR(context->symbols, - cuEventDestroy(context->base_event)); + context->device_interface->vtable->destroy_native_event( + context->device_interface, context->base_event); } iree_slim_mutex_deinitialize(&context->event_mutex); @@ -233,61 +232,63 @@ void iree_hal_cuda_tracing_context_free( IREE_TRACE_ZONE_END(z0); } -void iree_hal_cuda_tracing_context_collect( - iree_hal_cuda_tracing_context_t* context) { +void iree_hal_stream_tracing_context_collect( + iree_hal_stream_tracing_context_t* context) { if (!context) return; iree_slim_mutex_lock(&context->event_mutex); - // No outstanding queries if (!context->submitted_event_list.head) { iree_slim_mutex_unlock(&context->event_mutex); return; } IREE_TRACE_ZONE_BEGIN(z0); + // submitted_event_list is a list of the head elements for each command // buffer that has been submitted. Here we loop over all of the events, - // wait for them to complete and gather the results with cuEventQuery. - - iree_hal_cuda_tracing_context_event_t* events = + // wait for them to complete and gather the results with event_query. + iree_hal_stream_tracing_context_event_t* events = context->submitted_event_list.head; uint32_t read_query_count = 0; // Outer per-command_buffer loop. while (events) { - iree_hal_cuda_tracing_context_event_t* event = events; + iree_hal_stream_tracing_context_event_t* event = events; // Inner per-event loop. while (event) { uint32_t query_id = (uint32_t)(event - &context->event_pool[0]); - - CUresult result = context->symbols->cuEventSynchronize(event->event); - if (result != CUDA_SUCCESS) break; - result = context->symbols->cuEventQuery(event->event); - if (result != CUDA_SUCCESS) break; + iree_status_t status = + context->device_interface->vtable->synchronize_native_event( + context->device_interface, event->event); + if (!iree_status_is_ok(status)) break; + status = context->device_interface->vtable->query_native_event( + context->device_interface, event->event); + if (!iree_status_is_ok(status)) break; // Calculate context-relative time and notify tracy. float relative_millis = 0.0f; - IREE_CUDA_IGNORE_ERROR( - context->symbols, - cuEventElapsedTime(&relative_millis, context->base_event, - event->event)); + context->device_interface->vtable->event_elapsed_time( + context->device_interface, &relative_millis, context->base_event, + event->event); + int64_t gpu_timestamp = (int64_t)((double)relative_millis * 1000000.0); - iree_tracing_gpu_zone_notify(context->id, query_id, gpu_timestamp); + iree_tracing_gpu_zone_notify(context->id, query_id, gpu_timestamp); read_query_count += 1; event = event->next_in_command_buffer; } - iree_hal_cuda_tracing_context_event_t* next = events->next_submission; + iree_hal_stream_tracing_context_event_t* next = events->next_submission; events->was_submitted = true; events = next; context->submitted_event_list.head = events; } IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)read_query_count); + IREE_TRACE_ZONE_END(z0); iree_slim_mutex_unlock(&context->event_mutex); } -void iree_hal_cuda_tracing_notify_submitted( - iree_hal_cuda_tracing_context_t* context, - iree_hal_cuda_tracing_context_event_list_t* event_list) { +void iree_hal_stream_tracing_notify_submitted( + iree_hal_stream_tracing_context_t* context, + iree_hal_stream_tracing_context_event_list_t* event_list) { if (!context) return; IREE_ASSERT_ARGUMENT(event_list); iree_slim_mutex_lock(&context->event_mutex); @@ -308,19 +309,17 @@ void iree_hal_cuda_tracing_notify_submitted( iree_slim_mutex_unlock(&context->event_mutex); } -void iree_hal_cuda_tracing_free( - iree_hal_cuda_tracing_context_t* context, - iree_hal_cuda_tracing_context_event_list_t* event_list) { +void iree_hal_stream_tracing_free( + iree_hal_stream_tracing_context_t* context, + iree_hal_stream_tracing_context_event_list_t* event_list) { if (!context) return; iree_slim_mutex_lock(&context->event_mutex); - IREE_ASSERT_ARGUMENT(event_list); if (!event_list->head) { iree_slim_mutex_unlock(&context->event_mutex); return; } - // Free an event list that was previously created. There is some book-keeping // to keep tracy happy, and then we remove the elements from the // passed in event_list and add them to the front of the free-list. @@ -328,7 +327,7 @@ void iree_hal_cuda_tracing_free( // If this event list has never been submitted we still need to add values to // the timeline otherwise tracy will not behave correctly. if (!event_list->head->was_submitted) { - iree_hal_cuda_tracing_context_event_t* event = event_list->head; + iree_hal_stream_tracing_context_event_t* event = event_list->head; while (event) { uint32_t query_id = (uint32_t)(event - &context->event_pool[0]); iree_tracing_gpu_zone_notify(context->id, query_id, 0); @@ -351,9 +350,9 @@ void iree_hal_cuda_tracing_free( iree_slim_mutex_unlock(&context->event_mutex); } -static void iree_hal_cuda_tracing_context_event_list_append_event( - iree_hal_cuda_tracing_context_event_list_t* event_list, - iree_hal_cuda_tracing_context_event_t* event) { +static void iree_hal_stream_tracing_context_event_list_append_event( + iree_hal_stream_tracing_context_event_list_t* event_list, + iree_hal_stream_tracing_context_event_t* event) { if (!event_list->head) { event_list->head = event; event_list->tail = event; @@ -366,25 +365,25 @@ static void iree_hal_cuda_tracing_context_event_list_append_event( // Grabs the next available query out of the freelist and adds it to // the event_list that was passed in. Also starts the recording of the // event. -static uint16_t iree_hal_cuda_stream_tracing_context_insert_query( - iree_hal_cuda_tracing_context_t* context, - iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream, - iree_hal_cuda_tracing_verbosity_t verbosity) { +static uint16_t iree_hal_stream_tracing_context_insert_query( + iree_hal_stream_tracing_context_t* context, + iree_hal_stream_tracing_context_event_list_t* event_list) { iree_slim_mutex_lock(&context->event_mutex); IREE_ASSERT_ARGUMENT(event_list); // Allocate an event from the pool for use by the query. // TODO: If we have run out of our freelist, then we need to try and recover - // allocate events. - iree_hal_cuda_tracing_context_event_t* event = context->event_freelist_head; + // or allocate more events. + iree_hal_stream_tracing_context_event_t* event = context->event_freelist_head; context->event_freelist_head = event->next_in_command_buffer; uint32_t query_id = event - &context->event_pool[0]; IREE_ASSERT(event->next_in_command_buffer != NULL); event->next_in_command_buffer = NULL; - IREE_CUDA_IGNORE_ERROR(context->symbols, cuEventRecord(event->event, stream)); + IREE_IGNORE_ERROR(context->device_interface->vtable->record_native_event( + context->device_interface, event->event)); - iree_hal_cuda_tracing_context_event_list_append_event(event_list, event); + iree_hal_stream_tracing_context_event_list_append_event(event_list, event); iree_slim_mutex_unlock(&context->event_mutex); return query_id; @@ -394,31 +393,34 @@ static uint16_t iree_hal_cuda_stream_tracing_context_insert_query( // the event_list that was passed in. Also inserts the event record // node into the passed in graph. It returns the index of the // event. -static uint16_t iree_hal_cuda_graph_tracing_context_insert_query( - iree_hal_cuda_tracing_context_t* context, - iree_hal_cuda_tracing_context_event_list_t* event_list, - CUgraphNode* out_node, CUgraph graph, - iree_hal_cuda_tracing_verbosity_t verbosity, CUgraphNode* dependency_nodes, +static uint16_t iree_hal_graph_tracing_context_insert_query( + iree_hal_stream_tracing_context_t* context, + iree_hal_stream_tracing_context_event_list_t* event_list, + iree_hal_stream_tracing_native_graph_node_t* out_node, + iree_hal_stream_tracing_native_graph_t graph, + iree_hal_stream_tracing_native_graph_node_t* dependency_nodes, size_t dependency_nodes_count) { IREE_ASSERT_ARGUMENT(event_list); iree_slim_mutex_lock(&context->event_mutex); - // Allocate an event from the pool for use by the query. // TODO: If we have run out of our freelist, then we need to try and recover - // or allocate more events. - iree_hal_cuda_tracing_context_event_t* event = context->event_freelist_head; + // or + // allocate more events. + iree_hal_stream_tracing_context_event_t* event = context->event_freelist_head; context->event_freelist_head = event->next_in_command_buffer; uint32_t query_id = event - &context->event_pool[0]; IREE_ASSERT(event->next_in_command_buffer != NULL); event->next_in_command_buffer = NULL; - iree_status_t status = IREE_CURESULT_TO_STATUS( - context->symbols, - cuGraphAddEventRecordNode(out_node, graph, dependency_nodes, - dependency_nodes_count, event->event)); + iree_status_t status = + context->device_interface->vtable->add_graph_event_record_node( + context->device_interface, out_node, graph, dependency_nodes, + dependency_nodes_count, event->event); + // TODO(awoloszyn): Actually propagate this as an error instead + // of just asserting. IREE_ASSERT(iree_status_is_ok(status)); - iree_hal_cuda_tracing_context_event_list_append_event(event_list, event); + iree_hal_stream_tracing_context_event_list_append_event(event_list, event); iree_slim_mutex_unlock(&context->event_mutex); return query_id; @@ -428,102 +430,104 @@ static uint16_t iree_hal_cuda_graph_tracing_context_insert_query( // today we insert 2 events per zone (one for begin and one for end) but in // many cases we could reduce this by inserting events only between zones and // using the differences between them. - -void iree_hal_cuda_stream_tracing_zone_begin_impl( - iree_hal_cuda_tracing_context_t* context, - iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream, - iree_hal_cuda_tracing_verbosity_t verbosity, +void iree_hal_stream_tracing_zone_begin_impl( + iree_hal_stream_tracing_context_t* context, + iree_hal_stream_tracing_context_event_list_t* event_list, + iree_hal_stream_tracing_verbosity_t verbosity, const iree_tracing_location_t* src_loc) { if (!context) return; if (verbosity > context->verbosity) return; - - uint16_t query_id = iree_hal_cuda_stream_tracing_context_insert_query( - context, event_list, stream, verbosity); + uint16_t query_id = + iree_hal_stream_tracing_context_insert_query(context, event_list); iree_tracing_gpu_zone_begin(context->id, query_id, src_loc); } -void iree_hal_cuda_stream_tracing_zone_begin_external_impl( - iree_hal_cuda_tracing_context_t* context, - iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream, - iree_hal_cuda_tracing_verbosity_t verbosity, const char* file_name, +void iree_hal_stream_tracing_zone_begin_external_impl( + iree_hal_stream_tracing_context_t* context, + iree_hal_stream_tracing_context_event_list_t* event_list, + iree_hal_stream_tracing_verbosity_t verbosity, const char* file_name, size_t file_name_length, uint32_t line, const char* function_name, size_t function_name_length, const char* name, size_t name_length) { if (!context) return; if (verbosity > context->verbosity) return; - uint16_t query_id = iree_hal_cuda_stream_tracing_context_insert_query( - context, event_list, stream, verbosity); + uint16_t query_id = + iree_hal_stream_tracing_context_insert_query(context, event_list); iree_tracing_gpu_zone_begin_external(context->id, query_id, file_name, file_name_length, line, function_name, function_name_length, name, name_length); } -void iree_hal_cuda_graph_tracing_zone_begin_external_impl( - iree_hal_cuda_tracing_context_t* context, - iree_hal_cuda_tracing_context_event_list_t* event_list, - CUgraphNode* out_node, CUgraph graph, - iree_hal_cuda_tracing_verbosity_t verbosity, CUgraphNode* dependency_nodes, +void iree_hal_graph_tracing_zone_begin_external_impl( + iree_hal_stream_tracing_context_t* context, + iree_hal_stream_tracing_context_event_list_t* event_list, + iree_hal_stream_tracing_native_graph_node_t* out_node, + iree_hal_stream_tracing_native_graph_t graph, + iree_hal_stream_tracing_verbosity_t verbosity, + iree_hal_stream_tracing_native_graph_node_t* dependency_nodes, size_t dependency_nodes_count, const char* file_name, size_t file_name_length, uint32_t line, const char* function_name, size_t function_name_length, const char* name, size_t name_length) { if (!context) return; if (verbosity > context->verbosity) return; - uint16_t query_id = iree_hal_cuda_graph_tracing_context_insert_query( - context, event_list, out_node, graph, verbosity, dependency_nodes, + uint16_t query_id = iree_hal_graph_tracing_context_insert_query( + context, event_list, out_node, graph, dependency_nodes, dependency_nodes_count); iree_tracing_gpu_zone_begin_external(context->id, query_id, file_name, file_name_length, line, function_name, function_name_length, name, name_length); } -void iree_hal_cuda_stream_tracing_zone_end_impl( - iree_hal_cuda_tracing_context_t* context, - iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream, - iree_hal_cuda_tracing_verbosity_t verbosity) { +void iree_hal_stream_tracing_zone_end_impl( + iree_hal_stream_tracing_context_t* context, + iree_hal_stream_tracing_context_event_list_t* event_list, + iree_hal_stream_tracing_verbosity_t verbosity) { if (!context) return; if (verbosity > context->verbosity) return; - uint16_t query_id = iree_hal_cuda_stream_tracing_context_insert_query( - context, event_list, stream, verbosity); + uint16_t query_id = + iree_hal_stream_tracing_context_insert_query(context, event_list); iree_tracing_gpu_zone_end(context->id, query_id); } -void iree_hal_cuda_graph_tracing_zone_end_impl( - iree_hal_cuda_tracing_context_t* context, - iree_hal_cuda_tracing_context_event_list_t* event_list, - CUgraphNode* out_node, CUgraph graph, - iree_hal_cuda_tracing_verbosity_t verbosity, CUgraphNode* dependency_nodes, +void iree_hal_graph_tracing_zone_end_impl( + iree_hal_stream_tracing_context_t* context, + iree_hal_stream_tracing_context_event_list_t* event_list, + iree_hal_stream_tracing_native_graph_node_t* out_node, + iree_hal_stream_tracing_native_graph_t graph, + iree_hal_stream_tracing_verbosity_t verbosity, + iree_hal_stream_tracing_native_graph_node_t* dependency_nodes, size_t dependency_nodes_count) { if (!context) return; if (verbosity > context->verbosity) return; - uint16_t query_id = iree_hal_cuda_graph_tracing_context_insert_query( - context, event_list, out_node, graph, verbosity, dependency_nodes, + uint16_t query_id = iree_hal_graph_tracing_context_insert_query( + context, event_list, out_node, graph, dependency_nodes, dependency_nodes_count); iree_tracing_gpu_zone_end(context->id, query_id); } #else -iree_status_t iree_hal_cuda_tracing_context_allocate( - const iree_hal_cuda_dynamic_symbols_t* symbols, - iree_string_view_t queue_name, CUstream stream, - iree_hal_cuda_tracing_verbosity_t stream_tracing_verbosity, +iree_status_t iree_hal_stream_tracing_context_allocate( + iree_hal_stream_tracing_device_interface_t* interface, + iree_string_view_t queue_name, + iree_hal_stream_tracing_verbosity_t stream_tracing_verbosity, iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator, - iree_hal_cuda_tracing_context_t** out_context) { + iree_hal_stream_tracing_context_t** out_context) { *out_context = NULL; return iree_ok_status(); } -void iree_hal_cuda_tracing_context_free( - iree_hal_cuda_tracing_context_t* context) {} +void iree_hal_stream_tracing_context_free( + iree_hal_stream_tracing_context_t* context) {} -void iree_hal_cuda_tracing_context_collect( - iree_hal_cuda_tracing_context_t* context) {} +void iree_hal_stream_tracing_context_collect( + iree_hal_stream_tracing_context_t* context) {} -void iree_hal_cuda_tracing_notify_submitted( - iree_hal_cuda_tracing_context_t* context, - iree_hal_cuda_tracing_context_event_list_t* event_list) {} +void iree_hal_stream_tracing_notify_submitted( + iree_hal_stream_tracing_context_t* context, + iree_hal_stream_tracing_context_event_list_t* event_list) {} -void iree_hal_cuda_tracing_free( - iree_hal_cuda_tracing_context_t* context, - iree_hal_cuda_tracing_context_event_list_t* event_list) {} +void iree_hal_stream_tracing_free( + iree_hal_stream_tracing_context_t* context, + iree_hal_stream_tracing_context_event_list_t* event_list) {} #endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE diff --git a/runtime/src/iree/hal/utils/stream_tracing.h b/runtime/src/iree/hal/utils/stream_tracing.h new file mode 100644 index 000000000000..9314df237da9 --- /dev/null +++ b/runtime/src/iree/hal/utils/stream_tracing.h @@ -0,0 +1,249 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_HAL_UTILS_STREAM_TRACING_H_ +#define IREE_HAL_UTILS_STREAM_TRACING_H_ + +#include "iree/base/api.h" +#include "iree/base/internal/arena.h" +#include "iree/base/internal/synchronization.h" +#include "iree/hal/api.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +// Per-stream tracing context. +// No-op if IREE tracing is not enabled. +// +// Use the IREE_TRACE_* macros to trace a contiguous set of stream +// operations. Unlike the normal tracy macros there are no zone IDs and instead +// each stream gets an ID allocated once and passed to all tracing macros. +// +// Usage: +// IREE_HAL_STREAM_TRACE_ZONE_BEGIN(queue->tracing_context, stream); +// *LaunchKernel(..., stream); +// IREE_HAL_STREAM_TRACE_ZONE_END(queue->tracing_context, stream); +// ... +// iree_hal_stream_tracing_context_collect(queue->tracing_context); +// +// NOTE: timestamps can have non-trivial side-effecting behavior and may +// introduce serialization in graph execution. +// +// TODO: expose event reservation separate from recording. For +// graphs we will need to insert the events but in order to reuse the graphs +// we'll need to reserve and patch new events each graph launch. +// +// Thread-compatible: external synchronization is required if using from +// multiple threads (same as with the streams themselves). +typedef struct iree_hal_stream_tracing_context_t + iree_hal_stream_tracing_context_t; +typedef struct iree_hal_stream_tracing_context_event_t + iree_hal_stream_tracing_context_event_t; + +// This is used when tracing is enabled. Calls to dispatch and event related +// functions will update the pointers to keep the list up to date. +typedef struct iree_hal_stream_tracing_context_event_list_t { + iree_hal_stream_tracing_context_event_t* head; + iree_hal_stream_tracing_context_event_t* tail; +} iree_hal_stream_tracing_context_event_list_t; + +typedef enum iree_hal_stream_tracing_verbosity_e { + IREE_HAL_TRACING_VERBOSITY_OFF = 0, + IREE_HAL_TRACING_VERBOSITY_COARSE, + IREE_HAL_TRACING_VERBOSITY_FINE, + IREE_HAL_TRACING_VERBOSITY_MAX +} iree_hal_stream_tracing_verbosity_t; + +typedef struct iree_hal_stream_tracing_device_interface_vtable_t + iree_hal_stream_tracing_device_interface_vtable_t; + +typedef struct iree_hal_stream_tracing_device_interface_t { + const iree_hal_stream_tracing_device_interface_vtable_t* vtable; +} iree_hal_stream_tracing_device_interface_t; + +typedef void* iree_hal_stream_tracing_native_event_t; +typedef void* iree_hal_stream_tracing_native_graph_node_t; +typedef void* iree_hal_stream_tracing_native_graph_t; + +typedef struct iree_hal_stream_tracing_device_interface_vtable_t { + // Destroys the iree_hal_stream_tracing_device_interface_t. + // Called when the iree_hal_stream_tracing_context_t is done with it. + void(IREE_API_PTR* destroy)( + iree_hal_stream_tracing_device_interface_t* device_interface); + // Causes the CPU to wait on the |event| to complete. + iree_status_t(IREE_API_PTR* synchronize_native_event)( + iree_hal_stream_tracing_device_interface_t* device_interface, + iree_hal_stream_tracing_native_event_t event); + // Returns a new native event to be used for timing. + iree_status_t(IREE_API_PTR* create_native_event)( + iree_hal_stream_tracing_device_interface_t* device_interface, + iree_hal_stream_tracing_native_event_t* event); + // Returns iree_status_ok if the event is completed. + iree_status_t(IREE_API_PTR* query_native_event)( + iree_hal_stream_tracing_device_interface_t* device_interface, + iree_hal_stream_tracing_native_event_t event); + // Returns the difference in milliseconds between the + // completed events |start_event| and |end_event|. + void(IREE_API_PTR* event_elapsed_time)( + iree_hal_stream_tracing_device_interface_t* device_interface, + float* relative_millis, + iree_hal_stream_tracing_native_event_t start_event, + iree_hal_stream_tracing_native_event_t end_event); + // Destroys the native event. + void(IREE_API_PTR* destroy_native_event)( + iree_hal_stream_tracing_device_interface_t* device_interface, + iree_hal_stream_tracing_native_event_t event); + // Records the native event into the stream associated + // with the device. + iree_status_t(IREE_API_PTR* record_native_event)( + iree_hal_stream_tracing_device_interface_t* device_interface, + iree_hal_stream_tracing_native_event_t event); + // Adds a record of the native event into the given graph. + iree_status_t(IREE_API_PTR* add_graph_event_record_node)( + iree_hal_stream_tracing_device_interface_t* device_interface, + iree_hal_stream_tracing_native_graph_node_t* out_node, + iree_hal_stream_tracing_native_graph_t graph, + iree_hal_stream_tracing_native_graph_node_t* dependency_nodes, + size_t dependency_nodes_count, + iree_hal_stream_tracing_native_event_t event); +} iree_hal_stream_tracing_device_interface_vtable_t; + +// Allocates a tracing context for the given |interface|. +// Each context must only be used with the interface it was +// created for. +// The tracing context takes ownership of the interface, +// and the interface's destroy method will be called +// when cleanup is required. +iree_status_t iree_hal_stream_tracing_context_allocate( + iree_hal_stream_tracing_device_interface_t* interface, + iree_string_view_t queue_name, + iree_hal_stream_tracing_verbosity_t stream_tracing_verbosity, + iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator, + iree_hal_stream_tracing_context_t** out_context); + +// Frees a tracing context and all associated resources. +// All submissions using the resources must be completed prior to calling. +void iree_hal_stream_tracing_context_free( + iree_hal_stream_tracing_context_t* context); + +// Collects in-flight timestamp queries from the stream and feeds them to tracy. +// Must be called frequently (every submission, etc) to drain the backlog; +// tracing may start failing if the internal ringbuffer is exceeded. +void iree_hal_stream_tracing_context_collect( + iree_hal_stream_tracing_context_t* context); + +// Notifies that the given list of events has been dispached on to the gpu. +void iree_hal_stream_tracing_notify_submitted( + iree_hal_stream_tracing_context_t* context, + iree_hal_stream_tracing_context_event_list_t* event_list); + +// Frees the events and returns them back into the tracing context. +void iree_hal_stream_tracing_free( + iree_hal_stream_tracing_context_t* context, + iree_hal_stream_tracing_context_event_list_t* event_list); + +#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE + +// Begins a normal zone derived on the calling |src_loc|. +// Must be perfectly nested and paired with a corresponding zone end. +void iree_hal_stream_tracing_zone_begin_impl( + iree_hal_stream_tracing_context_t* context, + iree_hal_stream_tracing_context_event_list_t* event_list, + iree_hal_stream_tracing_verbosity_t verbosity, + const iree_tracing_location_t* src_loc); + +// Begins an external zone using the given source information. +// The provided strings will be copied into the tracy buffer. +void iree_hal_stream_tracing_zone_begin_external_impl( + iree_hal_stream_tracing_context_t* context, + iree_hal_stream_tracing_context_event_list_t* event_list, + iree_hal_stream_tracing_verbosity_t verbosity, const char* file_name, + size_t file_name_length, uint32_t line, const char* function_name, + size_t function_name_length, const char* name, size_t name_length); + +void iree_hal_graph_tracing_zone_begin_external_impl( + iree_hal_stream_tracing_context_t* context, + iree_hal_stream_tracing_context_event_list_t* event_list, + iree_hal_stream_tracing_native_graph_node_t* out_node, + iree_hal_stream_tracing_native_graph_t graph, + iree_hal_stream_tracing_verbosity_t verbosity, + iree_hal_stream_tracing_native_graph_node_t* dependency_nodes, + size_t dependency_nodes_count, const char* file_name, + size_t file_name_length, uint32_t line, const char* function_name, + size_t function_name_length, const char* name, size_t name_length); + +void iree_hal_stream_tracing_zone_end_impl( + iree_hal_stream_tracing_context_t* context, + iree_hal_stream_tracing_context_event_list_t* event_list, + iree_hal_stream_tracing_verbosity_t verbosity); +void iree_hal_graph_tracing_zone_end_impl( + iree_hal_stream_tracing_context_t* context, + iree_hal_stream_tracing_context_event_list_t* event_list, + iree_hal_stream_tracing_native_graph_node_t* out_node, + iree_hal_stream_tracing_native_graph_t graph, + iree_hal_stream_tracing_verbosity_t verbosity, + iree_hal_stream_tracing_native_graph_node_t* dependency_nodes, + size_t dependency_nodes_count); + +// Begins a new zone with the parent function name. +#define IREE_HAL_STREAM_TRACE_ZONE_BEGIN(context, event_list, verbosity) \ + static const iree_tracing_location_t TracyConcat( \ + __tracy_source_location, __LINE__) = {NULL, __FUNCTION__, __FILE__, \ + (uint32_t)__LINE__, 0}; \ + iree_hal_stream_tracing_zone_begin_impl( \ + context, event_list, verbosity, \ + &TracyConcat(__tracy_source_location, __LINE__)); + +// Begins an externally defined zone with a dynamic source location. +// The |file_name|, |function_name|, and optional |name| strings will be copied +// into the trace buffer and do not need to persist. +#define IREE_HAL_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( \ + context, event_list, verbosity, file_name, file_name_length, line, \ + function_name, function_name_length, name, name_length) \ + iree_hal_stream_tracing_zone_begin_external_impl( \ + context, event_list, verbosity, file_name, file_name_length, line, \ + function_name, function_name_length, name, name_length) + +#define IREE_HAL_GRAPH_TRACE_ZONE_BEGIN_EXTERNAL( \ + context, event_list, out_node, graph, verbosity, dependency_nodes, \ + dependency_nodes_count, file_name, file_name_length, line, function_name, \ + function_name_length, name, name_length) \ + iree_hal_graph_tracing_zone_begin_external_impl( \ + context, event_list, out_node, graph, verbosity, dependency_nodes, \ + dependency_nodes_count, file_name, file_name_length, line, \ + function_name, function_name_length, name, name_length) + +#define IREE_HAL_STREAM_TRACE_ZONE_END(context, event_list, verbosity) \ + iree_hal_stream_tracing_zone_end_impl(context, event_list, verbosity) + +#define IREE_HAL_GRAPH_TRACE_ZONE_END(context, event_list, out_node, graph, \ + verbosity, dependency_nodes, \ + dependency_nodes_count) \ + iree_hal_graph_tracing_zone_end_impl(context, event_list, out_node, graph, \ + verbosity, dependency_nodes, \ + dependency_nodes_count) +#else + +#define IREE_HAL_STREAM_TRACE_ZONE_BEGIN(context, event_list, verbosity) +#define IREE_HAL_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( \ + context, event_list, verbosity, file_name, file_name_length, line, \ + function_name, function_name_length, name, name_length) +#define IREE_HAL_GRAPH_TRACE_ZONE_BEGIN_EXTERNAL( \ + context, event_list, out_node, graph, verbosity, dependency_nodes, \ + dependency_nodes_count, file_name, file_name_length, line, function_name, \ + function_name_length, name, name_length) +#define IREE_HAL_STREAM_TRACE_ZONE_END(context, evnet_list, verbosity) +#define IREE_HAL_GRAPH_TRACE_ZONE_END(context, event_list, out_node, graph, \ + verbosity, dependency_nodes, \ + dependency_nodes_count) +#endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // IREE_HAL_UTILS_STREAM_TRACING_H_