From 0aa4983ec7fc0b849373521bf4fc2f69f77168c9 Mon Sep 17 00:00:00 2001 From: Solomon Bekele Date: Wed, 23 Oct 2024 16:22:51 -0500 Subject: [PATCH] Sampling Improvement (#295) * Dependency update (#286) Use dependency from Efficios deliverable. --------- Co-authored-by: Thomas Applencourt * Add archive (#287) Enable usage of session rotation for lossless online trace consumption. --------- Co-authored-by: Thomas Applencourt * Single rank profiling (#288) * Make only local master do energy profiling. * Use ZES to query devices in order to get around affinity masks. * Use ZES for drivers as well. * set ZES * Update ze/tracer_ze_helpers.include.c Co-authored-by: Brice Videau * Update ze/tracer_ze_helpers.include.c Co-authored-by: Brice Videau * Update xprof/xprof.rb.in --------- Co-authored-by: Brice Videau Co-authored-by: Thomas Applencourt * fabricPort sampling * Fabric Timeline Sampling timeline rewrite timeline cleanup timeline fix Minor fix * Sampling rewrite updated * Timeline cleanup clean up * include telemetry handles * uuid based timeline * Memory sampling timeline cleanup * zes_support * rebased * Code cleanup and fix * Update ze/btx_zeinterval_callbacks.cpp Co-authored-by: Thomas Applencourt * Update xprof/xprof.rb.in Co-authored-by: Thomas Applencourt * Update xprof/xprof.rb.in Co-authored-by: Thomas Applencourt * PR corrections * separate sampling stream * Deltas and Hash-return handled * Remove Ze calls for subDevice * minor change on delta --------- Co-authored-by: Thomas Applencourt Co-authored-by: Thomas Applencourt Co-authored-by: Brice Videau Co-authored-by: sbekele Co-authored-by: Solomon Bekele Co-authored-by: Solomon Bekele Co-authored-by: Solomon Bekele Co-authored-by: Solomon Bekele Co-authored-by: Solomon Bekele --- utils/xprof_utils.hpp | 9 +- xprof/btx_interval_model.yaml | 152 +++++++++- xprof/btx_timeline.cpp | 317 +++++++++++++++------ ze/btx_zeinterval_callbacks.cpp | 399 ++++++++++++++++++++------ ze/btx_zeinterval_callbacks.hpp | 40 ++- ze/tracer_ze_helpers.include.c | 485 ++++++++++++++++++++++---------- ze/ze_events.yaml | 121 ++++++-- 7 files changed, 1149 insertions(+), 374 deletions(-) diff --git a/utils/xprof_utils.hpp b/utils/xprof_utils.hpp index d7b0da96..8d1c4c08 100644 --- a/utils/xprof_utils.hpp +++ b/utils/xprof_utils.hpp @@ -55,7 +55,9 @@ typedef intptr_t process_id_t; typedef uintptr_t thread_id_t; typedef std::string hostname_t; typedef std::string thapi_function_name; -typedef uintptr_t thapi_device_id; +typedef uint64_t thapi_device_id; +typedef uint64_t thapi_telemetry_handle; +typedef uintptr_t thapi_fabricPort_id; typedef uint32_t thapi_domain_idx; typedef uint32_t thapi_sdevice_idx; @@ -69,9 +71,10 @@ typedef std::tuple hpt_device_function_name_t; typedef std::tuple hp_device_t; +typedef std::tuple h_device_t; typedef std::tuple hp_dsd_t; -typedef std::tuple hp_ddomain_t; -typedef std::tuple hp_dsdev_t; +typedef std::tuple h_ddomain_t; +typedef std::tuple h_dfsdev_t; typedef std::tuple sd_t; typedef std::tuple tfn_ts_t; typedef std::tuple fn_ts_t; diff --git a/xprof/btx_interval_model.yaml b/xprof/btx_interval_model.yaml index 56a519c6..02dca13a 100644 --- a/xprof/btx_interval_model.yaml +++ b/xprof/btx_interval_model.yaml @@ -93,7 +93,21 @@ :field_class: :type: string :cast_type: const char* - - :name: lttng:frequency + - :name: interval_sampling + :event_common_context_field_class: + :type: structure + :members: + - :name: hostname + :field_class: + :type: string + :cast_type: const char* + - :name: ts + :field_class: + :type: integer_signed + :field_value_range: 64 + :cast_type: int64_t + :event_classes: + - :name: sampling:frequency :payload_field_class: :type: structure :members: @@ -102,6 +116,16 @@ :type: integer_unsigned :field_value_range: 64 :cast_type: uint64_t + - :name: deviceIdx + :field_class: + :type: integer_unsigned + :field_value_range: 32 + :cast_type: uint32_t + - :name: hFrequency + :field_class: + :type: integer_unsigned + :field_value_range: 64 + :cast_type: uint64_t - :name: domain :field_class: :type: integer_unsigned @@ -112,7 +136,7 @@ :type: integer_unsigned :field_value_range: 64 :cast_type: uint64_t - - :name: lttng:power + - :name: sampling:power :payload_field_class: :type: structure :members: @@ -121,6 +145,16 @@ :type: integer_unsigned :field_value_range: 64 :cast_type: uint64_t + - :name: deviceIdx + :field_class: + :type: integer_unsigned + :field_value_range: 32 + :cast_type: uint32_t + - :name: hPower + :field_class: + :type: integer_unsigned + :field_value_range: 64 + :cast_type: uint64_t - :name: domain :field_class: :type: integer_unsigned @@ -131,7 +165,7 @@ :type: integer_unsigned :field_value_range: 64 :cast_type: uint64_t - - :name: lttng:computeEU + - :name: sampling:computeEU :payload_field_class: :type: structure :members: @@ -140,6 +174,16 @@ :type: integer_unsigned :field_value_range: 64 :cast_type: uint64_t + - :name: deviceIdx + :field_class: + :type: integer_unsigned + :field_value_range: 32 + :cast_type: uint32_t + - :name: hEngine + :field_class: + :type: integer_unsigned + :field_value_range: 64 + :cast_type: uint64_t - :name: subDevice :field_class: :type: integer_unsigned @@ -149,7 +193,7 @@ :field_class: :type: single :cast_type: float - - :name: lttng:copyEU + - :name: sampling:copyEU :payload_field_class: :type: structure :members: @@ -158,6 +202,16 @@ :type: integer_unsigned :field_value_range: 64 :cast_type: uint64_t + - :name: deviceIdx + :field_class: + :type: integer_unsigned + :field_value_range: 32 + :cast_type: uint32_t + - :name: hEngine + :field_class: + :type: integer_unsigned + :field_value_range: 64 + :cast_type: uint64_t - :name: subDevice :field_class: :type: integer_unsigned @@ -167,3 +221,93 @@ :field_class: :type: single :cast_type: float + - :name: sampling:fabricPort + :payload_field_class: + :type: structure + :members: + - :name: did + :field_class: + :type: integer_unsigned + :field_value_range: 64 + :cast_type: uint64_t + - :name: deviceIdx + :field_class: + :type: integer_unsigned + :field_value_range: 32 + :cast_type: uint32_t + - :name: hFabricPort + :field_class: + :type: integer_unsigned + :field_value_range: 64 + :cast_type: uint64_t + - :name: subDevice + :field_class: + :type: integer_unsigned + :field_value_range: 32 + :cast_type: uint32_t + - :name: portId + :field_class: + :type: integer_unsigned + :field_value_range: 32 + :cast_type: uint32_t + - :name: remotePortId + :field_class: + :type: integer_unsigned + :field_value_range: 32 + :cast_type: uint32_t + - :name: rxThroughput + :field_class: + :type: double + :cast_type: float + - :name: txThroughput + :field_class: + :type: double + :cast_type: float + - :name: rxSpeed + :field_class: + :type: double + :cast_type: float + - :name: txSpeed + :field_class: + :type: double + :cast_type: float + - :name: sampling:memModule + :payload_field_class: + :type: structure + :members: + - :name: did + :field_class: + :type: integer_unsigned + :field_value_range: 64 + :cast_type: uint64_t + - :name: deviceIdx + :field_class: + :type: integer_unsigned + :field_value_range: 32 + :cast_type: uint32_t + - :name: hMemModule + :field_class: + :type: integer_unsigned + :field_value_range: 64 + :cast_type: uint64_t + - :name: subDevice + :field_class: + :type: integer_unsigned + :field_value_range: 32 + :cast_type: uint32_t + - :name: pBandwidth + :field_class: + :type: double + :cast_type: float + - :name: rdBandwidth + :field_class: + :type: double + :cast_type: float + - :name: wtBandwidth + :field_class: + :type: double + :cast_type: float + - :name: occupancy + :field_class: + :type: double + :cast_type: float diff --git a/xprof/btx_timeline.cpp b/xprof/btx_timeline.cpp index 494da209..06609f2f 100644 --- a/xprof/btx_timeline.cpp +++ b/xprof/btx_timeline.cpp @@ -5,8 +5,8 @@ #include // set precision #include // stdcout #include -#include #include +#include #include #include // pair @@ -30,17 +30,19 @@ struct timeline_dispatch_s { std::map> track2lasts; - std::unordered_map hp_device2countertracks; - std::unordered_map hp_ddomain2frqtracks; - std::unordered_map hp_ddomain2pwrtracks; - std::unordered_map hp_dsdev2cpetracks; - std::unordered_map hp_dsdev2cpytracks; - + std::unordered_map hp_device2countertracks; + std::unordered_map hp_ddomain2telmtracks; + std::unordered_map hp_ddomain2cpyalloctracks; + std::unordered_map hp_dfsdev2fptracks; perfetto_pruned::Trace trace; }; +// Keeps extra parameters that does not fit the default getter +using Extras = std::tuple; + using timeline_dispatch_t = struct timeline_dispatch_s; -using uuid_getter_t = perfetto_uuid_t (*)(timeline_dispatch_t *, std::string, uint64_t, uintptr_t, - uint32_t); +using uuid_getter_t = perfetto_uuid_t (*)(timeline_dispatch_t *, const std::string &, uint64_t, + uint32_t, uint64_t, uint32_t, std::optional); + static perfetto_uuid_t gen_perfetto_uuid() { // Start at one, Look like UUID 0 is special static std::atomic uuid{1}; @@ -48,11 +50,10 @@ static perfetto_uuid_t gen_perfetto_uuid() { } static perfetto_uuid_t get_parent_counter_track_uuid(timeline_dispatch_t *dispatch, - std::string hostname, uint64_t process_id, - thapi_device_id did) { + std::string hostname, thapi_device_id did, + uint32_t deviceIdx) { perfetto_uuid_t hp_uuid = 0; - auto [it, inserted] = - dispatch->hp_device2countertracks.insert({{hostname, process_id, did}, hp_uuid}); + auto [it, inserted] = dispatch->hp_device2countertracks.insert({{hostname, did}, hp_uuid}); auto &potential_uuid = it->second; // Exists if (!inserted) @@ -72,78 +73,168 @@ static perfetto_uuid_t get_parent_counter_track_uuid(timeline_dispatch_t *dispat auto *process = track_descriptor->mutable_process(); process->set_pid(hp_uuid); std::ostringstream oss; - oss << "Hostname " << hostname << " | Process " << process_id << " | Device " << did; + oss << "Hostname " << hostname << " | Device " << deviceIdx; // oss << " | " << track_name << " | uuid "; process->set_process_name(oss.str()); return hp_uuid; } -static perfetto_uuid_t -get_counter_track_uuuid(timeline_dispatch_t *dispatch, - std::unordered_map &counter_tracks, - const std::string track_name, std::string hostname, uint64_t process_id, - thapi_device_id did, thapi_domain_idx domain, int64_t unit_multiplier = 1) { +template +std::pair +insert_or_get_uuid(MapType &counter_map, KeyType key, timeline_dispatch_t *dispatch, + const std::string &hostname, thapi_device_id did, uint32_t deviceIdx) { + perfetto_uuid_t hp_dev_uuid = 0; - auto [it, inserted] = counter_tracks.insert({{hostname, process_id, did, domain}, hp_dev_uuid}); + auto [it, inserted] = counter_map.insert({key, hp_dev_uuid}); auto &potential_uuid = it->second; - // Exists + if (!inserted) - return potential_uuid; + return {potential_uuid, 0}; // If present, return only hp_dev_uuid. - perfetto_uuid_t hp_uuid = get_parent_counter_track_uuid(dispatch, hostname, process_id, did); + // Generate both parent and device uuid if a new insertion + perfetto_uuid_t hp_uuid = get_parent_counter_track_uuid(dispatch, hostname, did, deviceIdx); hp_dev_uuid = gen_perfetto_uuid(); potential_uuid = hp_dev_uuid; - // Create new track + return {hp_dev_uuid, hp_uuid}; +} + +static perfetto_uuid_t get_counter_track_uuuid( + timeline_dispatch_t *dispatch, + std::unordered_map &counter_tracks, + const std::string &track_name, const std::string &hostname, thapi_device_id did, + uint32_t deviceIdx, uint64_t tHandle, thapi_domain_idx domain, + std::optional fabricExtras = std::nullopt, + std::unordered_map *counter_tracks_fp = nullptr) { + + // Choose key and map + std::pair uuids; + if (fabricExtras && counter_tracks_fp) { + bool RxTx = std::get<0>(*fabricExtras); + auto key = std::make_tuple(hostname, did, tHandle, domain, RxTx); + uuids = insert_or_get_uuid(*counter_tracks_fp, key, dispatch, hostname, did, deviceIdx); + } else { + auto key = std::make_tuple(hostname, did, domain, tHandle); + uuids = insert_or_get_uuid(counter_tracks, key, dispatch, hostname, did, deviceIdx); + } + + // Get hp_dev_uuid and hp_uuid + perfetto_uuid_t hp_dev_uuid = uuids.first; + perfetto_uuid_t hp_uuid = uuids.second; + + // Packet creation (independent of the map used for UUID storage) auto *packet = dispatch->trace.add_packet(); packet->set_timestamp(0); packet->set_trusted_packet_sequence_id(TRUSTED_PACKED_SEQUENCE_ID); auto *track_descriptor = packet->mutable_track_descriptor(); track_descriptor->set_uuid(hp_dev_uuid); - track_descriptor->set_parent_uuid(hp_uuid); + + // Set the parent UUID if it was generated + if (hp_uuid != 0) { + track_descriptor->set_parent_uuid(hp_uuid); + } + std::ostringstream oss; - oss << track_name << " | Domain " << domain; - track_descriptor->set_name(oss.str()); + if (track_name == "FabricT" && fabricExtras) { + bool RxTx = std::get<0>(*fabricExtras); + uint32_t fabricId = std::get<1>(*fabricExtras); + uint32_t remotePortId = std::get<2>(*fabricExtras); + oss << track_name << " | SD " << domain << " | " << fabricId << "<->" << remotePortId << " | " + << (RxTx ? " TX" : " RX"); + } else if (track_name == "Memory") { + oss << track_name << " BW | Module " << domain; + } else if (track_name == "Allocated Memory (%)") { + oss << track_name << " Module " << domain; + } else if (track_name == "CopyEngine (%)" || track_name == "ComputeEngine (%)") { + oss << track_name << " | SubDevice " << domain; + } else if (track_name == "Power") { + if (domain == 0) { + oss << " Total Power"; + } else { + oss << track_name << " | SubDevice " << domain - 1; + } + } else { + oss << track_name << " | SubDevice " << domain; + } - auto *counter_descriptor = track_descriptor->mutable_counter(); - counter_descriptor->set_unit_multiplier(unit_multiplier); + track_descriptor->set_name(oss.str()); + track_descriptor->mutable_counter(); return hp_dev_uuid; } -static perfetto_uuid_t get_frequency_track_uuuid(timeline_dispatch_t *dispatch, - std::string hostname, uint64_t process_id, - thapi_device_id did, thapi_domain_idx domain) { - return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2frqtracks, " GPU Frequency", - hostname, process_id, did, domain); -} -static perfetto_uuid_t get_power_track_uuuid(timeline_dispatch_t *dispatch, std::string hostname, - uint64_t process_id, thapi_device_id did, - thapi_domain_idx domain) { - // Extra leading space in the name field to make GPU Power the first track - return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2pwrtracks, " GPU Power", hostname, - process_id, did, domain); +static perfetto_uuid_t get_copyEU_track_uuuid(timeline_dispatch_t *dispatch, + const std::string &hostname, uint64_t did, + uint32_t deviceIdx, uint64_t hEngine, + uint32_t subDevice, std::optional options) { + return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2cpyalloctracks, "CopyEngine (%)", + hostname, did, deviceIdx, hEngine, subDevice); } static perfetto_uuid_t get_computeEU_track_uuuid(timeline_dispatch_t *dispatch, - std::string hostname, uint64_t process_id, - thapi_device_id did, thapi_sdevice_idx subDevice) { - return get_counter_track_uuuid(dispatch, dispatch->hp_dsdev2cpetracks, "ComputeEngine (%)", - hostname, process_id, did, subDevice, 100); + const std::string &hostname, uint64_t did, + uint32_t deviceIdx, uint64_t hEngine, + uint32_t subDevice, + std::optional options) { + return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2telmtracks, "ComputeEngine (%)", + hostname, did, deviceIdx, hEngine, subDevice); +} + +static perfetto_uuid_t get_fpThroughput_track_uuuid(timeline_dispatch_t *dispatch, + const std::string &hostname, uint64_t did, + uint32_t deviceIdx, uint64_t hFabricPort, + uint32_t subDevice, + std::optional fabricExtras) { + return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2telmtracks, "FabricT", hostname, + did, deviceIdx, hFabricPort, subDevice, fabricExtras, + &dispatch->hp_dfsdev2fptracks); +} + +static perfetto_uuid_t get_power_track_uuuid(timeline_dispatch_t *dispatch, + const std::string &hostname, uint64_t did, + uint32_t deviceIdx, uint64_t hPower, + uint32_t subDevice, std::optional options) { + // Extra space to maintain track sequence in the timeline + return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2telmtracks, "Power", hostname, did, + deviceIdx, hPower, subDevice); +} + +static perfetto_uuid_t get_frequency_track_uuuid(timeline_dispatch_t *dispatch, + const std::string &hostname, uint64_t did, + uint32_t deviceIdx, uint64_t hFrequency, + uint32_t subDevice, + std::optional options) { + return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2telmtracks, "Frequency", hostname, + did, deviceIdx, hFrequency, subDevice); +} + +static perfetto_uuid_t get_bandwidth_track_uuuid(timeline_dispatch_t *dispatch, + const std::string &hostname, uint64_t did, + uint32_t deviceIdx, uint64_t hMemModule, + uint32_t subDevice, + std::optional options) { + return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2telmtracks, "Memory", hostname, did, + deviceIdx, hMemModule, subDevice); } -static perfetto_uuid_t get_copyEU_track_uuuid(timeline_dispatch_t *dispatch, std::string hostname, - uint64_t process_id, thapi_device_id did, - thapi_sdevice_idx subDevice) { - return get_counter_track_uuuid(dispatch, dispatch->hp_dsdev2cpytracks, "CopyEngine (%)", hostname, - process_id, did, subDevice, 100); +static perfetto_uuid_t get_allocation_track_uuuid(timeline_dispatch_t *dispatch, + const std::string &hostname, uint64_t did, + uint32_t deviceIdx, uint64_t hMemModule, + uint32_t subDevice, + std::optional options) { + return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2cpyalloctracks, + "Allocated Memory (%)", hostname, did, deviceIdx, hMemModule, + subDevice); } -static void add_event_DTelemetry(timeline_dispatch_t *dispatch, std::string hostname, - uint64_t process_id, uint64_t thread_id, uintptr_t did, +static void add_event_DTelemetry(timeline_dispatch_t *dispatch, const std::string &hostname, + uint64_t did, uint32_t deviceIdx, uint64_t tHandle, uint32_t subDevice, uint64_t timestamp, float value, - uuid_getter_t uuid_getter, const std::string &eventName) { - perfetto_uuid_t track_uuid = uuid_getter(dispatch, hostname, process_id, did, subDevice); + uuid_getter_t uuid_getter, const std::string &eventName, + std::optional options = std::nullopt) { + perfetto_uuid_t track_uuid; + track_uuid = uuid_getter(dispatch, hostname, did, deviceIdx, tHandle, subDevice, options); + auto *packet = dispatch->trace.add_packet(); packet->set_trusted_packet_sequence_id(TRUSTED_PACKED_SEQUENCE_ID); packet->set_timestamp(timestamp); @@ -153,31 +244,60 @@ static void add_event_DTelemetry(timeline_dispatch_t *dispatch, std::string host track_event->set_double_counter_value(value); } -static void add_event_frequency(timeline_dispatch_t *dispatch, std::string hostname, - uint64_t process_id, uint64_t thread_id, uintptr_t did, - uint32_t domain, uint64_t timestamp, float frequency) { - add_event_DTelemetry(dispatch, hostname, process_id, thread_id, did, domain, timestamp, frequency, - get_frequency_track_uuuid, "Frequency"); +static void add_event_memModule(timeline_dispatch_t *dispatch, std::string hostname, uint64_t did, + uint32_t deviceIdx, uintptr_t hMemModule, uint32_t subDevice, + uint64_t timestamp, float pBandwidth, float rdBandwidth, + float wtBandwidth, float allocation) { + // Define for RX throughput. + add_event_DTelemetry(dispatch, hostname, did, deviceIdx, hMemModule, subDevice, timestamp, + pBandwidth, get_bandwidth_track_uuuid, "Memory BW"); + + add_event_DTelemetry(dispatch, hostname, did, deviceIdx, hMemModule, subDevice, timestamp, + allocation, get_allocation_track_uuuid, "Memory Allocation"); } -static void add_event_power(timeline_dispatch_t *dispatch, std::string hostname, - uint64_t process_id, uint64_t thread_id, uintptr_t did, uint32_t domain, +static void add_event_fabricPort(timeline_dispatch_t *dispatch, std::string hostname, uint64_t did, + uint32_t deviceIdx, uintptr_t hFabricPort, uint32_t subDevice, + uint64_t timestamp, uint32_t fabricId, uint32_t remotePortId, + float rxThroughput, float txThroughput, float rxSpeed, + float txSpeed) { + // Define. + Extras fabricExtras = {false, fabricId, remotePortId}; + add_event_DTelemetry(dispatch, hostname, did, deviceIdx, hFabricPort, subDevice, timestamp, + rxThroughput, get_fpThroughput_track_uuuid, "Fabric ThroughputRX", + fabricExtras); + + fabricExtras = {true, fabricId, remotePortId}; + add_event_DTelemetry(dispatch, hostname, did, deviceIdx, hFabricPort, subDevice, timestamp, + txThroughput, get_fpThroughput_track_uuuid, "Fabric ThroughputTX", + fabricExtras); +} + +static void add_event_frequency(timeline_dispatch_t *dispatch, std::string hostname, uint64_t did, + uint32_t deviceIdx, uint64_t hFrequency, uint32_t subDevice, + uint64_t timestamp, float frequency) { + add_event_DTelemetry(dispatch, hostname, did, deviceIdx, hFrequency, subDevice, timestamp, + frequency, get_frequency_track_uuuid, "Frequency"); +} + +static void add_event_power(timeline_dispatch_t *dispatch, std::string hostname, uint64_t did, + uint32_t deviceIdx, uint64_t hPower, uint32_t subDevice, uint64_t timestamp, float power) { - add_event_DTelemetry(dispatch, hostname, process_id, thread_id, did, domain, timestamp, power, + add_event_DTelemetry(dispatch, hostname, did, deviceIdx, hPower, subDevice, timestamp, power, get_power_track_uuuid, "Power"); } -static void add_event_computeEU(timeline_dispatch_t *dispatch, std::string hostname, - uint64_t process_id, uint64_t thread_id, uintptr_t did, - uint32_t subDevice, uint64_t timestamp, float activeTime) { - add_event_DTelemetry(dispatch, hostname, process_id, thread_id, did, subDevice, timestamp, +static void add_event_computeEU(timeline_dispatch_t *dispatch, std::string hostname, uint64_t did, + uint32_t deviceIdx, uint64_t hEngine, uint32_t subDevice, + uint64_t timestamp, float activeTime) { + add_event_DTelemetry(dispatch, hostname, did, deviceIdx, hEngine, subDevice, timestamp, activeTime, get_computeEU_track_uuuid, "ComputeEngine"); } -static void add_event_copyEU(timeline_dispatch_t *dispatch, std::string hostname, - uint64_t process_id, uint64_t thread_id, uintptr_t did, - uint32_t subDevice, uint64_t timestamp, float activeTime) { - add_event_DTelemetry(dispatch, hostname, process_id, thread_id, did, subDevice, timestamp, +static void add_event_copyEU(timeline_dispatch_t *dispatch, std::string hostname, uint64_t did, + uint32_t deviceIdx, uint64_t hEngine, uint32_t subDevice, + uint64_t timestamp, float activeTime) { + add_event_DTelemetry(dispatch, hostname, did, deviceIdx, hEngine, subDevice, timestamp, activeTime, get_copyEU_track_uuuid, "CopyEngine"); } @@ -391,40 +511,61 @@ static void device_usr_callback(void *btx_handle, void *usr_data, const char *ho } static void frequency_usr_callback(void *btx_handle, void *usr_data, const char *hostname, - int64_t vpid, uint64_t vtid, int64_t ts, int64_t backend, - uint64_t did, uint32_t domain, uint64_t frequency) { + int64_t ts, uint64_t did, uint32_t deviceIdx, + uint64_t hFrequency, uint32_t domain, uint64_t frequency) { auto *dispatch = static_cast(usr_data); - add_event_frequency(dispatch, hostname, vpid, vtid, did, domain, ts, frequency); + add_event_frequency(dispatch, hostname, did, deviceIdx, hFrequency, domain, ts, frequency); } -static void power_usr_callback(void *btx_handle, void *usr_data, const char *hostname, int64_t vpid, - uint64_t vtid, int64_t ts, int64_t backend, uint64_t did, - uint32_t domain, uint64_t power) { +static void power_usr_callback(void *btx_handle, void *usr_data, const char *hostname, int64_t ts, + uint64_t did, uint32_t deviceIdx, uint64_t hPower, uint32_t domain, + uint64_t power) { auto *dispatch = static_cast(usr_data); - add_event_power(dispatch, hostname, vpid, vtid, did, domain, ts, power); + add_event_power(dispatch, hostname, did, hPower, deviceIdx, domain, ts, power); } static void computeEU_usr_callback(void *btx_handle, void *usr_data, const char *hostname, - int64_t vpid, uint64_t vtid, int64_t ts, int64_t backend, - uint64_t did, uint32_t subDevice, float activeTime) { + int64_t ts, uint64_t did, uint32_t deviceIdx, uint64_t hEngine, + uint32_t subDevice, float activeTime) { auto *dispatch = static_cast(usr_data); - add_event_computeEU(dispatch, hostname, vpid, vtid, did, subDevice, ts, activeTime); + add_event_computeEU(dispatch, hostname, did, deviceIdx, hEngine, subDevice, ts, activeTime); } -static void copyEU_usr_callback(void *btx_handle, void *usr_data, const char *hostname, - int64_t vpid, uint64_t vtid, int64_t ts, int64_t backend, - uint64_t did, uint32_t subDevice, float activeTime) { +static void copyEU_usr_callback(void *btx_handle, void *usr_data, const char *hostname, int64_t ts, + uint64_t did, uint32_t deviceIdx, uint64_t hEngine, + uint32_t subDevice, float activeTime) { + auto *dispatch = static_cast(usr_data); + add_event_copyEU(dispatch, hostname, did, deviceIdx, hEngine, subDevice, ts, activeTime); +} + +static void fabricPort_usr_callback(void *btx_handle, void *usr_data, const char *hostname, + int64_t ts, uint64_t did, uint32_t deviceIdx, + uint64_t hFabricPort, uint32_t subDevice, uint32_t fabricId, + uint32_t remotePortId, float rxThroughput, float txThroughput, + float rxSpeed, float txSpeed) { + auto *dispatch = static_cast(usr_data); + add_event_fabricPort(dispatch, hostname, did, deviceIdx, hFabricPort, subDevice, ts, fabricId, + remotePortId, rxThroughput, txThroughput, rxSpeed, txSpeed); +} + +static void memModule_usr_callback(void *btx_handle, void *usr_data, const char *hostname, + int64_t ts, uint64_t did, uint32_t deviceIdx, + uint64_t hMemModule, uint32_t subDevice, float pBandwidth, + float rdBandwidth, float wtBandwidth, float allocation) { auto *dispatch = static_cast(usr_data); - add_event_copyEU(dispatch, hostname, vpid, vtid, did, subDevice, ts, activeTime); + add_event_memModule(dispatch, hostname, did, deviceIdx, hMemModule, subDevice, ts, pBandwidth, + rdBandwidth, wtBandwidth, allocation); } void btx_register_usr_callbacks(void *btx_handle) { btx_register_callbacks_lttng_host(btx_handle, &host_usr_callback); btx_register_callbacks_lttng_device(btx_handle, &device_usr_callback); - btx_register_callbacks_lttng_frequency(btx_handle, &frequency_usr_callback); - btx_register_callbacks_lttng_power(btx_handle, &power_usr_callback); - btx_register_callbacks_lttng_computeEU(btx_handle, &computeEU_usr_callback); - btx_register_callbacks_lttng_copyEU(btx_handle, ©EU_usr_callback); + btx_register_callbacks_sampling_frequency(btx_handle, &frequency_usr_callback); + btx_register_callbacks_sampling_power(btx_handle, &power_usr_callback); + btx_register_callbacks_sampling_computeEU(btx_handle, &computeEU_usr_callback); + btx_register_callbacks_sampling_copyEU(btx_handle, ©EU_usr_callback); + btx_register_callbacks_sampling_fabricPort(btx_handle, &fabricPort_usr_callback); + btx_register_callbacks_sampling_memModule(btx_handle, &memModule_usr_callback); btx_register_callbacks_initialize_component(btx_handle, &btx_initialize_component_callback); btx_register_callbacks_read_params(btx_handle, &read_params_callback); btx_register_callbacks_finalize_component(btx_handle, &btx_finalize_component_callback); diff --git a/ze/btx_zeinterval_callbacks.cpp b/ze/btx_zeinterval_callbacks.cpp index ce975fdc..6fb3e461 100644 --- a/ze/btx_zeinterval_callbacks.cpp +++ b/ze/btx_zeinterval_callbacks.cpp @@ -349,8 +349,8 @@ static void hSignalEvent_hKernel_with_group_entry_callback( std::stringstream metadata_s; metadata_s << "SIMD" << std::get(a).maxSubgroupSize << ", {" << pLaunchFuncArgs_val->groupCountX << "," << pLaunchFuncArgs_val->groupCountY << "," - << pLaunchFuncArgs_val->groupCountZ << "}" - << ", {" << groupSizeX << "," << groupSizeY << "," << groupSizeZ << "}"; + << pLaunchFuncArgs_val->groupCountZ << "}" << ", {" << groupSizeX << "," + << groupSizeY << "," << groupSizeZ << "}"; metadata = metadata_s.str(); } data->threadToLastLaunchInfo[{hostname, vpid, vtid}] = { @@ -765,87 +765,294 @@ static void zeEventDestroy_exit_callback(void *btx_handle, void *usr_data, int64 * Sampling */ -static void lttng_ust_ze_sampling_gpu_energy_callback(void *btx_handle, void *usr_data, int64_t ts, - const char *hostname, int64_t vpid, - uint64_t vtid, ze_device_handle_t hDevice, - uint32_t domain, uint64_t energy, - uint64_t sampling_ts) { - +std::optional get_device_hash(void *usr_data, const char *hostname, int64_t vpid, + ze_device_handle_t hDevice) { auto *data = static_cast(usr_data); - auto [it, inserted] = data->device_energy_ref.insert( - {{hostname, vpid, hDevice, domain}, {energy, sampling_ts, ts}}); - // First entry - if (inserted) - return; - auto &[prev_energy, prev_sampling_ts, prev_ts] = it->second; - - // Watt conversion - btx_push_message_lttng_power(btx_handle, hostname, 0, 0, prev_ts, BACKEND_ZE, (uint64_t)hDevice, - (thapi_domain_idx)domain, - (energy - prev_energy) / (double)(sampling_ts - prev_sampling_ts)); + const auto it0 = data->sampling_device_property.find({hostname, vpid, hDevice}); + if (it0 != data->sampling_device_property.cend()) { + const auto &[deviceProp, deviceIdx] = it0->second; + + uint64_t hash = 0xcbf29ce484222325; // FNV offset basis + for (int i = 0; i < ZE_MAX_DEVICE_UUID_SIZE; i++) { + hash ^= (uint64_t)deviceProp.core.uuid.id[i]; + hash *= 0x100000001b3; // FNV prime + } - it->second = {energy, sampling_ts, ts}; + return DeviceHash{hash, deviceIdx}; // Return tuple optional + } + return std::nullopt; // Return empty optional if not found } -static void lttng_ust_ze_sampling_gpu_frequency_callback(void *btx_handle, void *usr_data, - int64_t ts, const char *hostname, - int64_t vpid, uint64_t vtid, - ze_device_handle_t hDevice, - uint32_t domain, uint64_t frequency) { +uint64_t calculate_delta(uint64_t current_val, uint64_t prev_val) { + // calculate the delta, considering overflow + if (current_val >= prev_val) { + return current_val - prev_val; + } else { + return current_val + (UINT64_MAX - prev_val) + 1; + } +} - btx_push_message_lttng_frequency(btx_handle, hostname, 0, 0, ts, BACKEND_ZE, (uint64_t)hDevice, - domain, frequency); +static void lttng_ust_ze_sampling_fabricPort_callback( + void *btx_handle, void *usr_data, int64_t ts, const char *hostname, int64_t vpid, uint64_t vtid, + ze_device_handle_t hDevice, zes_fabric_port_handle_t hFabricPort, + size_t _pFabricPortState_val_length, zes_fabric_port_state_t *pFabricPortState_val, + size_t _pFabricPortThroughput_val_length, + zes_fabric_port_throughput_t *pFabricPortThroughput_val) { + auto *data = static_cast(usr_data); + const auto it0 = data->fabricPort_property.find({hostname, vpid, hDevice, hFabricPort}); + if (it0 != data->fabricPort_property.cend()) { + // Get fabricPort properties: subdevice ID, fabricId... + auto subDevice = it0->second.subdeviceId; + auto fabricId = it0->second.portId.fabricId; + auto remotePortId = pFabricPortState_val->remotePortId.fabricId; + // Current Speed (bytes/sec) place holder (not used currently in the timeline) + // https://spec.oneapi.io/level-zero/1.9.3/sysman/PROG.html#operations-on-fabric-ports + double rxSpeed = static_cast(pFabricPortState_val->rxSpeed.bitRate * + pFabricPortState_val->rxSpeed.width) / + 8.0; + double txSpeed = static_cast(pFabricPortState_val->txSpeed.bitRate * + pFabricPortState_val->txSpeed.width) / + 8.0; + + // Insert the current throughput data with timestamp + auto [it, inserted] = data->device_fabricPort_ref.insert( + {{hostname, vpid, hDevice, hFabricPort, subDevice}, {*pFabricPortThroughput_val, ts}}); + if (inserted) + return; + + // Previous throughput data + auto &[prev_throughput, prev_ts] = it->second; + + /* Per doc: When taking the delta, the difference between timestamp samples + * could be 0, if the frequency of sampling the snapshots is higher than the + * frequency of the timestamp update. */ + double time_delta = static_cast( + calculate_delta(pFabricPortThroughput_val->timestamp, prev_throughput.timestamp)); + if (time_delta == 0) + return; + + // Calculate the RX and TX throughput + double rxThroughput = static_cast(calculate_delta(pFabricPortThroughput_val->rxCounter, + prev_throughput.rxCounter)) / + time_delta; + double txThroughput = static_cast(calculate_delta(pFabricPortThroughput_val->txCounter, + prev_throughput.txCounter)) / + time_delta; + auto uuid_idx = get_device_hash(usr_data, hostname, vpid, hDevice); + if (uuid_idx) { + auto [hash, deviceIdx] = *uuid_idx; + if (rxThroughput != 0 || txThroughput != 0) + btx_push_message_sampling_fabricPort( + btx_handle, hostname, prev_ts, hash, deviceIdx, (uint64_t)hFabricPort, subDevice, + fabricId, remotePortId, rxThroughput, txThroughput, rxSpeed, txSpeed); + // Update the stored values + it->second = {*pFabricPortThroughput_val, ts}; + } else { + std::cerr << "Device property not found!" << std::endl; + } + } else { + std::cerr << "Fabricport property not found!" << std::endl; + } } -static void lttng_ust_ze_sampling_computeEngine_callback( +static void lttng_ust_ze_sampling_memStats_callback( void *btx_handle, void *usr_data, int64_t ts, const char *hostname, int64_t vpid, uint64_t vtid, - ze_device_handle_t hDevice, uint32_t subDevice, uint64_t activeTime, uint64_t sampling_ts) { + ze_device_handle_t hDevice, zes_mem_handle_t hMemModule, size_t _pMemState_val_length, + zes_mem_state_t *pMemState_val, size_t _pMemBandwidth_val_length, + zes_mem_bandwidth_t *pMemBandwidth_val) { + auto *data = static_cast(usr_data); + const auto it0 = data->memModule_property.find({hostname, vpid, hDevice, hMemModule}); + if (it0 != data->memModule_property.cend()) { + // Get memModule properties: subdevice ID ... + auto subDevice = it0->second.subdeviceId; + // Insert the current bandwidth data with timestamp + auto [it, inserted] = data->device_memModule_ref.insert( + {{hostname, vpid, hDevice, hMemModule, subDevice}, {*pMemBandwidth_val, ts}}); + if (inserted) + return; + + // Previous bandwidth data + auto &[prev_bandwidth, prev_ts] = it->second; + double time_delta = static_cast( + calculate_delta(pMemBandwidth_val->timestamp, prev_bandwidth.timestamp)); + // if no progress on timestamp, return (can happen when sampling frequency is higher than the + // timestamp counter update frequency) + if (time_delta == 0) + return; + + // Calculate the RD and WT bandwidth + // https://spec.oneapi.io/level-zero/1.9.3/sysman/api.html#zes-mem-bandwidth-t + + double allocation = static_cast(pMemState_val->size - pMemState_val->free) * 100.0 / + static_cast(pMemState_val->size); + // percentage bandwidth based on the manual + uint64_t rdBytes_delta = + calculate_delta(pMemBandwidth_val->readCounter, prev_bandwidth.readCounter); + uint64_t wtBytes_delta = + calculate_delta(pMemBandwidth_val->writeCounter, prev_bandwidth.writeCounter); + double pBandwidth = static_cast(1e6 * (rdBytes_delta + wtBytes_delta) / + (time_delta * pMemBandwidth_val->maxBandwidth)); + // rd and wt bandwidth if further drilling needed (place holder for now) + double rdBandwidth = static_cast(rdBytes_delta) * 1e6 / time_delta; + double wtBandwidth = static_cast(wtBytes_delta) * 1e6 / time_delta; + auto uuid_idx = get_device_hash(usr_data, hostname, vpid, hDevice); + if (uuid_idx) { + auto [hash, deviceIdx] = *uuid_idx; + btx_push_message_sampling_memModule(btx_handle, hostname, prev_ts, hash, deviceIdx, + (uint64_t)hMemModule, subDevice, pBandwidth, rdBandwidth, + wtBandwidth, allocation); + // Update the stored values + it->second = {*pMemBandwidth_val, ts}; + } else { + std::cerr << "Device property not found!" << std::endl; + } + } else { + std::cerr << "Memory property not found!" << std::endl; + } +} + +static void lttng_ust_ze_sampling_engineStats_callback(void *btx_handle, void *usr_data, int64_t ts, + const char *hostname, int64_t vpid, + uint64_t vtid, ze_device_handle_t hDevice, + zes_engine_handle_t hEngine, + size_t _pEngineStats_val_length, + zes_engine_stats_t *pEngineStats_val) { + auto *data = static_cast(usr_data); + const auto it0 = data->engine_property.find({hostname, vpid, hDevice, hEngine}); + if (it0 != data->engine_property.cend()) { + const auto &engineProps = it0->second; + uint32_t subDevice = engineProps.subdeviceId; + + if (engineProps.type == ZES_ENGINE_GROUP_COMPUTE_ALL || + engineProps.type == ZES_ENGINE_GROUP_COPY_ALL) { + auto [it, inserted] = data->device_engines_ref.insert( + {{hostname, vpid, hDevice, hEngine, subDevice}, {*pEngineStats_val, ts}}); + if (inserted) + return; + auto &[prev_engineStats, prev_ts] = it->second; + double time_delta = static_cast( + calculate_delta(pEngineStats_val->timestamp, prev_engineStats.timestamp)); + // if no progress on timestam, return (can happen when sampling frequency is higher than the + // timestamp counter update frequency) + if (time_delta == 0) + return; + double activeTime = static_cast(calculate_delta(pEngineStats_val->activeTime, + prev_engineStats.activeTime)) * + 100 / time_delta; + auto uuid_idx = get_device_hash(usr_data, hostname, vpid, hDevice); + if (uuid_idx) { + auto [hash, deviceIdx] = *uuid_idx; + if (engineProps.type == ZES_ENGINE_GROUP_COMPUTE_ALL) { + btx_push_message_sampling_computeEU(btx_handle, hostname, prev_ts, hash, deviceIdx, + (uint64_t)hEngine, subDevice, activeTime); + } else if (engineProps.type == ZES_ENGINE_GROUP_COPY_ALL) { + btx_push_message_sampling_copyEU(btx_handle, hostname, prev_ts, hash, deviceIdx, + (uint64_t)hEngine, subDevice, activeTime); + } + it->second = {*pEngineStats_val, ts}; + } else { + std::cerr << "Device property not found! " << std::endl; + } + } + } else { + std::cerr << "Engine property not found for device: " << hDevice << std::endl; + } +} +static void lttng_ust_ze_sampling_gpu_energy_callback( + void *btx_handle, void *usr_data, int64_t ts, const char *hostname, int64_t vpid, uint64_t vtid, + ze_device_handle_t hDevice, zes_pwr_handle_t hPower, uint32_t domainIdx, + size_t _pEnergyCounter_val_length, zes_power_energy_counter_t *pEnergyCounter_val) { auto *data = static_cast(usr_data); - auto [it, inserted] = data->device_computeEngine_ref.insert( - {{hostname, vpid, hDevice, subDevice}, {activeTime, sampling_ts, ts}}); - // First entry + auto [it, inserted] = data->device_energy_ref.insert( + {{hostname, vpid, hDevice, hPower, domainIdx}, {*pEnergyCounter_val, ts}}); if (inserted) return; - auto &[prev_activeTime, prev_sampling_ts, prev_ts] = it->second; - /* Per doc: When taking the delta, the difference between timestamp samples - * could be 0, if the frequency of sampling the snapshots is higher than the - * frequency of the timestamp update. */ - if (prev_sampling_ts == sampling_ts) + auto &[prev_EnergyCounter, prev_ts] = it->second; + double time_delta = static_cast( + calculate_delta(pEnergyCounter_val->timestamp, prev_EnergyCounter.timestamp)); + // if no progress on timestamp, return (can happen when sampling frequency is higher than the + // timestamp counter update frequency) + if (time_delta == 0) return; - btx_push_message_lttng_computeEU( - btx_handle, hostname, 0, 0, prev_ts, BACKEND_ZE, (uint64_t)hDevice, subDevice, - (activeTime - prev_activeTime) / (double)(sampling_ts - prev_sampling_ts)); + double power = + static_cast(calculate_delta(pEnergyCounter_val->energy, prev_EnergyCounter.energy)) / + time_delta; + auto uuid_idx = get_device_hash(usr_data, hostname, vpid, hDevice); + if (uuid_idx) { + auto [hash, deviceIdx] = *uuid_idx; + btx_push_message_sampling_power(btx_handle, hostname, prev_ts, hash, deviceIdx, + (uint64_t)hPower, (thapi_domain_idx)domainIdx, power); + it->second = {*pEnergyCounter_val, ts}; + } else { + std::cerr << "Device property not found! " << std::endl; + } +} - it->second = {activeTime, sampling_ts, ts}; +static void lttng_ust_ze_sampling_gpu_frequency_callback( + void *btx_handle, void *usr_data, int64_t ts, const char *hostname, int64_t vpid, uint64_t vtid, + ze_device_handle_t hDevice, zes_freq_handle_t hFrequency, uint32_t domainIdx, + size_t _pFreqState_val_length, zes_freq_state_t *pFreqState_val) { + auto uuid_idx = get_device_hash(usr_data, hostname, vpid, hDevice); + if (uuid_idx) { + auto [hash, deviceIdx] = *uuid_idx; + btx_push_message_sampling_frequency(btx_handle, hostname, ts, hash, deviceIdx, + (uint64_t)hFrequency, domainIdx, pFreqState_val->actual); + } else { + std::cerr << "Device property not found! " << std::endl; + } } -static void lttng_ust_ze_sampling_copyEngine_callback(void *btx_handle, void *usr_data, int64_t ts, - const char *hostname, int64_t vpid, - uint64_t vtid, ze_device_handle_t hDevice, - uint32_t subDevice, uint64_t activeTime, - uint64_t sampling_ts) { +// Properties +static void lttng_ust_ze_sampling_deviceProperties_callback( + void *btx_handle, void *usr_data, int64_t ts, const char *hostname, int64_t vpid, uint64_t vtid, + zes_device_handle_t hDevice, uint32_t deviceIdx, size_t _pDeviceProperties_val_length, + zes_device_properties_t *pDeviceProperties_val) { auto *data = static_cast(usr_data); - auto [it, inserted] = data->device_copyEngine_ref.insert( - {{hostname, vpid, hDevice, subDevice}, {activeTime, sampling_ts, ts}}); - // First entry - if (inserted) - return; - auto &[prev_activeTime, prev_sampling_ts, prev_ts] = it->second; + data->sampling_device_property[{hostname, vpid, hDevice}] = {*pDeviceProperties_val, deviceIdx}; +} - /* Per doc: When taking the delta, the difference between timestamp samples - * could be 0, if the frequency of sampling the snapshots is higher than the - * frequency of the timestamp update. */ - if (prev_sampling_ts == sampling_ts) - return; +static void lttng_ust_ze_sampling_fabricPortProperties_callback( + void *btx_handle, void *usr_data, int64_t ts, const char *hostname, int64_t vpid, uint64_t vtid, + ze_device_handle_t hDevice, zes_fabric_port_handle_t hFabricPort, + size_t _pFabricPortProperties_val_length, + zes_fabric_port_properties_t *pFabricPortProperties_val) { + auto *data = static_cast(usr_data); + data->fabricPort_property[{hostname, vpid, hDevice, hFabricPort}] = *pFabricPortProperties_val; +} - btx_push_message_lttng_copyEU( - btx_handle, hostname, 0, 0, prev_ts, BACKEND_ZE, (uint64_t)hDevice, subDevice, - (activeTime - prev_activeTime) / (double)(sampling_ts - prev_sampling_ts)); +static void lttng_ust_ze_sampling_memoryProperties_callback( + void *btx_handle, void *usr_data, int64_t ts, const char *hostname, int64_t vpid, uint64_t vtid, + ze_device_handle_t hDevice, zes_mem_handle_t hMemModule, + size_t _pMemModuleProperties_val_length, zes_mem_properties_t *pMemModuleProperties_val) { + auto *data = static_cast(usr_data); + data->memModule_property[{hostname, vpid, hDevice, hMemModule}] = *pMemModuleProperties_val; +} - it->second = {activeTime, sampling_ts, ts}; +static void lttng_ust_ze_sampling_powerProperties_callback( + void *btx_handle, void *usr_data, int64_t ts, const char *hostname, int64_t vpid, uint64_t vtid, + ze_device_handle_t hDevice, zes_pwr_handle_t hPower, size_t _pPowerProperties_val_length, + zes_power_properties_t *pPowerProperties_val) { + auto *data = static_cast(usr_data); + data->power_property[{hostname, vpid, hDevice, hPower}] = *pPowerProperties_val; +} + +static void lttng_ust_ze_sampling_freqProperties_callback( + void *btx_handle, void *usr_data, int64_t ts, const char *hostname, int64_t vpid, uint64_t vtid, + ze_device_handle_t hDevice, zes_freq_handle_t hFrequency, size_t _pfreqProperties_val_length, + zes_freq_properties_t *pFreqProperties_val) { + auto *data = static_cast(usr_data); + data->frequency_property[{hostname, vpid, hDevice, hFrequency}] = *pFreqProperties_val; +} + +static void lttng_ust_ze_sampling_engineProperties_callback( + void *btx_handle, void *usr_data, int64_t ts, const char *hostname, int64_t vpid, uint64_t vtid, + ze_device_handle_t hDevice, zes_engine_handle_t hEngine, size_t _pEngineProperties_val_length, + zes_engine_properties_t *pEngineProperties_val) { + auto *data = static_cast(usr_data); + data->engine_property[{hostname, vpid, hDevice, hEngine}] = *pEngineProperties_val; } /* @@ -874,22 +1081,21 @@ void btx_register_usr_callbacks(void *btx_handle) { /* Device and Subdevice property */ btx_register_callbacks_lttng_ust_ze_properties_device(btx_handle, &property_device_callback); btx_register_callbacks_lttng_ust_ze_properties_subdevice(btx_handle, - &property_subdevice_callback); + &property_subdevice_callback); /* Map command list to device and to command queue dist*/ btx_register_callbacks_lttng_ust_ze_zeCommandListCreateImmediate_entry( btx_handle, zeCommandListCreateImmediate_entry_callback); btx_register_callbacks_lttng_ust_ze_zeCommandListCreateImmediate_exit( btx_handle, zeCommandListCreateImmediate_exit_callback); - btx_register_callbacks_lttng_ust_ze_zeCommandListCreate_entry(btx_handle, - zeCommandListCreate_entry_callback); - btx_register_callbacks_lttng_ust_ze_zeCommandListCreate_exit(btx_handle, - zeCommandListCreate_exit_callback); + btx_register_callbacks_lttng_ust_ze_zeCommandListCreate_entry( + btx_handle, zeCommandListCreate_entry_callback); + btx_register_callbacks_lttng_ust_ze_zeCommandListCreate_exit( + btx_handle, zeCommandListCreate_exit_callback); btx_register_callbacks_lttng_ust_ze_zeCommandQueueCreate_entry( btx_handle, zeCommandQueueCreate_entry_callback); - btx_register_callbacks_lttng_ust_ze_zeCommandQueueCreate_exit(btx_handle, - zeCommandQueueCreate_exit_callback); - + btx_register_callbacks_lttng_ust_ze_zeCommandQueueCreate_exit( + btx_handle, zeCommandQueueCreate_exit_callback); btx_register_callbacks_lttng_ust_ze_zeCommandQueueExecuteCommandLists_entry( btx_handle, zeCommandQueueExecuteCommandLists_entry_callback); @@ -901,12 +1107,11 @@ void btx_register_usr_callbacks(void *btx_handle) { btx_register_callbacks_lttng_ust_ze_zeKernelSetGroupSize_entry( btx_handle, &zeKernelSetGroupSize_entry_callback); - btx_register_callbacks_lttng_ust_ze_properties_kernel(btx_handle, &property_kernel_callback); /* Drift */ btx_register_callbacks_lttng_ust_ze_properties_device_timer(btx_handle, - &property_device_timer_callback); + &property_device_timer_callback); /* Profiling Command (everything who signal an event on completion) */ @@ -926,32 +1131,48 @@ void btx_register_usr_callbacks(void *btx_handle) { btx_register_callbacks_lttng_ust_ze_zeModuleGetGlobalPointer_exit( btx_handle, &zeModuleGetGlobalPointer_exit_callback); - btx_register_callbacks_lttng_ust_ze_zeModuleDestroy_exit(btx_handle, - &zeModuleDestroy_exit_callback); + btx_register_callbacks_lttng_ust_ze_zeModuleDestroy_exit( + btx_handle, &zeModuleDestroy_exit_callback); /* Handling of event */ - btx_register_callbacks_lttng_ust_ze_profiling_event_profiling(btx_handle, - &event_profiling_callback); + btx_register_callbacks_lttng_ust_ze_profiling_event_profiling( + btx_handle, &event_profiling_callback); btx_register_callbacks_lttng_ust_ze_profiling_event_profiling_results( btx_handle, &event_profiling_result_callback); - - btx_register_callbacks_lttng_ust_ze_zeEventDestroy_entry(btx_handle, - &zeEventDestroy_entry_callback); - btx_register_callbacks_lttng_ust_ze_zeEventDestroy_exit(btx_handle, - &zeEventDestroy_exit_callback); - - btx_register_callbacks_lttng_ust_ze_zeCommandListReset_entry(btx_handle, - &zeCommandListReset_entry_callback); - btx_register_callbacks_lttng_ust_ze_zeCommandListReset_exit(btx_handle, - &zeCommandListReset_exit_callback); + btx_register_callbacks_lttng_ust_ze_zeEventDestroy_entry( + btx_handle, &zeEventDestroy_entry_callback); + btx_register_callbacks_lttng_ust_ze_zeEventDestroy_exit( + btx_handle, &zeEventDestroy_exit_callback); + btx_register_callbacks_lttng_ust_ze_zeCommandListReset_entry( + btx_handle, &zeCommandListReset_entry_callback); + btx_register_callbacks_lttng_ust_ze_zeCommandListReset_exit( + btx_handle, &zeCommandListReset_exit_callback); /* Sampling */ + + // Properties + btx_register_callbacks_lttng_ust_ze_sampling_deviceProperties( + btx_handle, <tng_ust_ze_sampling_deviceProperties_callback); + btx_register_callbacks_lttng_ust_ze_sampling_fabricPortProperties( + btx_handle, <tng_ust_ze_sampling_fabricPortProperties_callback); + btx_register_callbacks_lttng_ust_ze_sampling_powerProperties( + btx_handle, <tng_ust_ze_sampling_powerProperties_callback); + btx_register_callbacks_lttng_ust_ze_sampling_engineProperties( + btx_handle, <tng_ust_ze_sampling_engineProperties_callback); + btx_register_callbacks_lttng_ust_ze_sampling_freqProperties( + btx_handle, <tng_ust_ze_sampling_freqProperties_callback); + btx_register_callbacks_lttng_ust_ze_sampling_memoryProperties( + btx_handle, <tng_ust_ze_sampling_memoryProperties_callback); + + // Telemetries + btx_register_callbacks_lttng_ust_ze_sampling_memStats( + btx_handle, <tng_ust_ze_sampling_memStats_callback); + btx_register_callbacks_lttng_ust_ze_sampling_fabricPort( + btx_handle, <tng_ust_ze_sampling_fabricPort_callback); btx_register_callbacks_lttng_ust_ze_sampling_gpu_energy( btx_handle, <tng_ust_ze_sampling_gpu_energy_callback); btx_register_callbacks_lttng_ust_ze_sampling_gpu_frequency( btx_handle, <tng_ust_ze_sampling_gpu_frequency_callback); - btx_register_callbacks_lttng_ust_ze_sampling_computeEngine( - btx_handle, <tng_ust_ze_sampling_computeEngine_callback); - btx_register_callbacks_lttng_ust_ze_sampling_copyEngine( - btx_handle, <tng_ust_ze_sampling_copyEngine_callback); + btx_register_callbacks_lttng_ust_ze_sampling_engineStats( + btx_handle, <tng_ust_ze_sampling_engineStats_callback); } diff --git a/ze/btx_zeinterval_callbacks.hpp b/ze/btx_zeinterval_callbacks.hpp index 8ca4943a..b1c980e0 100644 --- a/ze/btx_zeinterval_callbacks.hpp +++ b/ze/btx_zeinterval_callbacks.hpp @@ -11,6 +11,8 @@ #include #include +#define ZE_MAX_DEVICE_UUID_SIZE 16 + typedef std::tuple hp_event_t; typedef std::tuple hp_kernel_t; typedef std::tuple hp_command_list_t; @@ -20,13 +22,29 @@ typedef std::tuple hp_module_t; typedef std::map memory_interval_t; typedef std::tuple clock_lttng_device_t; -typedef std::tuple energy_timestamp_t; -typedef std::tuple computeEngine_timestamp_t; -typedef std::tuple copyEngine_timestamp_t; +typedef std::tuple memModule_timestamp_t; +typedef std::tuple fabricPort_timestamp_t; +typedef std::tuple energy_timestamp_t; +typedef std::tuple engines_timestamp_t; + +typedef std::tuple deviceProperty_id_t; + +typedef std::tuple hpd_t; typedef std::tuple hpdd_t; typedef std::tuple hpdsd_t; +typedef std::tuple hpdf_t; +typedef std::tuple hpdm_t; +typedef std::tuple hpdpwr_t; +typedef std::tuple hpdfreq_t; +typedef std::tuple hpdeng_t; + +typedef std::tuple hpdmsd_t; +typedef std::tuple hpdfsd_t; +typedef std::tuple hpdesd_t; +typedef std::tuple hpdpwrd_t; + using btx_kernel_group_size_t = std::tuple; using btx_kernel_desct_t = std::tuple; @@ -51,6 +69,8 @@ using btx_event_desct_t = using btx_command_list_desc_t = std::tuple; +using DeviceHash = std::tuple; + struct data_s { /* Host */ EntryState entry_state; @@ -79,8 +99,16 @@ struct data_s { std::unordered_map device_timestamps_pair_ref; /* Sampling */ - std::unordered_map device_energy_ref; - std::unordered_map device_computeEngine_ref; - std::unordered_map device_copyEngine_ref; + std::unordered_map sampling_device_property; + std::unordered_map fabricPort_property; + std::unordered_map memModule_property; + std::unordered_map power_property; + std::unordered_map frequency_property; + std::unordered_map engine_property; + + std::unordered_map device_energy_ref; + std::unordered_map device_engines_ref; + std::unordered_map device_fabricPort_ref; + std::unordered_map device_memModule_ref; }; typedef struct data_s data_t; diff --git a/ze/tracer_ze_helpers.include.c b/ze/tracer_ze_helpers.include.c index 164149a9..8a18f161 100644 --- a/ze/tracer_ze_helpers.include.c +++ b/ze/tracer_ze_helpers.include.c @@ -790,54 +790,71 @@ static inline void _dump_memory_info(ze_command_list_handle_t hCommandList, cons } while (0) static int _sampling_freq_initialized = 0; +static int _sampling_fabricPorts_initialized = 0; +static int _sampling_memModules_initialized = 0; static int _sampling_pwr_initialized = 0; static int _sampling_engines_initialized = 0; // Static handles to stay throughout the execution -static zes_driver_handle_t* _sampling_hDrivers = NULL; -static zes_device_handle_t** _sampling_hDevices = NULL; -static zes_freq_handle_t*** _sampling_hFrequencies = NULL; -static zes_pwr_handle_t*** _sampling_hPowers = NULL; -static zes_engine_handle_t*** _sampling_engineHandles = NULL; -static zes_engine_properties_t*** _sampling_engineProps = NULL; +static zes_driver_handle_t *_sampling_hDrivers = NULL; +static zes_device_handle_t **_sampling_hDevices = NULL; +static zes_freq_handle_t ***_sampling_hFrequencies = NULL; +static zes_pwr_handle_t ***_sampling_hPowers = NULL; +static zes_engine_handle_t ***_sampling_engineHandles = NULL; +static zes_fabric_port_handle_t ***_sampling_hFabricPort = NULL; +static zes_mem_handle_t ***_sampling_hMemModule = NULL; static uint32_t _sampling_driverCount = 0; -static uint32_t* _sampling_deviceCount = NULL; -static uint32_t** _sampling_subDeviceCount = NULL; -static uint32_t** _sampling_freqDomainCounts = NULL; -static uint32_t** _sampling_powerDomainCounts = NULL; -static uint32_t** _sampling_engineCounts = NULL; - -typedef struct { - uint64_t timestamp; - uint64_t computeActive; -} computeEngineData; - -typedef struct { - uint64_t timestamp; - uint64_t copyActive; -} copyEngineData; +static uint32_t *_sampling_deviceCount = NULL; +static uint32_t **_sampling_freqDomainCounts = NULL; +static uint32_t **_sampling_fabricPortCount = NULL; +static uint32_t **_sampling_memModuleCount = NULL; +static uint32_t **_sampling_powerDomainCounts = NULL; +static uint32_t **_sampling_engineCounts = NULL; static void intializeFrequency() { ze_result_t res; - _sampling_hFrequencies = (zes_freq_handle_t***) calloc(_sampling_driverCount, sizeof(zes_freq_handle_t**)); - _sampling_freqDomainCounts = (uint32_t**) calloc(_sampling_driverCount, sizeof(uint32_t)); + _sampling_hFrequencies = + (zes_freq_handle_t ***)calloc(_sampling_driverCount, sizeof(zes_freq_handle_t **)); + _sampling_freqDomainCounts = (uint32_t **)calloc(_sampling_driverCount, sizeof(uint32_t *)); for (uint32_t driverIdx = 0; driverIdx < _sampling_driverCount; driverIdx++) { - _sampling_freqDomainCounts[driverIdx] = (uint32_t*) calloc(_sampling_deviceCount[driverIdx], sizeof(uint32_t)); - _sampling_hFrequencies[driverIdx] = (zes_freq_handle_t**) calloc(_sampling_deviceCount[driverIdx], sizeof(zes_freq_handle_t*)); + _sampling_freqDomainCounts[driverIdx] = + (uint32_t *)calloc(_sampling_deviceCount[driverIdx], sizeof(uint32_t)); + _sampling_hFrequencies[driverIdx] = + (zes_freq_handle_t **)calloc(_sampling_deviceCount[driverIdx], sizeof(zes_freq_handle_t *)); for (uint32_t deviceIdx = 0; deviceIdx < _sampling_deviceCount[driverIdx]; deviceIdx++) { // Get frequency domains for each device - res = ZES_DEVICE_ENUM_FREQUENCY_DOMAINS_PTR(_sampling_hDevices[driverIdx][deviceIdx], &_sampling_freqDomainCounts[driverIdx][deviceIdx], NULL); + res = ZES_DEVICE_ENUM_FREQUENCY_DOMAINS_PTR(_sampling_hDevices[driverIdx][deviceIdx], + &_sampling_freqDomainCounts[driverIdx][deviceIdx], + NULL); if (res != ZE_RESULT_SUCCESS) { _ZE_ERROR_MSG("1st ZES_DEVICE_ENUM_FREQUENCY_DOMAINS_PTR", res); _sampling_freqDomainCounts[driverIdx][deviceIdx] = 0; continue; } - _sampling_hFrequencies[driverIdx][deviceIdx] = (zes_freq_handle_t*) calloc(_sampling_freqDomainCounts[driverIdx][deviceIdx], sizeof(zes_freq_handle_t)); - res = ZES_DEVICE_ENUM_FREQUENCY_DOMAINS_PTR(_sampling_hDevices[driverIdx][deviceIdx], &_sampling_freqDomainCounts[driverIdx][deviceIdx], _sampling_hFrequencies[driverIdx][deviceIdx]); + _sampling_hFrequencies[driverIdx][deviceIdx] = (zes_freq_handle_t *)calloc( + _sampling_freqDomainCounts[driverIdx][deviceIdx], sizeof(zes_freq_handle_t)); + res = ZES_DEVICE_ENUM_FREQUENCY_DOMAINS_PTR(_sampling_hDevices[driverIdx][deviceIdx], + &_sampling_freqDomainCounts[driverIdx][deviceIdx], + _sampling_hFrequencies[driverIdx][deviceIdx]); if (res != ZE_RESULT_SUCCESS) { _ZE_ERROR_MSG("2nd ZES_DEVICE_ENUM_FREQUENCY_DOMAINS_PTR", res); _sampling_freqDomainCounts[driverIdx][deviceIdx] = 0; free(_sampling_hFrequencies[driverIdx][deviceIdx]); } + for (uint32_t domainIdx = 0; domainIdx < _sampling_freqDomainCounts[driverIdx][deviceIdx]; + domainIdx++) { + zes_freq_properties_t freqProps = {0}; + freqProps.stype = ZES_STRUCTURE_TYPE_FREQ_PROPERTIES; + res = ZES_FREQUENCY_GET_PROPERTIES_PTR( + _sampling_hFrequencies[driverIdx][deviceIdx][domainIdx], &freqProps); + if (res != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("ZES_FREQUENCY_GET_PROPERTIES_PTR", res); + free(_sampling_hFrequencies[driverIdx][deviceIdx][domainIdx]); + } + do_tracepoint(lttng_ust_ze_sampling, freqProperties, + (ze_device_handle_t)_sampling_hDevices[driverIdx][deviceIdx], + (zes_freq_handle_t)_sampling_hFrequencies[driverIdx][deviceIdx][domainIdx], + &freqProps); + } } } _sampling_freq_initialized = 1; @@ -845,26 +862,49 @@ static void intializeFrequency() { static void intializePower() { ze_result_t res; - _sampling_hPowers = (zes_pwr_handle_t***) calloc(_sampling_driverCount, sizeof(zes_pwr_handle_t**)); - _sampling_powerDomainCounts = (uint32_t**) calloc(_sampling_driverCount, sizeof(uint32_t*)); + _sampling_hPowers = + (zes_pwr_handle_t ***)calloc(_sampling_driverCount, sizeof(zes_pwr_handle_t **)); + _sampling_powerDomainCounts = (uint32_t **)calloc(_sampling_driverCount, sizeof(uint32_t *)); for (uint32_t driverIdx = 0; driverIdx < _sampling_driverCount; driverIdx++) { - _sampling_hPowers[driverIdx] = (zes_pwr_handle_t**) calloc(_sampling_deviceCount[driverIdx], sizeof(zes_pwr_handle_t*)); - _sampling_powerDomainCounts[driverIdx] = (uint32_t*) calloc(_sampling_deviceCount[driverIdx], sizeof(uint32_t)); + _sampling_hPowers[driverIdx] = + (zes_pwr_handle_t **)calloc(_sampling_deviceCount[driverIdx], sizeof(zes_pwr_handle_t *)); + _sampling_powerDomainCounts[driverIdx] = + (uint32_t *)calloc(_sampling_deviceCount[driverIdx], sizeof(uint32_t)); for (uint32_t deviceIdx = 0; deviceIdx < _sampling_deviceCount[driverIdx]; deviceIdx++) { // Get power domains for each device - res = ZES_DEVICE_ENUM_POWER_DOMAINS_PTR(_sampling_hDevices[driverIdx][deviceIdx], &_sampling_powerDomainCounts[driverIdx][deviceIdx], NULL); + res = ZES_DEVICE_ENUM_POWER_DOMAINS_PTR(_sampling_hDevices[driverIdx][deviceIdx], + &_sampling_powerDomainCounts[driverIdx][deviceIdx], + NULL); if (res != ZE_RESULT_SUCCESS) { _ZE_ERROR_MSG("1st ZES_DEVICE_ENUM_POWER_DOMAINS_PTR", res); _sampling_powerDomainCounts[driverIdx][deviceIdx] = 0; continue; } - _sampling_hPowers[driverIdx][deviceIdx] = (zes_pwr_handle_t*) calloc(_sampling_powerDomainCounts[driverIdx][deviceIdx], sizeof(zes_pwr_handle_t)); - res = ZES_DEVICE_ENUM_POWER_DOMAINS_PTR(_sampling_hDevices[driverIdx][deviceIdx], &_sampling_powerDomainCounts[driverIdx][deviceIdx], _sampling_hPowers[driverIdx][deviceIdx]); + _sampling_hPowers[driverIdx][deviceIdx] = (zes_pwr_handle_t *)calloc( + _sampling_powerDomainCounts[driverIdx][deviceIdx], sizeof(zes_pwr_handle_t)); + res = ZES_DEVICE_ENUM_POWER_DOMAINS_PTR(_sampling_hDevices[driverIdx][deviceIdx], + &_sampling_powerDomainCounts[driverIdx][deviceIdx], + _sampling_hPowers[driverIdx][deviceIdx]); if (res != ZE_RESULT_SUCCESS) { _ZE_ERROR_MSG("2nd ZES_DEVICE_ENUM_POWER_DOMAINS_PTR", res); _sampling_powerDomainCounts[driverIdx][deviceIdx] = 0; free(_sampling_hPowers[driverIdx][deviceIdx]); } + for (uint32_t domainIdx = 0; domainIdx < _sampling_powerDomainCounts[driverIdx][deviceIdx]; + domainIdx++) { + zes_power_properties_t powerProperties = {0}; + powerProperties.stype = ZES_STRUCTURE_TYPE_POWER_PROPERTIES; + res = ZES_POWER_GET_PROPERTIES_PTR(_sampling_hPowers[driverIdx][deviceIdx][domainIdx], + &powerProperties); + if (res != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("ZES_POWER_GET_PROPERTIES_PTR", res); + free(_sampling_hPowers[driverIdx][deviceIdx][domainIdx]); + } + do_tracepoint(lttng_ust_ze_sampling, powerProperties, + (ze_device_handle_t)_sampling_hDevices[driverIdx][deviceIdx], + (zes_pwr_handle_t)_sampling_hPowers[driverIdx][deviceIdx][domainIdx], + &powerProperties); + } } } _sampling_pwr_initialized = 1; @@ -872,41 +912,153 @@ static void intializePower() { static void intializeEngines() { ze_result_t res; - _sampling_engineProps = (zes_engine_properties_t***) calloc(_sampling_driverCount, sizeof(zes_engine_properties_t**)); - _sampling_engineHandles = (zes_engine_handle_t***) calloc(_sampling_driverCount, sizeof(zes_engine_handle_t**)); - _sampling_engineCounts = (uint32_t**) calloc(_sampling_driverCount, sizeof(uint32_t*)); + _sampling_engineHandles = + (zes_engine_handle_t ***)calloc(_sampling_driverCount, sizeof(zes_engine_handle_t **)); + _sampling_engineCounts = (uint32_t **)calloc(_sampling_driverCount, sizeof(uint32_t *)); for (uint32_t driverIdx = 0; driverIdx < _sampling_driverCount; driverIdx++) { - _sampling_engineProps[driverIdx] = (zes_engine_properties_t**) calloc(_sampling_deviceCount[driverIdx], sizeof(zes_engine_properties_t*)); - _sampling_engineHandles[driverIdx] = (zes_engine_handle_t**) calloc(_sampling_deviceCount[driverIdx], sizeof(zes_engine_handle_t*)); - _sampling_engineCounts[driverIdx] = (uint32_t*) calloc(_sampling_deviceCount[driverIdx], sizeof(uint32_t)); + _sampling_engineHandles[driverIdx] = (zes_engine_handle_t **)calloc( + _sampling_deviceCount[driverIdx], sizeof(zes_engine_handle_t *)); + _sampling_engineCounts[driverIdx] = + (uint32_t *)calloc(_sampling_deviceCount[driverIdx], sizeof(uint32_t)); for (uint32_t deviceIdx = 0; deviceIdx < _sampling_deviceCount[driverIdx]; deviceIdx++) { // Get engine counts for each device - res = ZES_DEVICE_ENUM_ENGINE_GROUPS_PTR(_sampling_hDevices[driverIdx][deviceIdx], &_sampling_engineCounts[driverIdx][deviceIdx], NULL); + res = ZES_DEVICE_ENUM_ENGINE_GROUPS_PTR(_sampling_hDevices[driverIdx][deviceIdx], + &_sampling_engineCounts[driverIdx][deviceIdx], NULL); if (res != ZE_RESULT_SUCCESS || _sampling_engineCounts[driverIdx][deviceIdx] == 0) { _ZE_ERROR_MSG("1st ZES_DEVICE_ENUM_ENGINE_GROUPS_PTR", res); _sampling_engineCounts[driverIdx][deviceIdx] = 0; continue; } - _sampling_engineHandles[driverIdx][deviceIdx] = (zes_engine_handle_t*) calloc(_sampling_engineCounts[driverIdx][deviceIdx], sizeof(zes_engine_handle_t)); - res = ZES_DEVICE_ENUM_ENGINE_GROUPS_PTR(_sampling_hDevices[driverIdx][deviceIdx], &_sampling_engineCounts[driverIdx][deviceIdx], _sampling_engineHandles[driverIdx][deviceIdx]); + _sampling_engineHandles[driverIdx][deviceIdx] = (zes_engine_handle_t *)calloc( + _sampling_engineCounts[driverIdx][deviceIdx], sizeof(zes_engine_handle_t)); + res = ZES_DEVICE_ENUM_ENGINE_GROUPS_PTR(_sampling_hDevices[driverIdx][deviceIdx], + &_sampling_engineCounts[driverIdx][deviceIdx], + _sampling_engineHandles[driverIdx][deviceIdx]); if (res != ZE_RESULT_SUCCESS) { _ZE_ERROR_MSG("2nd ZES_DEVICE_ENUM_ENGINE_GROUPS_PTR", res); _sampling_engineCounts[driverIdx][deviceIdx] = 0; free(_sampling_engineHandles[driverIdx][deviceIdx]); } - _sampling_engineProps[driverIdx][deviceIdx] = (zes_engine_properties_t*) calloc(_sampling_engineCounts[driverIdx][deviceIdx], sizeof(zes_engine_properties_t)); - for (uint32_t engineIdx = 0; engineIdx < _sampling_engineCounts[driverIdx][deviceIdx]; ++engineIdx) { - _sampling_engineProps[driverIdx][deviceIdx][engineIdx].stype = ZES_STRUCTURE_TYPE_ENGINE_PROPERTIES; - res = ZES_ENGINE_GET_PROPERTIES_PTR(_sampling_engineHandles[driverIdx][deviceIdx][engineIdx], &_sampling_engineProps[driverIdx][deviceIdx][engineIdx]); + for (uint32_t engineIdx = 0; engineIdx < _sampling_engineCounts[driverIdx][deviceIdx]; + ++engineIdx) { + zes_engine_properties_t engineProps = {0}; + engineProps.stype = ZES_STRUCTURE_TYPE_ENGINE_PROPERTIES; + res = ZES_ENGINE_GET_PROPERTIES_PTR( + _sampling_engineHandles[driverIdx][deviceIdx][engineIdx], &engineProps); if (res != ZE_RESULT_SUCCESS) { _ZE_ERROR_MSG("ZES_ENGINE_GET_PROPERTIES_PTR", res); } + do_tracepoint(lttng_ust_ze_sampling, engineProperties, + (ze_device_handle_t)_sampling_hDevices[driverIdx][deviceIdx], + (zes_engine_handle_t)_sampling_engineHandles[driverIdx][deviceIdx][engineIdx], + &engineProps); } } } _sampling_engines_initialized = 1; } +static void intializeFabricPorts() { + ze_result_t res; + _sampling_hFabricPort = (zes_fabric_port_handle_t ***)calloc(_sampling_driverCount, + sizeof(zes_fabric_port_handle_t **)); + _sampling_fabricPortCount = (uint32_t **)calloc(_sampling_driverCount, sizeof(uint32_t *)); + for (uint32_t driverIdx = 0; driverIdx < _sampling_driverCount; driverIdx++) { + _sampling_fabricPortCount[driverIdx] = + (uint32_t *)calloc(_sampling_deviceCount[driverIdx], sizeof(uint32_t)); + _sampling_hFabricPort[driverIdx] = (zes_fabric_port_handle_t **)calloc( + _sampling_deviceCount[driverIdx], sizeof(zes_fabric_port_handle_t *)); + for (uint32_t deviceIdx = 0; deviceIdx < _sampling_deviceCount[driverIdx]; deviceIdx++) { + // Get fabric ports for each device + res = + ZES_DEVICE_ENUM_FABRIC_PORTS_PTR(_sampling_hDevices[driverIdx][deviceIdx], + &_sampling_fabricPortCount[driverIdx][deviceIdx], NULL); + if (res != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("1st ZES_DEVICE_ENUM_FABRIC_PORTS_PTR", res); + _sampling_fabricPortCount[driverIdx][deviceIdx] = 0; + continue; + } + _sampling_hFabricPort[driverIdx][deviceIdx] = (zes_fabric_port_handle_t *)calloc( + _sampling_fabricPortCount[driverIdx][deviceIdx], sizeof(zes_fabric_port_handle_t)); + res = ZES_DEVICE_ENUM_FABRIC_PORTS_PTR(_sampling_hDevices[driverIdx][deviceIdx], + &_sampling_fabricPortCount[driverIdx][deviceIdx], + _sampling_hFabricPort[driverIdx][deviceIdx]); + if (res != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("2nd ZES_DEVICE_ENUM_FABRIC_PORTS_PTR", res); + _sampling_fabricPortCount[driverIdx][deviceIdx] = 0; + free(_sampling_hFabricPort[driverIdx][deviceIdx]); + } + for (uint32_t fabricPortIdx = 0; + fabricPortIdx < _sampling_fabricPortCount[driverIdx][deviceIdx]; ++fabricPortIdx) { + + zes_fabric_port_properties_t fabricPortProps = {0}; + res = ZES_FABRIC_PORT_GET_PROPERTIES_PTR( + _sampling_hFabricPort[driverIdx][deviceIdx][fabricPortIdx], &fabricPortProps); + if (res != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("ZES_FABRIC_PORT_GET_PROPERTIES_PTR", res); + } + // Dump fabricPortProperties once + do_tracepoint( + lttng_ust_ze_sampling, fabricPortProperties, + (ze_device_handle_t)_sampling_hDevices[driverIdx][deviceIdx], + (zes_fabric_port_handle_t)_sampling_hFabricPort[driverIdx][deviceIdx][fabricPortIdx], + &fabricPortProps); + } + } + } + _sampling_fabricPorts_initialized = 1; +} + +static void intializeMemModules() { + ze_result_t res; + _sampling_hMemModule = + (zes_mem_handle_t ***)calloc(_sampling_driverCount, sizeof(zes_mem_handle_t **)); + _sampling_memModuleCount = (uint32_t **)calloc(_sampling_driverCount, sizeof(uint32_t *)); + for (uint32_t driverIdx = 0; driverIdx < _sampling_driverCount; driverIdx++) { + _sampling_memModuleCount[driverIdx] = + (uint32_t *)calloc(_sampling_deviceCount[driverIdx], sizeof(uint32_t)); + _sampling_hMemModule[driverIdx] = + (zes_mem_handle_t **)calloc(_sampling_deviceCount[driverIdx], sizeof(zes_mem_handle_t *)); + for (uint32_t deviceIdx = 0; deviceIdx < _sampling_deviceCount[driverIdx]; deviceIdx++) { + // Get fabric ports for each device + res = + ZES_DEVICE_ENUM_MEMORY_MODULES_PTR(_sampling_hDevices[driverIdx][deviceIdx], + &_sampling_memModuleCount[driverIdx][deviceIdx], NULL); + if (res != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("1st ZES_DEVICE_ENUM_MEMORY_MODULES_PTR", res); + _sampling_memModuleCount[driverIdx][deviceIdx] = 0; + continue; + } + _sampling_hMemModule[driverIdx][deviceIdx] = (zes_mem_handle_t *)calloc( + _sampling_memModuleCount[driverIdx][deviceIdx], sizeof(zes_mem_handle_t)); + res = ZES_DEVICE_ENUM_MEMORY_MODULES_PTR(_sampling_hDevices[driverIdx][deviceIdx], + &_sampling_memModuleCount[driverIdx][deviceIdx], + _sampling_hMemModule[driverIdx][deviceIdx]); + if (res != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("2nd ZES_DEVICE_ENUM_MEMORY_MODULES_PTR", res); + _sampling_memModuleCount[driverIdx][deviceIdx] = 0; + free(_sampling_hMemModule[driverIdx][deviceIdx]); + } + for (uint32_t memModuleIdx = 0; memModuleIdx < _sampling_memModuleCount[driverIdx][deviceIdx]; + ++memModuleIdx) { + zes_mem_properties_t memProps = {0}; + memProps.stype = ZES_STRUCTURE_TYPE_MEM_PROPERTIES; + res = ZES_MEMORY_GET_PROPERTIES_PTR( + _sampling_hMemModule[driverIdx][deviceIdx][memModuleIdx], &memProps); + if (res != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("ZES_MEMORY_GET_PROPERTIES_PTR", res); + } + // Dump fabricPortProperties once + do_tracepoint(lttng_ust_ze_sampling, memoryProperties, + (ze_device_handle_t)_sampling_hDevices[driverIdx][deviceIdx], + (zes_mem_handle_t)_sampling_hMemModule[driverIdx][deviceIdx][memModuleIdx], + &memProps); + } + } + } + _sampling_memModules_initialized = 1; +} + static int initializeHandles() { ze_result_t res; res = ZES_INIT_PTR(0); @@ -922,167 +1074,188 @@ static int initializeHandles() { _ZE_ERROR_MSG("1st ZES_DRIVER_GET_PTR", res); return -1; } - _sampling_hDrivers = (zes_driver_handle_t*) calloc(_sampling_driverCount, sizeof(zes_driver_handle_t)); + _sampling_hDrivers = + (zes_driver_handle_t *)calloc(_sampling_driverCount, sizeof(zes_driver_handle_t)); res = ZES_DRIVER_GET_PTR(&_sampling_driverCount, _sampling_hDrivers); if (res != ZE_RESULT_SUCCESS) { _ZE_ERROR_MSG("2nd ZES_DRIVER_GET_PTR", res); return -1; } - _sampling_deviceCount = (uint32_t*) calloc(_sampling_driverCount, sizeof(uint32_t)); - _sampling_subDeviceCount = (uint32_t**) calloc(_sampling_driverCount, sizeof(uint32_t*)); - _sampling_hDevices = (zes_device_handle_t**) calloc(_sampling_driverCount, sizeof(zes_device_handle_t*)); - // Query device count + _sampling_deviceCount = (uint32_t *)calloc(_sampling_driverCount, sizeof(uint32_t)); + _sampling_hDevices = + (zes_device_handle_t **)calloc(_sampling_driverCount, sizeof(zes_device_handle_t *)); for (uint32_t driverIdx = 0; driverIdx < _sampling_driverCount; driverIdx++) { - res = ZES_DEVICE_GET_PTR(_sampling_hDrivers[driverIdx], &_sampling_deviceCount[driverIdx], NULL); + res = + ZES_DEVICE_GET_PTR(_sampling_hDrivers[driverIdx], &_sampling_deviceCount[driverIdx], NULL); if (res != ZE_RESULT_SUCCESS || _sampling_deviceCount[driverIdx] == 0) { fprintf(stderr, "ERROR: No device found!\n"); _ZE_ERROR_MSG("1st ZES_DEVICE_GET_PTR", res); return -1; } - _sampling_hDevices[driverIdx] = (zes_device_handle_t*) calloc(_sampling_deviceCount[driverIdx], sizeof(zes_device_handle_t)); - res = ZES_DEVICE_GET_PTR(_sampling_hDrivers[driverIdx], &_sampling_deviceCount[driverIdx], _sampling_hDevices[driverIdx]); + _sampling_hDevices[driverIdx] = (zes_device_handle_t *)calloc(_sampling_deviceCount[driverIdx], + sizeof(zes_device_handle_t)); + res = ZES_DEVICE_GET_PTR(_sampling_hDrivers[driverIdx], &_sampling_deviceCount[driverIdx], + _sampling_hDevices[driverIdx]); if (res != ZE_RESULT_SUCCESS) { _ZE_ERROR_MSG("2nd ZES_DEVICE_GET_PTR", res); free(_sampling_hDevices[driverIdx]); return -1; } - //Get no sub-devices - _sampling_subDeviceCount[driverIdx] = (uint32_t*) calloc(_sampling_deviceCount[driverIdx], sizeof(uint32_t)); for (uint32_t deviceIdx = 0; deviceIdx < _sampling_deviceCount[driverIdx]; deviceIdx++) { - zes_device_properties_t deviceProperties = {0}; - deviceProperties.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES; - res = ZES_DEVICE_GET_PROPERTIES_PTR(_sampling_hDevices[driverIdx][deviceIdx], &deviceProperties); + + zes_device_properties_t deviceProps = {0}; + deviceProps.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES; + deviceProps.pNext = NULL; + res = ZES_DEVICE_GET_PROPERTIES_PTR(_sampling_hDevices[driverIdx][deviceIdx], &deviceProps); if (res != ZE_RESULT_SUCCESS) { _ZE_ERROR_MSG("ZES_DEVICE_GET_PROPERTIES_PTR", res); - _sampling_subDeviceCount[driverIdx][deviceIdx] = 0; - } else - _sampling_subDeviceCount[driverIdx][deviceIdx] = deviceProperties.numSubdevices; - if (_sampling_subDeviceCount[driverIdx][deviceIdx] == 0) { - _sampling_subDeviceCount[driverIdx][deviceIdx] = 1; } + do_tracepoint(lttng_ust_ze_sampling, deviceProperties, + (zes_device_handle_t)_sampling_hDevices[driverIdx][deviceIdx], deviceIdx, + &deviceProps); } } intializeFrequency(); intializePower(); intializeEngines(); + intializeFabricPorts(); + intializeMemModules(); return 0; } -static void readFrequency(uint32_t driverIdx, uint32_t deviceIdx, uint32_t domainIdx, uint32_t *frequency) { - if (!_sampling_freq_initialized) return; - ze_result_t result; - *frequency=0; - zes_freq_state_t freqState; - result = ZES_FREQUENCY_GET_STATE_PTR(_sampling_hFrequencies[driverIdx][deviceIdx][domainIdx], &freqState); - if (result != ZE_RESULT_SUCCESS) { - _ZE_ERROR_MSG("ZES_FREQUENCY_GET_STATE_PTR", result); +static void readFrequency_dump(uint32_t driverIdx, uint32_t deviceIdx) { + if (!_sampling_freq_initialized) return; + ze_result_t result; + for (uint32_t domainIdx = 0; domainIdx < _sampling_freqDomainCounts[driverIdx][deviceIdx]; + domainIdx++) { + zes_freq_state_t freqState = {0}; + result = ZES_FREQUENCY_GET_STATE_PTR(_sampling_hFrequencies[driverIdx][deviceIdx][domainIdx], + &freqState); + if (result != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("ZES_FREQUENCY_GET_STATE_PTR", result); + continue; + } + do_tracepoint(lttng_ust_ze_sampling, gpu_frequency, + (zes_device_handle_t)_sampling_hDevices[driverIdx][deviceIdx], + (zes_freq_handle_t)_sampling_hFrequencies[driverIdx][deviceIdx][domainIdx], + domainIdx, &freqState); } - *frequency = freqState.actual; } -static void readEnergy(uint32_t driverIdx, uint32_t deviceIdx, uint32_t domainIdx, uint64_t *ts_us, uint64_t *energy_uj) { - if (!_sampling_pwr_initialized) return; - ze_result_t result; - *ts_us = 0; - *energy_uj = 0; - zes_power_energy_counter_t energyCounter; - result = ZES_POWER_GET_ENERGY_COUNTER_PTR(_sampling_hPowers[driverIdx][deviceIdx][domainIdx], &energyCounter); - if (result != ZE_RESULT_SUCCESS) { - _ZE_ERROR_MSG("ZES_POWER_GET_ENERGY_COUNTER_PTR", result); +static void readFabricPorts_dump(uint32_t driverIdx, uint32_t deviceIdx) { + if (!_sampling_fabricPorts_initialized) return; + ze_result_t result; + for (uint32_t portIdx = 0; portIdx < _sampling_fabricPortCount[driverIdx][deviceIdx]; portIdx++) { + zes_fabric_port_state_t portState = {0}; + portState.pNext = NULL; + portState.stype = ZES_STRUCTURE_TYPE_FABRIC_PORT_STATE; + result = ZES_FABRIC_PORT_GET_STATE_PTR(_sampling_hFabricPort[driverIdx][deviceIdx][portIdx], + &portState); + if (result != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("ZES_FABRIC_PORT_GET_STATE_PTR", result); + continue; + } + zes_fabric_port_throughput_t throughput = {0}; + result = ZES_FABRIC_PORT_GET_THROUGHPUT_PTR( + _sampling_hFabricPort[driverIdx][deviceIdx][portIdx], &throughput); + if (result != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("ZES_FABRIC_PORT_GET_THROUGHPUT_PTR", result); + continue; + } + do_tracepoint(lttng_ust_ze_sampling, fabricPort, + (zes_device_handle_t)_sampling_hDevices[driverIdx][deviceIdx], + (zes_fabric_port_handle_t)_sampling_hFabricPort[driverIdx][deviceIdx][portIdx], + &portState, &throughput); } - *ts_us = energyCounter.timestamp; - *energy_uj = energyCounter.energy; } -static void readComputeE(uint32_t driverIdx, uint32_t deviceIdx, computeEngineData *computeData ){ - if (!_sampling_engines_initialized) return; +static void readMemModules_dump(uint32_t driverIdx, uint32_t deviceIdx) { + if (!_sampling_memModules_initialized) + return; ze_result_t result; - for (uint32_t subDevIdx = 0; subDevIdx < _sampling_subDeviceCount[driverIdx][deviceIdx]; subDevIdx++) { - computeData[subDevIdx].computeActive = 0; - computeData[subDevIdx].timestamp = 0; - } - for (uint32_t engineIdx = 0; engineIdx < _sampling_engineCounts[driverIdx][deviceIdx]; ++engineIdx) { - if (_sampling_engineProps[driverIdx][deviceIdx][engineIdx].type == ZES_ENGINE_GROUP_COMPUTE_ALL){ - zes_engine_stats_t engineStats = {0}; - result = ZES_ENGINE_GET_ACTIVITY_PTR(_sampling_engineHandles[driverIdx][deviceIdx][engineIdx], &engineStats); - if (result != ZE_RESULT_SUCCESS) { - _ZE_ERROR_MSG("ZES_ENGINE_GET_ACTIVITY_PTR", result); - continue; - } - if (_sampling_engineProps[driverIdx][deviceIdx][engineIdx].onSubdevice) { - computeData[_sampling_engineProps[driverIdx][deviceIdx][engineIdx].subdeviceId].computeActive = engineStats.activeTime; - computeData[_sampling_engineProps[driverIdx][deviceIdx][engineIdx].subdeviceId].timestamp = engineStats.timestamp; - } else { - computeData[0].computeActive = engineStats.activeTime; - computeData[0].timestamp = engineStats.timestamp; - } + for (uint32_t memModuleIdx = 0; memModuleIdx < _sampling_memModuleCount[driverIdx][deviceIdx]; + ++memModuleIdx) { + zes_mem_state_t memState = {0}; + memState.stype = ZES_STRUCTURE_TYPE_MEM_STATE; + zes_mem_bandwidth_t memBandwidth = {0}; + result = ZES_MEMORY_GET_STATE_PTR(_sampling_hMemModule[driverIdx][deviceIdx][memModuleIdx], + &memState); + if (result != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("ZES_MEMORY_GET_STATE_PTR", result); + continue; + } + result = ZES_MEMORY_GET_BANDWIDTH_PTR(_sampling_hMemModule[driverIdx][deviceIdx][memModuleIdx], + &memBandwidth); + if (result != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("ZES_MEMORY_GET_BANDWIDTH_PTR", result); + continue; } + do_tracepoint(lttng_ust_ze_sampling, memStats, + (zes_device_handle_t)_sampling_hDevices[driverIdx][deviceIdx], + (zes_mem_handle_t)_sampling_hMemModule[driverIdx][deviceIdx][memModuleIdx], + &memState, &memBandwidth); } } -static void readCopyE(uint32_t driverIdx, uint32_t deviceIdx, copyEngineData *copyData ){ - if (!_sampling_engines_initialized) return; +static void readEnergy_dump(uint32_t driverIdx, uint32_t deviceIdx) { + if (!_sampling_pwr_initialized) + return; ze_result_t result; - for (uint32_t subDevIdx = 0; subDevIdx < _sampling_subDeviceCount[driverIdx][deviceIdx]; subDevIdx++) { - copyData[subDevIdx].copyActive = 0; - copyData[subDevIdx].timestamp = 0; + for (uint32_t domainIdx = 0; domainIdx < _sampling_powerDomainCounts[driverIdx][deviceIdx]; + domainIdx++) { + zes_power_energy_counter_t energyCounter = {0}; + result = ZES_POWER_GET_ENERGY_COUNTER_PTR(_sampling_hPowers[driverIdx][deviceIdx][domainIdx], + &energyCounter); + if (result != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("ZES_POWER_GET_ENERGY_COUNTER_PTR", result); + continue; + } + do_tracepoint(lttng_ust_ze_sampling, gpu_energy, + (ze_device_handle_t)_sampling_hDevices[driverIdx][deviceIdx], + (zes_pwr_handle_t)_sampling_hPowers[driverIdx][deviceIdx][domainIdx], domainIdx, + &energyCounter); } - for (uint32_t engineIdx = 0; engineIdx < _sampling_engineCounts[driverIdx][deviceIdx]; ++engineIdx) { - if (_sampling_engineProps[driverIdx][deviceIdx][engineIdx].type == ZES_ENGINE_GROUP_COPY_ALL){ - zes_engine_stats_t engineStats = {0}; - result = ZES_ENGINE_GET_ACTIVITY_PTR(_sampling_engineHandles[driverIdx][deviceIdx][engineIdx], &engineStats); - if (result != ZE_RESULT_SUCCESS) { - _ZE_ERROR_MSG("ZES_ENGINE_GET_ACTIVITY_PTR", result); - continue; - } - if (_sampling_engineProps[driverIdx][deviceIdx][engineIdx].onSubdevice) { - copyData[_sampling_engineProps[driverIdx][deviceIdx][engineIdx].subdeviceId].copyActive = engineStats.activeTime; - copyData[_sampling_engineProps[driverIdx][deviceIdx][engineIdx].subdeviceId].timestamp = engineStats.timestamp; - } else { - copyData[0].copyActive = engineStats.activeTime; - copyData[0].timestamp = engineStats.timestamp; - } +} + +static void readEngines_dump(uint32_t driverIdx, uint32_t deviceIdx) { + if (!_sampling_engines_initialized) + return; + ze_result_t result; + for (uint32_t engineIdx = 0; engineIdx < _sampling_engineCounts[driverIdx][deviceIdx]; + ++engineIdx) { + zes_engine_stats_t engineStats = {0}; + result = ZES_ENGINE_GET_ACTIVITY_PTR(_sampling_engineHandles[driverIdx][deviceIdx][engineIdx], + &engineStats); + if (result != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("ZES_ENGINE_GET_ACTIVITY_PTR", result); + continue; } + do_tracepoint(lttng_ust_ze_sampling, engineStats, + (zes_device_handle_t)_sampling_hDevices[driverIdx][deviceIdx], + (zes_engine_handle_t)_sampling_engineHandles[driverIdx][deviceIdx][engineIdx], + &engineStats); } } static void thapi_sampling_energy() { - uint64_t ts_us = 0; - uint64_t energy_uj = 0; - uint32_t frequency = 0; for (uint32_t driverIdx = 0; driverIdx < _sampling_driverCount; driverIdx++) { for (uint32_t deviceIdx = 0; deviceIdx < _sampling_deviceCount[driverIdx]; deviceIdx++) { - if (tracepoint_enabled(lttng_ust_ze_sampling, gpu_frequency)){ - for (uint32_t domainIdx = 0; domainIdx < _sampling_freqDomainCounts[driverIdx][deviceIdx]; domainIdx++) { - readFrequency(driverIdx, deviceIdx, domainIdx, &frequency); - do_tracepoint(lttng_ust_ze_sampling, gpu_frequency, (ze_device_handle_t)_sampling_hDevices[driverIdx][deviceIdx], domainIdx, frequency); - } + if (tracepoint_enabled(lttng_ust_ze_sampling, gpu_frequency)) { + readFrequency_dump(driverIdx, deviceIdx); } - if (tracepoint_enabled(lttng_ust_ze_sampling, gpu_energy)){ - for (uint32_t domainIdx = 0; domainIdx < _sampling_powerDomainCounts[driverIdx][deviceIdx]; domainIdx++) { - readEnergy(driverIdx, deviceIdx, domainIdx, &ts_us, &energy_uj); - do_tracepoint(lttng_ust_ze_sampling, gpu_energy, (ze_device_handle_t)_sampling_hDevices[driverIdx][deviceIdx], domainIdx, (uint64_t)energy_uj, ts_us); - } + if (tracepoint_enabled(lttng_ust_ze_sampling, gpu_energy)) { + readEnergy_dump(driverIdx, deviceIdx); } - if (tracepoint_enabled(lttng_ust_ze_sampling, computeEngine)){ - if (_sampling_subDeviceCount[driverIdx][deviceIdx] != 0 ) { - computeEngineData computeE[_sampling_subDeviceCount[driverIdx][deviceIdx]]; - readComputeE(driverIdx, deviceIdx, computeE); - for (uint32_t subDevIdx = 0; subDevIdx < _sampling_subDeviceCount[driverIdx][deviceIdx]; subDevIdx++){ - do_tracepoint(lttng_ust_ze_sampling, computeEngine, (ze_device_handle_t)_sampling_hDevices[driverIdx][deviceIdx], subDevIdx, computeE[subDevIdx].computeActive, computeE[subDevIdx].timestamp); - } - } + if (tracepoint_enabled(lttng_ust_ze_sampling, engineStats)) { + readEngines_dump(driverIdx, deviceIdx); } - if (tracepoint_enabled(lttng_ust_ze_sampling, copyEngine)){ - if (_sampling_subDeviceCount[driverIdx][deviceIdx] != 0 ) { - copyEngineData copyE[_sampling_subDeviceCount[driverIdx][deviceIdx]]; - readCopyE(driverIdx, deviceIdx, copyE); - for (uint32_t subDevIdx = 0; subDevIdx < _sampling_subDeviceCount[driverIdx][deviceIdx]; subDevIdx++){ - do_tracepoint(lttng_ust_ze_sampling, copyEngine, (ze_device_handle_t)_sampling_hDevices[driverIdx][deviceIdx], subDevIdx, copyE[subDevIdx].copyActive, copyE[subDevIdx].timestamp); - } - } + if (tracepoint_enabled(lttng_ust_ze_sampling, fabricPort)) { + readFabricPorts_dump(driverIdx, deviceIdx); + } + if (tracepoint_enabled(lttng_ust_ze_sampling, memStats)) { + readMemModules_dump(driverIdx, deviceIdx); } } } diff --git a/ze/ze_events.yaml b/ze/ze_events.yaml index c8e58263..6158d79a 100644 --- a/ze/ze_events.yaml +++ b/ze/ze_events.yaml @@ -1,48 +1,113 @@ --- lttng_ust_ze_sampling: events: - - name: copyEngine + - name: deviceProperties args: - - [ ze_device_handle_t, hDevice ] - - [ uint32_t, subDevice ] - - [ uint64_t, activeTime ] - - [ uint64_t, timestamp ] + - [ zes_device_handle_t, hDevice ] + - [ uint32_t, deviceIdx ] + - [ zes_device_properties_t *, pDeviceProperties ] fields: - [ ctf_integer_hex, uintptr_t, hDevice, "(uintptr_t)hDevice" ] - - [ ctf_integer, uint32_t, subDevice, "subDevice" ] - - [ ctf_integer, uint64_t, activeTime, "activeTime" ] - - [ ctf_integer, uint64_t, timestamp, "timestamp" ] - - name: computeEngine + - [ ctf_integer, uint32_t, deviceIdx, "deviceIdx" ] + - [ ctf_sequence_text, uint8_t, pDeviceProperties_val, pDeviceProperties, size_t, "sizeof(ze_device_properties_t)" ] + - name: engineProperties args: - - [ ze_device_handle_t, hDevice ] - - [ uint32_t, subDevice ] - - [ uint64_t, activeTime ] - - [ uint64_t, timestamp ] + - [ zes_device_handle_t, hDevice ] + - [ zes_engine_handle_t, hEngines ] + - [ zes_engine_properties_t *, pEngineProperties ] fields: - [ ctf_integer_hex, uintptr_t, hDevice, "(uintptr_t)hDevice" ] - - [ ctf_integer, uint32_t, subDevice, "subDevice" ] - - [ ctf_integer, uint64_t, activeTime, "activeTime" ] - - [ ctf_integer, uint64_t, timestamp, "timestamp" ] + - [ ctf_integer_hex, uintptr_t, hEngines, "(uintptr_t)hEngines" ] + - [ ctf_sequence_text, uint8_t, pEngineProperties_val, pEngineProperties, size_t, "sizeof(zes_engine_properties_t)" ] + - name: engineStats + args: + - [ zes_device_handle_t, hDevice ] + - [ zes_engine_handle_t, hEngines ] + - [ zes_engine_stats_t *, pEngineStats ] + fields: + - [ ctf_integer_hex, uintptr_t, hDevice, "(uintptr_t)hDevice" ] + - [ ctf_integer_hex, uintptr_t, hEngines, "(uintptr_t)hEngines" ] + - [ ctf_sequence_text, uint8_t, pEngineStats_val, pEngineStats, size_t, "sizeof(zes_engine_stats_t)" ] + - name: powerProperties + args: + - [ zes_device_handle_t, hDevice ] + - [ zes_pwr_handle_t, hPowers ] + - [ zes_power_properties_t *, pPowerProperties ] + fields: + - [ ctf_integer_hex, uintptr_t, hDevice, "(uintptr_t)hDevice" ] + - [ ctf_integer_hex, uintptr_t, hPowers, "(uintptr_t)hPowers" ] + - [ ctf_sequence_text, uint8_t, pPowerProperties_val, pPowerProperties, size_t, "sizeof(zes_power_properties_t)" ] - name: gpu_energy args: - - [ ze_device_handle_t, hDevice ] - - [ uint32_t, domain ] - - [ uint64_t, energy ] - - [ uint64_t, timestamp ] + - [ zes_device_handle_t, hDevice ] + - [ zes_pwr_handle_t, hPowers ] + - [ uint32_t, domainIdx ] + - [ zes_power_energy_counter_t *, pEnergyCounter ] + fields: + - [ ctf_integer_hex, uintptr_t, hDevice, "(uintptr_t)hDevice" ] + - [ ctf_integer_hex, uintptr_t, hPowers, "(uintptr_t)hPowers" ] + - [ ctf_integer, uint32_t, domainIdx, "domainIdx" ] + - [ ctf_sequence_text, uint8_t, pEnergyCounter_val, pEnergyCounter, size_t, "sizeof(zes_power_energy_counter_t)" ] + - name: freqProperties + args: + - [ zes_device_handle_t, hDevice ] + - [ zes_freq_handle_t, hFrequency ] + - [ zes_freq_properties_t *, pFreqProperties ] fields: - [ ctf_integer_hex, uintptr_t, hDevice, "(uintptr_t)hDevice" ] - - [ ctf_integer, uint32_t, domain, "domain" ] - - [ ctf_integer, uint64_t, energy, "energy" ] - - [ ctf_integer, uint64_t, timestamp, "timestamp" ] + - [ ctf_integer_hex, uintptr_t, hFrequency, "(uintptr_t)hFrequency" ] + - [ ctf_sequence_text, uint8_t, pFreqProperties_val, pFreqProperties, size_t, "sizeof(zes_freq_properties_t)" ] - name: gpu_frequency args: - - [ ze_device_handle_t, hDevice ] - - [ uint32_t, domain ] - - [ uint64_t, frequency ] + - [ zes_device_handle_t, hDevice ] + - [ zes_freq_handle_t, hFrequency ] + - [ uint32_t, domainIdx ] + - [ zes_freq_state_t *, pFreqState ] + fields: + - [ ctf_integer_hex, uintptr_t, hDevice, "(uintptr_t)hDevice" ] + - [ ctf_integer_hex, uintptr_t, hFrequency, "(uintptr_t)hFrequency" ] + - [ ctf_integer, uint32_t, domainIdx, "domainIdx" ] + - [ ctf_sequence_text, uint8_t, pFreqState_val, pFreqState, size_t, "sizeof(zes_freq_state_t)" ] + - name: fabricPortProperties + args: + - [ zes_device_handle_t, hDevice ] + - [ zes_fabric_port_handle_t, hFabricPort ] + - [ zes_fabric_port_properties_t *, pFabricPortProperties ] + fields: + - [ ctf_integer_hex, uintptr_t, hDevice, "(uintptr_t)hDevice" ] + - [ ctf_integer_hex, uintptr_t, hFabricPort, "(uintptr_t)hFabricPort" ] + - [ ctf_sequence_text, uint8_t, pFabricPortProperties_val, pFabricPortProperties, size_t, "sizeof(zes_fabric_port_properties_t)" ] + - name: fabricPort + args: + - [ zes_device_handle_t, hDevice ] + - [ zes_fabric_port_handle_t, hFabricPort ] + - [ zes_fabric_port_state_t *, pFabricPortState ] + - [ zes_fabric_port_throughput_t *, pFabricPortThroughput ] + fields: + - [ ctf_integer_hex, uintptr_t, hDevice, "(uintptr_t)hDevice" ] + - [ ctf_integer_hex, uintptr_t, hFabricPort, "(uintptr_t)hFabricPort" ] + - [ ctf_sequence_text, uint8_t, pFabricPortState_val, pFabricPortState, size_t, "sizeof(zes_fabric_port_state_t)" ] + - [ ctf_sequence_text, uint8_t, pFabricPortThroughput_val, pFabricPortThroughput, size_t, "sizeof(zes_fabric_port_throughput_t)" ] + - name: memoryProperties + args: + - [ zes_device_handle_t, hDevice ] + - [ zes_mem_handle_t, hMemModule ] + - [ zes_mem_properties_t *, pMemModuleProperties ] + fields: + - [ ctf_integer_hex, uintptr_t, hDevice, "(uintptr_t)hDevice" ] + - [ ctf_integer_hex, uintptr_t, hMemModule, "(uintptr_t)hMemModule" ] + - [ ctf_sequence_text, uint8_t, pMemModuleProperties_val, pMemModuleProperties, size_t, "sizeof(zes_mem_properties_t)" ] + - name: memStats + args: + - [ zes_device_handle_t, hDevice ] + - [ zes_mem_handle_t, hMemModule ] + - [ zes_mem_state_t *, pMemState ] + - [ zes_mem_bandwidth_t *, pMemBandwidth ] fields: - [ ctf_integer_hex, uintptr_t, hDevice, "(uintptr_t)hDevice" ] - - [ ctf_integer, uint32_t, domain, "domain" ] - - [ ctf_integer, uint64_t, frequency, "frequency" ] + - [ ctf_integer_hex, uintptr_t, hMemModule, "(uintptr_t)hMemModule" ] + - [ ctf_sequence_text, uint8_t, pMemState_val, pMemState, size_t, "sizeof(zes_mem_state_t)" ] + - [ ctf_sequence_text, uint8_t, pMemBandwidth_val, pMemBandwidth, size_t, "sizeof(zes_mem_bandwidth_t)" ] lttng_ust_ze_profiling: events: - name: event_profiling