diff --git a/utils/xprof_utils.cpp b/utils/xprof_utils.cpp index 5527e3eb..35f5229c 100644 --- a/utils/xprof_utils.cpp +++ b/utils/xprof_utils.cpp @@ -118,6 +118,102 @@ bt_message* create_frequency_message(const char* hostname, const process_id_t pr return message; } +bt_message* create_computeEU_message(const char* hostname, const process_id_t process_id, const thread_id_t thread_id, + const uintptr_t hDevice, const uint32_t subDevice, const float activeTime, const uint64_t ts, + bt_event_class *event_class, bt_self_message_iterator *message_iterator, bt_stream *stream, backend_t backend) { + + /* Message creation */ + bt_message *message = bt_message_event_create( + message_iterator, event_class, stream); + + /* event */ + bt_event *downstream_event = bt_message_event_borrow_event(message); + + /* Common context */ + bt_field *context_field = bt_event_borrow_common_context_field(downstream_event); + + // Hostname + bt_field *hostname_msg_field = bt_field_structure_borrow_member_field_by_index(context_field,0); + bt_field_string_set_value(hostname_msg_field, hostname); + // pid + bt_field *vpid_field = bt_field_structure_borrow_member_field_by_index(context_field,1); + bt_field_integer_signed_set_value(vpid_field, process_id); + // vid + bt_field *vtid_field = bt_field_structure_borrow_member_field_by_index(context_field,2); + bt_field_integer_signed_set_value(vtid_field, thread_id); + // ts + bt_field *ts_field = bt_field_structure_borrow_member_field_by_index(context_field,3); + bt_field_integer_signed_set_value(ts_field, ts); + // backend + bt_field *backend_field = bt_field_structure_borrow_member_field_by_index(context_field,4); + bt_field_integer_signed_set_value(backend_field, backend); + + /* Payload */ + bt_field *payload_field = bt_event_borrow_payload_field(downstream_event); + + // did + bt_field *device_id_field = bt_field_structure_borrow_member_field_by_index(payload_field,0); + bt_field_integer_unsigned_set_value(device_id_field, hDevice); + + //subDevice + bt_field *subDevice_field = bt_field_structure_borrow_member_field_by_index(payload_field,1); + bt_field_integer_unsigned_set_value(subDevice_field, subDevice); + + //activeTime + bt_field *activeTime_field = bt_field_structure_borrow_member_field_by_index(payload_field,2); + bt_field_real_single_precision_set_value(activeTime_field, activeTime); + + return message; +} + +bt_message* create_copyEU_message(const char* hostname, const process_id_t process_id, const thread_id_t thread_id, + const uintptr_t hDevice, const uint32_t subDevice, const float activeTime, const uint64_t ts, + bt_event_class *event_class, bt_self_message_iterator *message_iterator, bt_stream *stream, backend_t backend) { + + /* Message creation */ + bt_message *message = bt_message_event_create( + message_iterator, event_class, stream); + + /* event */ + bt_event *downstream_event = bt_message_event_borrow_event(message); + + /* Common context */ + bt_field *context_field = bt_event_borrow_common_context_field(downstream_event); + + // Hostname + bt_field *hostname_msg_field = bt_field_structure_borrow_member_field_by_index(context_field,0); + bt_field_string_set_value(hostname_msg_field, hostname); + // pid + bt_field *vpid_field = bt_field_structure_borrow_member_field_by_index(context_field,1); + bt_field_integer_signed_set_value(vpid_field, process_id); + // vid + bt_field *vtid_field = bt_field_structure_borrow_member_field_by_index(context_field,2); + bt_field_integer_signed_set_value(vtid_field, thread_id); + // ts + bt_field *ts_field = bt_field_structure_borrow_member_field_by_index(context_field,3); + bt_field_integer_signed_set_value(ts_field, ts); + // backend + bt_field *backend_field = bt_field_structure_borrow_member_field_by_index(context_field,4); + bt_field_integer_signed_set_value(backend_field, backend); + + /* Payload */ + bt_field *payload_field = bt_event_borrow_payload_field(downstream_event); + + // did + bt_field *device_id_field = bt_field_structure_borrow_member_field_by_index(payload_field,0); + bt_field_integer_unsigned_set_value(device_id_field, hDevice); + + // subDevice + bt_field *subDevice_field = bt_field_structure_borrow_member_field_by_index(payload_field,1); + bt_field_integer_unsigned_set_value(subDevice_field, subDevice); + + //activeTime + bt_field *activeTime_field = bt_field_structure_borrow_member_field_by_index(payload_field,2); + bt_field_real_single_precision_set_value(activeTime_field, activeTime); + + return message; +} + bt_message* create_host_message(const char* hostname, const process_id_t process_id, const thread_id_t thread_id, const char* name, const uint64_t ts, const uint64_t duration, const bool err, bt_event_class *event_class, bt_self_message_iterator *message_iterator, bt_stream *stream, backend_t backend) { diff --git a/utils/xprof_utils.hpp b/utils/xprof_utils.hpp index 2695e4d5..28714c9c 100644 --- a/utils/xprof_utils.hpp +++ b/utils/xprof_utils.hpp @@ -46,7 +46,8 @@ typedef uintptr_t thread_id_t; typedef std::string hostname_t; typedef std::string thapi_function_name; typedef uintptr_t thapi_device_id; -typedef uint32_t thapi_domain_id; +typedef uint32_t thapi_domain_idx; +typedef uint32_t thapi_sdevice_idx; // Represent a device and a sub device typedef std::tuple dsd_t; @@ -59,7 +60,8 @@ typedef std::tuple hp_device_t; typedef std::tuple hp_dsd_t; -typedef std::tuple hp_ddomain_t; +typedef std::tuple hp_ddomain_t; +typedef std::tuple hp_dsdev_t; typedef std::tuple sd_t; typedef std::tuple tfn_ts_t; typedef std::tuple fn_ts_t; @@ -116,11 +118,18 @@ bt_message* create_power_message(const char* hostname, const process_id_t propro const uintptr_t hDevice, const uint32_t domain, const uint64_t power, const uint64_t ts, bt_event_class *event_class, bt_self_message_iterator *message_iterator, bt_stream *stream, backend_t backend = BACKEND_UNKNOWN); - bt_message* create_frequency_message(const char* hostname, const process_id_t proprocess_id, const thread_id_t thread_id, const uintptr_t hDevice, const uint32_t domain, const uint64_t ts, const uint64_t frequency, bt_event_class *event_class, bt_self_message_iterator *message_iterator, bt_stream *stream, backend_t backend = BACKEND_UNKNOWN); +bt_message* create_computeEU_message(const char* hostname, const process_id_t proprocess_id, const thread_id_t thread_id, + const uintptr_t hDevice, const uint32_t subDevice, const float activeTime, const uint64_t ts, + bt_event_class *event_class, bt_self_message_iterator *message_iterator, bt_stream *stream, backend_t backend = BACKEND_UNKNOWN); + +bt_message* create_copyEU_message(const char* hostname, const process_id_t proprocess_id, const thread_id_t thread_id, + const uintptr_t hDevice, const uint32_t subDevice, const float activeTime, const uint64_t ts, + bt_event_class *event_class, bt_self_message_iterator *message_iterator, bt_stream *stream, backend_t backend = BACKEND_UNKNOWN); + bt_message *create_host_message(const char *hostname, const process_id_t, const thread_id_t, const char *name, const uint64_t ts, const uint64_t duration, const bool err, bt_event_class *, bt_self_message_iterator *, diff --git a/xprof/btx_interval_model.yaml b/xprof/btx_interval_model.yaml index f62aac1a..d23459f1 100644 --- a/xprof/btx_interval_model.yaml +++ b/xprof/btx_interval_model.yaml @@ -127,3 +127,39 @@ :type: integer_unsigned :field_value_range: 64 :cast_type: uint64_t + - :name: lttng:computeEU + :payload_field_class: + :type: structure + :members: + - :name: did + :field_class: + :type: integer_unsigned + :field_value_range: 64 + :cast_type: uint64_t + - :name: subDevice + :field_class: + :type: integer_unsigned + :field_value_range: 32 + :cast_type: uint32_t + - :name: activeTime + :field_class: + :type: single + :cast_type: float + - :name: lttng:copyEU + :payload_field_class: + :type: structure + :members: + - :name: did + :field_class: + :type: integer_unsigned + :field_value_range: 64 + :cast_type: uint64_t + - :name: subDevice + :field_class: + :type: integer_unsigned + :field_value_range: 32 + :cast_type: uint32_t + - :name: activeTime + :field_class: + :type: single + :cast_type: float diff --git a/xprof/btx_timeline.cpp b/xprof/btx_timeline.cpp index 278a4060..14bce1a9 100644 --- a/xprof/btx_timeline.cpp +++ b/xprof/btx_timeline.cpp @@ -30,11 +30,13 @@ struct timeline_dispatch_s { std::unordered_map hp_device2countertracks; std::unordered_map hp_ddomain2frqtracks; std::unordered_map hp_ddomain2pwrtracks; - + std::unordered_map hp_dsdev2cpetracks; + std::unordered_map hp_dsdev2cpytracks; + perfetto_pruned::Trace trace; }; using timeline_dispatch_t = struct timeline_dispatch_s; - +using uuid_getter_t = perfetto_uuid_t(*)(timeline_dispatch_t*, std::string, uint64_t, uintptr_t, uint32_t); static perfetto_uuid_t gen_perfetto_uuid() { // Start at one, Look like UUID 0 is special static std::atomic uuid{1}; @@ -55,7 +57,7 @@ static perfetto_uuid_t get_parent_counter_track_uuid(timeline_dispatch_t *dispat // Create packet with track descriptor auto *packet = dispatch->trace.add_packet(); - packet->set_trusted_packet_sequence_id(10000); + packet->set_trusted_packet_sequence_id(TRUSTED_PACKED_SEQUENCE_ID); packet->set_timestamp(0); // TODO: check if this is required packet->set_previous_packet_dropped(true); @@ -72,7 +74,7 @@ static perfetto_uuid_t get_parent_counter_track_uuid(timeline_dispatch_t *dispat static perfetto_uuid_t get_counter_track_uuuid(timeline_dispatch_t *dispatch, std::unordered_map &counter_tracks, const std::string track_name, - std::string hostname, uint64_t process_id, thapi_device_id did, thapi_domain_id domain) { + std::string hostname, uint64_t process_id, thapi_device_id did, thapi_domain_idx domain) { perfetto_uuid_t hp_dev_uuid = 0; auto [it, inserted] = counter_tracks.insert({{hostname, process_id, did, domain}, hp_dev_uuid}); auto &potential_uuid = it->second; @@ -87,7 +89,7 @@ static perfetto_uuid_t get_counter_track_uuuid(timeline_dispatch_t *dispatch, // Create new track auto *packet = dispatch->trace.add_packet(); packet->set_timestamp(0); - packet->set_trusted_packet_sequence_id(10000); + packet->set_trusted_packet_sequence_id(TRUSTED_PACKED_SEQUENCE_ID); auto *track_descriptor = packet->mutable_track_descriptor(); track_descriptor->set_uuid(hp_dev_uuid); track_descriptor->set_parent_uuid(hp_uuid); @@ -97,42 +99,68 @@ static perfetto_uuid_t get_counter_track_uuuid(timeline_dispatch_t *dispatch, track_descriptor->mutable_counter(); return hp_dev_uuid; } + static perfetto_uuid_t get_frequency_track_uuuid(timeline_dispatch_t *dispatch, std::string hostname, - uint64_t process_id, thapi_device_id did, thapi_domain_id domain) { - return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2frqtracks, "GPU Frequency", hostname, process_id, did, domain); + uint64_t process_id, thapi_device_id did, thapi_domain_idx domain) { + return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2frqtracks, " GPU Frequency", hostname, process_id, did, domain); } static perfetto_uuid_t get_power_track_uuuid(timeline_dispatch_t *dispatch, std::string hostname, - uint64_t process_id, thapi_device_id did, thapi_device_id domain) { - return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2pwrtracks, " GPU Power", hostname, process_id, did, domain); + uint64_t process_id, thapi_device_id did, thapi_domain_idx domain) { + //Extra leading space in the name field to make GPU Power the first track + return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2pwrtracks, " GPU Power", hostname, process_id, did, domain); } -static void add_event_frequency(timeline_dispatch_t *dispatch, std::string hostname, - uint64_t process_id, uint64_t thread_id, uintptr_t did, - uint32_t domain, uint64_t timestamp, uint64_t frequency) { - - perfetto_uuid_t track_uuid = get_frequency_track_uuuid(dispatch, hostname, process_id, did, domain); +static perfetto_uuid_t get_computeEU_track_uuuid(timeline_dispatch_t *dispatch, std::string hostname, + uint64_t process_id, thapi_device_id did, thapi_sdevice_idx subDevice) { + return get_counter_track_uuuid(dispatch, dispatch->hp_dsdev2cpetracks, "ComputeEngine (%)", hostname, process_id, did, subDevice); +} + +static perfetto_uuid_t get_copyEU_track_uuuid(timeline_dispatch_t *dispatch, std::string hostname, + uint64_t process_id, thapi_device_id did, thapi_sdevice_idx subDevice) { + return get_counter_track_uuuid(dispatch, dispatch->hp_dsdev2cpytracks, "CopyEngine (%)", hostname, process_id, did, subDevice); +} + +static void add_event_DTelemetry(timeline_dispatch_t *dispatch, std::string hostname, uint64_t process_id, + uint64_t thread_id, uintptr_t did, uint32_t subDevice, uint64_t timestamp, + float value, uuid_getter_t uuid_getter, const std::string& eventName) { + perfetto_uuid_t track_uuid = uuid_getter(dispatch, hostname, process_id, did, subDevice); auto *packet = dispatch->trace.add_packet(); - packet->set_trusted_packet_sequence_id(10000); + packet->set_trusted_packet_sequence_id(TRUSTED_PACKED_SEQUENCE_ID); packet->set_timestamp(timestamp); auto *track_event = packet->mutable_track_event(); track_event->set_type(perfetto_pruned::TrackEvent::TYPE_COUNTER); track_event->set_track_uuid(track_uuid); - track_event->set_name("Frequency"); - track_event->set_counter_value(frequency); + track_event->set_name(eventName); + track_event->set_double_counter_value(value); +} + +static void add_event_frequency(timeline_dispatch_t *dispatch, std::string hostname, + uint64_t process_id, uint64_t thread_id, uintptr_t did, + uint32_t domain, uint64_t timestamp, float frequency) { + add_event_DTelemetry(dispatch, hostname, process_id, thread_id, did, domain, + timestamp, frequency, get_frequency_track_uuuid, "Frequency"); } static void add_event_power(timeline_dispatch_t *dispatch, std::string hostname, uint64_t process_id, uint64_t thread_id, uintptr_t did, - uint32_t domain, uint64_t timestamp, uint64_t power) { - perfetto_uuid_t track_uuid = get_power_track_uuuid(dispatch, hostname, process_id, did, domain); - auto *packet = dispatch->trace.add_packet(); - packet->set_trusted_packet_sequence_id(10000); - packet->set_timestamp(timestamp); - auto *track_event = packet->mutable_track_event(); - track_event->set_type(perfetto_pruned::TrackEvent::TYPE_COUNTER); - track_event->set_track_uuid(track_uuid); - track_event->set_name("Power"); - track_event->set_counter_value(power); + uint32_t domain, uint64_t timestamp, float power) +{ + add_event_DTelemetry(dispatch, hostname, process_id, thread_id, did, domain, + timestamp, power, get_power_track_uuuid, "Power"); +} + +static void add_event_computeEU(timeline_dispatch_t *dispatch, std::string hostname, + uint64_t process_id, uint64_t thread_id, uintptr_t did, + uint32_t subDevice, uint64_t timestamp, float activeTime) { + add_event_DTelemetry(dispatch, hostname, process_id, thread_id, did, subDevice, + timestamp, activeTime, get_computeEU_track_uuuid, "ComputeEngine"); +} + +static void add_event_copyEU(timeline_dispatch_t *dispatch, std::string hostname, + uint64_t process_id, uint64_t thread_id, uintptr_t did, + uint32_t subDevice, uint64_t timestamp, float activeTime) { + add_event_DTelemetry(dispatch, hostname, process_id, thread_id, did, subDevice, + timestamp, activeTime, get_copyEU_track_uuuid, "CopyEngine"); } static void add_event_begin(timeline_dispatch_t *dispatch, perfetto_uuid_t uuid, timestamp_t begin, @@ -351,11 +379,27 @@ static void power_usr_callback(void *btx_handle, void *usr_data, const char *hos add_event_power(dispatch, hostname, vpid, vtid, did, domain, ts, power); } +static void computeEU_usr_callback(void *btx_handle, void *usr_data, const char *hostname, + int64_t vpid, uint64_t vtid, int64_t ts, int64_t backend, + uint64_t did, uint32_t subDevice, float activeTime) { + auto *dispatch = static_cast(usr_data); + add_event_computeEU(dispatch, hostname, vpid, vtid, did, subDevice, ts, activeTime); +} + +static void copyEU_usr_callback(void *btx_handle, void *usr_data, const char *hostname, + int64_t vpid, uint64_t vtid, int64_t ts, int64_t backend, + uint64_t did, uint32_t subDevice, float activeTime) { + auto *dispatch = static_cast(usr_data); + add_event_copyEU(dispatch, hostname, vpid, vtid, did, subDevice, ts, activeTime); +} + void btx_register_usr_callbacks(void *btx_handle) { btx_register_callbacks_lttng_host(btx_handle, &host_usr_callback); btx_register_callbacks_lttng_device(btx_handle, &device_usr_callback); btx_register_callbacks_lttng_frequency(btx_handle, &frequency_usr_callback); btx_register_callbacks_lttng_power(btx_handle, &power_usr_callback); + btx_register_callbacks_lttng_computeEU(btx_handle, &computeEU_usr_callback); + btx_register_callbacks_lttng_copyEU(btx_handle, ©EU_usr_callback); btx_register_callbacks_initialize_component(btx_handle, &btx_initialize_component_callback); btx_register_callbacks_finalize_component(btx_handle, &btx_finalize_component_callback); } diff --git a/xprof/interval.c.erb b/xprof/interval.c.erb index 8e09ef4d..6e39ca27 100644 --- a/xprof/interval.c.erb +++ b/xprof/interval.c.erb @@ -99,6 +99,8 @@ bt_component_class_initialize_method_status <%= namespace %>_dispatch_initialize dispatch->device_name_event_class = create_lttng_device_name_event_class_message(trace_class, stream_class); dispatch->frequency_event_class = create_lttng_frequency_event_class_message(trace_class, stream_class); dispatch->power_event_class = create_lttng_power_event_class_message(trace_class, stream_class); + dispatch->computeEU_event_class = create_lttng_computeEU_event_class_message(trace_class, stream_class); + dispatch->copyEU_event_class = create_lttng_copyEU_event_class_message(trace_class, stream_class); /* Create a default trace from (instance of `trace_class`) */ bt_trace *trace = bt_trace_create(trace_class); diff --git a/xprof/interval.h.erb b/xprof/interval.h.erb index 2ebdb184..3a0e1d91 100644 --- a/xprof/interval.h.erb +++ b/xprof/interval.h.erb @@ -58,6 +58,8 @@ struct <%= namespace %>_dispatch { bt_event_class *device_name_event_class; bt_event_class *frequency_event_class; bt_event_class *power_event_class; + bt_event_class *computeEU_event_class; + bt_event_class *copyEU_event_class; /* Component's input port (weak) */ bt_self_component_port_input *in_port; }; diff --git a/xprof/interval_model.yaml b/xprof/interval_model.yaml index 0a9df04a..af2e6e45 100644 --- a/xprof/interval_model.yaml +++ b/xprof/interval_model.yaml @@ -78,3 +78,27 @@ :field_value_range: 32 - :name: power :class: unsigned +- :name: lttng:computeEU + :payload: + - :name: did + :class: unsigned + :class_properties: + :preferred_display_base: 16 + - :name: subDevice + :class: unsigned + :class_properties: + :field_value_range: 32 + - :name: activeTime + :class: unsigned +- :name: lttng:copyEU + :payload: + - :name: did + :class: unsigned + :class_properties: + :preferred_display_base: 16 + - :name: subDevice + :class: unsigned + :class_properties: + :field_value_range: 32 + - :name: activeTime + :class: unsigned diff --git a/ze/tracer_ze_helpers.include.c b/ze/tracer_ze_helpers.include.c index 583299f3..56bcc127 100644 --- a/ze/tracer_ze_helpers.include.c +++ b/ze/tracer_ze_helpers.include.c @@ -753,22 +753,142 @@ static inline void _dump_memory_info(ze_command_list_handle_t hCommandList, cons } //////////////////////////////////////////// -#define _ZE_ERROR_MSG(NAME,RES) {fprintf(stderr,"%s() failed at %d(%s): res=%x\n",(NAME),__LINE__,__FILE__,(RES));} -#define _ZE_ERROR_MSG_NOTERMINATE(NAME,RES) {fprintf(stderr,"%s() error at %d(%s): res=%x\n",(NAME),__LINE__,__FILE__,(RES));} -#define _ERROR_MSG(MSG) {perror((MSG)); fprintf(stderr,"errno=%d at %d(%s)",errno,__LINE__,__FILE__);} +#define _ZE_ERROR_MSG(NAME,RES) do {\ + fprintf(stderr,"%s() failed at %d(%s): res=%x\n",(NAME),__LINE__,__FILE__,(RES));\ +} while (0) +#define _ZE_ERROR_MSG_NOTERMINATE(NAME,RES) do {\ + fprintf(stderr,"%s() error at %d(%s): res=%x\n",(NAME),__LINE__,__FILE__,(RES));\ +} while (0) +#define _ERROR_MSG(MSG) {perror((MSG)) do {\ + {perror((MSG)); fprintf(stderr,"errno=%d at %d(%s)",errno,__LINE__,__FILE__);\ +} while (0) -static int _sampling_initialized = 0; +static int _sampling_freq_initialized = 0; +static int _sampling_pwr_initialized = 0; +static int _sampling_engines_initialized = 0; // Static handles to stay throughout the execution -static ze_device_handle_t* _sampling_hDevices; -static zes_freq_handle_t** _sampling_hFrequencies; -static zes_pwr_handle_t** _sampling_hPowers; -static uint32_t _sampling_deviceCount; -static uint32_t* _sampling_freqDomainCounts; -static uint32_t* _sampling_powerDomainCounts; +static ze_device_handle_t** _sampling_hDevices = NULL; +static zes_freq_handle_t*** _sampling_hFrequencies = NULL; +static zes_pwr_handle_t*** _sampling_hPowers = NULL; +static zes_engine_handle_t*** _sampling_engineHandles = NULL; +static zes_engine_properties_t*** _sampling_engineProps = NULL; +static uint32_t _sampling_driverCount = 0; +static uint32_t* _sampling_deviceCount = NULL; +static uint32_t** _sampling_subDeviceCount = NULL; +static uint32_t** _sampling_freqDomainCounts = NULL; +static uint32_t** _sampling_powerDomainCounts = NULL; +static uint32_t** _sampling_engineCounts = NULL; + +typedef struct { + uint64_t timestamp; + uint64_t computeActive; +} computeEngineData; + +typedef struct { + uint64_t timestamp; + uint64_t copyActive; +} copyEngineData; + +void intializeFrequency() { + ze_result_t res; + _sampling_hFrequencies = (zes_freq_handle_t***) malloc(_sampling_driverCount * sizeof(zes_freq_handle_t**)); + _sampling_freqDomainCounts = (uint32_t**) malloc(_sampling_driverCount * sizeof(uint32_t)); + for (uint32_t driverIdx = 0; driverIdx < _sampling_driverCount; driverIdx++) { + _sampling_freqDomainCounts[driverIdx] = (uint32_t*) malloc(_sampling_deviceCount[driverIdx] * sizeof(uint32_t)); + _sampling_hFrequencies[driverIdx] = (zes_freq_handle_t**) malloc(_sampling_deviceCount[driverIdx] * sizeof(zes_freq_handle_t*)); + for (uint32_t deviceIdx = 0; deviceIdx < _sampling_deviceCount[driverIdx]; deviceIdx++) { + // Get frequency domains for each device + _sampling_hFrequencies[driverIdx][deviceIdx] = NULL; + _sampling_freqDomainCounts[driverIdx][deviceIdx] = 0; + res = ZES_DEVICE_ENUM_FREQUENCY_DOMAINS_PTR(_sampling_hDevices[driverIdx][deviceIdx], &_sampling_freqDomainCounts[driverIdx][deviceIdx], NULL); + if (res != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("1st ZES_DEVICE_ENUM_FREQUENCY_DOMAINS_PTR", res); + _sampling_freqDomainCounts[driverIdx][deviceIdx] = 0; + continue; + } + _sampling_hFrequencies[driverIdx][deviceIdx] = (zes_freq_handle_t*) malloc(_sampling_freqDomainCounts[driverIdx][deviceIdx] * sizeof(zes_freq_handle_t)); + res = ZES_DEVICE_ENUM_FREQUENCY_DOMAINS_PTR(_sampling_hDevices[driverIdx][deviceIdx], &_sampling_freqDomainCounts[driverIdx][deviceIdx], _sampling_hFrequencies[driverIdx][deviceIdx]); + if (res != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("2nd ZES_DEVICE_ENUM_FREQUENCY_DOMAINS_PTR", res); + _sampling_freqDomainCounts[driverIdx][deviceIdx] = 0; + free(_sampling_hFrequencies[driverIdx][deviceIdx]); + } + } + } + _sampling_freq_initialized = 1; +} -int initializeHandles() { +void intializePower() { ze_result_t res; + _sampling_hPowers = (zes_pwr_handle_t***) malloc(_sampling_driverCount * sizeof(zes_pwr_handle_t**)); + _sampling_powerDomainCounts = (uint32_t**) malloc(_sampling_driverCount * sizeof(uint32_t*)); + for (uint32_t driverIdx = 0; driverIdx < _sampling_driverCount; driverIdx++) { + _sampling_hPowers[driverIdx] = (zes_pwr_handle_t**) malloc(_sampling_deviceCount[driverIdx] * sizeof(zes_pwr_handle_t*)); + _sampling_powerDomainCounts[driverIdx] = (uint32_t*) malloc(_sampling_deviceCount[driverIdx] * sizeof(uint32_t)); + for (uint32_t deviceIdx = 0; deviceIdx < _sampling_deviceCount[driverIdx]; deviceIdx++) { + // Get power domains for each device + _sampling_hPowers[driverIdx][deviceIdx] = NULL; + _sampling_powerDomainCounts[driverIdx][deviceIdx] = 0; + res = ZES_DEVICE_ENUM_POWER_DOMAINS_PTR(_sampling_hDevices[driverIdx][deviceIdx], &_sampling_powerDomainCounts[driverIdx][deviceIdx], NULL); + if (res != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("1st ZES_DEVICE_ENUM_POWER_DOMAINS_PTR", res); + _sampling_powerDomainCounts[driverIdx][deviceIdx] = 0; + continue; + } + _sampling_hPowers[driverIdx][deviceIdx] = (zes_pwr_handle_t*) malloc(_sampling_powerDomainCounts[driverIdx][deviceIdx] * sizeof(zes_pwr_handle_t)); + res = ZES_DEVICE_ENUM_POWER_DOMAINS_PTR(_sampling_hDevices[driverIdx][deviceIdx], &_sampling_powerDomainCounts[driverIdx][deviceIdx], _sampling_hPowers[driverIdx][deviceIdx]); + if (res != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("2nd ZES_DEVICE_ENUM_POWER_DOMAINS_PTR", res); + _sampling_powerDomainCounts[driverIdx][deviceIdx] = 0; + free(_sampling_hPowers[driverIdx][deviceIdx]); + } + } + } + _sampling_pwr_initialized = 1; +} + +void intializeEngines() { + ze_result_t res; + _sampling_engineProps = (zes_engine_properties_t***) malloc(_sampling_driverCount * sizeof(zes_engine_properties_t**)); + _sampling_engineHandles = (zes_engine_handle_t***) malloc(_sampling_driverCount * sizeof(zes_engine_handle_t**)); + _sampling_engineCounts = (uint32_t**) malloc(_sampling_driverCount * sizeof(uint32_t*)); + for (uint32_t driverIdx = 0; driverIdx < _sampling_driverCount; driverIdx++) { + _sampling_engineProps[driverIdx] = (zes_engine_properties_t**) malloc(_sampling_deviceCount[driverIdx] * sizeof(zes_engine_properties_t*)); + _sampling_engineHandles[driverIdx] = (zes_engine_handle_t**) malloc(_sampling_deviceCount[driverIdx] * sizeof(zes_engine_handle_t*)); + _sampling_engineCounts[driverIdx] = (uint32_t*) malloc(_sampling_deviceCount[driverIdx] * sizeof(uint32_t)); + for (uint32_t deviceIdx = 0; deviceIdx < _sampling_deviceCount[driverIdx]; deviceIdx++) { + // Get engine counts for each device + _sampling_engineProps[driverIdx][deviceIdx] = NULL; + _sampling_engineHandles[driverIdx][deviceIdx] = NULL; + _sampling_engineCounts[driverIdx][deviceIdx] = 0; + res = ZES_DEVICE_ENUM_ENGINE_GROUPS_PTR(_sampling_hDevices[driverIdx][deviceIdx], &_sampling_engineCounts[driverIdx][deviceIdx], NULL); + if (res != ZE_RESULT_SUCCESS || _sampling_engineCounts[driverIdx][deviceIdx] == 0) { + _ZE_ERROR_MSG("1st ZES_DEVICE_ENUM_ENGINE_GROUPS_PTR", res); + _sampling_engineCounts[driverIdx][deviceIdx] = 0; + continue; + } + _sampling_engineHandles[driverIdx][deviceIdx] = (zes_engine_handle_t*) malloc(_sampling_engineCounts[driverIdx][deviceIdx] * sizeof(zes_engine_handle_t)); + res = ZES_DEVICE_ENUM_ENGINE_GROUPS_PTR(_sampling_hDevices[driverIdx][deviceIdx], &_sampling_engineCounts[driverIdx][deviceIdx], _sampling_engineHandles[driverIdx][deviceIdx]); + if (res != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("2nd ZES_DEVICE_ENUM_ENGINE_GROUPS_PTR", res); + _sampling_engineCounts[driverIdx][deviceIdx] = 0; + free(_sampling_engineHandles[driverIdx][deviceIdx]); + } + _sampling_engineProps[driverIdx][deviceIdx] = (zes_engine_properties_t*) calloc(_sampling_engineCounts[driverIdx][deviceIdx], sizeof(zes_engine_properties_t)); + for (uint32_t engineIdx = 0; engineIdx < _sampling_engineCounts[driverIdx][deviceIdx]; ++engineIdx) { + _sampling_engineProps[driverIdx][deviceIdx][engineIdx].stype = ZES_STRUCTURE_TYPE_ENGINE_PROPERTIES; + res = ZES_ENGINE_GET_PROPERTIES_PTR(_sampling_engineHandles[driverIdx][deviceIdx][engineIdx], &_sampling_engineProps[driverIdx][deviceIdx][engineIdx]); + if (res != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("ZES_ENGINE_GET_PROPERTIES_PTR", res); + } + } + } + } + _sampling_engines_initialized = 1; +} +int initializeHandles() { + ze_result_t res; const char *e = getenv("ZES_ENABLE_SYSMAN"); if (!(e && e[0] == '1')) { fprintf(stderr,"ZES_ENABLE_SYSMAN needs to be set!\n"); @@ -783,94 +903,133 @@ int initializeHandles() { #endif // Query driver - uint32_t driverCount; - res = ZE_DRIVER_GET_PTR(&driverCount, NULL); + _sampling_driverCount = 0; + res = ZE_DRIVER_GET_PTR(&_sampling_driverCount, NULL); if (res != ZE_RESULT_SUCCESS) { _ZE_ERROR_MSG("1st ZE_DRIVER_GET_PTR", res); return -1; } - - ze_driver_handle_t *hDriver = (ze_driver_handle_t*) malloc(driverCount * sizeof(ze_driver_handle_t)); - res = ZE_DRIVER_GET_PTR(&driverCount, hDriver); + ze_driver_handle_t *hDriver = (ze_driver_handle_t*) alloca(_sampling_driverCount * sizeof(ze_driver_handle_t)); + res = ZE_DRIVER_GET_PTR(&_sampling_driverCount, hDriver); if (res != ZE_RESULT_SUCCESS) { _ZE_ERROR_MSG("2nd ZE_DRIVER_GET_PTR", res); return -1; } - + _sampling_deviceCount = (uint32_t*) malloc(_sampling_driverCount * sizeof(uint32_t)); + _sampling_subDeviceCount = (uint32_t**) malloc(_sampling_driverCount * sizeof(uint32_t*)); + _sampling_hDevices = (ze_device_handle_t**) malloc(_sampling_driverCount * sizeof(ze_device_handle_t*)); // Query device count - res = ZE_DEVICE_GET_PTR(hDriver[0], &_sampling_deviceCount, NULL); - if (res != ZE_RESULT_SUCCESS || _sampling_deviceCount == 0) { - fprintf(stderr, "ERROR: No device found!\n"); - _ZE_ERROR_MSG("ZE_DEVICE_GET_PTR", res); - return -1; - } - - _sampling_hDevices = (ze_device_handle_t*) malloc(_sampling_deviceCount * sizeof(ze_device_handle_t)); - res = ZE_DEVICE_GET_PTR(hDriver[0], &_sampling_deviceCount, _sampling_hDevices); - if (res != ZE_RESULT_SUCCESS) { - _ZE_ERROR_MSG("2nd ZE_DRIVER_GET_PTR", res); - return -1; - } - - _sampling_hFrequencies = (zes_freq_handle_t**) malloc(_sampling_deviceCount * sizeof(zes_freq_handle_t*)); - _sampling_freqDomainCounts = (uint32_t*) malloc(_sampling_deviceCount * sizeof(uint32_t)); - - _sampling_hPowers = (zes_pwr_handle_t**) malloc(_sampling_deviceCount * sizeof(zes_pwr_handle_t*)); - _sampling_powerDomainCounts = (uint32_t*) malloc(_sampling_deviceCount * sizeof(uint32_t)); - - for (uint32_t i = 0; i < _sampling_deviceCount; i++) { - // Get frequency domains for each device - res = zesDeviceEnumFrequencyDomains(_sampling_hDevices[i], &_sampling_freqDomainCounts[i], NULL); - if (res != ZE_RESULT_SUCCESS) { - printf("zesDeviceEnumFrequencyDomains (count query) failed for device %d: %d\n", i, res); - return(-1); + for (uint32_t driverIdx = 0; driverIdx < _sampling_driverCount; driverIdx++) { + res = ZE_DEVICE_GET_PTR(hDriver[driverIdx], &_sampling_deviceCount[driverIdx], NULL); + if (res != ZE_RESULT_SUCCESS || _sampling_deviceCount[driverIdx] == 0) { + fprintf(stderr, "ERROR: No device found!\n"); + _ZE_ERROR_MSG("1st ZE_DEVICE_GET_PTR", res); + return -1; } - - _sampling_hFrequencies[i] = (zes_freq_handle_t*) malloc(_sampling_freqDomainCounts[i] * sizeof(zes_freq_handle_t)); - res = zesDeviceEnumFrequencyDomains(_sampling_hDevices[i], &_sampling_freqDomainCounts[i], _sampling_hFrequencies[i]); + _sampling_hDevices[driverIdx] = (ze_device_handle_t*) malloc(_sampling_deviceCount[driverIdx] * sizeof(ze_device_handle_t)); + res = ZE_DEVICE_GET_PTR(hDriver[driverIdx], &_sampling_deviceCount[driverIdx], _sampling_hDevices[driverIdx]); if (res != ZE_RESULT_SUCCESS) { - printf("zesDeviceEnumFrequencyDomains failed for device %d: %d\n", i, res); - return(-1); + _ZE_ERROR_MSG("2nd ZE_DEVICE_GET_PTR", res); + free(_sampling_hDevices[driverIdx]); + return -1; } - - // Get power domains for each device - res = zesDeviceEnumPowerDomains(_sampling_hDevices[i], &_sampling_powerDomainCounts[i], NULL); - if (res != ZE_RESULT_SUCCESS) { - printf("zesDeviceEnumPowerDomains (count query) failed for device %d: %d\n", i, res); - return(-1); - } - - _sampling_hPowers[i] = (zes_pwr_handle_t*) malloc(_sampling_powerDomainCounts[i] * sizeof(zes_pwr_handle_t)); - res = zesDeviceEnumPowerDomains(_sampling_hDevices[i], &_sampling_powerDomainCounts[i], _sampling_hPowers[i]); - if (res != ZE_RESULT_SUCCESS) { - printf("zesDeviceEnumPowerDomains failed for device %d: %d\n", i, res); - return(-1); + //Get no sub-devices + _sampling_subDeviceCount[driverIdx] = (uint32_t*) malloc(_sampling_deviceCount[driverIdx] * sizeof(uint32_t)); + for (uint32_t deviceIdx = 0; deviceIdx < _sampling_deviceCount[driverIdx]; deviceIdx++) { + _sampling_subDeviceCount[driverIdx][deviceIdx] = 0; + res = ZE_DEVICE_GET_SUB_DEVICES_PTR(_sampling_hDevices[driverIdx][deviceIdx], &_sampling_subDeviceCount[driverIdx][deviceIdx], NULL); + if (res != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("ZE_DEVICE_GET_SUB_DEVICES_PTR", res); + _sampling_subDeviceCount[driverIdx][deviceIdx] = 0; + } + if (_sampling_subDeviceCount[driverIdx][deviceIdx] == 0) { + _sampling_subDeviceCount[driverIdx][deviceIdx] = 1; + } } } - free(hDriver); - _sampling_initialized=1; + intializeFrequency(); + intializePower(); + intializeEngines(); return 0; } -void readFrequency(uint32_t deviceIdx, uint32_t domainIdx, uint32_t *frequency) { - if (!_sampling_initialized) return; +void readFrequency(uint32_t driverIdx, uint32_t deviceIdx, uint32_t domainIdx, uint32_t *frequency) { + if (!_sampling_freq_initialized) return; + ze_result_t result; *frequency=0; zes_freq_state_t freqState; - if (zesFrequencyGetState(_sampling_hFrequencies[deviceIdx][domainIdx], &freqState) == ZE_RESULT_SUCCESS) { - // printf("Device %d - Frequency Domain %d: Current frequency: %lf MHz\n", deviceIdx, domainIdx, freqState.actual); - *frequency = freqState.actual; + result = ZES_FREQUENCY_GET_STATE_PTR(_sampling_hFrequencies[driverIdx][deviceIdx][domainIdx], &freqState); + if (result != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("ZES_FREQUENCY_GET_STATE_PTR", result); + return; } + *frequency = freqState.actual; } -void readEnergy(uint32_t deviceIdx, uint32_t domainIdx, uint64_t *ts_us, uint64_t *energy_uj) { - if (!_sampling_initialized) return; +void readEnergy(uint32_t driverIdx, uint32_t deviceIdx, uint32_t domainIdx, uint64_t *ts_us, uint64_t *energy_uj) { + if (!_sampling_pwr_initialized) return; + ze_result_t result; *ts_us = 0; *energy_uj = 0; zes_power_energy_counter_t energyCounter; - if (zesPowerGetEnergyCounter(_sampling_hPowers[deviceIdx][domainIdx], &energyCounter) == ZE_RESULT_SUCCESS) { - // printf("Device %d - Power Domain %d: Total energy consumption: %lu Joules\n", deviceIdx, domainIdx, energyCounter.energy); - *ts_us = energyCounter.timestamp; - *energy_uj = energyCounter.energy; + result = ZES_POWER_GET_ENERGY_COUNTER_PTR(_sampling_hPowers[driverIdx][deviceIdx][domainIdx], &energyCounter); + if (result != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("ZES_POWER_GET_ENERGY_COUNTER_PTR", result); + return; + } + *ts_us = energyCounter.timestamp; + *energy_uj = energyCounter.energy; +} + +void readComputeE(uint32_t driverIdx, uint32_t deviceIdx, computeEngineData *computeData ){ + if (!_sampling_engines_initialized) return; + ze_result_t result; + for (uint32_t subDevIdx = 0; subDevIdx < _sampling_subDeviceCount[driverIdx][deviceIdx]; subDevIdx++) { + computeData[subDevIdx].computeActive = 0; + computeData[subDevIdx].timestamp = 0; + } + for (uint32_t engineIdx = 0; engineIdx < _sampling_engineCounts[driverIdx][deviceIdx]; ++engineIdx) { + if (_sampling_engineProps[driverIdx][deviceIdx][engineIdx].type == ZES_ENGINE_GROUP_COMPUTE_ALL){ + zes_engine_stats_t engineStats = {0}; + result = ZES_ENGINE_GET_ACTIVITY_PTR(_sampling_engineHandles[driverIdx][deviceIdx][engineIdx], &engineStats); + if (result != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("ZES_ENGINE_GET_ACTIVITY_PTR", result); + continue; + } + if (_sampling_engineProps[driverIdx][deviceIdx][engineIdx].onSubdevice) { + computeData[_sampling_engineProps[driverIdx][deviceIdx][engineIdx].subdeviceId].computeActive = engineStats.activeTime; + computeData[_sampling_engineProps[driverIdx][deviceIdx][engineIdx].subdeviceId].timestamp = engineStats.timestamp; + } else { + computeData[0].computeActive = engineStats.activeTime; + computeData[0].timestamp = engineStats.timestamp; + } + } + } +} + +void readCopyE(uint32_t driverIdx, uint32_t deviceIdx, copyEngineData *copyData ){ + if (!_sampling_engines_initialized) return; + ze_result_t result; + for (uint32_t subDevIdx = 0; subDevIdx < _sampling_subDeviceCount[driverIdx][deviceIdx]; subDevIdx++) { + copyData[subDevIdx].copyActive = 0; + copyData[subDevIdx].timestamp = 0; + } + for (uint32_t engineIdx = 0; engineIdx < _sampling_engineCounts[driverIdx][deviceIdx]; ++engineIdx) { + if (_sampling_engineProps[driverIdx][deviceIdx][engineIdx].type == ZES_ENGINE_GROUP_COPY_ALL){ + zes_engine_stats_t engineStats = {0}; + result = ZES_ENGINE_GET_ACTIVITY_PTR(_sampling_engineHandles[driverIdx][deviceIdx][engineIdx], &engineStats); + if (result != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("ZES_ENGINE_GET_ACTIVITY_PTR", result); + continue; + } + if (_sampling_engineProps[driverIdx][deviceIdx][engineIdx].onSubdevice) { + copyData[_sampling_engineProps[driverIdx][deviceIdx][engineIdx].subdeviceId].copyActive = engineStats.activeTime; + copyData[_sampling_engineProps[driverIdx][deviceIdx][engineIdx].subdeviceId].timestamp = engineStats.timestamp; + } else { + copyData[0].copyActive = engineStats.activeTime; + copyData[0].timestamp = engineStats.timestamp; + } + } } } @@ -878,14 +1037,38 @@ static void thapi_sampling_energy() { uint64_t ts_us; uint64_t energy_uj; uint32_t frequency; - for (uint32_t i = 0; i < _sampling_deviceCount; i++) { - for (uint32_t j = 0; j < _sampling_freqDomainCounts[i]; j++) { - readFrequency(i, j, &frequency); - do_tracepoint(lttng_ust_ze_sampling, gpu_frequency, (ze_device_handle_t)_sampling_hDevices[i], j, ts_us, frequency); - } - for (uint32_t j = 0; j < _sampling_powerDomainCounts[i]; j++) { - readEnergy(i, j, &ts_us, &energy_uj); - do_tracepoint(lttng_ust_ze_sampling, gpu_energy, (ze_device_handle_t)_sampling_hDevices[i], j, (uint64_t)energy_uj, ts_us); + for (uint32_t driverIdx = 0; driverIdx < _sampling_driverCount; driverIdx++) { + for (uint32_t deviceIdx = 0; deviceIdx < _sampling_deviceCount[driverIdx]; deviceIdx++) { + if (tracepoint_enabled(lttng_ust_ze_sampling, gpu_frequency)){ + for (uint32_t domainIdx = 0; domainIdx < _sampling_freqDomainCounts[driverIdx][deviceIdx]; domainIdx++) { + readFrequency(driverIdx, deviceIdx, domainIdx, &frequency); + do_tracepoint(lttng_ust_ze_sampling, gpu_frequency, (ze_device_handle_t)_sampling_hDevices[driverIdx][deviceIdx], domainIdx, frequency); + } + } + if (tracepoint_enabled(lttng_ust_ze_sampling, gpu_energy)){ + for (uint32_t domainIdx = 0; domainIdx < _sampling_powerDomainCounts[driverIdx][deviceIdx]; domainIdx++) { + readEnergy(driverIdx, deviceIdx, domainIdx, &ts_us, &energy_uj); + do_tracepoint(lttng_ust_ze_sampling, gpu_energy, (ze_device_handle_t)_sampling_hDevices[driverIdx][deviceIdx], domainIdx, (uint64_t)energy_uj, ts_us); + } + } + if (tracepoint_enabled(lttng_ust_ze_sampling, computeEngine)){ + if (_sampling_subDeviceCount[driverIdx][deviceIdx] != 0 ) { + computeEngineData computeE[_sampling_subDeviceCount[driverIdx][deviceIdx]]; + readComputeE(driverIdx, deviceIdx, computeE); + for (uint32_t subDevIdx = 0; subDevIdx < _sampling_subDeviceCount[driverIdx][deviceIdx]; subDevIdx++){ + do_tracepoint(lttng_ust_ze_sampling, computeEngine, (ze_device_handle_t)_sampling_hDevices[driverIdx][deviceIdx], subDevIdx, computeE[subDevIdx].computeActive, computeE[subDevIdx].timestamp); + } + } + } + if (tracepoint_enabled(lttng_ust_ze_sampling, copyEngine)){ + if (_sampling_subDeviceCount[driverIdx][deviceIdx] != 0 ) { + copyEngineData copyE[_sampling_subDeviceCount[driverIdx][deviceIdx]]; + readCopyE(driverIdx, deviceIdx, copyE); + for (uint32_t subDevIdx = 0; subDevIdx < _sampling_subDeviceCount[driverIdx][deviceIdx]; subDevIdx++){ + do_tracepoint(lttng_ust_ze_sampling, copyEngine, (ze_device_handle_t)_sampling_hDevices[driverIdx][deviceIdx], subDevIdx, copyE[subDevIdx].copyActive, copyE[subDevIdx].timestamp); + } + } + } } } } diff --git a/ze/ze_events.yaml b/ze/ze_events.yaml index 4d914186..c8e58263 100644 --- a/ze/ze_events.yaml +++ b/ze/ze_events.yaml @@ -1,10 +1,32 @@ --- lttng_ust_ze_sampling: events: + - name: copyEngine + args: + - [ ze_device_handle_t, hDevice ] + - [ uint32_t, subDevice ] + - [ uint64_t, activeTime ] + - [ uint64_t, timestamp ] + fields: + - [ ctf_integer_hex, uintptr_t, hDevice, "(uintptr_t)hDevice" ] + - [ ctf_integer, uint32_t, subDevice, "subDevice" ] + - [ ctf_integer, uint64_t, activeTime, "activeTime" ] + - [ ctf_integer, uint64_t, timestamp, "timestamp" ] + - name: computeEngine + args: + - [ ze_device_handle_t, hDevice ] + - [ uint32_t, subDevice ] + - [ uint64_t, activeTime ] + - [ uint64_t, timestamp ] + fields: + - [ ctf_integer_hex, uintptr_t, hDevice, "(uintptr_t)hDevice" ] + - [ ctf_integer, uint32_t, subDevice, "subDevice" ] + - [ ctf_integer, uint64_t, activeTime, "activeTime" ] + - [ ctf_integer, uint64_t, timestamp, "timestamp" ] - name: gpu_energy args: - [ ze_device_handle_t, hDevice ] - - [ uint32_t, domain] + - [ uint32_t, domain ] - [ uint64_t, energy ] - [ uint64_t, timestamp ] fields: @@ -15,13 +37,11 @@ lttng_ust_ze_sampling: - name: gpu_frequency args: - [ ze_device_handle_t, hDevice ] - - [ uint32_t, domain] - - [ uint64_t, timestamp ] + - [ uint32_t, domain ] - [ uint64_t, frequency ] fields: - [ ctf_integer_hex, uintptr_t, hDevice, "(uintptr_t)hDevice" ] - [ ctf_integer, uint32_t, domain, "domain" ] - - [ ctf_integer, uint64_t, timestamp, "timestamp" ] - [ ctf_integer, uint64_t, frequency, "frequency" ] lttng_ust_ze_profiling: events: diff --git a/ze/zeinterval_callbacks.cpp.erb b/ze/zeinterval_callbacks.cpp.erb index e3199e1e..0f9a4f52 100644 --- a/ze/zeinterval_callbacks.cpp.erb +++ b/ze/zeinterval_callbacks.cpp.erb @@ -53,6 +53,7 @@ static uint64_t convert_device_cycle(uint64_t device_cycle, } static uint64_t compute_and_convert_delta(uint64_t start, uint64_t end, const ze_device_properties_t &device_property) { + assert (device_property.kernelTimestampValidBits <= 64); const uint64_t max_val = ((uint64_t)1 << device_property.kernelTimestampValidBits) - 1; start &= max_val; @@ -78,8 +79,10 @@ void *init_zeinterval_callbacks_state() { return (void*) s; } -static void create_and_enqueue_power_message(const char* hostname, const process_id_t process_id, const thread_id_t thread_id, - const uintptr_t hDevice, const uint32_t domain, const uint64_t energy, const uint64_t ts) { +static void create_and_enqueue_power_message(const char* hostname, const process_id_t process_id, + const thread_id_t thread_id, const uintptr_t hDevice, + const uint32_t domain, const uint64_t energy, const uint64_t ts) { + zeinterval_callbacks_state* state = (zeinterval_callbacks_state*) zeinterval_iter_g->callbacks_state; auto [it, inserted] = state->device_energy_ref.insert({{hostname, process_id, hDevice, domain}, {energy, ts}}); // First entry @@ -89,41 +92,88 @@ static void create_and_enqueue_power_message(const char* hostname, const process auto &[prev_energy, prev_ts] = it->second; bt_message *message = create_power_message(hostname, process_id, - thread_id, hDevice, domain, - static_cast(((energy-prev_energy) / static_cast(ts-prev_ts))*1000.0), - prev_ts, - zeinterval_iter_g->dispatch->power_event_class, - zeinterval_self_message_iterator_g, - zeinterval_iter_g->dispatch->stream, BACKEND_ZE); + thread_id, hDevice, domain, + static_cast(((energy-prev_energy) / static_cast(ts-prev_ts)) * 1000.0), + prev_ts, + zeinterval_iter_g->dispatch->power_event_class, + zeinterval_self_message_iterator_g, + zeinterval_iter_g->dispatch->stream, BACKEND_ZE); state->downstream_message_queue.push(message); prev_energy = energy; prev_ts = ts; } -static void create_and_enqueue_frequency_message(const char* hostname, const process_id_t process_id, const thread_id_t thread_id, - const uintptr_t hDevice, const uint32_t domain, const uint64_t ts, const uint64_t frequency) { - bt_message *message = create_frequency_message(hostname, process_id, thread_id, hDevice, domain, ts, frequency, - zeinterval_iter_g->dispatch->frequency_event_class, - zeinterval_self_message_iterator_g, - zeinterval_iter_g->dispatch->stream, BACKEND_ZE); +static void create_and_enqueue_frequency_message(const char* hostname, const process_id_t process_id, + const thread_id_t thread_id, const uintptr_t hDevice, + const uint32_t domain, const uint64_t ts, const uint64_t frequency) { + + bt_message *message = create_frequency_message(hostname, process_id, thread_id, hDevice, domain, ts, frequency, + zeinterval_iter_g->dispatch->frequency_event_class, + zeinterval_self_message_iterator_g, + zeinterval_iter_g->dispatch->stream, BACKEND_ZE); zeinterval_callbacks_state* state = (zeinterval_callbacks_state*) zeinterval_iter_g->callbacks_state; state->downstream_message_queue.push(message); } -static void create_and_enqueue_host_message(const char* hostname, const process_id_t process_id, const thread_id_t thread_id, const char* name, - const uint64_t ts, const uint64_t duration, const bool err) { +static void create_and_enqueue_host_message(const char* hostname, const process_id_t process_id, + const thread_id_t thread_id, const char* name, const uint64_t ts, + const uint64_t duration, const bool err) { /* Message creation */ bt_message *message = create_host_message(hostname, process_id, thread_id, name, ts, duration, err, - zeinterval_iter_g->dispatch->host_event_class, - zeinterval_self_message_iterator_g, - zeinterval_iter_g->dispatch->stream, BACKEND_ZE); + zeinterval_iter_g->dispatch->host_event_class, + zeinterval_self_message_iterator_g, + zeinterval_iter_g->dispatch->stream, BACKEND_ZE); zeinterval_callbacks_state* state = (zeinterval_callbacks_state*) zeinterval_iter_g->callbacks_state; state->downstream_message_queue.push(message); } +template +static void create_and_enqueue_EU_message(const char* hostname, const process_id_t process_id, + const thread_id_t thread_id, const uintptr_t hDevice, + const uint32_t subDevice, const uint64_t activeTime, + const uint64_t ts, device_ref_t& device_ref, + Func message_creator, void* event_class) { + zeinterval_callbacks_state* state = (zeinterval_callbacks_state*) zeinterval_iter_g->callbacks_state; + auto [it, inserted] = device_ref.insert({{hostname, process_id, hDevice, subDevice}, {activeTime, ts}}); + if (inserted) + return; + + auto &[prev_activeTime, prev_ts] = it->second; + bt_message *message = message_creator(hostname, process_id, thread_id, hDevice, subDevice, + ((activeTime-prev_activeTime) / static_cast(ts-prev_ts)) * 1000.0, + prev_ts, static_cast(event_class), zeinterval_self_message_iterator_g, + zeinterval_iter_g->dispatch->stream, BACKEND_ZE); + + state->downstream_message_queue.push(message); + prev_activeTime = activeTime; + prev_ts = ts; +} + +static void create_and_enqueue_computeEU_message(const char* hostname, const process_id_t process_id, + const thread_id_t thread_id, const uintptr_t hDevice, + const uint32_t subDevice, const uint64_t activeTime, + const uint64_t ts) { + create_and_enqueue_EU_message(hostname, process_id, thread_id, hDevice, subDevice, + activeTime, ts, + static_cast(zeinterval_iter_g->callbacks_state)->device_computeEngine_ref, + create_computeEU_message, + zeinterval_iter_g->dispatch->computeEU_event_class); +} + +static void create_and_enqueue_copyEU_message(const char* hostname, const process_id_t process_id, + const thread_id_t thread_id, const uintptr_t hDevice, + const uint32_t subDevice, const uint64_t activeTime, + const uint64_t ts) { + create_and_enqueue_EU_message(hostname, process_id, thread_id, hDevice, subDevice, + activeTime, ts, + static_cast(zeinterval_iter_g->callbacks_state)->device_copyEngine_ref, + create_copyEU_message, + zeinterval_iter_g->dispatch->copyEU_event_class); +} + static void create_and_enqueue_device_message( const char* hostname, const process_id_t process_id, const thread_id_t thread_id, thapi_device_id device, const char* commandname, const char* metadata, @@ -293,6 +343,20 @@ static void zeinterval_<%= dbt_event.name %>_callback( int64_t ns_from_origin; bt_clock_snapshot_get_ns_from_origin(bt_clock, &ns_from_origin); create_and_enqueue_frequency_message(hostname.c_str(), process_id, thread_id, (uintptr_t)hDevice, domain, ns_from_origin, frequency); + <% elsif dbt_event.name_unsanitized == "lttng_ust_ze_sampling:computeEngine" %> + const hostname_t hostname = borrow_hostname(bt_evt); + const process_id_t process_id = 0; + const thread_id_t thread_id = 0; + int64_t ns_from_origin; + bt_clock_snapshot_get_ns_from_origin(bt_clock, &ns_from_origin); + create_and_enqueue_computeEU_message(hostname.c_str(), process_id, thread_id, (uintptr_t)hDevice, subDevice, activeTime, ns_from_origin); + <% elsif dbt_event.name_unsanitized == "lttng_ust_ze_sampling:copyEngine" %> + const hostname_t hostname = borrow_hostname(bt_evt); + const process_id_t process_id = 0; + const thread_id_t thread_id = 0; + int64_t ns_from_origin; + bt_clock_snapshot_get_ns_from_origin(bt_clock, &ns_from_origin); + create_and_enqueue_copyEU_message(hostname.c_str(), process_id, thread_id, (uintptr_t)hDevice, subDevice, activeTime, ns_from_origin); <% elsif dbt_event.name_unsanitized.start_with?('lttng_ust_ze:') or dbt_event.name_unsanitized.start_with?('lttng_ust_zet:') or dbt_event.name_unsanitized.start_with?('lttng_ust_zes:') or diff --git a/ze/zeinterval_callbacks.hpp b/ze/zeinterval_callbacks.hpp index d48d0bef..6d889ab3 100644 --- a/ze/zeinterval_callbacks.hpp +++ b/ze/zeinterval_callbacks.hpp @@ -18,11 +18,15 @@ typedef std::tuple hp_comma typedef std::tuple hp_module_t; typedef hp_device_t hpd_t; typedef hp_dsd_t hpdd_t; +typedef hp_dsdev_t hpdsd_t; typedef hp_event_t hpe_t; typedef hp_kernel_t hpk_t; typedef std::tuple clock_lttng_device_t; typedef std::tuple energy_timestamp_t; +typedef std::tuple computeEngine_timestamp_t; +typedef std::tuple copyEngine_timestamp_t; +typedef std::unordered_map, std::tuple> device_ref_t; typedef std::tuple t_tfnm_m_d_ts_cld_t; typedef std::tuple l_tfnm_m_d_ts_t; @@ -58,6 +62,10 @@ struct zeinterval_callbacks_state { std::unordered_map> last_command; /*Energy */ std::unordered_map device_energy_ref; + /*computeEngine */ + std::unordered_map device_computeEngine_ref; + /*copyEngine */ + std::unordered_map device_copyEngine_ref; }; template