Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for utilization metrics #153

Merged
merged 27 commits into from
Nov 1, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
4045398
Support for utilization metrics
solo2abera Sep 22, 2023
2e12717
Update ze/tracer_ze_helpers.include.c
sbekele81 Sep 25, 2023
8b5c3b2
Apply suggestions from code review
sbekele81 Sep 25, 2023
9c15987
sysman-metrics updated
solo2abera Sep 25, 2023
423c2aa
Update ze/tracer_ze_helpers.include.c
Kerilk Sep 29, 2023
58dd9e8
tracepoint_enabled check added
solo2abera Sep 29, 2023
759e669
Apply suggestions from code review
sbekele81 Sep 29, 2023
bff3be7
Suggestions incorporated
solo2abera Sep 30, 2023
4b8fdab
Separation of initialization
solo2abera Oct 6, 2023
8c3f0e0
Apply suggestions from code review
sbekele81 Oct 9, 2023
4093579
Support for utilization metrics
solo2abera Sep 22, 2023
49b3744
Update ze/tracer_ze_helpers.include.c
sbekele81 Sep 25, 2023
bb677ed
Apply suggestions from code review
sbekele81 Sep 25, 2023
05ed28d
sysman-metrics updated
solo2abera Sep 25, 2023
e449c3b
Update ze/tracer_ze_helpers.include.c
Kerilk Sep 29, 2023
f0b75d8
tracepoint_enabled check added
solo2abera Sep 29, 2023
c62a74d
Apply suggestions from code review
sbekele81 Sep 29, 2023
bcb7d4a
Suggestions incorporated
solo2abera Sep 30, 2023
0894dc3
Separation of initialization
solo2abera Oct 6, 2023
c47c598
Apply suggestions from code review
sbekele81 Oct 9, 2023
5a24bca
subDevice Handled
solo2abera Oct 9, 2023
ba7a6e3
Multi-driver support
solo2abera Oct 10, 2023
31c45a2
Merge branch 'sysman-metrics' of github.com:argonne-lcf/THAPI into sy…
solo2abera Oct 10, 2023
3d79d27
EngineProps and Subdevice Handled
solo2abera Oct 13, 2023
b6438c0
Index consistancy maintianed
solo2abera Oct 13, 2023
1a93ad0
Templated
solo2abera Oct 28, 2023
8c456c9
Merge branch 'master' into sysman-metrics
TApplencourt Oct 31, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 96 additions & 0 deletions utils/xprof_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,102 @@ bt_message* create_frequency_message(const char* hostname, const process_id_t pr
return message;
}

bt_message* create_computeEU_message(const char* hostname, const process_id_t process_id, const thread_id_t thread_id,
const uintptr_t hDevice, const uint32_t subDevice, const float activeTime, const uint64_t ts,
bt_event_class *event_class, bt_self_message_iterator *message_iterator, bt_stream *stream, backend_t backend) {

/* Message creation */
bt_message *message = bt_message_event_create(
message_iterator, event_class, stream);

/* event */
bt_event *downstream_event = bt_message_event_borrow_event(message);

/* Common context */
bt_field *context_field = bt_event_borrow_common_context_field(downstream_event);

// Hostname
bt_field *hostname_msg_field = bt_field_structure_borrow_member_field_by_index(context_field,0);
bt_field_string_set_value(hostname_msg_field, hostname);
// pid
bt_field *vpid_field = bt_field_structure_borrow_member_field_by_index(context_field,1);
bt_field_integer_signed_set_value(vpid_field, process_id);
// vid
bt_field *vtid_field = bt_field_structure_borrow_member_field_by_index(context_field,2);
bt_field_integer_signed_set_value(vtid_field, thread_id);
// ts
bt_field *ts_field = bt_field_structure_borrow_member_field_by_index(context_field,3);
bt_field_integer_signed_set_value(ts_field, ts);
// backend
bt_field *backend_field = bt_field_structure_borrow_member_field_by_index(context_field,4);
bt_field_integer_signed_set_value(backend_field, backend);

/* Payload */
bt_field *payload_field = bt_event_borrow_payload_field(downstream_event);

// did
bt_field *device_id_field = bt_field_structure_borrow_member_field_by_index(payload_field,0);
bt_field_integer_unsigned_set_value(device_id_field, hDevice);

//subDevice
bt_field *subDevice_field = bt_field_structure_borrow_member_field_by_index(payload_field,1);
bt_field_integer_unsigned_set_value(subDevice_field, subDevice);

//activeTime
bt_field *activeTime_field = bt_field_structure_borrow_member_field_by_index(payload_field,2);
bt_field_real_single_precision_set_value(activeTime_field, activeTime);

return message;
}

bt_message* create_copyEU_message(const char* hostname, const process_id_t process_id, const thread_id_t thread_id,
const uintptr_t hDevice, const uint32_t subDevice, const float activeTime, const uint64_t ts,
bt_event_class *event_class, bt_self_message_iterator *message_iterator, bt_stream *stream, backend_t backend) {

/* Message creation */
bt_message *message = bt_message_event_create(
message_iterator, event_class, stream);

/* event */
bt_event *downstream_event = bt_message_event_borrow_event(message);

/* Common context */
bt_field *context_field = bt_event_borrow_common_context_field(downstream_event);

// Hostname
bt_field *hostname_msg_field = bt_field_structure_borrow_member_field_by_index(context_field,0);
bt_field_string_set_value(hostname_msg_field, hostname);
// pid
bt_field *vpid_field = bt_field_structure_borrow_member_field_by_index(context_field,1);
bt_field_integer_signed_set_value(vpid_field, process_id);
// vid
bt_field *vtid_field = bt_field_structure_borrow_member_field_by_index(context_field,2);
bt_field_integer_signed_set_value(vtid_field, thread_id);
// ts
bt_field *ts_field = bt_field_structure_borrow_member_field_by_index(context_field,3);
bt_field_integer_signed_set_value(ts_field, ts);
// backend
bt_field *backend_field = bt_field_structure_borrow_member_field_by_index(context_field,4);
bt_field_integer_signed_set_value(backend_field, backend);

/* Payload */
bt_field *payload_field = bt_event_borrow_payload_field(downstream_event);

// did
bt_field *device_id_field = bt_field_structure_borrow_member_field_by_index(payload_field,0);
bt_field_integer_unsigned_set_value(device_id_field, hDevice);

// subDevice
bt_field *subDevice_field = bt_field_structure_borrow_member_field_by_index(payload_field,1);
bt_field_integer_unsigned_set_value(subDevice_field, subDevice);

//activeTime
bt_field *activeTime_field = bt_field_structure_borrow_member_field_by_index(payload_field,2);
bt_field_real_single_precision_set_value(activeTime_field, activeTime);

return message;
}

bt_message* create_host_message(const char* hostname, const process_id_t process_id, const thread_id_t thread_id, const char* name,
const uint64_t ts, const uint64_t duration, const bool err,
bt_event_class *event_class, bt_self_message_iterator *message_iterator, bt_stream *stream, backend_t backend) {
Expand Down
11 changes: 10 additions & 1 deletion utils/xprof_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ typedef std::string hostname_t;
typedef std::string thapi_function_name;
typedef uintptr_t thapi_device_id;
typedef uint32_t thapi_domain_id;
typedef uint32_t thapi_sdevice_id;
sbekele81 marked this conversation as resolved.
Show resolved Hide resolved
sbekele81 marked this conversation as resolved.
Show resolved Hide resolved

// Represent a device and a sub device
typedef std::tuple<thapi_device_id, thapi_device_id> dsd_t;
Expand All @@ -60,6 +61,7 @@ typedef std::tuple<hostname_t, process_id_t, thread_id_t, thapi_device_id, thapi
typedef std::tuple<hostname_t, process_id_t, thapi_device_id> hp_device_t;
typedef std::tuple<hostname_t, process_id_t, thapi_device_id, thapi_device_id> hp_dsd_t;
typedef std::tuple<hostname_t, process_id_t, thapi_device_id, thapi_domain_id> hp_ddomain_t;
typedef std::tuple<hostname_t, process_id_t, thapi_device_id, thapi_sdevice_id> hp_dsdev_t;
typedef std::tuple<long, long> sd_t;
typedef std::tuple<thread_id_t, thapi_function_name, long> tfn_ts_t;
typedef std::tuple<thapi_function_name, long> fn_ts_t;
Expand Down Expand Up @@ -116,11 +118,18 @@ bt_message* create_power_message(const char* hostname, const process_id_t propro
const uintptr_t hDevice, const uint32_t domain, const uint64_t power, const uint64_t ts,
bt_event_class *event_class, bt_self_message_iterator *message_iterator, bt_stream *stream, backend_t backend = BACKEND_UNKNOWN);


bt_message* create_frequency_message(const char* hostname, const process_id_t proprocess_id, const thread_id_t thread_id,
const uintptr_t hDevice, const uint32_t domain, const uint64_t ts, const uint64_t frequency,
bt_event_class *event_class, bt_self_message_iterator *message_iterator, bt_stream *stream, backend_t backend = BACKEND_UNKNOWN);

bt_message* create_computeEU_message(const char* hostname, const process_id_t proprocess_id, const thread_id_t thread_id,
const uintptr_t hDevice, const uint32_t subDevice, const float activeTime, const uint64_t ts,
bt_event_class *event_class, bt_self_message_iterator *message_iterator, bt_stream *stream, backend_t backend = BACKEND_UNKNOWN);

bt_message* create_copyEU_message(const char* hostname, const process_id_t proprocess_id, const thread_id_t thread_id,
const uintptr_t hDevice, const uint32_t subDevice, const float activeTime, const uint64_t ts,
bt_event_class *event_class, bt_self_message_iterator *message_iterator, bt_stream *stream, backend_t backend = BACKEND_UNKNOWN);

bt_message *create_host_message(const char *hostname, const process_id_t, const thread_id_t,
const char *name, const uint64_t ts, const uint64_t duration,
const bool err, bt_event_class *, bt_self_message_iterator *,
Expand Down
36 changes: 36 additions & 0 deletions xprof/btx_interval_model.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,39 @@
:type: integer_unsigned
:field_value_range: 64
:cast_type: uint64_t
- :name: lttng:computeEU
:payload_field_class:
:type: structure
:members:
- :name: did
:field_class:
:type: integer_unsigned
:field_value_range: 64
:cast_type: uint64_t
- :name: subDevice
:field_class:
:type: integer_unsigned
:field_value_range: 32
:cast_type: uint32_t
- :name: activeTime
:field_class:
:type: single
:cast_type: float
- :name: lttng:copyEU
:payload_field_class:
:type: structure
:members:
- :name: did
:field_class:
:type: integer_unsigned
:field_value_range: 64
:cast_type: uint64_t
- :name: subDevice
:field_class:
:type: integer_unsigned
:field_value_range: 32
:cast_type: uint32_t
- :name: activeTime
:field_class:
:type: single
:cast_type: float
64 changes: 60 additions & 4 deletions xprof/btx_timeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@ struct timeline_dispatch_s {
std::unordered_map<hp_device_t, perfetto_uuid_t> hp_device2countertracks;
std::unordered_map<hp_ddomain_t, perfetto_uuid_t> hp_ddomain2frqtracks;
std::unordered_map<hp_ddomain_t, perfetto_uuid_t> hp_ddomain2pwrtracks;

std::unordered_map<hp_dsdev_t, perfetto_uuid_t> hp_dsdev2cpetracks;
std::unordered_map<hp_dsdev_t, perfetto_uuid_t> hp_dsdev2cpytracks;
sbekele81 marked this conversation as resolved.
Show resolved Hide resolved

perfetto_pruned::Trace trace;
};
using timeline_dispatch_t = struct timeline_dispatch_s;
Expand Down Expand Up @@ -99,11 +101,21 @@ static perfetto_uuid_t get_counter_track_uuuid(timeline_dispatch_t *dispatch,
}
static perfetto_uuid_t get_frequency_track_uuuid(timeline_dispatch_t *dispatch, std::string hostname,
uint64_t process_id, thapi_device_id did, thapi_domain_id domain) {
return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2frqtracks, "GPU Frequency", hostname, process_id, did, domain);
return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2frqtracks, " GPU Frequency", hostname, process_id, did, domain);
}
static perfetto_uuid_t get_power_track_uuuid(timeline_dispatch_t *dispatch, std::string hostname,
uint64_t process_id, thapi_device_id did, thapi_device_id domain) {
return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2pwrtracks, " GPU Power", hostname, process_id, did, domain);
uint64_t process_id, thapi_device_id did, thapi_domain_id domain) {
return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2pwrtracks, " GPU Power", hostname, process_id, did, domain);
sbekele81 marked this conversation as resolved.
Show resolved Hide resolved
}

static perfetto_uuid_t get_computeEU_track_uuuid(timeline_dispatch_t *dispatch, std::string hostname,
uint64_t process_id, thapi_device_id did, thapi_sdevice_id subDevice) {
return get_counter_track_uuuid(dispatch, dispatch->hp_dsdev2cpetracks, "ComputeE Utilization", hostname, process_id, did, subDevice);
sbekele81 marked this conversation as resolved.
Show resolved Hide resolved
}

static perfetto_uuid_t get_copyEU_track_uuuid(timeline_dispatch_t *dispatch, std::string hostname,
uint64_t process_id, thapi_device_id did, thapi_sdevice_id subDevice) {
return get_counter_track_uuuid(dispatch, dispatch->hp_dsdev2cpytracks, "CopyE Utilization", hostname, process_id, did, subDevice);
sbekele81 marked this conversation as resolved.
Show resolved Hide resolved
}

static void add_event_frequency(timeline_dispatch_t *dispatch, std::string hostname,
Expand Down Expand Up @@ -135,6 +147,34 @@ static void add_event_power(timeline_dispatch_t *dispatch, std::string hostname,
track_event->set_counter_value(power);
}

static void add_event_computeEU(timeline_dispatch_t *dispatch, std::string hostname,
uint64_t process_id, uint64_t thread_id, uintptr_t did,
uint32_t subDevice, uint64_t timestamp, float activeTime) {
perfetto_uuid_t track_uuid = get_computeEU_track_uuuid(dispatch, hostname, process_id, did, subDevice);
auto *packet = dispatch->trace.add_packet();
packet->set_trusted_packet_sequence_id(10000);
sbekele81 marked this conversation as resolved.
Show resolved Hide resolved
packet->set_timestamp(timestamp);
auto *track_event = packet->mutable_track_event();
track_event->set_type(perfetto_pruned::TrackEvent::TYPE_COUNTER);
track_event->set_track_uuid(track_uuid);
track_event->set_name("computeEngine Usage");
sbekele81 marked this conversation as resolved.
Show resolved Hide resolved
track_event->set_double_counter_value(activeTime);
}

static void add_event_copyEU(timeline_dispatch_t *dispatch, std::string hostname,
uint64_t process_id, uint64_t thread_id, uintptr_t did,
uint32_t subDevice, uint64_t timestamp, float activeTime) {
perfetto_uuid_t track_uuid = get_copyEU_track_uuuid(dispatch, hostname, process_id, did, subDevice);
auto *packet = dispatch->trace.add_packet();
packet->set_trusted_packet_sequence_id(10000);
packet->set_timestamp(timestamp);
auto *track_event = packet->mutable_track_event();
track_event->set_type(perfetto_pruned::TrackEvent::TYPE_COUNTER);
track_event->set_track_uuid(track_uuid);
track_event->set_name("copyEngine Usage");
track_event->set_double_counter_value(activeTime);
}

sbekele81 marked this conversation as resolved.
Show resolved Hide resolved
static void add_event_begin(timeline_dispatch_t *dispatch, perfetto_uuid_t uuid, timestamp_t begin,
std::string name) {
auto *packet = dispatch->trace.add_packet();
Expand Down Expand Up @@ -352,11 +392,27 @@ static void power_usr_callback(void *btx_handle, void *usr_data, const char *hos
add_event_power(dispatch, hostname, vpid, vtid, did, domain, ts, power);
}

static void computeEU_usr_callback(void *btx_handle, void *usr_data, const char *hostname,
int64_t vpid, uint64_t vtid, int64_t ts, int64_t backend,
uint64_t did, uint32_t subDevice, float activeTime) {
auto *dispatch = static_cast<timeline_dispatch_t *>(usr_data);
add_event_computeEU(dispatch, hostname, vpid, vtid, did, subDevice, ts, activeTime);
}

static void copyEU_usr_callback(void *btx_handle, void *usr_data, const char *hostname,
int64_t vpid, uint64_t vtid, int64_t ts, int64_t backend,
uint64_t did, uint32_t subDevice, float activeTime) {
auto *dispatch = static_cast<timeline_dispatch_t *>(usr_data);
add_event_copyEU(dispatch, hostname, vpid, vtid, did, subDevice, ts, activeTime);
}

void btx_register_usr_callbacks(void *btx_handle) {
btx_register_callbacks_lttng_host(btx_handle, &host_usr_callback);
btx_register_callbacks_lttng_device(btx_handle, &device_usr_callback);
btx_register_callbacks_lttng_frequency(btx_handle, &frequency_usr_callback);
btx_register_callbacks_lttng_power(btx_handle, &power_usr_callback);
btx_register_callbacks_lttng_computeEU(btx_handle, &computeEU_usr_callback);
btx_register_callbacks_lttng_copyEU(btx_handle, &copyEU_usr_callback);
btx_register_callbacks_initialize_usr_data(btx_handle, &btx_initialize_usr_data);
btx_register_callbacks_finalize_usr_data(btx_handle, &btx_finalize_usr_data);
}
2 changes: 2 additions & 0 deletions xprof/interval.c.erb
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ bt_component_class_initialize_method_status <%= namespace %>_dispatch_initialize
dispatch->device_name_event_class = create_lttng_device_name_event_class_message(trace_class, stream_class);
dispatch->frequency_event_class = create_lttng_frequency_event_class_message(trace_class, stream_class);
dispatch->power_event_class = create_lttng_power_event_class_message(trace_class, stream_class);
dispatch->computeEU_event_class = create_lttng_computeEU_event_class_message(trace_class, stream_class);
dispatch->copyEU_event_class = create_lttng_copyEU_event_class_message(trace_class, stream_class);
sbekele81 marked this conversation as resolved.
Show resolved Hide resolved

/* Create a default trace from (instance of `trace_class`) */
bt_trace *trace = bt_trace_create(trace_class);
Expand Down
2 changes: 2 additions & 0 deletions xprof/interval.h.erb
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ struct <%= namespace %>_dispatch {
bt_event_class *device_name_event_class;
bt_event_class *frequency_event_class;
bt_event_class *power_event_class;
bt_event_class *computeEU_event_class;
bt_event_class *copyEU_event_class;
/* Component's input port (weak) */
bt_self_component_port_input *in_port;
};
Expand Down
24 changes: 24 additions & 0 deletions xprof/interval_model.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,27 @@
:field_value_range: 32
- :name: power
:class: unsigned
- :name: lttng:computeEU
:payload:
- :name: did
:class: unsigned
:class_properties:
:preferred_display_base: 16
- :name: subDevice
:class: unsigned
:class_properties:
:field_value_range: 32
- :name: activeTime
:class: unsigned
- :name: lttng:copyEU
:payload:
- :name: did
:class: unsigned
:class_properties:
:preferred_display_base: 16
- :name: subDevice
:class: unsigned
:class_properties:
:field_value_range: 32
- :name: activeTime
:class: unsigned
sbekele81 marked this conversation as resolved.
Show resolved Hide resolved
Loading