Skip to content

Commit

Permalink
[SWDEV-463406] Update API with fields for gfx_clock_below_host_limit …
Browse files Browse the repository at this point in the history
…and low_utilization violations

Updated API with fields for gfx_clock_below_host_limit and low_utilization violations
Change-Id: I25647bae6e7b785f44dab024272767658688bcad

---------
Signed-off-by: Scaffidi, Salvatore <[email protected]>
Signed-off-by: Arif, Maisam <[email protected]>
Co-authored-by: Charis Poag <[email protected]>
  • Loading branch information
salvatoreg3 authored Jan 9, 2025
1 parent 4901327 commit 3793be7
Show file tree
Hide file tree
Showing 8 changed files with 246 additions and 85 deletions.
148 changes: 113 additions & 35 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -135,46 +135,124 @@ GPU: 0

### Changed


- **All `amd-smi set` and `amd-smi reset` options are now mutually exclusive**.
- Users can only use one set option at a time now.
- **All `amd-smi set` and `amd-smi reset` options are now mutually exclusive**.
- Users can only use one set option at a time now.

- **Python API for `amdsmi_get_energy_count()` will change the name for the `power` field to `energy_accumulator`**.

- **Added violation status output for Graphics Clock Below Host Limit to our CLI: `amdsmi_get_violation_status()`, `amd-smi metric --throttle`, and `amd-smi monitor --violation`.**
***Only available for MI300+ ASICs.***
Users can retrieve violation status' through either our Python or C++ APIs.
Additionally, we have added capability to view these outputs conviently through `amd-smi metric --throttle` and `amd-smi monitor --violation`.
Example outputs are listed below (below is for reference, output is subject to change):

```shell
$ amd-smi monitor --violation
GPU PVIOL TVIOL TVIOL_ACTIVE PHOT_TVIOL VR_TVIOL HBM_TVIOL GFX_CLKVIOL
0 0 % 0 % False 0 % 0 % 0 % 0 %
1 0 % 0 % False 0 % 0 % 0 % 0 %
...
```

```shell
$ amd-smi metric --throttle
GPU: 0
THROTTLE:
ACCUMULATION_COUNTER: 11240028
PROCHOT_ACCUMULATED: 0
PPT_ACCUMULATED: 0
SOCKET_THERMAL_ACCUMULATED: 0
VR_THERMAL_ACCUMULATED: 0
HBM_THERMAL_ACCUMULATED: 0
GFX_CLK_BELOW_HOST_LIMIT_ACCUMULATED: N/A
PROCHOT_VIOLATION_STATUS: NOT ACTIVE
PPT_VIOLATION_STATUS: NOT ACTIVE
SOCKET_THERMAL_VIOLATION_STATUS: NOT ACTIVE
VR_THERMAL_VIOLATION_STATUS: NOT ACTIVE
HBM_THERMAL_VIOLATION_STATUS: NOT ACTIVE
GFX_CLK_BELOW_HOST_LIMIT_VIOLATION_STATUS: N/A
PROCHOT_VIOLATION_ACTIVITY: 0 %
PPT_VIOLATION_ACTIVITY: 0 %
SOCKET_THERMAL_VIOLATION_ACTIVITY: 0 %
VR_THERMAL_VIOLATION_ACTIVITY: 0 %
HBM_THERMAL_VIOLATION_ACTIVITY: 0 %
GFX_CLK_BELOW_HOST_LIMIT_VIOLATION_ACTIVITY: 0 %
GPU: 1
THROTTLE:
ACCUMULATION_COUNTER: 11238232
PROCHOT_ACCUMULATED: 0
PPT_ACCUMULATED: 0
SOCKET_THERMAL_ACCUMULATED: 0
VR_THERMAL_ACCUMULATED: 0
HBM_THERMAL_ACCUMULATED: 0
GFX_CLK_BELOW_HOST_LIMIT_ACCUMULATED: 0
PROCHOT_VIOLATION_STATUS: NOT ACTIVE
PPT_VIOLATION_STATUS: NOT ACTIVE
SOCKET_THERMAL_VIOLATION_STATUS: NOT ACTIVE
VR_THERMAL_VIOLATION_STATUS: NOT ACTIVE
HBM_THERMAL_VIOLATION_STATUS: NOT ACTIVE
GFX_CLK_BELOW_HOST_LIMIT_VIOLATION_STATUS: NOT ACTIVE
PROCHOT_VIOLATION_ACTIVITY: 0 %
PPT_VIOLATION_ACTIVITY: 0 %
SOCKET_THERMAL_VIOLATION_ACTIVITY: 0 %
VR_THERMAL_VIOLATION_ACTIVITY: 0 %
HBM_THERMAL_VIOLATION_ACTIVITY: 0 %
GFX_CLK_BELOW_HOST_LIMIT_VIOLATION_ACTIVITY: 0 %
...
```

- **Updated API `amdsmi_get_violation_status()` structure and CLI `amdsmi_violation_status_t` to include GFX Clk below host limit**
Updated structure `amdsmi_violation_status_t`:
```C
typedef struct {
...
uint64_t acc_gfx_clk_below_host_limit; //!< Current graphic clock below host limit count; Max uint64 means unsupported
...
uint64_t per_gfx_clk_below_host_limit; //!< Graphics clock below host limit violation % (greater than 0% is a violation); Max uint64 means unsupported
...
uint8_t active_gfx_clk_below_host_limit; //!< Graphics clock below host limit violation; 1 = active 0 = not active; Max uint8 means unsupported
...
} amdsmi_violation_status_t;
```

- **Updated API `amdsmi_get_gpu_vram_info()` structure and CLI `amd-smi static --vram`**
Updated structure `amdsmi_vram_info_t`:
```C
typedef struct {
amdsmi_vram_type_t vram_type;
amdsmi_vram_vendor_type_t vram_vendor;
uint64_t vram_size;
uint32_t vram_bit_width;
uint64_t vram_max_bandwidth; //!< The VRAM max bandwidth at current memory clock (GB/s)
uint64_t reserved[4];
} amdsmi_vram_info_t;

amdsmi_status_t amdsmi_get_gpu_vram_info(amdsmi_processor_handle processor_handle, amdsmi_vram_info_t *info)
```
Example CLI output:
```shell
$ amd-smi static --vram
GPU: 0
VRAM:
TYPE: GDDR6
VENDOR: N/A
SIZE: 16368 MB
BIT_WIDTH: 256
MAX_BANDWIDTH: 1555 GB/s
GPU: 1
VRAM:
TYPE: GDDR6
VENDOR: N/A
SIZE: 30704 MB
BIT_WIDTH: 256
MAX_BANDWIDTH: 1555 GB/s
...
Updated structure `amdsmi_vram_info_t`:
```
```C
typedef struct {
amdsmi_vram_type_t vram_type;
amdsmi_vram_vendor_type_t vram_vendor;
uint64_t vram_size;
uint32_t vram_bit_width;
uint64_t vram_max_bandwidth; //!< The VRAM max bandwidth at current memory clock (GB/s)
uint64_t reserved[4];
} amdsmi_vram_info_t;

amdsmi_status_t amdsmi_get_gpu_vram_info(amdsmi_processor_handle processor_handle, amdsmi_vram_info_t *info)
```

Example CLI output:

```shell
$ amd-smi static --vram
GPU: 0
VRAM:
TYPE: GDDR6
VENDOR: N/A
SIZE: 16368 MB
BIT_WIDTH: 256
MAX_BANDWIDTH: 1555 GB/s
GPU: 1
VRAM:
TYPE: GDDR6
VENDOR: N/A
SIZE: 30704 MB
BIT_WIDTH: 256
MAX_BANDWIDTH: 1555 GB/s
...
```

### Removed

Expand Down
29 changes: 22 additions & 7 deletions amdsmi_cli/amdsmi_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -2277,21 +2277,23 @@ def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=No
'socket_thermal_accumulated': "N/A",
'vr_thermal_accumulated': "N/A",
'hbm_thermal_accumulated': "N/A",
'gfx_below_host_limit_acc': "N/A",
'gfx_clk_below_host_limit_accumulated': "N/A",

# violation status values - active/not active
'prochot_violation_status': "N/A",
'ppt_violation_status': "N/A",
'socket_thermal_violation_status': "N/A",
'vr_thermal_violation_status': "N/A",
'hbm_thermal_violation_status': "N/A",
'gfx_clk_below_host_limit_violation_status': "N/A",

# violation activity values - percent
'prochot_violation_activity': "N/A",
'ppt_violation_activity': "N/A",
'socket_thermal_violation_activity': "N/A",
'vr_thermal_violation_activity': "N/A",
'hbm_thermal_violation_activity': "N/A"
'hbm_thermal_violation_activity': "N/A",
'gfx_clk_below_host_limit_violation_activity': "N/A",
}

try:
Expand All @@ -2302,18 +2304,21 @@ def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=No
throttle_status['socket_thermal_accumulated'] = violation_status['acc_socket_thrm']
throttle_status['vr_thermal_accumulated'] = violation_status['acc_vr_thrm']
throttle_status['hbm_thermal_accumulated'] = violation_status['acc_hbm_thrm']
throttle_status['gfx_clk_below_host_limit_accumulated'] = violation_status['acc_gfx_clk_below_host_limit']

throttle_status['prochot_violation_status'] = violation_status['active_prochot_thrm']
throttle_status['ppt_violation_status'] = violation_status['active_ppt_pwr']
throttle_status['socket_thermal_violation_status'] = violation_status['active_socket_thrm']
throttle_status['vr_thermal_violation_status'] = violation_status['active_vr_thrm']
throttle_status['hbm_thermal_violation_status'] = violation_status['active_hbm_thrm']
throttle_status['gfx_clk_below_host_limit_violation_status'] = violation_status['active_gfx_clk_below_host_limit']

throttle_status['prochot_violation_activity'] = violation_status['per_prochot_thrm']
throttle_status['ppt_violation_activity'] = violation_status['per_ppt_pwr']
throttle_status['socket_thermal_violation_activity'] = violation_status['per_socket_thrm']
throttle_status['vr_thermal_violation_activity'] = violation_status['per_vr_thrm']
throttle_status['hbm_thermal_violation_activity'] = violation_status['per_hbm_thrm']
throttle_status['gfx_clk_below_host_limit_violation_activity'] = violation_status['per_gfx_clk_below_host_limit']

except amdsmi_exception.AmdSmiLibraryException as e:
values_dict['throttle'] = throttle_status
Expand Down Expand Up @@ -5274,6 +5279,7 @@ def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None,
"phot_tviol": "N/A",
"vr_tviol": "N/A",
"hbm_tviol": "N/A",
"gfx_clkviol": "N/A",
}
try:
violations = amdsmi_interface.amdsmi_get_violation_status(args.gpu)
Expand All @@ -5283,39 +5289,48 @@ def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None,
violation_status['phot_tviol'] = violations['per_prochot_thrm']
violation_status['vr_tviol'] = violations['per_vr_thrm']
violation_status['hbm_tviol'] = violations['per_hbm_thrm']
violation_status['gfx_clkviol'] = violations['per_gfx_clk_below_host_limit']
except amdsmi_exception.AmdSmiLibraryException as e:
monitor_values['pviol'] = violation_status['pviol']
monitor_values['tviol'] = violation_status['tviol']
monitor_values['tviol_active'] = violation_status['tviol_active']
monitor_values['phot_tviol'] = violation_status['phot_tviol']
monitor_values['vr_tviol'] = violation_status['vr_tviol']
monitor_values['hbm_tviol'] = violation_status['hbm_tviol']
monitor_values['gfx_clkviol'] = violation_status['gfx_clkviol']
logging.debug("Failed to get violation status on gpu %s | %s", gpu_id, e.get_error_info())
violation_status_unit = "%"
kPVIOL_MAX_WIDTH = 7
kTVIOL_MAX_WIDTH = 7
kTVIOL_ACTIVE_MAX_WIDTH = 14
kPVIOL_MAX_WIDTH = 7
kPHOT_MAX_WIDTH = 12
kVR_MAX_WIDTH = 10
kHBM_MAX_WIDTH = 11
kGFXC_MAX_WIDTH = 13

for key, value in violation_status.items():
if key == "tviol_active":
monitor_values[key] = value
elif key != "tviol_active":
monitor_values[key] = self.helpers.unit_format(self.logger, violation_status[key], violation_status_unit)
if value != "N/A":
if key == "tviol_active":
monitor_values[key] = value
else:
monitor_values[key] = self.helpers.unit_format(self.logger, violation_status[key], violation_status_unit)
else:
monitor_values[key] = violation_status[key]

if self.logger.is_human_readable_format():
monitor_values['pviol'] = monitor_values['pviol'].rjust(kPVIOL_MAX_WIDTH, ' ')
monitor_values['tviol'] = monitor_values['tviol'].rjust(kTVIOL_MAX_WIDTH, ' ')
monitor_values['phot_tviol'] = monitor_values['phot_tviol'].rjust(kPHOT_MAX_WIDTH, ' ')
monitor_values['vr_tviol'] = monitor_values['vr_tviol'].rjust(kVR_MAX_WIDTH, ' ')
monitor_values['hbm_tviol'] = monitor_values['hbm_tviol'].rjust(kHBM_MAX_WIDTH, ' ')
monitor_values['gfx_clkviol'] = monitor_values['gfx_clkviol'].rjust(kGFXC_MAX_WIDTH, ' ')
self.logger.table_header += 'PVIOL'.rjust(kPVIOL_MAX_WIDTH, ' ')
self.logger.table_header += 'TVIOL'.rjust(kTVIOL_MAX_WIDTH, ' ')
self.logger.table_header += 'TVIOL_ACTIVE'.rjust(kTVIOL_ACTIVE_MAX_WIDTH, ' ')
self.logger.table_header += 'PHOT_TVIOL'.rjust(kPHOT_MAX_WIDTH, ' ')
self.logger.table_header += 'VR_TVIOL'.rjust(kVR_MAX_WIDTH, ' ')
self.logger.table_header += 'HBM_TVIOL'.rjust(kHBM_MAX_WIDTH, ' ')
self.logger.table_header += 'GFX_CLKVIOL'.rjust(kGFXC_MAX_WIDTH, ' ')

self.logger.store_output(args.gpu, 'values', monitor_values)

Expand Down
2 changes: 2 additions & 0 deletions amdsmi_cli/amdsmi_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,8 @@ def _convert_json_to_tabular(self, json_object: Dict[str, any]):
table_values += string_value.rjust(10)
elif key == "hbm_tviol":
table_values += string_value.rjust(11)
elif key == "gfx_clkviol":
table_values += string_value.rjust(13)
elif key == "process_list":
#Add an additional padding between the first instance of GPU and NAME
table_values += ' '
Expand Down
6 changes: 5 additions & 1 deletion include/amd_smi/amdsmi.h
Original file line number Diff line number Diff line change
Expand Up @@ -526,18 +526,22 @@ typedef struct {
uint64_t acc_socket_thrm; //!< TVIOL; Current accumulated Socket thermal count; Max uint64 means unsupported
uint64_t acc_vr_thrm; //!< Current accumulated voltage regulator count; Max uint64 means unsupported
uint64_t acc_hbm_thrm; //!< Current accumulated High Bandwidth Memory (HBM) thermal count; Max uint64 means unsupported
uint64_t acc_gfx_clk_below_host_limit; //!< Current graphic clock below host limit count; Max uint64 means unsupported
uint64_t per_prochot_thrm; //!< Processor hot violation % (greater than 0% is a violation); Max uint64 means unsupported
uint64_t per_ppt_pwr; //!< PVIOL; Package Power Tracking (PPT) violation % (greater than 0% is a violation); Max uint64 means unsupported
uint64_t per_socket_thrm; //!< TVIOL; Socket thermal violation % (greater than 0% is a violation); Max uint64 means unsupported
uint64_t per_vr_thrm; //!< Voltage regulator violation % (greater than 0% is a violation); Max uint64 means unsupported
uint64_t per_hbm_thrm; //!< High Bandwidth Memory (HBM) thermal violation % (greater than 0% is a violation); Max uint64 means unsupported
uint64_t per_gfx_clk_below_host_limit; //!< Graphics clock below host limit violation % (greater than 0% is a violation); Max uint64 means unsupported
uint8_t active_prochot_thrm; //!< Processor hot violation; 1 = active 0 = not active; Max uint8 means unsupported
uint8_t active_ppt_pwr; //!< Package Power Tracking (PPT) violation; 1 = active 0 = not active; Max uint8 means unsupported
uint8_t active_socket_thrm; //!< Socket thermal violation; 1 = active 0 = not active; Max uint8 means unsupported
uint8_t active_vr_thrm; //!< Voltage regulator violation; 1 = active 0 = not active; Max uint8 means unsupported
uint8_t active_hbm_thrm; //!< High Bandwidth Memory (HBM) thermal violation; 1 = active 0 = not active; Max uint8 means unsupported
uint64_t reserved[30]; // Reserved for new violation info
uint8_t active_gfx_clk_below_host_limit; //!< Graphics clock below host limit violation; 1 = active 0 = not active; Max uint8 means unsupported
uint64_t reserved[3]; // Reserved for new violation info
} amdsmi_violation_status_t;

typedef struct {
amdsmi_range_t supported_freq_range;
amdsmi_range_t current_freq_range;
Expand Down
13 changes: 10 additions & 3 deletions py-interface/amdsmi_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import sys
import math
from time import localtime, asctime, time
import json

MAX_NUM_PROCESSES = 1024

Expand Down Expand Up @@ -1559,7 +1560,9 @@ def amdsmi_get_hsmp_metrics_table(
"mtbl_ppt_residency_acc": mtbl.ppt_residency_acc,
"mtbl_socket_thm_residency_acc": mtbl.socket_thm_residency_acc,
"mtbl_vr_thm_residency_acc": mtbl.vr_thm_residency_acc,
"mtbl_hbm_thm_residency_acc": mtbl.hbm_thm_residency_acc
"mtbl_hbm_thm_residency_acc": mtbl.hbm_thm_residency_acc,
"mtbl_gfx_clk_below_host_residency_acc": mtbl.gfx_clk_below_host_residency_acc,
"mtbl_low_utilization_residency_acc": mtbl.low_utilization_residency_acc
}

def amdsmi_first_online_core_on_cpu_socket(
Expand Down Expand Up @@ -2035,7 +2038,7 @@ def amdsmi_get_violation_status(
processor_handle, ctypes.byref(violation_status))
)

return {
dict_return = {
"reference_timestamp": _validate_if_max_uint(violation_status.reference_timestamp, MaxUIntegerTypes.UINT64_T),
"violation_timestamp": _validate_if_max_uint(violation_status.violation_timestamp, MaxUIntegerTypes.UINT64_T),
"acc_counter": _validate_if_max_uint(violation_status.acc_counter, MaxUIntegerTypes.UINT64_T),
Expand All @@ -2044,17 +2047,21 @@ def amdsmi_get_violation_status(
"acc_socket_thrm": _validate_if_max_uint(violation_status.acc_socket_thrm, MaxUIntegerTypes.UINT64_T), #TVIOL
"acc_vr_thrm": _validate_if_max_uint(violation_status.acc_vr_thrm, MaxUIntegerTypes.UINT64_T),
"acc_hbm_thrm": _validate_if_max_uint(violation_status.acc_hbm_thrm, MaxUIntegerTypes.UINT64_T),
"acc_gfx_clk_below_host_limit": _validate_if_max_uint(violation_status.acc_gfx_clk_below_host_limit, MaxUIntegerTypes.UINT64_T),
"per_prochot_thrm": _validate_if_max_uint(violation_status.per_prochot_thrm, MaxUIntegerTypes.UINT64_T, isActivity=True),
"per_ppt_pwr": _validate_if_max_uint(violation_status.per_ppt_pwr, MaxUIntegerTypes.UINT64_T, isActivity=True), #PVIOL
"per_socket_thrm": _validate_if_max_uint(violation_status.per_socket_thrm, MaxUIntegerTypes.UINT64_T, isActivity=True), #TVIOL
"per_vr_thrm": _validate_if_max_uint(violation_status.per_vr_thrm, MaxUIntegerTypes.UINT64_T, isActivity=True),
"per_hbm_thrm": _validate_if_max_uint(violation_status.per_hbm_thrm, MaxUIntegerTypes.UINT64_T, isActivity=True),
"per_gfx_clk_below_host_limit": _validate_if_max_uint(violation_status.per_gfx_clk_below_host_limit, MaxUIntegerTypes.UINT64_T, isActivity=True),
"active_prochot_thrm": _validate_if_max_uint(violation_status.active_prochot_thrm, MaxUIntegerTypes.UINT8_T, isBool=True),
"active_ppt_pwr": _validate_if_max_uint(violation_status.active_ppt_pwr, MaxUIntegerTypes.UINT8_T, isBool=True), #PVIOL
"active_socket_thrm": _validate_if_max_uint(violation_status.active_socket_thrm, MaxUIntegerTypes.UINT8_T, isBool=True), #TVIOL
"active_vr_thrm": _validate_if_max_uint(violation_status.active_vr_thrm, MaxUIntegerTypes.UINT8_T, isBool=True),
"active_hbm_thrm": _validate_if_max_uint(violation_status.active_hbm_thrm, MaxUIntegerTypes.UINT8_T, isBool=True)
"active_hbm_thrm": _validate_if_max_uint(violation_status.active_hbm_thrm, MaxUIntegerTypes.UINT8_T, isBool=True),
"active_gfx_clk_below_host_limit": _validate_if_max_uint(violation_status.active_gfx_clk_below_host_limit, MaxUIntegerTypes.UINT8_T, isBool=True),
}
return dict_return

def amdsmi_get_gpu_total_ecc_count(
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
Expand Down
Loading

0 comments on commit 3793be7

Please sign in to comment.