Skip to content

Commit

Permalink
fix(gpu): fix accelerator registry map
Browse files Browse the repository at this point in the history
Signed-off-by: Vimal Kumar <[email protected]>
  • Loading branch information
vimalk78 committed Jan 21, 2025
1 parent b0f6cdc commit 40b36a1
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 7 deletions.
3 changes: 3 additions & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,9 @@ func logBoolConfigs() {
klog.V(5).Infof("EXPOSE_ESTIMATED_IDLE_POWER_METRICS: %t. This only impacts when the power is estimated using pre-prained models. Estimated idle power is meaningful only when Kepler is running on bare-metal or with a single virtual machine (VM) on the node.", instance.Kepler.ExposeIdlePowerMetrics)
klog.V(5).Infof("EXPERIMENTAL_BPF_SAMPLE_RATE: %d", instance.Kepler.BPFSampleRate)
klog.V(5).Infof("EXCLUDE_SWAPPER_PROCESS: %t", instance.Kepler.ExcludeSwapperProcess)
if instance.Kepler.EnabledGPU {
klog.V(5).Infof("DCGMHostEngineEndpoint %s", instance.DCGMHostEngineEndpoint)
}
}
}

Expand Down
6 changes: 5 additions & 1 deletion pkg/sensors/accelerator/accelerator.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ package accelerator

//nolint:gci // The supported device imports are kept separate.
import (
"encoding/json"
"slices"
"sync"
"time"
Expand Down Expand Up @@ -130,7 +131,10 @@ func New(atype string, sleep bool) (Accelerator, error) {

// Init the available devices.

devs := devices.GetRegistry().GetAllDeviceTypes()
r := devices.GetRegistry()
j, _ := json.Marshal(r.GetAllDevices())
klog.V(5).Infof("Accelerator Registry AllDevices: %s", string(j))
devs := r.GetAllDeviceTypes()
numDevs := len(devs)
if numDevs == 0 || !slices.Contains(devs, atype) {
return nil, errors.New("no devices found")
Expand Down
13 changes: 9 additions & 4 deletions pkg/sensors/accelerator/devices/dcgm.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,21 +76,22 @@ func dcgmCheck(r *Registry) {
}

func dcgmDeviceStartup() Device {
a := dcgmAccImpl
klog.V(3).Infof("Attempting to startup DCGM")
d := dcgmAccImpl

if err := a.InitLib(); err != nil {
if err := d.InitLib(); err != nil {
klog.Errorf("Error initializing %s: %v", dcgmType.String(), err)
return nil
}

if err := a.Init(); err != nil {
if err := d.Init(); err != nil {
klog.Errorf("failed to StartupDevice: %v", err)
return nil
}

klog.Infof("Using %s to obtain gpu power", dcgmType.String())

return &a
return &d
}

func (d *gpuDcgm) Init() error {
Expand Down Expand Up @@ -138,6 +139,7 @@ func (d *gpuDcgm) InitLib() (err error) {
if err != nil {
klog.Infof("There is no DCGM daemon running in the host: %s", err)
// embedded mode is not recommended for production per https://github.com/NVIDIA/dcgm-exporter/issues/22#issuecomment-1321521995
klog.Info("Attempting to inilialize dcgm in Embedded mode.")
cleanup, err = dcgm.Init(dcgm.Embedded)
if err != nil {
klog.Errorf("Could not start DCGM. Error: %s", err)
Expand All @@ -147,6 +149,8 @@ func (d *gpuDcgm) InitLib() (err error) {
return fmt.Errorf("not able to connect to DCGM: %s", err)
}
klog.Info("Started DCGM in the Embedded mode ")
} else {
klog.Info("Started DCGM in the Standalone mode ")
}
d.nvmlInited = false
d.devs = make(map[int]GPUDevice)
Expand All @@ -172,6 +176,7 @@ func (d *gpuDcgm) InitLib() (err error) {
}

func (d *gpuDcgm) loadDevices() error {
klog.V(5).Infof("Attempting to load dcgm devices.")
d.devs = map[int]GPUDevice{}
count, err := nvml.DeviceGetCount()
if err != nvml.SUCCESS {
Expand Down
21 changes: 19 additions & 2 deletions pkg/sensors/accelerator/devices/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,13 @@ func (r *Registry) MustRegister(a string, d DeviceType, deviceStartup deviceStar
return
}
klog.V(5).Infof("Adding the device to the registry [%s][%s]", a, d.String())
r.Registry[a] = map[DeviceType]deviceStartupFunc{
d: deviceStartup,
m, ok := r.Registry[a]
if !ok {
r.Registry[a] = map[DeviceType]deviceStartupFunc{
d: deviceStartup,
}
} else {
m[d] = deviceStartup
}
}

Expand All @@ -143,6 +148,18 @@ func (r *Registry) GetAllDeviceTypes() []string {
return devices
}

func (r *Registry) GetAllDevices() map[string]map[string]interface{} {
all := map[string]map[string]interface{}{}
for t, m := range r.Registry {
devices := map[string]interface{}{}
for d := range m {
devices[d.String()] = struct{}{}
}
all[t] = devices
}
return all
}

func addDeviceInterface(registry *Registry, dtype DeviceType, accType string, deviceStartup deviceStartupFunc) error {
switch accType {
case config.GPU:
Expand Down

0 comments on commit 40b36a1

Please sign in to comment.