From a9c25de696174995378a490d8cfe4e47760b900e Mon Sep 17 00:00:00 2001 From: Nimbus318 <136771156+Nimbus318@users.noreply.github.com> Date: Mon, 13 Jan 2025 00:11:30 +0800 Subject: [PATCH] feat: support ascend and nvidia use mode(hami-core mig mps) Signed-off-by: Nimbus318 <136771156+Nimbus318@users.noreply.github.com> --- .../projects/vgpu/components/previewBar.vue | 20 +++++++- .../projects/vgpu/views/card/admin/Detail.vue | 9 ++++ .../projects/vgpu/views/card/admin/index.vue | 9 ++++ .../projects/vgpu/views/task/admin/Detail.vue | 9 +++- server/api/v1/card.proto | 1 + server/internal/biz/node.go | 1 + server/internal/data/node.go | 1 + server/internal/exporter/exporter.go | 5 ++ server/internal/provider/ascend/device.go | 19 +++++--- server/internal/provider/ascend/provider.go | 46 ++++++++----------- server/internal/provider/util/types.go | 1 + server/internal/provider/util/util.go | 23 ++++++++-- server/internal/service/card.go | 2 + server/internal/service/container.go | 2 +- 14 files changed, 106 insertions(+), 42 deletions(-) diff --git a/packages/web/projects/vgpu/components/previewBar.vue b/packages/web/projects/vgpu/components/previewBar.vue index a15662d..27f2aaf 100644 --- a/packages/web/projects/vgpu/components/previewBar.vue +++ b/packages/web/projects/vgpu/components/previewBar.vue @@ -173,16 +173,32 @@ ul { .nodeCard { height: 100%; + .pie { width: 200px; height: 200px; margin: 0 auto; } + .nodeCard-legend { width: 100%; display: flex; flex-direction: column; gap: 15px; + max-height: calc(3 * (12px + 15px)); + overflow-y: auto; + padding-right: 10px; + + /* 自定义滚动条样式(可选) */ + &::-webkit-scrollbar { + width: 6px; + } + + &::-webkit-scrollbar-thumb { + background-color: rgba(0, 0, 0, 0.2); + border-radius: 3px; + } + li { display: flex; justify-content: space-between; @@ -194,8 +210,8 @@ ul { gap: 5px; } .color-box { - width: 4px; - height: 4px; + width: 10px; + height: 10px; display: inline-block; } } diff --git a/packages/web/projects/vgpu/views/card/admin/Detail.vue b/packages/web/projects/vgpu/views/card/admin/Detail.vue index 63e7436..1f9ee21 100644 --- a/packages/web/projects/vgpu/views/card/admin/Detail.vue +++ b/packages/web/projects/vgpu/views/card/admin/Detail.vue @@ -196,6 +196,15 @@ const columns = [ label: '驱动版本', value: 'driver_version', }, + { + label: '使用模式', + value: 'mode', + render: ({ mode, type }) => ( + + {type?.split('-')[0] === "NVIDIA" ? mode : 'default'} + + ) + } ]; const cp = useInstantVector( diff --git a/packages/web/projects/vgpu/views/card/admin/index.vue b/packages/web/projects/vgpu/views/card/admin/index.vue index e52f74d..bc61c0b 100644 --- a/packages/web/projects/vgpu/views/card/admin/index.vue +++ b/packages/web/projects/vgpu/views/card/admin/index.vue @@ -63,6 +63,15 @@ const columns = [ ) }, + { + title: '使用模式', + dataIndex: 'mode', + render: ({ mode, type }) => ( + + {type?.split('-')[0] === "NVIDIA" ? mode : 'default'} + + ) + }, { title: '所属节点', dataIndex: 'nodeName', diff --git a/packages/web/projects/vgpu/views/task/admin/Detail.vue b/packages/web/projects/vgpu/views/task/admin/Detail.vue index 0100aae..78d1ec7 100644 --- a/packages/web/projects/vgpu/views/task/admin/Detail.vue +++ b/packages/web/projects/vgpu/views/task/admin/Detail.vue @@ -42,11 +42,16 @@ - diff --git a/server/api/v1/card.proto b/server/api/v1/card.proto index 3e4756c..5087dc1 100644 --- a/server/api/v1/card.proto +++ b/server/api/v1/card.proto @@ -64,6 +64,7 @@ message GPUReply { int32 memory_total = 9; string node_uid = 10; bool health = 11; + string mode = 12; } message GPUsReply { diff --git a/server/internal/biz/node.go b/server/internal/biz/node.go index 9d32e78..3202772 100644 --- a/server/internal/biz/node.go +++ b/server/internal/biz/node.go @@ -32,6 +32,7 @@ type DeviceInfo struct { Devcore int32 Type string Numa int + Mode string Health bool NodeName string NodeUid string diff --git a/server/internal/data/node.go b/server/internal/data/node.go index 2757899..d5a54a1 100644 --- a/server/internal/data/node.go +++ b/server/internal/data/node.go @@ -86,6 +86,7 @@ func (r *nodeRepo) updateLocalNodes() { Devcore: device.Devcore, Type: device.Type, Numa: device.Numa, + Mode: device.Mode, Health: device.Health, NodeName: node.Name, NodeUid: string(node.UID), diff --git a/server/internal/exporter/exporter.go b/server/internal/exporter/exporter.go index 755a11a..bd27668 100644 --- a/server/internal/exporter/exporter.go +++ b/server/internal/exporter/exporter.go @@ -438,6 +438,8 @@ func (s *MetricsGenerator) queryDeviceAdditional(ctx context.Context, provider, switch provider { case biz.NvidiaGPUDevice: query = fmt.Sprintf("DCGM_FI_DEV_POWER_USAGE{UUID=\"%s\"}", deviceUUID) + case biz.AscendGPUDevice: + query = fmt.Sprintf("npu_chip_info_power{vdie_id=\"%s\"}", deviceUUID) case biz.CambriconGPUDevice: query = fmt.Sprintf("mlu_power_usage{uuid=\"%s\"}", deviceUUID) case biz.HygonGPUDevice: @@ -462,6 +464,9 @@ func (s *MetricsGenerator) queryDeviceAdditional(ctx context.Context, provider, case biz.CambriconGPUDevice: info.DriverVersion = metric["driver"] info.DeviceNo = metric["sn"] + case biz.AscendGPUDevice: + info.DriverVersion = "暂无" + info.DeviceNo = "ascend-" + metric["id"] case biz.HygonGPUDevice: info.DriverVersion = "暂无" info.DeviceNo = "dcu-" + metric["minor_number"] diff --git a/server/internal/provider/ascend/device.go b/server/internal/provider/ascend/device.go index 49ec6c7..5573c54 100644 --- a/server/internal/provider/ascend/device.go +++ b/server/internal/provider/ascend/device.go @@ -8,17 +8,22 @@ const ( // IluvatarUseUUID is user can use specify Iluvatar device for set Iluvatar UUID. AscendDeviceUseUUID = "huawei.com/use-ascenduuid" // IluvatarNoUseUUID is user can not use specify Iluvatar device for set Iluvatar UUID. - AscendNoUseUUID = "huawei.com/nouse-ascenduuid" - AscendResourceCoreCount = "huawei.com/Ascend910" + AscendNoUseUUID = "huawei.com/nouse-ascenduuid" + Ascend910BNodeRegisterAnno = "hami.io/node-register-Ascend910B" + Ascend310PNodeRegisterAnno = "hami.io/node-register-Ascend310P" ) var ( - AscendResourceCount string - AscendResourceMemory string - AscendResourceCores string + AscendResourceCount string + AscendResourceMemory string + AscendResourceCores string + AscendNodeRegisterAnnos []string ) func init() { - util.InRequestDevices[AscendDevice] = "hami.io/ascend-devices-to-allocate" - util.SupportDevices[AscendDevice] = "hami.io/ascend-devices-allocated" + AscendNodeRegisterAnnos = []string{Ascend910BNodeRegisterAnno, Ascend310PNodeRegisterAnno} + util.InRequestDevices[AscendDevice] = "hami.io/Ascend910B-devices-to-allocate" + util.SupportDevices[AscendDevice] = "hami.io/Ascend910B-devices-allocated" + util.InRequestDevices["Ascend310P"] = "hami.io/Ascend310P-devices-to-allocate" + util.SupportDevices["Ascend310P"] = "hami.io/Ascend310P-devices-allocated" } diff --git a/server/internal/provider/ascend/provider.go b/server/internal/provider/ascend/provider.go index 2658d5e..6c84d83 100644 --- a/server/internal/provider/ascend/provider.go +++ b/server/internal/provider/ascend/provider.go @@ -6,9 +6,8 @@ import ( "github.com/go-kratos/kratos/v2/log" "github.com/prometheus/common/model" corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/labels" - "strings" + "strconv" "vgpu/internal/data/prom" "vgpu/internal/provider/util" ) @@ -72,31 +71,26 @@ func (a *Ascend) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.De } func (a *Ascend) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) { - - nodedevices := []*util.DeviceInfo{} - i := 0 - cards, _ := node.Status.Capacity.Name(corev1.ResourceName(AscendResourceCoreCount), resource.DecimalSI).AsInt64() - tmpDevice := a.GetDevicesFromPrometheus(node) - for int64(i)*10 < cards { - index := fmt.Sprintf("%d", i) - if _, ok := tmpDevice[index]; !ok { - i++ + for _, anno := range AscendNodeRegisterAnnos { + tmpDevice := a.GetDevicesFromPrometheus(node) + anno, ok := node.Annotations[anno] + if !ok { + log.Infof("anno %s not found", anno) continue } - mode := strings.Split(tmpDevice[index].Type, "-") - nodedevices = append(nodedevices, &util.DeviceInfo{ - Index: uint(i), - ID: tmpDevice[index].ID, - AliasId: node.Name + "-Ascend910-" + fmt.Sprint(i), - Count: 10, - Devmem: int32(65536), - Devcore: 100, - Type: fmt.Sprintf("%s-%s", mode[1], mode[0]), - Numa: 0, - Health: true, - Driver: "xxx", - }) - i++ + nodeDevices, err := util.UnMarshalNodeDevices(anno) + if err != nil { + return []*util.DeviceInfo{}, err + } + for i, nodedevice := range nodeDevices { + nodeDevices[i].AliasId = nodedevice.ID + if device, exists := tmpDevice[strconv.Itoa(i)]; exists { + nodeDevices[i].ID = device.ID + } else { + log.Infof("Key %d not found in tmpDevice", i) + } + } + return nodeDevices, nil } - return nodedevices, nil + return []*util.DeviceInfo{}, fmt.Errorf("") } diff --git a/server/internal/provider/util/types.go b/server/internal/provider/util/types.go index 65727a2..cd99070 100644 --- a/server/internal/provider/util/types.go +++ b/server/internal/provider/util/types.go @@ -97,6 +97,7 @@ type DeviceInfo struct { Devcore int32 Type string Numa int + Mode string Health bool Driver string } diff --git a/server/internal/provider/util/util.go b/server/internal/provider/util/util.go index 2cfcac9..f1265b0 100644 --- a/server/internal/provider/util/util.go +++ b/server/internal/provider/util/util.go @@ -1,6 +1,7 @@ package util import ( + "encoding/json" "errors" "fmt" "github.com/go-kratos/kratos/v2/log" @@ -60,12 +61,18 @@ func DecodeNodeDevices(str string, log *log.Helper) ([]*DeviceInfo, error) { for _, val := range tmp { if strings.Contains(val, ",") { items := strings.Split(val, ",") - if len(items) == 7 { - count, _ := strconv.Atoi(items[1]) - devmem, _ := strconv.Atoi(items[2]) - devcore, _ := strconv.Atoi(items[3]) + if len(items) >= 7 || len(items) == 9 { + count, _ := strconv.ParseInt(items[1], 10, 32) + devmem, _ := strconv.ParseInt(items[2], 10, 32) + devcore, _ := strconv.ParseInt(items[3], 10, 32) health, _ := strconv.ParseBool(items[6]) numa, _ := strconv.Atoi(items[5]) + mode := "hami-core" + index := 0 + if len(items) == 9 { + index, _ = strconv.Atoi(items[7]) + mode = items[8] + } i := DeviceInfo{ ID: items[0], AliasId: items[0], @@ -75,6 +82,8 @@ func DecodeNodeDevices(str string, log *log.Helper) ([]*DeviceInfo, error) { Type: items[4], Numa: numa, Health: health, + Mode: mode, + Index: uint(index), } retval = append(retval, &i) } else { @@ -307,3 +316,9 @@ func DecodePodDevices(pod *corev1.Pod, log *log.Helper) (PodDevices, error) { log.Infof("Decoded pod annos: poddevices %v", pd) return pd, nil } + +func UnMarshalNodeDevices(str string) ([]*DeviceInfo, error) { + var dlist []*DeviceInfo + err := json.Unmarshal([]byte(str), &dlist) + return dlist, err +} diff --git a/server/internal/service/card.go b/server/internal/service/card.go index 1f33015..6569c1f 100644 --- a/server/internal/service/card.go +++ b/server/internal/service/card.go @@ -50,6 +50,7 @@ func (s *CardService) GetAllGPUs(ctx context.Context, req *pb.GetAllGpusReq) (*p gpu.MemoryTotal = device.Devmem gpu.NodeUid = device.NodeUid gpu.Health = device.Health + gpu.Mode = device.Mode vGPU, core, memory, err := s.pod.StatisticsByDeviceId(ctx, device.AliasId) if err == nil { @@ -120,6 +121,7 @@ func (s *CardService) GetGPU(ctx context.Context, req *pb.GetGpuReq) (*pb.GPURep gpu.MemoryTotal = device.Devmem gpu.NodeUid = device.NodeUid gpu.Health = device.Health + gpu.Mode = device.Mode vGPU, core, memory, err := s.pod.StatisticsByDeviceId(ctx, device.AliasId) if err == nil { diff --git a/server/internal/service/container.go b/server/internal/service/container.go index aa031ee..8e1da81 100644 --- a/server/internal/service/container.go +++ b/server/internal/service/container.go @@ -69,7 +69,7 @@ func (s *ContainerService) GetAllContainers(ctx context.Context, req *pb.GetAllC deviceID = device.Id } - if filters.DeviceId != "" && filters.DeviceId != deviceID { + if filters.DeviceId != "" && !strings.HasPrefix(deviceID, filters.DeviceId) { continue }