From a9c25de696174995378a490d8cfe4e47760b900e Mon Sep 17 00:00:00 2001
From: Nimbus318 <136771156+Nimbus318@users.noreply.github.com>
Date: Mon, 13 Jan 2025 00:11:30 +0800
Subject: [PATCH] feat: support ascend and nvidia use mode(hami-core mig mps)
Signed-off-by: Nimbus318 <136771156+Nimbus318@users.noreply.github.com>
---
.../projects/vgpu/components/previewBar.vue | 20 +++++++-
.../projects/vgpu/views/card/admin/Detail.vue | 9 ++++
.../projects/vgpu/views/card/admin/index.vue | 9 ++++
.../projects/vgpu/views/task/admin/Detail.vue | 9 +++-
server/api/v1/card.proto | 1 +
server/internal/biz/node.go | 1 +
server/internal/data/node.go | 1 +
server/internal/exporter/exporter.go | 5 ++
server/internal/provider/ascend/device.go | 19 +++++---
server/internal/provider/ascend/provider.go | 46 ++++++++-----------
server/internal/provider/util/types.go | 1 +
server/internal/provider/util/util.go | 23 ++++++++--
server/internal/service/card.go | 2 +
server/internal/service/container.go | 2 +-
14 files changed, 106 insertions(+), 42 deletions(-)
diff --git a/packages/web/projects/vgpu/components/previewBar.vue b/packages/web/projects/vgpu/components/previewBar.vue
index a15662d..27f2aaf 100644
--- a/packages/web/projects/vgpu/components/previewBar.vue
+++ b/packages/web/projects/vgpu/components/previewBar.vue
@@ -173,16 +173,32 @@ ul {
.nodeCard {
height: 100%;
+
.pie {
width: 200px;
height: 200px;
margin: 0 auto;
}
+
.nodeCard-legend {
width: 100%;
display: flex;
flex-direction: column;
gap: 15px;
+ max-height: calc(3 * (12px + 15px));
+ overflow-y: auto;
+ padding-right: 10px;
+
+ /* 自定义滚动条样式(可选) */
+ &::-webkit-scrollbar {
+ width: 6px;
+ }
+
+ &::-webkit-scrollbar-thumb {
+ background-color: rgba(0, 0, 0, 0.2);
+ border-radius: 3px;
+ }
+
li {
display: flex;
justify-content: space-between;
@@ -194,8 +210,8 @@ ul {
gap: 5px;
}
.color-box {
- width: 4px;
- height: 4px;
+ width: 10px;
+ height: 10px;
display: inline-block;
}
}
diff --git a/packages/web/projects/vgpu/views/card/admin/Detail.vue b/packages/web/projects/vgpu/views/card/admin/Detail.vue
index 63e7436..1f9ee21 100644
--- a/packages/web/projects/vgpu/views/card/admin/Detail.vue
+++ b/packages/web/projects/vgpu/views/card/admin/Detail.vue
@@ -196,6 +196,15 @@ const columns = [
label: '驱动版本',
value: 'driver_version',
},
+ {
+ label: '使用模式',
+ value: 'mode',
+ render: ({ mode, type }) => (
+
+ {type?.split('-')[0] === "NVIDIA" ? mode : 'default'}
+
+ )
+ }
];
const cp = useInstantVector(
diff --git a/packages/web/projects/vgpu/views/card/admin/index.vue b/packages/web/projects/vgpu/views/card/admin/index.vue
index e52f74d..bc61c0b 100644
--- a/packages/web/projects/vgpu/views/card/admin/index.vue
+++ b/packages/web/projects/vgpu/views/card/admin/index.vue
@@ -63,6 +63,15 @@ const columns = [
)
},
+ {
+ title: '使用模式',
+ dataIndex: 'mode',
+ render: ({ mode, type }) => (
+
+ {type?.split('-')[0] === "NVIDIA" ? mode : 'default'}
+
+ )
+ },
{
title: '所属节点',
dataIndex: 'nodeName',
diff --git a/packages/web/projects/vgpu/views/task/admin/Detail.vue b/packages/web/projects/vgpu/views/task/admin/Detail.vue
index 0100aae..78d1ec7 100644
--- a/packages/web/projects/vgpu/views/task/admin/Detail.vue
+++ b/packages/web/projects/vgpu/views/task/admin/Detail.vue
@@ -42,11 +42,16 @@
-
+
-
+
+
+
+
+
+
diff --git a/server/api/v1/card.proto b/server/api/v1/card.proto
index 3e4756c..5087dc1 100644
--- a/server/api/v1/card.proto
+++ b/server/api/v1/card.proto
@@ -64,6 +64,7 @@ message GPUReply {
int32 memory_total = 9;
string node_uid = 10;
bool health = 11;
+ string mode = 12;
}
message GPUsReply {
diff --git a/server/internal/biz/node.go b/server/internal/biz/node.go
index 9d32e78..3202772 100644
--- a/server/internal/biz/node.go
+++ b/server/internal/biz/node.go
@@ -32,6 +32,7 @@ type DeviceInfo struct {
Devcore int32
Type string
Numa int
+ Mode string
Health bool
NodeName string
NodeUid string
diff --git a/server/internal/data/node.go b/server/internal/data/node.go
index 2757899..d5a54a1 100644
--- a/server/internal/data/node.go
+++ b/server/internal/data/node.go
@@ -86,6 +86,7 @@ func (r *nodeRepo) updateLocalNodes() {
Devcore: device.Devcore,
Type: device.Type,
Numa: device.Numa,
+ Mode: device.Mode,
Health: device.Health,
NodeName: node.Name,
NodeUid: string(node.UID),
diff --git a/server/internal/exporter/exporter.go b/server/internal/exporter/exporter.go
index 755a11a..bd27668 100644
--- a/server/internal/exporter/exporter.go
+++ b/server/internal/exporter/exporter.go
@@ -438,6 +438,8 @@ func (s *MetricsGenerator) queryDeviceAdditional(ctx context.Context, provider,
switch provider {
case biz.NvidiaGPUDevice:
query = fmt.Sprintf("DCGM_FI_DEV_POWER_USAGE{UUID=\"%s\"}", deviceUUID)
+ case biz.AscendGPUDevice:
+ query = fmt.Sprintf("npu_chip_info_power{vdie_id=\"%s\"}", deviceUUID)
case biz.CambriconGPUDevice:
query = fmt.Sprintf("mlu_power_usage{uuid=\"%s\"}", deviceUUID)
case biz.HygonGPUDevice:
@@ -462,6 +464,9 @@ func (s *MetricsGenerator) queryDeviceAdditional(ctx context.Context, provider,
case biz.CambriconGPUDevice:
info.DriverVersion = metric["driver"]
info.DeviceNo = metric["sn"]
+ case biz.AscendGPUDevice:
+ info.DriverVersion = "暂无"
+ info.DeviceNo = "ascend-" + metric["id"]
case biz.HygonGPUDevice:
info.DriverVersion = "暂无"
info.DeviceNo = "dcu-" + metric["minor_number"]
diff --git a/server/internal/provider/ascend/device.go b/server/internal/provider/ascend/device.go
index 49ec6c7..5573c54 100644
--- a/server/internal/provider/ascend/device.go
+++ b/server/internal/provider/ascend/device.go
@@ -8,17 +8,22 @@ const (
// IluvatarUseUUID is user can use specify Iluvatar device for set Iluvatar UUID.
AscendDeviceUseUUID = "huawei.com/use-ascenduuid"
// IluvatarNoUseUUID is user can not use specify Iluvatar device for set Iluvatar UUID.
- AscendNoUseUUID = "huawei.com/nouse-ascenduuid"
- AscendResourceCoreCount = "huawei.com/Ascend910"
+ AscendNoUseUUID = "huawei.com/nouse-ascenduuid"
+ Ascend910BNodeRegisterAnno = "hami.io/node-register-Ascend910B"
+ Ascend310PNodeRegisterAnno = "hami.io/node-register-Ascend310P"
)
var (
- AscendResourceCount string
- AscendResourceMemory string
- AscendResourceCores string
+ AscendResourceCount string
+ AscendResourceMemory string
+ AscendResourceCores string
+ AscendNodeRegisterAnnos []string
)
func init() {
- util.InRequestDevices[AscendDevice] = "hami.io/ascend-devices-to-allocate"
- util.SupportDevices[AscendDevice] = "hami.io/ascend-devices-allocated"
+ AscendNodeRegisterAnnos = []string{Ascend910BNodeRegisterAnno, Ascend310PNodeRegisterAnno}
+ util.InRequestDevices[AscendDevice] = "hami.io/Ascend910B-devices-to-allocate"
+ util.SupportDevices[AscendDevice] = "hami.io/Ascend910B-devices-allocated"
+ util.InRequestDevices["Ascend310P"] = "hami.io/Ascend310P-devices-to-allocate"
+ util.SupportDevices["Ascend310P"] = "hami.io/Ascend310P-devices-allocated"
}
diff --git a/server/internal/provider/ascend/provider.go b/server/internal/provider/ascend/provider.go
index 2658d5e..6c84d83 100644
--- a/server/internal/provider/ascend/provider.go
+++ b/server/internal/provider/ascend/provider.go
@@ -6,9 +6,8 @@ import (
"github.com/go-kratos/kratos/v2/log"
"github.com/prometheus/common/model"
corev1 "k8s.io/api/core/v1"
- "k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/labels"
- "strings"
+ "strconv"
"vgpu/internal/data/prom"
"vgpu/internal/provider/util"
)
@@ -72,31 +71,26 @@ func (a *Ascend) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.De
}
func (a *Ascend) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) {
-
- nodedevices := []*util.DeviceInfo{}
- i := 0
- cards, _ := node.Status.Capacity.Name(corev1.ResourceName(AscendResourceCoreCount), resource.DecimalSI).AsInt64()
- tmpDevice := a.GetDevicesFromPrometheus(node)
- for int64(i)*10 < cards {
- index := fmt.Sprintf("%d", i)
- if _, ok := tmpDevice[index]; !ok {
- i++
+ for _, anno := range AscendNodeRegisterAnnos {
+ tmpDevice := a.GetDevicesFromPrometheus(node)
+ anno, ok := node.Annotations[anno]
+ if !ok {
+ log.Infof("anno %s not found", anno)
continue
}
- mode := strings.Split(tmpDevice[index].Type, "-")
- nodedevices = append(nodedevices, &util.DeviceInfo{
- Index: uint(i),
- ID: tmpDevice[index].ID,
- AliasId: node.Name + "-Ascend910-" + fmt.Sprint(i),
- Count: 10,
- Devmem: int32(65536),
- Devcore: 100,
- Type: fmt.Sprintf("%s-%s", mode[1], mode[0]),
- Numa: 0,
- Health: true,
- Driver: "xxx",
- })
- i++
+ nodeDevices, err := util.UnMarshalNodeDevices(anno)
+ if err != nil {
+ return []*util.DeviceInfo{}, err
+ }
+ for i, nodedevice := range nodeDevices {
+ nodeDevices[i].AliasId = nodedevice.ID
+ if device, exists := tmpDevice[strconv.Itoa(i)]; exists {
+ nodeDevices[i].ID = device.ID
+ } else {
+ log.Infof("Key %d not found in tmpDevice", i)
+ }
+ }
+ return nodeDevices, nil
}
- return nodedevices, nil
+ return []*util.DeviceInfo{}, fmt.Errorf("")
}
diff --git a/server/internal/provider/util/types.go b/server/internal/provider/util/types.go
index 65727a2..cd99070 100644
--- a/server/internal/provider/util/types.go
+++ b/server/internal/provider/util/types.go
@@ -97,6 +97,7 @@ type DeviceInfo struct {
Devcore int32
Type string
Numa int
+ Mode string
Health bool
Driver string
}
diff --git a/server/internal/provider/util/util.go b/server/internal/provider/util/util.go
index 2cfcac9..f1265b0 100644
--- a/server/internal/provider/util/util.go
+++ b/server/internal/provider/util/util.go
@@ -1,6 +1,7 @@
package util
import (
+ "encoding/json"
"errors"
"fmt"
"github.com/go-kratos/kratos/v2/log"
@@ -60,12 +61,18 @@ func DecodeNodeDevices(str string, log *log.Helper) ([]*DeviceInfo, error) {
for _, val := range tmp {
if strings.Contains(val, ",") {
items := strings.Split(val, ",")
- if len(items) == 7 {
- count, _ := strconv.Atoi(items[1])
- devmem, _ := strconv.Atoi(items[2])
- devcore, _ := strconv.Atoi(items[3])
+ if len(items) >= 7 || len(items) == 9 {
+ count, _ := strconv.ParseInt(items[1], 10, 32)
+ devmem, _ := strconv.ParseInt(items[2], 10, 32)
+ devcore, _ := strconv.ParseInt(items[3], 10, 32)
health, _ := strconv.ParseBool(items[6])
numa, _ := strconv.Atoi(items[5])
+ mode := "hami-core"
+ index := 0
+ if len(items) == 9 {
+ index, _ = strconv.Atoi(items[7])
+ mode = items[8]
+ }
i := DeviceInfo{
ID: items[0],
AliasId: items[0],
@@ -75,6 +82,8 @@ func DecodeNodeDevices(str string, log *log.Helper) ([]*DeviceInfo, error) {
Type: items[4],
Numa: numa,
Health: health,
+ Mode: mode,
+ Index: uint(index),
}
retval = append(retval, &i)
} else {
@@ -307,3 +316,9 @@ func DecodePodDevices(pod *corev1.Pod, log *log.Helper) (PodDevices, error) {
log.Infof("Decoded pod annos: poddevices %v", pd)
return pd, nil
}
+
+func UnMarshalNodeDevices(str string) ([]*DeviceInfo, error) {
+ var dlist []*DeviceInfo
+ err := json.Unmarshal([]byte(str), &dlist)
+ return dlist, err
+}
diff --git a/server/internal/service/card.go b/server/internal/service/card.go
index 1f33015..6569c1f 100644
--- a/server/internal/service/card.go
+++ b/server/internal/service/card.go
@@ -50,6 +50,7 @@ func (s *CardService) GetAllGPUs(ctx context.Context, req *pb.GetAllGpusReq) (*p
gpu.MemoryTotal = device.Devmem
gpu.NodeUid = device.NodeUid
gpu.Health = device.Health
+ gpu.Mode = device.Mode
vGPU, core, memory, err := s.pod.StatisticsByDeviceId(ctx, device.AliasId)
if err == nil {
@@ -120,6 +121,7 @@ func (s *CardService) GetGPU(ctx context.Context, req *pb.GetGpuReq) (*pb.GPURep
gpu.MemoryTotal = device.Devmem
gpu.NodeUid = device.NodeUid
gpu.Health = device.Health
+ gpu.Mode = device.Mode
vGPU, core, memory, err := s.pod.StatisticsByDeviceId(ctx, device.AliasId)
if err == nil {
diff --git a/server/internal/service/container.go b/server/internal/service/container.go
index aa031ee..8e1da81 100644
--- a/server/internal/service/container.go
+++ b/server/internal/service/container.go
@@ -69,7 +69,7 @@ func (s *ContainerService) GetAllContainers(ctx context.Context, req *pb.GetAllC
deviceID = device.Id
}
- if filters.DeviceId != "" && filters.DeviceId != deviceID {
+ if filters.DeviceId != "" && !strings.HasPrefix(deviceID, filters.DeviceId) {
continue
}