Skip to content

Commit

Permalink
feat: support ascend and nvidia use mode(hami-core mig mps)
Browse files Browse the repository at this point in the history
Signed-off-by: Nimbus318 <[email protected]>
  • Loading branch information
Nimbus318 committed Jan 12, 2025
1 parent 73d6a53 commit a9c25de
Show file tree
Hide file tree
Showing 14 changed files with 106 additions and 42 deletions.
20 changes: 18 additions & 2 deletions packages/web/projects/vgpu/components/previewBar.vue
Original file line number Diff line number Diff line change
Expand Up @@ -173,16 +173,32 @@ ul {
.nodeCard {
height: 100%;
.pie {
width: 200px;
height: 200px;
margin: 0 auto;
}
.nodeCard-legend {
width: 100%;
display: flex;
flex-direction: column;
gap: 15px;
max-height: calc(3 * (12px + 15px));
overflow-y: auto;
padding-right: 10px;
/* 自定义滚动条样式(可选) */
&::-webkit-scrollbar {
width: 6px;
}
&::-webkit-scrollbar-thumb {
background-color: rgba(0, 0, 0, 0.2);
border-radius: 3px;
}
li {
display: flex;
justify-content: space-between;
Expand All @@ -194,8 +210,8 @@ ul {
gap: 5px;
}
.color-box {
width: 4px;
height: 4px;
width: 10px;
height: 10px;
display: inline-block;
}
}
Expand Down
9 changes: 9 additions & 0 deletions packages/web/projects/vgpu/views/card/admin/Detail.vue
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,15 @@ const columns = [
label: '驱动版本',
value: 'driver_version',
},
{
label: '使用模式',
value: 'mode',
render: ({ mode, type }) => (
<el-tag disable-transitions>
{type?.split('-')[0] === "NVIDIA" ? mode : 'default'}
</el-tag>
)
}
];
const cp = useInstantVector(
Expand Down
9 changes: 9 additions & 0 deletions packages/web/projects/vgpu/views/card/admin/index.vue
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,15 @@ const columns = [
</el-tag>
)
},
{
title: '使用模式',
dataIndex: 'mode',
render: ({ mode, type }) => (
<el-tag disable-transitions>
{type?.split('-')[0] === "NVIDIA" ? mode : 'default'}
</el-tag>
)
},
{
title: '所属节点',
dataIndex: 'nodeName',
Expand Down
9 changes: 7 additions & 2 deletions packages/web/projects/vgpu/views/task/admin/Detail.vue
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,16 @@
</block-box>

<block-box v-for="{ title, data } in lineConfig" :key="title" :title="title">
<template #extra>
<template #extra v-if="detail.type && detail.type.startsWith('NVIDIA')">
<time-picker v-model="times" type="datetimerange" size="small" />
</template>
<div style="height: 200px">
<echarts-plus :options="getLineOptions({ data })" />
<template v-if="detail.type && !detail.type.startsWith('NVIDIA')">
<el-empty description="该设备厂商暂不支持任务维度监控" :image-size="60" />
</template>
<template v-else>
<echarts-plus :options="getLineOptions({ data })" />
</template>
</div>
</block-box>
</template>
Expand Down
1 change: 1 addition & 0 deletions server/api/v1/card.proto
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ message GPUReply {
int32 memory_total = 9;
string node_uid = 10;
bool health = 11;
string mode = 12;
}

message GPUsReply {
Expand Down
1 change: 1 addition & 0 deletions server/internal/biz/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ type DeviceInfo struct {
Devcore int32
Type string
Numa int
Mode string
Health bool
NodeName string
NodeUid string
Expand Down
1 change: 1 addition & 0 deletions server/internal/data/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ func (r *nodeRepo) updateLocalNodes() {
Devcore: device.Devcore,
Type: device.Type,
Numa: device.Numa,
Mode: device.Mode,
Health: device.Health,
NodeName: node.Name,
NodeUid: string(node.UID),
Expand Down
5 changes: 5 additions & 0 deletions server/internal/exporter/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,8 @@ func (s *MetricsGenerator) queryDeviceAdditional(ctx context.Context, provider,
switch provider {
case biz.NvidiaGPUDevice:
query = fmt.Sprintf("DCGM_FI_DEV_POWER_USAGE{UUID=\"%s\"}", deviceUUID)
case biz.AscendGPUDevice:
query = fmt.Sprintf("npu_chip_info_power{vdie_id=\"%s\"}", deviceUUID)
case biz.CambriconGPUDevice:
query = fmt.Sprintf("mlu_power_usage{uuid=\"%s\"}", deviceUUID)
case biz.HygonGPUDevice:
Expand All @@ -462,6 +464,9 @@ func (s *MetricsGenerator) queryDeviceAdditional(ctx context.Context, provider,
case biz.CambriconGPUDevice:
info.DriverVersion = metric["driver"]
info.DeviceNo = metric["sn"]
case biz.AscendGPUDevice:
info.DriverVersion = "暂无"
info.DeviceNo = "ascend-" + metric["id"]
case biz.HygonGPUDevice:
info.DriverVersion = "暂无"
info.DeviceNo = "dcu-" + metric["minor_number"]
Expand Down
19 changes: 12 additions & 7 deletions server/internal/provider/ascend/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,22 @@ const (
// IluvatarUseUUID is user can use specify Iluvatar device for set Iluvatar UUID.
AscendDeviceUseUUID = "huawei.com/use-ascenduuid"
// IluvatarNoUseUUID is user can not use specify Iluvatar device for set Iluvatar UUID.
AscendNoUseUUID = "huawei.com/nouse-ascenduuid"
AscendResourceCoreCount = "huawei.com/Ascend910"
AscendNoUseUUID = "huawei.com/nouse-ascenduuid"
Ascend910BNodeRegisterAnno = "hami.io/node-register-Ascend910B"
Ascend310PNodeRegisterAnno = "hami.io/node-register-Ascend310P"
)

var (
AscendResourceCount string
AscendResourceMemory string
AscendResourceCores string
AscendResourceCount string
AscendResourceMemory string
AscendResourceCores string
AscendNodeRegisterAnnos []string
)

func init() {
util.InRequestDevices[AscendDevice] = "hami.io/ascend-devices-to-allocate"
util.SupportDevices[AscendDevice] = "hami.io/ascend-devices-allocated"
AscendNodeRegisterAnnos = []string{Ascend910BNodeRegisterAnno, Ascend310PNodeRegisterAnno}
util.InRequestDevices[AscendDevice] = "hami.io/Ascend910B-devices-to-allocate"
util.SupportDevices[AscendDevice] = "hami.io/Ascend910B-devices-allocated"
util.InRequestDevices["Ascend310P"] = "hami.io/Ascend310P-devices-to-allocate"
util.SupportDevices["Ascend310P"] = "hami.io/Ascend310P-devices-allocated"
}
46 changes: 20 additions & 26 deletions server/internal/provider/ascend/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,8 @@ import (
"github.com/go-kratos/kratos/v2/log"
"github.com/prometheus/common/model"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/labels"
"strings"
"strconv"
"vgpu/internal/data/prom"
"vgpu/internal/provider/util"
)
Expand Down Expand Up @@ -72,31 +71,26 @@ func (a *Ascend) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.De
}

func (a *Ascend) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) {

nodedevices := []*util.DeviceInfo{}
i := 0
cards, _ := node.Status.Capacity.Name(corev1.ResourceName(AscendResourceCoreCount), resource.DecimalSI).AsInt64()
tmpDevice := a.GetDevicesFromPrometheus(node)
for int64(i)*10 < cards {
index := fmt.Sprintf("%d", i)
if _, ok := tmpDevice[index]; !ok {
i++
for _, anno := range AscendNodeRegisterAnnos {
tmpDevice := a.GetDevicesFromPrometheus(node)
anno, ok := node.Annotations[anno]
if !ok {
log.Infof("anno %s not found", anno)
continue
}
mode := strings.Split(tmpDevice[index].Type, "-")
nodedevices = append(nodedevices, &util.DeviceInfo{
Index: uint(i),
ID: tmpDevice[index].ID,
AliasId: node.Name + "-Ascend910-" + fmt.Sprint(i),
Count: 10,
Devmem: int32(65536),
Devcore: 100,
Type: fmt.Sprintf("%s-%s", mode[1], mode[0]),
Numa: 0,
Health: true,
Driver: "xxx",
})
i++
nodeDevices, err := util.UnMarshalNodeDevices(anno)
if err != nil {
return []*util.DeviceInfo{}, err
}
for i, nodedevice := range nodeDevices {
nodeDevices[i].AliasId = nodedevice.ID
if device, exists := tmpDevice[strconv.Itoa(i)]; exists {
nodeDevices[i].ID = device.ID
} else {
log.Infof("Key %d not found in tmpDevice", i)
}
}
return nodeDevices, nil
}
return nodedevices, nil
return []*util.DeviceInfo{}, fmt.Errorf("")
}
1 change: 1 addition & 0 deletions server/internal/provider/util/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ type DeviceInfo struct {
Devcore int32
Type string
Numa int
Mode string
Health bool
Driver string
}
Expand Down
23 changes: 19 additions & 4 deletions server/internal/provider/util/util.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package util

import (
"encoding/json"
"errors"
"fmt"
"github.com/go-kratos/kratos/v2/log"
Expand Down Expand Up @@ -60,12 +61,18 @@ func DecodeNodeDevices(str string, log *log.Helper) ([]*DeviceInfo, error) {
for _, val := range tmp {
if strings.Contains(val, ",") {
items := strings.Split(val, ",")
if len(items) == 7 {
count, _ := strconv.Atoi(items[1])
devmem, _ := strconv.Atoi(items[2])
devcore, _ := strconv.Atoi(items[3])
if len(items) >= 7 || len(items) == 9 {
count, _ := strconv.ParseInt(items[1], 10, 32)
devmem, _ := strconv.ParseInt(items[2], 10, 32)
devcore, _ := strconv.ParseInt(items[3], 10, 32)
health, _ := strconv.ParseBool(items[6])
numa, _ := strconv.Atoi(items[5])
mode := "hami-core"
index := 0
if len(items) == 9 {
index, _ = strconv.Atoi(items[7])
mode = items[8]
}
i := DeviceInfo{
ID: items[0],
AliasId: items[0],
Expand All @@ -75,6 +82,8 @@ func DecodeNodeDevices(str string, log *log.Helper) ([]*DeviceInfo, error) {
Type: items[4],
Numa: numa,
Health: health,
Mode: mode,
Index: uint(index),
}
retval = append(retval, &i)
} else {
Expand Down Expand Up @@ -307,3 +316,9 @@ func DecodePodDevices(pod *corev1.Pod, log *log.Helper) (PodDevices, error) {
log.Infof("Decoded pod annos: poddevices %v", pd)
return pd, nil
}

func UnMarshalNodeDevices(str string) ([]*DeviceInfo, error) {
var dlist []*DeviceInfo
err := json.Unmarshal([]byte(str), &dlist)
return dlist, err
}
2 changes: 2 additions & 0 deletions server/internal/service/card.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ func (s *CardService) GetAllGPUs(ctx context.Context, req *pb.GetAllGpusReq) (*p
gpu.MemoryTotal = device.Devmem
gpu.NodeUid = device.NodeUid
gpu.Health = device.Health
gpu.Mode = device.Mode

vGPU, core, memory, err := s.pod.StatisticsByDeviceId(ctx, device.AliasId)
if err == nil {
Expand Down Expand Up @@ -120,6 +121,7 @@ func (s *CardService) GetGPU(ctx context.Context, req *pb.GetGpuReq) (*pb.GPURep
gpu.MemoryTotal = device.Devmem
gpu.NodeUid = device.NodeUid
gpu.Health = device.Health
gpu.Mode = device.Mode

vGPU, core, memory, err := s.pod.StatisticsByDeviceId(ctx, device.AliasId)
if err == nil {
Expand Down
2 changes: 1 addition & 1 deletion server/internal/service/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ func (s *ContainerService) GetAllContainers(ctx context.Context, req *pb.GetAllC
deviceID = device.Id
}

if filters.DeviceId != "" && filters.DeviceId != deviceID {
if filters.DeviceId != "" && !strings.HasPrefix(deviceID, filters.DeviceId) {
continue
}

Expand Down

0 comments on commit a9c25de

Please sign in to comment.