Skip to content

Commit

Permalink
fix: fix comments
Browse files Browse the repository at this point in the history
Signed-off-by: Rouzip <[email protected]>
  • Loading branch information
Rouzip committed Jan 5, 2025
1 parent 1dca501 commit ab3c8f8
Show file tree
Hide file tree
Showing 6 changed files with 66 additions and 26 deletions.
3 changes: 2 additions & 1 deletion pkg/koordlet/metriccache/metric_resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,5 +74,6 @@ var (
HostAppMemoryUsageWithPageCacheMetric = defaultMetricFactory.New(HostAppMemoryWithPageCacheUsage).withPropertySchema(MetricPropertyHostAppName)

// Resctrl
QosResctrl = defaultMetricFactory.New(ResctrlQos).withPropertySchema(MetricPropertyNodeQos, MetricPropertyResctrlType, MetricPropertyResctrlCacheId, MetricPropertyResctrlMbType)
ResctrlLLCMetric = defaultMetricFactory.New(ResctrlLLC).withPropertySchema(MetricPropertyQos, MetricPropertyResctrlCacheId)
ResctrlMBMetric = defaultMetricFactory.New(ResctrlMB).withPropertySchema(MetricPropertyQos, MetricPropertyResctrlCacheId, MetricPropertyResctrlMbType)
)
26 changes: 19 additions & 7 deletions pkg/koordlet/metriccache/metric_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ const (
ContainerMetricCPI MetricKind = "container_cpi"

// Resctrl
ResctrlQos MetricKind = "qos_resctrl_resource"
ResctrlLLC MetricKind = "resctrl_resource_llc"
ResctrlMB MetricKind = "resctrl_resource_mb"

// PSI
ContainerMetricPSI MetricKind = "container_psi"
Expand Down Expand Up @@ -106,7 +107,7 @@ const (

MetricPropertyCPIResource MetricProperty = "cpi_resource"

MetricPropertyNodeQos MetricProperty = "node_qos"
MetricPropertyQos MetricProperty = "qos"

MetricPropertyResctrlType MetricProperty = "resctrl_type"
MetricPropertyResctrlCacheId MetricProperty = "cache_id"
Expand Down Expand Up @@ -138,6 +139,9 @@ const (
PSIDegreeFull MetricPropertyValue = "full"
PSIDegreeSome MetricPropertyValue = "some"

ResctrlTypeLLC MetricPropertyValue = "llc"
ResctrlTypeMB MetricPropertyValue = "mb"

BEResourceCPU MetricPropertyValue = "cpu"
BEResourceAllocationUsage MetricPropertyValue = "usage"
BEResourceAllocationRealLimit MetricPropertyValue = "real-limit"
Expand All @@ -151,7 +155,8 @@ var MetricPropertiesFunc = struct {
GPU func(string, string) map[MetricProperty]string
PSICPUFullSupported func(string, string) map[MetricProperty]string
ContainerCPI func(string, string, string) map[MetricProperty]string
QosResctrl func(string, int, string, string) map[MetricProperty]string
ResctrlLLC func(string, int) map[MetricProperty]string
ResctrlMB func(string, int, string) map[MetricProperty]string
PodPSI func(string, string, string, string) map[MetricProperty]string
ContainerPSI func(string, string, string, string, string) map[MetricProperty]string
PodGPU func(string, string, string) map[MetricProperty]string
Expand All @@ -171,13 +176,20 @@ var MetricPropertiesFunc = struct {
PSICPUFullSupported: func(podUID, containerID string) map[MetricProperty]string {
return map[MetricProperty]string{MetricPropertyPodUID: podUID, MetricPropertyContainerID: containerID}
},
QosResctrl: func(qos string, cacheid int, resctrlType string, resctrlMbType string) map[MetricProperty]string {
ResctrlLLC: func(qos string, cacheid int) map[MetricProperty]string {
return map[MetricProperty]string{
MetricPropertyResctrlCacheId: strconv.Itoa(cacheid),
MetricPropertyQos: qos,
}
},
ResctrlMB: func(qos string, cacheid int, mbType string) map[MetricProperty]string {
return map[MetricProperty]string{
MetricPropertyResctrlType: resctrlType,
MetricPropertyResctrlCacheId: strconv.Itoa(cacheid),
MetricPropertyResctrlMbType: resctrlMbType,
MetricPropertyNodeQos: qos,
MetricPropertyResctrlType: string(ResctrlTypeMB),
MetricPropertyResctrlMbType: mbType,
MetricPropertyQos: qos,
}

},
ContainerCPI: func(podUID, containerID, cpiResource string) map[MetricProperty]string {
return map[MetricProperty]string{MetricPropertyPodUID: podUID, MetricPropertyContainerID: containerID, MetricPropertyCPIResource: cpiResource}
Expand Down
36 changes: 26 additions & 10 deletions pkg/koordlet/metrics/resctrl.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,26 +33,42 @@ const (
)

var (
QosResctrl = prometheus.NewGaugeVec(prometheus.GaugeOpts{
ResctrlLLC = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Subsystem: KoordletSubsystem,
Name: "qos_resctrl",
Help: "qos resctrl collected by koordlet",
}, []string{NodeKey, ResctrlResourceType, ResctrlCacheId, ResctrlQos, ResctrlMbType})
Name: "resctrl_llc_occupancy",
Help: "resctrl default qos(LSR, LS, BE) llc occupancy collected by koordlet",
}, []string{NodeKey, ResctrlCacheId, ResctrlQos})
ResctrlMB = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Subsystem: KoordletSubsystem,
Name: "resctrl_memory_bandwidth",
Help: "resctrl default qos(LSR, LS, BE) memory bandwidth collected by koordlet",
}, []string{NodeKey, ResctrlCacheId, ResctrlQos, ResctrlMbType})

ResctrlCollectors = []prometheus.Collector{
QosResctrl,
ResctrlLLC,
ResctrlMB,
}
)

func ResetQosResctrl() {
QosResctrl.Reset()
func ResetResctrlLLCQos() {
ResctrlLLC.Reset()
}

func ResetResctrlMBQos() {
ResctrlMB.Reset()
}

func RecordResctrlLLC(cacheId int, qos string, value uint64) {
labels := genNodeLabels()
labels[ResctrlCacheId] = strconv.Itoa(cacheId)
labels[ResctrlQos] = qos
ResctrlLLC.With(labels).Set(float64(value))
}

func RecordQosResctrl(resourceType string, cacheId int, qos, mbType string, value uint64) {
func RecordResctrlMB(cacheId int, qos, mbType string, value uint64) {
labels := genNodeLabels()
labels[ResctrlResourceType] = resourceType
labels[ResctrlCacheId] = strconv.Itoa(cacheId)
labels[ResctrlQos] = qos
labels[ResctrlMbType] = mbType
QosResctrl.With(labels).Set(float64(value))
ResctrlMB.With(labels).Set(float64(value))
}
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,8 @@ func (r *resctrlCollector) collectQoSResctrlStat() {
continue
}
for cacheId, value := range l3Map {
metrics.RecordQosResctrl(metrics.ResourceTypeLLC, int(cacheId), qos, "", value)
llcSample, err := metriccache.QosResctrl.GenerateSample(metriccache.MetricPropertiesFunc.QosResctrl(qos, int(cacheId), metrics.ResourceTypeLLC, ""), collectTime, float64(value))
metrics.RecordResctrlLLC(int(cacheId), qos, value)
llcSample, err := metriccache.ResctrlLLCMetric.GenerateSample(metriccache.MetricPropertiesFunc.ResctrlLLC(qos, int(cacheId)), collectTime, float64(value))
if err != nil {
klog.Warningf("generate QoS %s resctrl llc sample error: %v", qos, err)
}
Expand All @@ -114,8 +114,8 @@ func (r *resctrlCollector) collectQoSResctrlStat() {
}
for cacheId, value := range mbMap {
for mbType, mbValue := range value {
metrics.RecordQosResctrl(metrics.ResourceTypeMB, int(cacheId), qos, mbType, mbValue)
mbSample, err := metriccache.QosResctrl.GenerateSample(metriccache.MetricPropertiesFunc.QosResctrl(qos, int(cacheId), metrics.ResourceTypeMB, mbType), collectTime, float64(mbValue))
metrics.RecordResctrlMB(int(cacheId), qos, mbType, mbValue)
mbSample, err := metriccache.ResctrlMBMetric.GenerateSample(metriccache.MetricPropertiesFunc.ResctrlMB(qos, int(cacheId), mbType), collectTime, float64(mbValue))
if err != nil {
klog.V(4).Infof("generate QoS %s resctrl mb sample error: %v", qos, err)
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/koordlet/metricsadvisor/framework/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ func NewDefaultConfig() *Config {
PSICollectorInterval: 10 * time.Second,
CPICollectorTimeWindow: 10 * time.Second,
ColdPageCollectorInterval: 5 * time.Second,
ResctrlCollectorInterval: 1 * time.Second,
ResctrlCollectorInterval: 10 * time.Second,
EnablePageCacheCollector: false,
EnableResctrlCollector: false,
}
Expand Down
17 changes: 14 additions & 3 deletions pkg/koordlet/resourceexecutor/resctrl.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,14 @@ func NewResctrlQoSReader() ResctrlReader {
return &ResctrlAMDReader{}
}

// ReadResctrlL3Stat: Reads the resctrl L3 cache statistics based on NUMA domain.
// For more information about x86 resctrl, refer to: https://docs.kernel.org/arch/x86/resctrl.html
func (rr *ResctrlBaseReader) ReadResctrlL3Stat(parent string) (map[CacheId]uint64, error) {
l3Stat := make(map[CacheId]uint64)
monDataPath := system.GetResctrlMonDataPath(parent)
fd, err := os.Open(monDataPath)
if err != nil {
return nil, fmt.Errorf(ErrResctrlDir)
return nil, errors.New(ErrResctrlDir)
}
defer fd.Close()
// read all l3-memory domains
Expand All @@ -103,43 +105,52 @@ func (rr *ResctrlBaseReader) ReadResctrlL3Stat(parent string) (map[CacheId]uint6
return nil, fmt.Errorf("%s, cannot find L3 domains, err: %w", ErrResctrlDir, err)
}
for _, domain := range domains {
// Convert the cache ID from the domain name string to an integer.
cacheId, err := strconv.Atoi(strings.Split(domain.Name(), "_")[CacheIdIndex])
if err != nil {
return nil, fmt.Errorf("%s, cannot get cacheid, err: %w", ErrResctrlDir, err)
}
// Construct the path to the resctrl L3 cache occupancy file.
path := system.ResctrlLLCOccupancy.Path(filepath.Join(parent, system.ResctrlMonData, domain.Name()))
l3Byte, err := os.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("%s, cannot read from resctrl file system, err: %w",
ErrResctrlDir, err)
}
// Parse the L3 cache usage data from the file content.
l3Usage, err := strconv.ParseUint(string(l3Byte), 10, 64)
if err != nil {
return nil, fmt.Errorf("cannot parse result, err: %w", err)
return nil, fmt.Errorf("cannot parse L3 cache usage, err: %w", err)
}
l3Stat[CacheId(cacheId)] = l3Usage
}
return l3Stat, nil
}

// ReadResctrlMBStat: Reads the resctrl memory bandwidth statistics based on NUMA domain.
// For more information about x86 resctrl, refer to: https://docs.kernel.org/arch/x86/resctrl.html
func (rr *ResctrlBaseReader) ReadResctrlMBStat(parent string) (map[CacheId]system.MBStatData, error) {
mbStat := make(map[CacheId]system.MBStatData)
monDataPath := system.GetResctrlMonDataPath(parent)
fd, err := os.Open(monDataPath)
if err != nil {
return nil, fmt.Errorf(ErrResctrlDir)
return nil, errors.New(ErrResctrlDir)
}
// read all l3-memory domains
domains, err := fd.ReadDir(-1)
if err != nil {
return nil, fmt.Errorf("%s, cannot find L3 domains, err: %w", ErrResctrlDir, err)
}
for _, domain := range domains {
// Parse the L3 cache usage data from the file content.
cacheId, err := strconv.Atoi(strings.Split(domain.Name(), "_")[CacheIdIndex])
if err != nil {
return nil, fmt.Errorf("%s, cannot get cacheid, err: %w", ErrResctrlDir, err)
}
mbStat[CacheId(cacheId)] = make(system.MBStatData)
// Read the memory bandwidth statistics for the local and total memory bandwidth.
// The local memory bandwidth is the memory bandwidth consumed by the domain itself.
// The total memory bandwidth is the memory bandwidth consumed by the domain and accessed by other domains.
for _, mbResource := range []system.Resource{
system.ResctrlMBLocal, system.ResctrlMBTotal,
} {
Expand Down

0 comments on commit ab3c8f8

Please sign in to comment.