diff --git a/pkg/koordlet/metriccache/metric_resources.go b/pkg/koordlet/metriccache/metric_resources.go index f40e100fa..623ab0e08 100644 --- a/pkg/koordlet/metriccache/metric_resources.go +++ b/pkg/koordlet/metriccache/metric_resources.go @@ -74,5 +74,6 @@ var ( HostAppMemoryUsageWithPageCacheMetric = defaultMetricFactory.New(HostAppMemoryWithPageCacheUsage).withPropertySchema(MetricPropertyHostAppName) // Resctrl - QosResctrl = defaultMetricFactory.New(ResctrlQos).withPropertySchema(MetricPropertyNodeQos, MetricPropertyResctrlType, MetricPropertyResctrlCacheId, MetricPropertyResctrlMbType) + ResctrlLLCMetric = defaultMetricFactory.New(ResctrlLLC).withPropertySchema(MetricPropertyQos, MetricPropertyResctrlCacheId) + ResctrlMBMetric = defaultMetricFactory.New(ResctrlMB).withPropertySchema(MetricPropertyQos, MetricPropertyResctrlCacheId, MetricPropertyResctrlMbType) ) diff --git a/pkg/koordlet/metriccache/metric_types.go b/pkg/koordlet/metriccache/metric_types.go index bfdc0ee0a..15d53c58e 100644 --- a/pkg/koordlet/metriccache/metric_types.go +++ b/pkg/koordlet/metriccache/metric_types.go @@ -75,7 +75,8 @@ const ( ContainerMetricCPI MetricKind = "container_cpi" // Resctrl - ResctrlQos MetricKind = "qos_resctrl_resource" + ResctrlLLC MetricKind = "resctrl_resource_llc" + ResctrlMB MetricKind = "resctrl_resource_mb" // PSI ContainerMetricPSI MetricKind = "container_psi" @@ -106,7 +107,7 @@ const ( MetricPropertyCPIResource MetricProperty = "cpi_resource" - MetricPropertyNodeQos MetricProperty = "node_qos" + MetricPropertyQos MetricProperty = "qos" MetricPropertyResctrlType MetricProperty = "resctrl_type" MetricPropertyResctrlCacheId MetricProperty = "cache_id" @@ -138,6 +139,9 @@ const ( PSIDegreeFull MetricPropertyValue = "full" PSIDegreeSome MetricPropertyValue = "some" + ResctrlTypeLLC MetricPropertyValue = "llc" + ResctrlTypeMB MetricPropertyValue = "mb" + BEResourceCPU MetricPropertyValue = "cpu" BEResourceAllocationUsage MetricPropertyValue = "usage" BEResourceAllocationRealLimit MetricPropertyValue = "real-limit" @@ -151,7 +155,8 @@ var MetricPropertiesFunc = struct { GPU func(string, string) map[MetricProperty]string PSICPUFullSupported func(string, string) map[MetricProperty]string ContainerCPI func(string, string, string) map[MetricProperty]string - QosResctrl func(string, int, string, string) map[MetricProperty]string + ResctrlLLC func(string, int) map[MetricProperty]string + ResctrlMB func(string, int, string) map[MetricProperty]string PodPSI func(string, string, string, string) map[MetricProperty]string ContainerPSI func(string, string, string, string, string) map[MetricProperty]string PodGPU func(string, string, string) map[MetricProperty]string @@ -171,13 +176,20 @@ var MetricPropertiesFunc = struct { PSICPUFullSupported: func(podUID, containerID string) map[MetricProperty]string { return map[MetricProperty]string{MetricPropertyPodUID: podUID, MetricPropertyContainerID: containerID} }, - QosResctrl: func(qos string, cacheid int, resctrlType string, resctrlMbType string) map[MetricProperty]string { + ResctrlLLC: func(qos string, cacheid int) map[MetricProperty]string { + return map[MetricProperty]string{ + MetricPropertyResctrlCacheId: strconv.Itoa(cacheid), + MetricPropertyQos: qos, + } + }, + ResctrlMB: func(qos string, cacheid int, mbType string) map[MetricProperty]string { return map[MetricProperty]string{ - MetricPropertyResctrlType: resctrlType, MetricPropertyResctrlCacheId: strconv.Itoa(cacheid), - MetricPropertyResctrlMbType: resctrlMbType, - MetricPropertyNodeQos: qos, + MetricPropertyResctrlType: string(ResctrlTypeMB), + MetricPropertyResctrlMbType: mbType, + MetricPropertyQos: qos, } + }, ContainerCPI: func(podUID, containerID, cpiResource string) map[MetricProperty]string { return map[MetricProperty]string{MetricPropertyPodUID: podUID, MetricPropertyContainerID: containerID, MetricPropertyCPIResource: cpiResource} diff --git a/pkg/koordlet/metrics/resctrl.go b/pkg/koordlet/metrics/resctrl.go index f340c25fe..6c5379727 100644 --- a/pkg/koordlet/metrics/resctrl.go +++ b/pkg/koordlet/metrics/resctrl.go @@ -33,26 +33,42 @@ const ( ) var ( - QosResctrl = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + ResctrlLLC = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Subsystem: KoordletSubsystem, - Name: "qos_resctrl", - Help: "qos resctrl collected by koordlet", - }, []string{NodeKey, ResctrlResourceType, ResctrlCacheId, ResctrlQos, ResctrlMbType}) + Name: "resctrl_llc_occupancy", + Help: "resctrl default qos(LSR, LS, BE) llc occupancy collected by koordlet", + }, []string{NodeKey, ResctrlCacheId, ResctrlQos}) + ResctrlMB = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Subsystem: KoordletSubsystem, + Name: "resctrl_memory_bandwidth", + Help: "resctrl default qos(LSR, LS, BE) memory bandwidth collected by koordlet", + }, []string{NodeKey, ResctrlCacheId, ResctrlQos, ResctrlMbType}) ResctrlCollectors = []prometheus.Collector{ - QosResctrl, + ResctrlLLC, + ResctrlMB, } ) -func ResetQosResctrl() { - QosResctrl.Reset() +func ResetResctrlLLCQos() { + ResctrlLLC.Reset() +} + +func ResetResctrlMBQos() { + ResctrlMB.Reset() +} + +func RecordResctrlLLC(cacheId int, qos string, value uint64) { + labels := genNodeLabels() + labels[ResctrlCacheId] = strconv.Itoa(cacheId) + labels[ResctrlQos] = qos + ResctrlLLC.With(labels).Set(float64(value)) } -func RecordQosResctrl(resourceType string, cacheId int, qos, mbType string, value uint64) { +func RecordResctrlMB(cacheId int, qos, mbType string, value uint64) { labels := genNodeLabels() - labels[ResctrlResourceType] = resourceType labels[ResctrlCacheId] = strconv.Itoa(cacheId) labels[ResctrlQos] = qos labels[ResctrlMbType] = mbType - QosResctrl.With(labels).Set(float64(value)) + ResctrlMB.With(labels).Set(float64(value)) } diff --git a/pkg/koordlet/metricsadvisor/collectors/resctrl/resctrl_collector.go b/pkg/koordlet/metricsadvisor/collectors/resctrl/resctrl_collector.go index 7631f5c6e..6d5ca9649 100644 --- a/pkg/koordlet/metricsadvisor/collectors/resctrl/resctrl_collector.go +++ b/pkg/koordlet/metricsadvisor/collectors/resctrl/resctrl_collector.go @@ -100,8 +100,8 @@ func (r *resctrlCollector) collectQoSResctrlStat() { continue } for cacheId, value := range l3Map { - metrics.RecordQosResctrl(metrics.ResourceTypeLLC, int(cacheId), qos, "", value) - llcSample, err := metriccache.QosResctrl.GenerateSample(metriccache.MetricPropertiesFunc.QosResctrl(qos, int(cacheId), metrics.ResourceTypeLLC, ""), collectTime, float64(value)) + metrics.RecordResctrlLLC(int(cacheId), qos, value) + llcSample, err := metriccache.ResctrlLLCMetric.GenerateSample(metriccache.MetricPropertiesFunc.ResctrlLLC(qos, int(cacheId)), collectTime, float64(value)) if err != nil { klog.Warningf("generate QoS %s resctrl llc sample error: %v", qos, err) } @@ -114,8 +114,8 @@ func (r *resctrlCollector) collectQoSResctrlStat() { } for cacheId, value := range mbMap { for mbType, mbValue := range value { - metrics.RecordQosResctrl(metrics.ResourceTypeMB, int(cacheId), qos, mbType, mbValue) - mbSample, err := metriccache.QosResctrl.GenerateSample(metriccache.MetricPropertiesFunc.QosResctrl(qos, int(cacheId), metrics.ResourceTypeMB, mbType), collectTime, float64(mbValue)) + metrics.RecordResctrlMB(int(cacheId), qos, mbType, mbValue) + mbSample, err := metriccache.ResctrlMBMetric.GenerateSample(metriccache.MetricPropertiesFunc.ResctrlMB(qos, int(cacheId), mbType), collectTime, float64(mbValue)) if err != nil { klog.V(4).Infof("generate QoS %s resctrl mb sample error: %v", qos, err) } diff --git a/pkg/koordlet/metricsadvisor/framework/config.go b/pkg/koordlet/metricsadvisor/framework/config.go index c1c2ca17d..0bf8e97c5 100644 --- a/pkg/koordlet/metricsadvisor/framework/config.go +++ b/pkg/koordlet/metricsadvisor/framework/config.go @@ -50,7 +50,7 @@ func NewDefaultConfig() *Config { PSICollectorInterval: 10 * time.Second, CPICollectorTimeWindow: 10 * time.Second, ColdPageCollectorInterval: 5 * time.Second, - ResctrlCollectorInterval: 1 * time.Second, + ResctrlCollectorInterval: 10 * time.Second, EnablePageCacheCollector: false, EnableResctrlCollector: false, } diff --git a/pkg/koordlet/resourceexecutor/resctrl.go b/pkg/koordlet/resourceexecutor/resctrl.go index 2f77b93f3..03112e75c 100644 --- a/pkg/koordlet/resourceexecutor/resctrl.go +++ b/pkg/koordlet/resourceexecutor/resctrl.go @@ -89,12 +89,14 @@ func NewResctrlQoSReader() ResctrlReader { return &ResctrlAMDReader{} } +// ReadResctrlL3Stat: Reads the resctrl L3 cache statistics based on NUMA domain. +// For more information about x86 resctrl, refer to: https://docs.kernel.org/arch/x86/resctrl.html func (rr *ResctrlBaseReader) ReadResctrlL3Stat(parent string) (map[CacheId]uint64, error) { l3Stat := make(map[CacheId]uint64) monDataPath := system.GetResctrlMonDataPath(parent) fd, err := os.Open(monDataPath) if err != nil { - return nil, fmt.Errorf(ErrResctrlDir) + return nil, errors.New(ErrResctrlDir) } defer fd.Close() // read all l3-memory domains @@ -103,31 +105,36 @@ func (rr *ResctrlBaseReader) ReadResctrlL3Stat(parent string) (map[CacheId]uint6 return nil, fmt.Errorf("%s, cannot find L3 domains, err: %w", ErrResctrlDir, err) } for _, domain := range domains { + // Convert the cache ID from the domain name string to an integer. cacheId, err := strconv.Atoi(strings.Split(domain.Name(), "_")[CacheIdIndex]) if err != nil { return nil, fmt.Errorf("%s, cannot get cacheid, err: %w", ErrResctrlDir, err) } + // Construct the path to the resctrl L3 cache occupancy file. path := system.ResctrlLLCOccupancy.Path(filepath.Join(parent, system.ResctrlMonData, domain.Name())) l3Byte, err := os.ReadFile(path) if err != nil { return nil, fmt.Errorf("%s, cannot read from resctrl file system, err: %w", ErrResctrlDir, err) } + // Parse the L3 cache usage data from the file content. l3Usage, err := strconv.ParseUint(string(l3Byte), 10, 64) if err != nil { - return nil, fmt.Errorf("cannot parse result, err: %w", err) + return nil, fmt.Errorf("cannot parse L3 cache usage, err: %w", err) } l3Stat[CacheId(cacheId)] = l3Usage } return l3Stat, nil } +// ReadResctrlMBStat: Reads the resctrl memory bandwidth statistics based on NUMA domain. +// For more information about x86 resctrl, refer to: https://docs.kernel.org/arch/x86/resctrl.html func (rr *ResctrlBaseReader) ReadResctrlMBStat(parent string) (map[CacheId]system.MBStatData, error) { mbStat := make(map[CacheId]system.MBStatData) monDataPath := system.GetResctrlMonDataPath(parent) fd, err := os.Open(monDataPath) if err != nil { - return nil, fmt.Errorf(ErrResctrlDir) + return nil, errors.New(ErrResctrlDir) } // read all l3-memory domains domains, err := fd.ReadDir(-1) @@ -135,11 +142,15 @@ func (rr *ResctrlBaseReader) ReadResctrlMBStat(parent string) (map[CacheId]syste return nil, fmt.Errorf("%s, cannot find L3 domains, err: %w", ErrResctrlDir, err) } for _, domain := range domains { + // Parse the L3 cache usage data from the file content. cacheId, err := strconv.Atoi(strings.Split(domain.Name(), "_")[CacheIdIndex]) if err != nil { return nil, fmt.Errorf("%s, cannot get cacheid, err: %w", ErrResctrlDir, err) } mbStat[CacheId(cacheId)] = make(system.MBStatData) + // Read the memory bandwidth statistics for the local and total memory bandwidth. + // The local memory bandwidth is the memory bandwidth consumed by the domain itself. + // The total memory bandwidth is the memory bandwidth consumed by the domain and accessed by other domains. for _, mbResource := range []system.Resource{ system.ResctrlMBLocal, system.ResctrlMBTotal, } {