From 1957d95a68c7d61bb5c3344e09439611a3977405 Mon Sep 17 00:00:00 2001 From: Jeffrey Chien Date: Mon, 2 Oct 2023 12:04:53 -0400 Subject: [PATCH] Filter terminated pods from node request metrics. (#104) --- .../internal/stores/podstore.go | 11 ++-- .../internal/stores/podstore_test.go | 53 +++++++++++++++++++ 2 files changed, 60 insertions(+), 4 deletions(-) diff --git a/receiver/awscontainerinsightreceiver/internal/stores/podstore.go b/receiver/awscontainerinsightreceiver/internal/stores/podstore.go index e907b1b6cc96..3babc6a37b45 100644 --- a/receiver/awscontainerinsightreceiver/internal/stores/podstore.go +++ b/receiver/awscontainerinsightreceiver/internal/stores/podstore.go @@ -303,10 +303,13 @@ func (p *PodStore) refreshInternal(now time.Time, podList []corev1.Pod) { p.logger.Warn(fmt.Sprintf("podKey is unavailable, refresh pod store for pod %s", pod.Name)) continue } - tmpCPUReq, _ := getResourceSettingForPod(&pod, p.nodeInfo.getCPUCapacity(), cpuKey, getRequestForContainer) - cpuRequest += tmpCPUReq - tmpMemReq, _ := getResourceSettingForPod(&pod, p.nodeInfo.getMemCapacity(), memoryKey, getRequestForContainer) - memRequest += tmpMemReq + // filter out terminated pods + if pod.Status.Phase != corev1.PodSucceeded && pod.Status.Phase != corev1.PodFailed { + tmpCPUReq, _ := getResourceSettingForPod(&pod, p.nodeInfo.getCPUCapacity(), cpuKey, getRequestForContainer) + cpuRequest += tmpCPUReq + tmpMemReq, _ := getResourceSettingForPod(&pod, p.nodeInfo.getMemCapacity(), memoryKey, getRequestForContainer) + memRequest += tmpMemReq + } if pod.Status.Phase == corev1.PodRunning { podCount++ } diff --git a/receiver/awscontainerinsightreceiver/internal/stores/podstore_test.go b/receiver/awscontainerinsightreceiver/internal/stores/podstore_test.go index 982fca985cdb..6a5c98b32455 100644 --- a/receiver/awscontainerinsightreceiver/internal/stores/podstore_test.go +++ b/receiver/awscontainerinsightreceiver/internal/stores/podstore_test.go @@ -1141,6 +1141,59 @@ func TestPodStore_decorateNode(t *testing.T) { assert.Equal(t, uint64(15), metric.GetField("node_status_allocatable_pods").(uint64)) } +func TestPodStore_decorateNode_multiplePodStates(t *testing.T) { + podStore := getPodStore() + defer require.NoError(t, podStore.Shutdown()) + + tags := map[string]string{ci.MetricType: ci.TypeNode} + fields := map[string]interface{}{ + ci.MetricName(ci.TypeNode, ci.CPUTotal): float64(100), + ci.MetricName(ci.TypeNode, ci.CPULimit): uint64(4000), + ci.MetricName(ci.TypeNode, ci.MemWorkingset): float64(100 * 1024 * 1024), + ci.MetricName(ci.TypeNode, ci.MemLimit): uint64(400 * 1024 * 1024), + } + metric := generateMetric(fields, tags) + + // terminated pods should not contribute to requests + failedPod := generatePodInfo("./test_resources/pod_in_phase_failed.json") + succeededPod := generatePodInfo("./test_resources/pod_in_phase_succeeded.json") + podList := []corev1.Pod{*failedPod, *succeededPod} + podStore.refreshInternal(time.Now(), podList) + podStore.decorateNode(metric) + + assert.Equal(t, uint64(0), metric.GetField("node_cpu_request").(uint64)) + assert.Equal(t, uint64(4000), metric.GetField("node_cpu_limit").(uint64)) + assert.Equal(t, float64(0), metric.GetField("node_cpu_reserved_capacity").(float64)) + assert.Equal(t, float64(100), metric.GetField("node_cpu_usage_total").(float64)) + + assert.Equal(t, uint64(0), metric.GetField("node_memory_request").(uint64)) + assert.Equal(t, uint64(400*1024*1024), metric.GetField("node_memory_limit").(uint64)) + assert.Equal(t, float64(0), metric.GetField("node_memory_reserved_capacity").(float64)) + assert.Equal(t, float64(100*1024*1024), metric.GetField("node_memory_working_set").(float64)) + + // non-terminated pods should contribute to requests + pendingPod := generatePodInfo("./test_resources/pod_in_phase_pending.json") + podList = append(podList, *pendingPod) + podStore.refreshInternal(time.Now(), podList) + podStore.decorateNode(metric) + assert.Equal(t, uint64(10), metric.GetField("node_cpu_request").(uint64)) + assert.Equal(t, float64(0.25), metric.GetField("node_cpu_reserved_capacity").(float64)) + + assert.Equal(t, uint64(50*1024*1024), metric.GetField("node_memory_request").(uint64)) + assert.Equal(t, float64(12.5), metric.GetField("node_memory_reserved_capacity").(float64)) + + runningPod := generatePodInfo("./test_resources/pod_in_phase_running.json") + podList = append(podList, *runningPod) + podStore.refreshInternal(time.Now(), podList) + podStore.decorateNode(metric) + + assert.Equal(t, uint64(20), metric.GetField("node_cpu_request").(uint64)) + assert.Equal(t, float64(0.5), metric.GetField("node_cpu_reserved_capacity").(float64)) + + assert.Equal(t, uint64(100*1024*1024), metric.GetField("node_memory_request").(uint64)) + assert.Equal(t, float64(25), metric.GetField("node_memory_reserved_capacity").(float64)) +} + func TestPodStore_Decorate(t *testing.T) { // not the metrics for decoration tags := map[string]string{}