From 4f0ee2170b7449f7ae373a8873d6f6fca1c4f172 Mon Sep 17 00:00:00 2001 From: Paige Rubendall Date: Thu, 26 May 2022 15:03:11 -0400 Subject: [PATCH 1/5] combining metrics profiles --- .../metrics-profiles/metrics-ovn.yaml | 137 ------------------ .../kube-burner/metrics-profiles/metrics.yaml | 5 + 2 files changed, 5 insertions(+), 137 deletions(-) delete mode 100644 workloads/kube-burner/metrics-profiles/metrics-ovn.yaml diff --git a/workloads/kube-burner/metrics-profiles/metrics-ovn.yaml b/workloads/kube-burner/metrics-profiles/metrics-ovn.yaml deleted file mode 100644 index 2b9f329f..00000000 --- a/workloads/kube-burner/metrics-profiles/metrics-ovn.yaml +++ /dev/null @@ -1,137 +0,0 @@ -# API server - -- query: irate(apiserver_request_total{verb="POST", resource="pods", subresource="binding",code="201"}[2m]) > 0 - metricName: schedulingThroughput - -- query: histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy"}[2m])) by (le, resource, verb, scope)) > 0 - metricName: readOnlyAPICallsLatency - -- query: histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"POST|PUT|DELETE|PATCH", subresource!~"log|exec|portforward|attach|proxy"}[2m])) by (le, resource, verb, scope)) > 0 - metricName: mutatingAPICallsLatency - -- query: sum(irate(apiserver_request_total{apiserver="kube-apiserver",verb!="WATCH"}[2m])) by (verb,resource,code) > 0 - metricName: APIRequestRate - -# Kubeproxy and OVN service sync latency - -- query: histogram_quantile(0.99, sum(rate(kubeproxy_network_programming_duration_seconds_bucket[2m])) by (le)) > 0 - metricName: serviceSyncLatency - -# Containers & pod metrics -- query: sum(irate(container_cpu_usage_seconds_total{name!="",namespace=~"openshift-(etcd|oauth-apiserver|.*apiserver|ovn-kubernetes|sdn|ingress|authentication|.*controller-manager|.*scheduler|monitoring|logging|image-registry)"}[2m]) * 100) by (pod, namespace, node) - metricName: podCPU - -- query: sum(container_memory_rss{name!="",namespace=~"openshift-(etcd|oauth-apiserver|.*apiserver|ovn-kubernetes|sdn|ingress|authentication|.*controller-manager|.*scheduler|monitoring|logging|image-registry)"}) by (pod, namespace, node) - metricName: podMemory - -- query: (sum(irate(container_cpu_usage_seconds_total{name!="",container!~"POD|",namespace=~"openshift-(etcd|.*apiserver|ovn-kubernetes|sdn|ingress|.*controller-manager|.*scheduler)"}[2m]) * 100) by (container, pod, namespace, node)) > 0 - metricName: containerCPU - -- query: sum(container_memory_rss{name!="",container!~"POD|",namespace=~"openshift-(etcd|.*apiserver|ovn-kubernetes|sdn|ingress|.*controller-manager|.*scheduler)"}) by (container, pod, namespace, node) - metricName: containerMemory - -# Kubelet & CRI-O runtime metrics - -- query: sum(irate(process_cpu_seconds_total{service="kubelet",job="kubelet"}[2m]) * 100) by (node) and on (node) kube_node_role{role="worker"} - metricName: kubeletCPU - -- query: sum(process_resident_memory_bytes{service="kubelet",job="kubelet"}) by (node) and on (node) kube_node_role{role="worker"} - metricName: kubeletMemory - -- query: sum(irate(process_cpu_seconds_total{service="kubelet",job="crio"}[2m]) * 100) by (node) and on (node) kube_node_role{role="worker"} - metricName: crioCPU - -- query: sum(process_resident_memory_bytes{service="kubelet",job="crio"}) by (node) and on (node) kube_node_role{role="worker"} - metricName: crioMemory - -- query: irate(container_runtime_crio_operations_latency_microseconds{operation_type="network_setup_pod"}[2m]) > 0 - metricName: containerNetworkSetupLatency - -# Node metrics: CPU & Memory - -- query: (sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) > 0 - metricName: nodeCPU-Workers - -- query: (sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")) > 0 - metricName: nodeCPU-Masters - -- query: (sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) > 0 - metricName: nodeCPU-Infra - -- query: node_memory_MemAvailable_bytes and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)") - metricName: nodeMemoryAvailable-Masters - -- query: node_memory_MemAvailable_bytes and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)") - metricName: nodeMemoryAvailable-Workers - -# We compute memory utilization by substrating available memory to the total - -- query: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)") - metricName: nodeMemoryUtilization-Workers - -- query: node_memory_MemAvailable_bytes and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)") - metricName: nodeMemoryAvailable-Infra - -- query: node_memory_MemTotal_bytes and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)") - metricName: nodeMemoryTotal-Masters - -- query: node_memory_MemTotal_bytes and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)") - metricName: nodeMemoryTotal-Workers - -- query: node_memory_MemTotal_bytes and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)") - metricName: nodeMemoryTotal-Infra - -# Etcd metrics - -- query: sum(rate(etcd_server_leader_changes_seen_total[2m])) - metricName: etcdLeaderChangesRate - -- query: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m])) - metricName: 99thEtcdDiskBackendCommitDurationSeconds - -- query: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m])) - metricName: 99thEtcdDiskWalFsyncDurationSeconds - -- query: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) - metricName: 99thEtcdRoundTripTimeSeconds - -- query: sum by (cluster_version)(etcd_cluster_version) - metricName: etcdVersion - instant: true - -# Cluster metrics - -- query: sum(kube_namespace_status_phase) by (phase) > 0 - metricName: namespaceCount - -- query: sum(kube_pod_status_phase{}) by (phase) - metricName: podStatusCount - -- query: count(kube_secret_info{}) - metricName: secretCount - instant: true - -- query: count(kube_deployment_labels{}) - metricName: deploymentCount - instant: true - -- query: count(kube_configmap_info{}) - metricName: configmapCount - instant: true - -- query: count(kube_service_info{}) - metricName: serviceCount - instant: true - -- query: kube_node_role - metricName: nodeRoles - -- query: sum(kube_node_status_condition{status="true"}) by (condition) - metricName: nodeStatus - -- query: count(kube_pod_info{}) by (node) - metricName: podDistribution - -- query: cluster_version{type="completed"} - metricName: clusterVersion - instant: true diff --git a/workloads/kube-burner/metrics-profiles/metrics.yaml b/workloads/kube-burner/metrics-profiles/metrics.yaml index 30e775c4..2b9f329f 100644 --- a/workloads/kube-burner/metrics-profiles/metrics.yaml +++ b/workloads/kube-burner/metrics-profiles/metrics.yaml @@ -18,6 +18,11 @@ metricName: serviceSyncLatency # Containers & pod metrics +- query: sum(irate(container_cpu_usage_seconds_total{name!="",namespace=~"openshift-(etcd|oauth-apiserver|.*apiserver|ovn-kubernetes|sdn|ingress|authentication|.*controller-manager|.*scheduler|monitoring|logging|image-registry)"}[2m]) * 100) by (pod, namespace, node) + metricName: podCPU + +- query: sum(container_memory_rss{name!="",namespace=~"openshift-(etcd|oauth-apiserver|.*apiserver|ovn-kubernetes|sdn|ingress|authentication|.*controller-manager|.*scheduler|monitoring|logging|image-registry)"}) by (pod, namespace, node) + metricName: podMemory - query: (sum(irate(container_cpu_usage_seconds_total{name!="",container!~"POD|",namespace=~"openshift-(etcd|.*apiserver|ovn-kubernetes|sdn|ingress|.*controller-manager|.*scheduler)"}[2m]) * 100) by (container, pod, namespace, node)) > 0 metricName: containerCPU From c7f2eed1a54e2c8fd385e832e5b063f3ec4f97fb Mon Sep 17 00:00:00 2001 From: Paige Rubendall Date: Thu, 29 Sep 2022 13:48:18 -0400 Subject: [PATCH 2/5] adding container touchstone configs --- .../touchstone-configs/containerCPU-avg.json | 21 +++++++++++++++++++ .../touchstone-configs/containerCPU-max.json | 21 +++++++++++++++++++ .../containerMemory-avg.json | 21 +++++++++++++++++++ .../containerMemory-max.json | 21 +++++++++++++++++++ 4 files changed, 84 insertions(+) create mode 100644 workloads/kube-burner/touchstone-configs/containerCPU-avg.json create mode 100644 workloads/kube-burner/touchstone-configs/containerCPU-max.json create mode 100644 workloads/kube-burner/touchstone-configs/containerMemory-avg.json create mode 100644 workloads/kube-burner/touchstone-configs/containerMemory-max.json diff --git a/workloads/kube-burner/touchstone-configs/containerCPU-avg.json b/workloads/kube-burner/touchstone-configs/containerCPU-avg.json new file mode 100644 index 00000000..b1e9dd44 --- /dev/null +++ b/workloads/kube-burner/touchstone-configs/containerCPU-avg.json @@ -0,0 +1,21 @@ +{ + "elasticsearch": { + "ripsaw-kube-burner": [ + { + "filter": { + "metricName.keyword": "containerCPU", + "labels.namespace.keyword": "${TOUCHSTONE_NAMESPACE}" + }, + "buckets": [ + "labels.pod.keyword", + "labels.node.keyword" + ], + "aggregations": { + "value": [ + "avg" + ] + } + } + ] + } +} diff --git a/workloads/kube-burner/touchstone-configs/containerCPU-max.json b/workloads/kube-burner/touchstone-configs/containerCPU-max.json new file mode 100644 index 00000000..b83c52ec --- /dev/null +++ b/workloads/kube-burner/touchstone-configs/containerCPU-max.json @@ -0,0 +1,21 @@ +{ + "elasticsearch": { + "ripsaw-kube-burner": [ + { + "filter": { + "metricName.keyword": "containerCPU", + "labels.namespace.keyword": "${TOUCHSTONE_NAMESPACE}" + }, + "buckets": [ + "labels.pod.keyword", + "labels.node.keyword" + ], + "aggregations": { + "value": [ + "max" + ] + } + } + ] + } +} diff --git a/workloads/kube-burner/touchstone-configs/containerMemory-avg.json b/workloads/kube-burner/touchstone-configs/containerMemory-avg.json new file mode 100644 index 00000000..2a8e805c --- /dev/null +++ b/workloads/kube-burner/touchstone-configs/containerMemory-avg.json @@ -0,0 +1,21 @@ +{ + "elasticsearch": { + "ripsaw-kube-burner": [ + { + "filter": { + "metricName.keyword": "containerMemory", + "labels.namespace.keyword": "${TOUCHSTONE_NAMESPACE}" + }, + "buckets": [ + "labels.pod.keyword", + "labels.node.keyword" + ], + "aggregations": { + "value": [ + "avg" + ] + } + } + ] + } +} diff --git a/workloads/kube-burner/touchstone-configs/containerMemory-max.json b/workloads/kube-burner/touchstone-configs/containerMemory-max.json new file mode 100644 index 00000000..4c48e5c6 --- /dev/null +++ b/workloads/kube-burner/touchstone-configs/containerMemory-max.json @@ -0,0 +1,21 @@ +{ + "elasticsearch": { + "ripsaw-kube-burner": [ + { + "filter": { + "metricName.keyword": "containerMemory", + "labels.namespace.keyword": "${TOUCHSTONE_NAMESPACE}" + }, + "buckets": [ + "labels.pod.keyword", + "labels.node.keyword" + ], + "aggregations": { + "value": [ + "max" + ] + } + } + ] + } +} From 7ab532df9cd29b1fa19399cfeed314f2f71fc283 Mon Sep 17 00:00:00 2001 From: Paige Rubendall Date: Fri, 30 Sep 2022 09:20:24 -0400 Subject: [PATCH 3/5] adding overall container metrics --- .../touchstone-configs/containerMetrics.json | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 workloads/kube-burner/touchstone-configs/containerMetrics.json diff --git a/workloads/kube-burner/touchstone-configs/containerMetrics.json b/workloads/kube-burner/touchstone-configs/containerMetrics.json new file mode 100644 index 00000000..d55a478e --- /dev/null +++ b/workloads/kube-burner/touchstone-configs/containerMetrics.json @@ -0,0 +1,40 @@ +{ + "elasticsearch": { + "ripsaw-kube-burner": [ + { + "filter": { + "metricName.keyword": "containerCPU", + "labels.namespace.keyword": "${TOUCHSTONE_NAMESPACE}" + }, + "buckets": [ + "labels.pod.keyword", + "labels.node.keyword", + "labels.container.keyword" + ], + "aggregations": { + "value": [ + "avg", + "max" + ] + } + }, + { + "filter": { + "metricName.keyword": "containerMemory", + "labels.namespace.keyword": "${TOUCHSTONE_NAMESPACE}" + }, + "buckets": [ + "labels.pod.keyword", + "labels.node.keyword", + "labels.container.keyword" + ], + "aggregations": { + "value": [ + "avg", + "max" + ] + } + } + ] + } +} From cca56c2ed37f0bdb7578dc3b6755d800e42472ee Mon Sep 17 00:00:00 2001 From: Paige Rubendall Date: Fri, 30 Sep 2022 09:21:45 -0400 Subject: [PATCH 4/5] deleting old files --- .../touchstone-configs/containerCPU-avg.json | 21 ------------------- .../touchstone-configs/containerCPU-max.json | 21 ------------------- .../containerMemory-avg.json | 21 ------------------- .../containerMemory-max.json | 21 ------------------- 4 files changed, 84 deletions(-) delete mode 100644 workloads/kube-burner/touchstone-configs/containerCPU-avg.json delete mode 100644 workloads/kube-burner/touchstone-configs/containerCPU-max.json delete mode 100644 workloads/kube-burner/touchstone-configs/containerMemory-avg.json delete mode 100644 workloads/kube-burner/touchstone-configs/containerMemory-max.json diff --git a/workloads/kube-burner/touchstone-configs/containerCPU-avg.json b/workloads/kube-burner/touchstone-configs/containerCPU-avg.json deleted file mode 100644 index b1e9dd44..00000000 --- a/workloads/kube-burner/touchstone-configs/containerCPU-avg.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "elasticsearch": { - "ripsaw-kube-burner": [ - { - "filter": { - "metricName.keyword": "containerCPU", - "labels.namespace.keyword": "${TOUCHSTONE_NAMESPACE}" - }, - "buckets": [ - "labels.pod.keyword", - "labels.node.keyword" - ], - "aggregations": { - "value": [ - "avg" - ] - } - } - ] - } -} diff --git a/workloads/kube-burner/touchstone-configs/containerCPU-max.json b/workloads/kube-burner/touchstone-configs/containerCPU-max.json deleted file mode 100644 index b83c52ec..00000000 --- a/workloads/kube-burner/touchstone-configs/containerCPU-max.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "elasticsearch": { - "ripsaw-kube-burner": [ - { - "filter": { - "metricName.keyword": "containerCPU", - "labels.namespace.keyword": "${TOUCHSTONE_NAMESPACE}" - }, - "buckets": [ - "labels.pod.keyword", - "labels.node.keyword" - ], - "aggregations": { - "value": [ - "max" - ] - } - } - ] - } -} diff --git a/workloads/kube-burner/touchstone-configs/containerMemory-avg.json b/workloads/kube-burner/touchstone-configs/containerMemory-avg.json deleted file mode 100644 index 2a8e805c..00000000 --- a/workloads/kube-burner/touchstone-configs/containerMemory-avg.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "elasticsearch": { - "ripsaw-kube-burner": [ - { - "filter": { - "metricName.keyword": "containerMemory", - "labels.namespace.keyword": "${TOUCHSTONE_NAMESPACE}" - }, - "buckets": [ - "labels.pod.keyword", - "labels.node.keyword" - ], - "aggregations": { - "value": [ - "avg" - ] - } - } - ] - } -} diff --git a/workloads/kube-burner/touchstone-configs/containerMemory-max.json b/workloads/kube-burner/touchstone-configs/containerMemory-max.json deleted file mode 100644 index 4c48e5c6..00000000 --- a/workloads/kube-burner/touchstone-configs/containerMemory-max.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "elasticsearch": { - "ripsaw-kube-burner": [ - { - "filter": { - "metricName.keyword": "containerMemory", - "labels.namespace.keyword": "${TOUCHSTONE_NAMESPACE}" - }, - "buckets": [ - "labels.pod.keyword", - "labels.node.keyword" - ], - "aggregations": { - "value": [ - "max" - ] - } - } - ] - } -} From 28aec6fba1a2ce6a198922f129a330a61bec49da Mon Sep 17 00:00:00 2001 From: Paige Rubendall Date: Fri, 30 Sep 2022 09:23:43 -0400 Subject: [PATCH 5/5] adding metrics-ovn back --- .../metrics-profiles/metrics-ovn.yaml | 137 ++++++++++++++++++ .../kube-burner/metrics-profiles/metrics.yaml | 5 - 2 files changed, 137 insertions(+), 5 deletions(-) create mode 100644 workloads/kube-burner/metrics-profiles/metrics-ovn.yaml diff --git a/workloads/kube-burner/metrics-profiles/metrics-ovn.yaml b/workloads/kube-burner/metrics-profiles/metrics-ovn.yaml new file mode 100644 index 00000000..2b9f329f --- /dev/null +++ b/workloads/kube-burner/metrics-profiles/metrics-ovn.yaml @@ -0,0 +1,137 @@ +# API server + +- query: irate(apiserver_request_total{verb="POST", resource="pods", subresource="binding",code="201"}[2m]) > 0 + metricName: schedulingThroughput + +- query: histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy"}[2m])) by (le, resource, verb, scope)) > 0 + metricName: readOnlyAPICallsLatency + +- query: histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"POST|PUT|DELETE|PATCH", subresource!~"log|exec|portforward|attach|proxy"}[2m])) by (le, resource, verb, scope)) > 0 + metricName: mutatingAPICallsLatency + +- query: sum(irate(apiserver_request_total{apiserver="kube-apiserver",verb!="WATCH"}[2m])) by (verb,resource,code) > 0 + metricName: APIRequestRate + +# Kubeproxy and OVN service sync latency + +- query: histogram_quantile(0.99, sum(rate(kubeproxy_network_programming_duration_seconds_bucket[2m])) by (le)) > 0 + metricName: serviceSyncLatency + +# Containers & pod metrics +- query: sum(irate(container_cpu_usage_seconds_total{name!="",namespace=~"openshift-(etcd|oauth-apiserver|.*apiserver|ovn-kubernetes|sdn|ingress|authentication|.*controller-manager|.*scheduler|monitoring|logging|image-registry)"}[2m]) * 100) by (pod, namespace, node) + metricName: podCPU + +- query: sum(container_memory_rss{name!="",namespace=~"openshift-(etcd|oauth-apiserver|.*apiserver|ovn-kubernetes|sdn|ingress|authentication|.*controller-manager|.*scheduler|monitoring|logging|image-registry)"}) by (pod, namespace, node) + metricName: podMemory + +- query: (sum(irate(container_cpu_usage_seconds_total{name!="",container!~"POD|",namespace=~"openshift-(etcd|.*apiserver|ovn-kubernetes|sdn|ingress|.*controller-manager|.*scheduler)"}[2m]) * 100) by (container, pod, namespace, node)) > 0 + metricName: containerCPU + +- query: sum(container_memory_rss{name!="",container!~"POD|",namespace=~"openshift-(etcd|.*apiserver|ovn-kubernetes|sdn|ingress|.*controller-manager|.*scheduler)"}) by (container, pod, namespace, node) + metricName: containerMemory + +# Kubelet & CRI-O runtime metrics + +- query: sum(irate(process_cpu_seconds_total{service="kubelet",job="kubelet"}[2m]) * 100) by (node) and on (node) kube_node_role{role="worker"} + metricName: kubeletCPU + +- query: sum(process_resident_memory_bytes{service="kubelet",job="kubelet"}) by (node) and on (node) kube_node_role{role="worker"} + metricName: kubeletMemory + +- query: sum(irate(process_cpu_seconds_total{service="kubelet",job="crio"}[2m]) * 100) by (node) and on (node) kube_node_role{role="worker"} + metricName: crioCPU + +- query: sum(process_resident_memory_bytes{service="kubelet",job="crio"}) by (node) and on (node) kube_node_role{role="worker"} + metricName: crioMemory + +- query: irate(container_runtime_crio_operations_latency_microseconds{operation_type="network_setup_pod"}[2m]) > 0 + metricName: containerNetworkSetupLatency + +# Node metrics: CPU & Memory + +- query: (sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) > 0 + metricName: nodeCPU-Workers + +- query: (sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")) > 0 + metricName: nodeCPU-Masters + +- query: (sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) > 0 + metricName: nodeCPU-Infra + +- query: node_memory_MemAvailable_bytes and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)") + metricName: nodeMemoryAvailable-Masters + +- query: node_memory_MemAvailable_bytes and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)") + metricName: nodeMemoryAvailable-Workers + +# We compute memory utilization by substrating available memory to the total + +- query: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)") + metricName: nodeMemoryUtilization-Workers + +- query: node_memory_MemAvailable_bytes and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)") + metricName: nodeMemoryAvailable-Infra + +- query: node_memory_MemTotal_bytes and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)") + metricName: nodeMemoryTotal-Masters + +- query: node_memory_MemTotal_bytes and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)") + metricName: nodeMemoryTotal-Workers + +- query: node_memory_MemTotal_bytes and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)") + metricName: nodeMemoryTotal-Infra + +# Etcd metrics + +- query: sum(rate(etcd_server_leader_changes_seen_total[2m])) + metricName: etcdLeaderChangesRate + +- query: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m])) + metricName: 99thEtcdDiskBackendCommitDurationSeconds + +- query: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m])) + metricName: 99thEtcdDiskWalFsyncDurationSeconds + +- query: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) + metricName: 99thEtcdRoundTripTimeSeconds + +- query: sum by (cluster_version)(etcd_cluster_version) + metricName: etcdVersion + instant: true + +# Cluster metrics + +- query: sum(kube_namespace_status_phase) by (phase) > 0 + metricName: namespaceCount + +- query: sum(kube_pod_status_phase{}) by (phase) + metricName: podStatusCount + +- query: count(kube_secret_info{}) + metricName: secretCount + instant: true + +- query: count(kube_deployment_labels{}) + metricName: deploymentCount + instant: true + +- query: count(kube_configmap_info{}) + metricName: configmapCount + instant: true + +- query: count(kube_service_info{}) + metricName: serviceCount + instant: true + +- query: kube_node_role + metricName: nodeRoles + +- query: sum(kube_node_status_condition{status="true"}) by (condition) + metricName: nodeStatus + +- query: count(kube_pod_info{}) by (node) + metricName: podDistribution + +- query: cluster_version{type="completed"} + metricName: clusterVersion + instant: true diff --git a/workloads/kube-burner/metrics-profiles/metrics.yaml b/workloads/kube-burner/metrics-profiles/metrics.yaml index 2b9f329f..30e775c4 100644 --- a/workloads/kube-burner/metrics-profiles/metrics.yaml +++ b/workloads/kube-burner/metrics-profiles/metrics.yaml @@ -18,11 +18,6 @@ metricName: serviceSyncLatency # Containers & pod metrics -- query: sum(irate(container_cpu_usage_seconds_total{name!="",namespace=~"openshift-(etcd|oauth-apiserver|.*apiserver|ovn-kubernetes|sdn|ingress|authentication|.*controller-manager|.*scheduler|monitoring|logging|image-registry)"}[2m]) * 100) by (pod, namespace, node) - metricName: podCPU - -- query: sum(container_memory_rss{name!="",namespace=~"openshift-(etcd|oauth-apiserver|.*apiserver|ovn-kubernetes|sdn|ingress|authentication|.*controller-manager|.*scheduler|monitoring|logging|image-registry)"}) by (pod, namespace, node) - metricName: podMemory - query: (sum(irate(container_cpu_usage_seconds_total{name!="",container!~"POD|",namespace=~"openshift-(etcd|.*apiserver|ovn-kubernetes|sdn|ingress|.*controller-manager|.*scheduler)"}[2m]) * 100) by (container, pod, namespace, node)) > 0 metricName: containerCPU