From a1c94fa232d949d0152095e33229446395bafc04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacob=20Baung=C3=A5rd=20Hansen?= Date: Tue, 28 May 2024 15:38:31 +0200 Subject: [PATCH] Add optimized grafana dashboard (#1454) * Add optimized cluster overview dashboard This optimized dashboard mainly lowers the cardinality of the CPU metrics. Specifically instead of using `avg(rate(node_cpu_seconds_total` which has a cardinality of total CPUs across all managed clusters, we instead use `cluster:node_cpu:ratio` which has a cardinality of 1 per cluster. That is with 100 clusters, with 16 CPUs, the cardinality before was 100*16 = 1600, where as with this change we now only fetch 100 metrics. This should scale quite a bit better on larger installations with many clusters/nodes. Signed-off-by: Jacob Baungard Hansen * Grafana: Use wildcard for all on cluster overview Instead of listing all clusters manually in the query, i.e like: ``` cluster=~"(local-cluster|simulated-managed-cluster-1|simulated-managed-cluster-1-1|simulated-managed-cluster-1-10|simulated-managed-cluster-1-2|simulated-managed-cluster-1-3..." ``` We set it to `".+"` simplifying the query significantly. Signed-off-by: Jacob Baungard Hansen * Tests: Add basic test for dashboard existence A quick test that checks if the dashboards exists. Signed-off-by: Jacob Baungard Hansen * Kind test: Actually use CI built MCO image in test While the auxiliary images (endpoint-monitoring-operator, etc) correctly used the CI built images in kind, this was that the case for MCO itself. In this commit we make sure to load in the `IMAGE REF` from the kind env file, so that the CI image for MCO is used as well. Signed-off-by: Jacob Baungard Hansen --------- Signed-off-by: Jacob Baungard Hansen --- cicd-scripts/setup-e2e-tests.sh | 8 + .../dash-acm-clusters-overview-optimized.yaml | 1910 +++++++++++++++++ .../grafana/dash-acm-clusters-overview.yaml | 2 +- .../manifests/base/grafana/kustomization.yaml | 1 + .../pkg/tests/observability_dashboard_test.go | 33 +- 5 files changed, 1950 insertions(+), 4 deletions(-) create mode 100644 operators/multiclusterobservability/manifests/base/grafana/dash-acm-clusters-overview-optimized.yaml diff --git a/cicd-scripts/setup-e2e-tests.sh b/cicd-scripts/setup-e2e-tests.sh index 028a7a2df5..2f1a079c0a 100755 --- a/cicd-scripts/setup-e2e-tests.sh +++ b/cicd-scripts/setup-e2e-tests.sh @@ -130,6 +130,11 @@ EOF # deploy the MCO operator via the kustomize resources deploy_mco_operator() { + # makes sure we get the MULTICLUSTER_OBSERVABILITY_OPERATOR_IMAGE_REF + if [[ -n ${IS_KIND_ENV} ]]; then + source ${ROOTDIR}/tests/run-in-kind/env.sh + fi + if [[ -n ${MULTICLUSTER_OBSERVABILITY_OPERATOR_IMAGE_REF} ]]; then cd ${ROOTDIR}/operators/multiclusterobservability/config/manager && kustomize edit set image quay.io/stolostron/multicluster-observability-operator=${MULTICLUSTER_OBSERVABILITY_OPERATOR_IMAGE_REF} else @@ -138,6 +143,9 @@ deploy_mco_operator() { cd ${ROOTDIR} kustomize build ${ROOTDIR}/operators/multiclusterobservability/config/default | kubectl apply -n ${OCM_DEFAULT_NS} --server-side=true -f - + cat ${ROOTDIR}/operators/multiclusterobservability/config/manager/manager.yaml + cat ${ROOTDIR}/operators/multiclusterobservability/config/manager/kustomization.yaml + # wait until mco is ready wait_for_deployment_ready 10 60s ${OCM_DEFAULT_NS} multicluster-observability-operator echo "mco operator is deployed successfully." diff --git a/operators/multiclusterobservability/manifests/base/grafana/dash-acm-clusters-overview-optimized.yaml b/operators/multiclusterobservability/manifests/base/grafana/dash-acm-clusters-overview-optimized.yaml new file mode 100644 index 0000000000..6457c0c456 --- /dev/null +++ b/operators/multiclusterobservability/manifests/base/grafana/dash-acm-clusters-overview-optimized.yaml @@ -0,0 +1,1910 @@ +apiVersion: v1 +data: + acm-clusters-overview-optimized.json: |- + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 1, + "iteration": 1682528664304, + "links": [ + { + "asDropdown": true, + "icon": "external link", + "includeVars": false, + "keepTime": false, + "tags": [], + "targetBlank": true, + "title": "All Dashboards", + "tooltip": "", + "type": "dashboards", + "url": "" + } + ], + "panels": [ + { + "collapsed": false, + "datasource": "$datasource", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 138, + "panels": [], + "title": "Control Plane Health", + "type": "row" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": null, + "displayMode": "auto", + "filterable": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value #A" + }, + "properties": [ + { + "id": "displayName", + "value": "Max latency (99th percentile)" + }, + { + "id": "unit", + "value": "s" + }, + { + "id": "custom.displayMode", + "value": "color-background" + }, + { + "id": "custom.align", + "value": "left" + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 1 + }, + { + "color": "red", + "value": 2 + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #B" + }, + "properties": [ + { + "id": "displayName", + "value": "API Errors [1h]" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "custom.displayMode", + "value": "color-text" + }, + { + "id": "custom.align", + "value": "left" + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 1 + }, + { + "color": "red", + "value": 2 + } + ] + } + }, + { + "id": "noValue", + "value": "0" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "api_up" + }, + "properties": [ + { + "id": "displayName", + "value": "API servers up" + }, + { + "id": "unit", + "value": "percentunit" + }, + { + "id": "custom.displayMode", + "value": "color-text" + }, + { + "id": "custom.align", + "value": "left" + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 0 + }, + { + "color": "green", + "value": 1 + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "cluster" + }, + "properties": [ + { + "id": "displayName", + "value": "Cluster" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "links", + "value": [ + { + "targetBlank": false, + "title": "Drill down to cluster", + "url": "/d/09ec8aa1e996d6ffcd6817bbaff4db1b/kubernetes-api-server?${__url_time_range}&var-cluster=${__data.fields.cluster}&var-instance=All" + } + ] + }, + { + "id": "custom.align", + "value": "left" + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 146, + "interval": "4m", + "options": { + "showHeader": true + }, + "pluginVersion": "8.5.20", + "targets": [ + { + "exemplar": true, + "expr": "topk(50, max(apiserver_request_duration_seconds:histogram_quantile_99{cluster=~\"$cluster\",clusterType!=\"ocp3\"}) by (cluster))\n* on(cluster) group_left(api_up) count_values without() (\"api_up\", (sum(up{cluster=~\"$cluster\",job=\"apiserver\",clusterType!=\"ocp3\"} == 1) by (cluster) / count(up{cluster=~\"$cluster\",job=\"apiserver\",clusterType!=\"ocp3\"}) by (cluster)))", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum by (cluster)(sum:apiserver_request_total:1h{cluster=~\"$cluster\",code=~\"5..\",clusterType!=\"ocp3\"})", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "B" + } + ], + "title": "Top 50 Max Latency API Server", + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "indexByName": { + "Time": 0, + "Value #A": 2, + "Value #B": 4, + "api_up": 3, + "cluster": 1 + }, + "renameByName": {} + } + } + ], + "type": "table" + }, + { + "datasource": null, + "description": "Leader election changes per cluster over the time range selected for dashboard.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": null, + "displayMode": "auto", + "filterable": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "cluster" + }, + "properties": [ + { + "id": "displayName", + "value": "Cluster" + }, + { + "id": "links", + "value": [ + { + "title": "Drill down to cluster", + "url": "/d/N8BxQ2jMz/kubernetes-etcd-cluster?${__url_time_range}&var-cluster=${__data.fields.cluster}" + } + ] + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Leader Election Changes" + }, + { + "id": "custom.align", + "value": "left" + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 2 + } + ] + } + }, + { + "id": "custom.displayMode", + "value": "color-text" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "db_size" + }, + "properties": [ + { + "id": "displayName", + "value": "DB Size" + }, + { + "id": "unit", + "value": "bytes" + }, + { + "id": "custom.align", + "value": "left" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "has_leader" + }, + "properties": [ + { + "id": "displayName", + "value": "Has a Leader" + }, + { + "id": "mappings", + "value": [ + { + "options": { + "0": { + "text": "No" + }, + "1": { + "text": "Yes" + } + }, + "type": "value" + } + ] + }, + { + "id": "custom.align", + "value": "left" + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 150, + "interval": "1m", + "options": { + "frameIndex": 2, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "8.5.20", + "targets": [ + { + "exemplar": true, + "expr": "sum(changes(etcd_server_leader_changes_seen_total{cluster=~\"$cluster\",job=\"etcd\"}[$__range])) by (cluster)\n* on(cluster) group_left(db_size) count_values without() (\"db_size\", max(etcd_debugging_mvcc_db_total_size_in_bytes{cluster=~\"$cluster\",job=\"etcd\"}) by (cluster))\n* on(cluster) group_left(has_leader) count_values without() (\"has_leader\", max(etcd_server_has_leader{cluster=~\"$cluster\",job=\"etcd\"}) by (cluster))", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "etcd", + "transformations": [ + { + "id": "filterFieldsByName", + "options": { + "include": { + "names": [ + "cluster", + "db_size", + "has_leader", + "Value" + ] + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": {}, + "indexByName": { + "Value": 2, + "cluster": 0, + "db_size": 3, + "has_leader": 1 + }, + "renameByName": {} + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "datasource": "$datasource", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 140, + "panels": [], + "title": "Optimization", + "type": "row" + }, + { + "datasource": "$datasource", + "description": "Highlights % differences between CPU requests commitments vs utilization. When this difference is large ( >20%), it means that resources are reserved but unused.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": null, + "displayMode": "auto", + "filterable": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Overestimation" + }, + { + "id": "unit", + "value": "percentunit" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.displayMode", + "value": "color-background" + }, + { + "id": "custom.align", + "value": "left" + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.2 + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "cpu_requested" + }, + "properties": [ + { + "id": "displayName", + "value": "Requested" + }, + { + "id": "unit", + "value": "percentunit" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": "left" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "cpu_utilized" + }, + "properties": [ + { + "id": "displayName", + "value": "Utilized" + }, + { + "id": "unit", + "value": "percentunit" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": "left" + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "cluster" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "links", + "value": [ + { + "targetBlank": false, + "title": "Drill down to cluster", + "url": "/d/8Qvi3edMz/acm-resource-optimization-cluster?${__url_time_range}&var-cluster=${__data.fields.cluster}" + } + ] + }, + { + "id": "custom.align", + "value": "left" + }, + { + "id": "displayName", + "value": "Cluster" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 151, + "interval": "5m", + "options": { + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "8.5.20", + "targets": [ + { + "exemplar": true, + "expr": "topk(50, sum by (cluster) (cluster:cpu_requested:ratio)- sum by (cluster) (cluster:node_cpu:ratio{cluster=~\"$cluster\",clusterType!=\"ocp3\"}))\n* on(cluster) group_left(cpu_requested) count_values without() (\"cpu_requested\", cluster:cpu_requested:ratio)\n* on(cluster) group_left(cpu_utilized) count_values without() (\"cpu_utilized\", cluster:node_cpu:ratio{cluster=~\"$cluster\",clusterType!=\"ocp3\"})", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Top 50 CPU Overestimation Clusters", + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "indexByName": { + "Time": 0, + "Value": 2, + "cluster": 1, + "cpu_requested": 3, + "cpu_utilized": 4 + }, + "renameByName": {} + } + } + ], + "type": "table" + }, + { + "datasource": "$datasource", + "description": "Highlights % differences between Memory requests commitments vs utilization. When this difference is large ( >20%), it means that resources are reserved but unused.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": null, + "displayMode": "auto", + "filterable": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "cluster" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "links", + "value": [ + { + "targetBlank": false, + "title": "Drill down to cluster", + "url": "/d/8Qvi3edMz/acm-resource-optimization-cluster?${__url_time_range}&var-cluster=${__data.fields.cluster}" + } + ] + }, + { + "id": "custom.align", + "value": "left" + }, + { + "id": "displayName", + "value": "Cluster" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Overestimation" + }, + { + "id": "unit", + "value": "percentunit" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.displayMode", + "value": "color-background" + }, + { + "id": "custom.align", + "value": "left" + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.2 + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "memory_requested" + }, + "properties": [ + { + "id": "displayName", + "value": "Requested" + }, + { + "id": "unit", + "value": "percentunit" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": "left" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "memory_utilized" + }, + "properties": [ + { + "id": "displayName", + "value": "Utilized" + }, + { + "id": "unit", + "value": "percentunit" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": "left" + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 153, + "interval": "5m", + "options": { + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "8.5.20", + "targets": [ + { + "exemplar": true, + "expr": "topk(50, cluster:memory_requested:ratio{cluster=~\"$cluster\"} - ignoring(usage) cluster:memory_utilized:ratio{cluster=~\"$cluster\"})\n* on(cluster) group_left(memory_requested) count_values without() (\"memory_requested\", cluster:memory_requested:ratio)\n* on(cluster) group_left(memory_utilized) count_values without() (\"memory_utilized\", cluster:memory_utilized:ratio)", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Top 50 Memory Overestimation Clusters", + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "indexByName": { + "Time": 0, + "Value": 2, + "cluster": 1, + "memory_requested": 3, + "memory_utilized": 4 + }, + "renameByName": {} + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "datasource": "$datasource", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 34, + "panels": [], + "repeat": null, + "title": "Capacity / Utilization", + "type": "row" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": null, + "displayMode": "auto", + "filterable": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "cluster" + }, + "properties": [ + { + "id": "displayName", + "value": "Cluster" + }, + { + "id": "links", + "value": [ + { + "targetBlank": false, + "title": "Drill down to cluster", + "url": "/d/8Qvi3edMz/acm-resource-optimization-cluster?${__url_time_range}&var-cluster=${__data.fields.cluster}" + } + ] + }, + { + "id": "custom.align", + "value": "left" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": "left" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "machine_cpu_cores_sum" + }, + "properties": [ + { + "id": "displayName", + "value": "Total Cores" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "custom.align", + "value": "left" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "node_allocatable_cpu_cores_sum" + }, + "properties": [ + { + "id": "displayName", + "value": "Allocatable Cores" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "custom.align", + "value": "left" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "cpu_requested" + }, + "properties": [ + { + "id": "displayName", + "value": "Requested" + }, + { + "id": "unit", + "value": "percentunit" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": "left" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Utilized" + }, + { + "id": "unit", + "value": "percentunit" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": "left" + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 18 + }, + "id": 47, + "interval": "5m", + "options": { + "showHeader": true + }, + "pluginVersion": "8.5.20", + "targets": [ + { + "exemplar": true, + "expr": "topk(50, cluster:node_cpu:ratio{cluster=~\"$cluster\"})\n* on(cluster) group_left(machine_cpu_cores_sum) count_values without() (\"machine_cpu_cores_sum\", cluster:cpu_cores:sum)\n* on(cluster) group_left(node_allocatable_cpu_cores_sum) count_values without() (\"node_allocatable_cpu_cores_sum\", cluster:cpu_allocatable:sum)\n* on(cluster) group_left(cpu_requested) count_values without() (\"cpu_requested\", cluster:cpu_requested:ratio)", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Top 50 CPU Utilized Clusters", + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "usage": true + }, + "indexByName": { + "Time": 0, + "Value": 5, + "cluster": 1, + "cpu_requested": 4, + "machine_cpu_cores_sum": 2, + "node_allocatable_cpu_cores_sum": 3 + }, + "renameByName": {} + } + } + ], + "type": "table" + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 18 + }, + "hiddenSeries": false, + "id": 64, + "interval": "4m", + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": false, + "linewidth": 1, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.20", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "topk(5, cluster:node_cpu:ratio{cluster=~\"$cluster\",clusterType!=\"ocp3\"})", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{cluster}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Top 5 Utilized Clusters (% CPU usage)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": "1", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": 1 + } + }, + { + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": null, + "displayMode": "auto", + "filterable": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "cluster" + }, + "properties": [ + { + "id": "displayName", + "value": "Cluster" + }, + { + "id": "links", + "value": [ + { + "targetBlank": false, + "title": "Drill down to cluster", + "url": "/d/8Qvi3edMz/acm-resource-optimization-cluster?${__url_time_range}&var-cluster=${__data.fields.cluster}" + } + ] + }, + { + "id": "custom.align", + "value": "left" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "machine_memory_sum" + }, + "properties": [ + { + "id": "displayName", + "value": "Available Memory" + }, + { + "id": "unit", + "value": "bytes" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": "left" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "machine_memory_requested" + }, + "properties": [ + { + "id": "displayName", + "value": "Requested" + }, + { + "id": "unit", + "value": "percentunit" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": "left" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Utilized" + }, + { + "id": "unit", + "value": "percentunit" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": "left" + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 25 + }, + "id": 60, + "interval": "5m", + "options": { + "showHeader": true + }, + "pluginVersion": "8.5.20", + "targets": [ + { + "exemplar": true, + "expr": "topk(50, cluster:memory_utilized:ratio{cluster=~\"$cluster\"})\n* on(cluster) group_left(machine_memory_sum) count_values without() (\"machine_memory_sum\", cluster:machine_memory:sum)\n* on(cluster) group_left(machine_memory_requested) count_values without() (\"machine_memory_requested\", cluster:memory_requested:ratio)", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Top 50 Memory Utilized Clusters", + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "usage": true + }, + "indexByName": { + "Time": 0, + "Value": 4, + "cluster": 1, + "machine_memory_requested": 3, + "machine_memory_sum": 2 + }, + "renameByName": {} + } + } + ], + "type": "table" + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 25 + }, + "hiddenSeries": false, + "id": 65, + "interval": "4m", + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": false, + "linewidth": 1, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.20", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + {} + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "topk(5, (1 - sum(:node_memory_MemAvailable_bytes:sum) by (cluster) / sum(kube_node_status_allocatable{cluster=~\"$cluster\",resource=\"memory\"}) by (cluster)))", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{cluster}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Top 5 Utilized Clusters (% Memory usage)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "percentunit", + "label": null, + "logBase": 1, + "max": "1", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "datasource": null, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": null, + "displayMode": "auto", + "filterable": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Current Bandwidth Received" + }, + { + "id": "unit", + "value": "Bps" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": "left" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "node_transmit" + }, + "properties": [ + { + "id": "displayName", + "value": "Current Bandwidth Transmitted" + }, + { + "id": "unit", + "value": "Bps" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": "left" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "cluster" + }, + "properties": [ + { + "id": "displayName", + "value": "Cluster" + }, + { + "id": "links", + "value": [ + { + "title": "Drill down to cluster", + "url": "/d/ff635a025bcfea7bc3dd4f508990a3e9/kubernetes-networking-cluster?${__url_time_range}&var-cluster=${__data.fields.cluster}" + } + ] + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "node_transmit_drop" + }, + "properties": [ + { + "id": "displayName", + "value": "Rate of Transmitted Packets Dropped" + }, + { + "id": "unit", + "value": "pps" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "node_receive_drop" + }, + "properties": [ + { + "id": "displayName", + "value": "Rate of Received Packets Dropped" + }, + { + "id": "unit", + "value": "pps" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 32 + }, + "id": 148, + "options": { + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Current Bandwidth Received" + } + ] + }, + "pluginVersion": "8.5.20", + "targets": [ + { + "exemplar": true, + "expr": "sum(instance:node_network_receive_bytes_excluding_lo:rate1m{cluster=~\"$cluster\",job=\"node-exporter\",clusterType!=\"ocp3\"}) by (cluster)\n* on(cluster) group_left(node_transmit) count_values without() (\"node_transmit\", sum(instance:node_network_transmit_bytes_excluding_lo:rate1m{cluster=~\"$cluster\",job=\"node-exporter\"}) by (cluster))\n* on(cluster) group_left(node_receive_drop) count_values without() (\"node_receive_drop\", sum(instance:node_network_receive_drop_excluding_lo:rate1m{cluster=~\"$cluster\",job=\"node-exporter\"}) by (cluster))\n* on(cluster) group_left(node_transmit_drop) count_values without() (\"node_transmit_drop\", sum(instance:node_network_transmit_drop_excluding_lo:rate1m{cluster=~\"$cluster\",job=\"node-exporter\"}) by (cluster))", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Bandwidth Utilization", + "transformations": [ + { + "id": "filterFieldsByName", + "options": { + "include": { + "names": [ + "cluster", + "node_receive_drop", + "node_transmit", + "node_transmit_drop", + "Value" + ] + } + } + }, + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "Value #A" + } + ] + } + }, + { + "id": "organize", + "options": { + "excludeByName": {}, + "indexByName": { + "Value": 1, + "cluster": 0, + "node_receive_drop": 3, + "node_transmit": 2, + "node_transmit_drop": 4 + }, + "renameByName": {} + } + } + ], + "type": "table" + } + ], + "refresh": "5m", + "schemaVersion": 30, + "style": "light", + "tags": [ + "ACM" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Observatorium", + "value": "Observatorium" + }, + "description": null, + "error": null, + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": null, + "current": { + "selected": false, + "text": "name", + "value": "name" + }, + "datasource": null, + "definition": "label_values(acm_label_names, label_name)", + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": "Label", + "multi": false, + "name": "acm_label_names", + "options": [], + "query": { + "query": "label_values(acm_label_names, label_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "allValue": null, + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": null, + "definition": "label_values(acm_managed_cluster_labels, $acm_label_names)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": "Value", + "multi": true, + "name": "value", + "options": [], + "query": { + "query": "label_values(acm_managed_cluster_labels, $acm_label_names)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "allValue": ".+", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": null, + "definition": "label_values(acm_managed_cluster_labels{$acm_label_names=~\"$value\"}, name)", + "description": null, + "error": null, + "hide": 2, + "includeAll": true, + "label": "Cluster", + "multi": true, + "name": "cluster", + "options": [], + "query": { + "query": "label_values(acm_managed_cluster_labels{$acm_label_names=~\"$value\"}, name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "1m", + "5m", + "10m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "browser", + "title": "ACM - Clusters Overview (Optimized)", + "uid": "b4733fbea8104bae951b04961f47bd20", + "version": 1 + } +kind: ConfigMap +metadata: + name: grafana-dashboard-acm-clusters-overview-optimized + namespace: open-cluster-management-observability + labels: + general-folder: 'true' diff --git a/operators/multiclusterobservability/manifests/base/grafana/dash-acm-clusters-overview.yaml b/operators/multiclusterobservability/manifests/base/grafana/dash-acm-clusters-overview.yaml index 90d6089e8a..d9724cec3e 100644 --- a/operators/multiclusterobservability/manifests/base/grafana/dash-acm-clusters-overview.yaml +++ b/operators/multiclusterobservability/manifests/base/grafana/dash-acm-clusters-overview.yaml @@ -1849,7 +1849,7 @@ data: "type": "query" }, { - "allValue": null, + "allValue": ".+", "current": { "selected": true, "text": [ diff --git a/operators/multiclusterobservability/manifests/base/grafana/kustomization.yaml b/operators/multiclusterobservability/manifests/base/grafana/kustomization.yaml index 1bd342154a..4f1cda4dc9 100644 --- a/operators/multiclusterobservability/manifests/base/grafana/kustomization.yaml +++ b/operators/multiclusterobservability/manifests/base/grafana/kustomization.yaml @@ -10,6 +10,7 @@ resources: - dash-acm-cluster-by-alerts.yaml - dash-acm-optimization-overview.yaml - dash-acm-clusters-overview.yaml +- dash-acm-clusters-overview-optimized.yaml - dash-acm-hcp-overview.yaml - dash-acm-resources-hcp.yaml - dash-k8s-etcd.yaml diff --git a/tests/pkg/tests/observability_dashboard_test.go b/tests/pkg/tests/observability_dashboard_test.go index a2ab147658..e43ee580a2 100644 --- a/tests/pkg/tests/observability_dashboard_test.go +++ b/tests/pkg/tests/observability_dashboard_test.go @@ -13,9 +13,11 @@ import ( ) const ( - dashboardName = "sample-dashboard" - dashboardTitle = "Sample Dashboard for E2E" - updateDashboardTitle = "Update Sample Dashboard for E2E" + dashboardName = "sample-dashboard" + dashboardTitle = "Sample Dashboard for E2E" + updateDashboardTitle = "Update Sample Dashboard for E2E" + clusterOverviewTitle = "ACM - Clusters Overview" + clusterOverviewOptimizedTitle = "ACM - Clusters Overview (Optimized)" ) var _ = Describe("Observability:", func() { @@ -89,4 +91,29 @@ var _ = Describe("Observability:", func() { } testFailed = testFailed || CurrentGinkgoTestDescription().Failed }) + + It("[P2][Sev2][observability][Stable] Should have default overview dashboards (dashboard/g0)", func() { + // Check Original dash exists + Eventually(func() bool { + _, result := utils.ContainDashboard(testOptions, clusterOverviewTitle) + return result + }, EventuallyTimeoutMinute*3, EventuallyIntervalSecond*5).Should(BeTrue()) + // Check optimized dash + Eventually(func() bool { + _, result := utils.ContainDashboard(testOptions, clusterOverviewOptimizedTitle) + return result + }, EventuallyTimeoutMinute*3, EventuallyIntervalSecond*5).Should(BeTrue()) + + }) + + JustAfterEach(func() { + Expect(utils.IntegrityChecking(testOptions)).NotTo(HaveOccurred()) + }) + + AfterEach(func() { + if CurrentGinkgoTestDescription().Failed { + utils.LogFailingTestStandardDebugInfo(testOptions) + } + testFailed = testFailed || CurrentGinkgoTestDescription().Failed + }) })