From 7c498a89a0fd967e1823d515a33b934c6c6a0552 Mon Sep 17 00:00:00 2001 From: Murali Krishnasamy <70236227+mukrishn@users.noreply.github.com> Date: Mon, 30 Oct 2023 04:26:54 -0400 Subject: [PATCH] Added serving node metrics (#88) * added serving node panels * corrected syntax * updated query --- .../General/hypershift-performance.jsonnet | 91 +++++++++++++++---- 1 file changed, 74 insertions(+), 17 deletions(-) diff --git a/templates/General/hypershift-performance.jsonnet b/templates/General/hypershift-performance.jsonnet index 8416234..81851b8 100644 --- a/templates/General/hypershift-performance.jsonnet +++ b/templates/General/hypershift-performance.jsonnet @@ -35,6 +35,58 @@ local hostedControlPlaneMemory = genericGraphLegendPanel('Hosted Control Plane M ) ); +// Serving node stats and other daemons + +local nodeMemory = genericGraphLegendPanel('Serving Node Memory', 'Cluster Prometheus', 'bytes').addTarget( + prometheus.target( + 'node_memory_Active_bytes and on (instance) label_replace(cluster:nodes_roles{label_hypershift_openshift_io_cluster=~"$namespace"}, "instance", "$1", "node", "(.+)")', + legendFormat='{{instance}} - Active', + ) +).addTarget( + prometheus.target( + 'node_memory_MemTotal_bytes and on (instance) label_replace(cluster:nodes_roles{label_hypershift_openshift_io_cluster=~"$namespace"}, "instance", "$1", "node", "(.+)")', + legendFormat='{{instance}} - Total', + ) +).addTarget( + prometheus.target( + '(node_memory_Cached_bytes + node_memory_Buffers_bytes) and on (instance) label_replace(cluster:nodes_roles{label_hypershift_openshift_io_cluster=~"$namespace"}, "instance", "$1", "node", "(.+)")', + legendFormat='{{instance}} - Cached + Buffers', + ) +).addTarget( + prometheus.target( + 'node_memory_MemAvailable_bytes and on (instance) label_replace(cluster:nodes_roles{label_hypershift_openshift_io_cluster=~"$namespace"}, "instance", "$1", "node", "(.+)")', + legendFormat='{{instance}} - Available', + ) +).addTarget( + prometheus.target( + '(node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)) and on (instance) label_replace(cluster:nodes_roles{label_hypershift_openshift_io_cluster=~"$namespace"}, "instance", "$1", "node", "(.+)")', + legendFormat='{{instance}} - Used', + ) +); + + +local nodeCPU = genericGraphLegendPanel('Serving Node CPU Basic', 'Cluster Prometheus', 'percent').addTarget( + prometheus.target( + 'sum by (instance, mode)(irate(node_cpu_seconds_total{job=~".*"}[2m])) * 100 and on (instance) label_replace(cluster:nodes_roles{label_hypershift_openshift_io_cluster=~"$namespace"}, "instance", "$1", "node", "(.+)")', + legendFormat='{{instance}} - {{mode}}', + ) +); + +local suricataCPU = genericGraphLegendPanel('Suricata CPU(Running on Serving node)', 'Cluster Prometheus', 'percent').addTarget( + prometheus.target( + 'sum(irate(container_cpu_usage_seconds_total{namespace=~"openshift-suricata",container!="POD",name!=""}[2m])*100) by (node) and on (node) label_replace(cluster:nodes_roles{label_hypershift_openshift_io_cluster=~"$namespace"}, "node", "$1", "node", "(.+)")', + legendFormat='{{node}}', + ) +); + +local suricataMemory = genericGraphLegendPanel('Suricata Memory(Running on Serving node)', 'Cluster Prometheus', 'bytes').addTarget( + prometheus.target( + 'sum(container_memory_rss{namespace=~"openshift-suricata",container!="POD",name!=""}) by (node) and on (node) label_replace(cluster:nodes_roles{label_hypershift_openshift_io_cluster=~"$namespace"}, "node", "$1", "node", "(.+)")', + legendFormat='{{node}}', + ) +); + + // Overall stats on the management cluster // Cluster Operators details and status @@ -1694,6 +1746,17 @@ grafana.dashboard.new( ], ), { gridPos: { x: 0, y: 4, w: 24, h: 1 } }) +.addPanel( + grafana.row.new(title='Hosted Clusters Serving Node stats - $namespace', collapse=true, repeat='namespace').addPanels( + [ + nodeCPU { gridPos: { x: 0, y: 2, w: 12, h: 8 } }, + nodeMemory { gridPos: { x: 12, y: 2, w: 12, h: 8 } }, + suricataCPU { gridPos: { x: 0, y: 18, w: 12, h: 8 } }, + suricataMemory { gridPos: { x: 12, y: 18, w: 12, h: 8 } }, + ] + ), { gridPos: { x: 0, y: 4, w: 24, h: 1 } } +) + .addPanel(grafana.row.new(title='HostedControlPlane stats - $namespace', collapse=true, repeat='namespace').addPanels( [ infrastructure { gridPos: { x: 0, y: 0, w: 8, h: 4 } }, @@ -1735,23 +1798,17 @@ grafana.dashboard.new( grpc_traffic { gridPos: { x: 12, y: 18, w: 12, h: 8 } }, active_streams { gridPos: { x: 0, y: 26, w: 12, h: 8 } }, snapshot_duration { gridPos: { x: 12, y: 26, w: 12, h: 8 } }, - ] - ), { gridPos: { x: 0, y: 0, w: 24, h: 1 } } -) -.addPanel( - grafana.row.new(title='Hosted Clusters ETCD General Info - $namespace', collapse=true, repeat='namespace').addPanels( - [ - raft_proposals { gridPos: { x: 0, y: 1, w: 12, h: 8 } }, - num_leader_changes { gridPos: { x: 12, y: 1, w: 12, h: 8 } }, - etcd_has_leader { gridPos: { x: 0, y: 8, w: 6, h: 2 } }, - num_failed_proposals { gridPos: { x: 6, y: 8, w: 6, h: 2 } }, - leader_elections_per_day { gridPos: { x: 0, y: 12, w: 12, h: 6 } }, - keys { gridPos: { x: 12, y: 12, w: 12, h: 8 } }, - slow_operations { gridPos: { x: 0, y: 20, w: 12, h: 8 } }, - key_operations { gridPos: { x: 12, y: 20, w: 12, h: 8 } }, - heartbeat_failures { gridPos: { x: 0, y: 28, w: 12, h: 8 } }, - compacted_keys { gridPos: { x: 12, y: 28, w: 12, h: 8 } }, + raft_proposals { gridPos: { x: 0, y: 34, w: 12, h: 8 } }, + num_leader_changes { gridPos: { x: 12, y: 34, w: 12, h: 8 } }, + etcd_has_leader { gridPos: { x: 0, y: 42, w: 6, h: 2 } }, + num_failed_proposals { gridPos: { x: 6, y: 42, w: 6, h: 2 } }, + leader_elections_per_day { gridPos: { x: 0, y: 44, w: 12, h: 6 } }, + keys { gridPos: { x: 12, y: 44, w: 12, h: 8 } }, + slow_operations { gridPos: { x: 0, y: 52, w: 12, h: 8 } }, + key_operations { gridPos: { x: 12, y: 52, w: 12, h: 8 } }, + heartbeat_failures { gridPos: { x: 0, y: 60, w: 12, h: 8 } }, + compacted_keys { gridPos: { x: 12, y: 60, w: 12, h: 8 } }, ] - ), { gridPos: { x: 0, y: 3, w: 24, h: 1 } } + ), { gridPos: { x: 0, y: 0, w: 24, h: 1 } } )