diff --git a/assets/cilium-k8s-perf/panels.libsonnet b/assets/cilium-k8s-perf/panels.libsonnet index b7d9d6b..76ba885 100644 --- a/assets/cilium-k8s-perf/panels.libsonnet +++ b/assets/cilium-k8s-perf/panels.libsonnet @@ -48,15 +48,6 @@ local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonn + options.legend.withPlacement('bottom') + options.legend.withDisplayMode('table') + options.legend.withCalcs([]) - - - - - - - - - }, stat: { @@ -77,16 +68,11 @@ local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonn + options.withGraphMode("area") + options.text.withTitleSize(12), - withclusterAgg(title, unit, targets, gridPos): self.base(title, unit, targets, gridPos) + options.reduceOptions.withCalcs([ 'last', ]) + stat.standardOptions.thresholds.withSteps([]), - } - - - } \ No newline at end of file diff --git a/assets/cilium-k8s-perf/queries.libsonnet b/assets/cilium-k8s-perf/queries.libsonnet index 6e76273..9ca634c 100644 --- a/assets/cilium-k8s-perf/queries.libsonnet +++ b/assets/cilium-k8s-perf/queries.libsonnet @@ -23,7 +23,7 @@ local prometheus = g.query.prometheus; ciliumContainerCPU: { query(): - prometheus.withExpr('sum(irate(container_cpu_usage_seconds_total{container=~\"cilium.*\",container!=\"cilium-operator.*\",namespace!=\"\"}[$interval])) by (instance,pod,container,namespace,name,service) * 100') + prometheus.withExpr('sum(irate(container_cpu_usage_seconds_total{container=~"cilium.*",container!="cilium-operator.*",namespace!=""}[$interval])) by (instance,pod,container,namespace,name,service) * 100') + prometheus.withFormat('time_series') + prometheus.withIntervalFactor(2) + prometheus.withLegendFormat('{{ instance }} - {{ pod }}') @@ -32,7 +32,7 @@ local prometheus = g.query.prometheus; ciliumConatinerMemory: { query(): - prometheus.withExpr('container_memory_rss{container=~\"cilium.*\",namespace!=\"\"}') + prometheus.withExpr('container_memory_rss{container=~"cilium.*",namespace!=""}') + prometheus.withFormat('time_series') + prometheus.withIntervalFactor(2) + prometheus.withLegendFormat('{{ instance }} - {{ pod }}') @@ -65,7 +65,8 @@ local prometheus = g.query.prometheus; + prometheus.withIntervalFactor(2) + prometheus.withLegendFormat('Number of nodes') + prometheus.withDatasource('$Datasource') , - prometheus.withExpr('sum(kube_node_status_condition{status=\"true\"}) by (condition) > 0') + + prometheus.withExpr('sum(kube_node_status_condition{status="true"}) by (condition) > 0') + prometheus.withFormat('time_series') + prometheus.withIntervalFactor(2) + prometheus.withLegendFormat('Node: {{ condition }}') @@ -99,7 +100,8 @@ local prometheus = g.query.prometheus; + prometheus.withIntervalFactor(2) + prometheus.withLegendFormat('Number of nodes') + prometheus.withDatasource('$Datasource'), - prometheus.withExpr('sum(kube_node_status_condition{status=\"true\"}) by (condition) > 0') + + prometheus.withExpr('sum(kube_node_status_condition{status="true"}) by (condition) > 0') + prometheus.withFormat('time_series') + prometheus.withIntervalFactor(2) + prometheus.withLegendFormat('Node: {{ condition }}') @@ -161,7 +163,7 @@ local prometheus = g.query.prometheus; top10ContainerRSS: { query(): - prometheus.withExpr('topk(10, container_memory_rss{namespace!=\"\",container!=\"POD\",name!=\"\"})') + prometheus.withExpr('topk(10, container_memory_rss{namespace!="",container!="POD",name!=""})') + prometheus.withFormat('time_series') + prometheus.withIntervalFactor(2) + prometheus.withLegendFormat('{{ namespace }} - {{ name }}') @@ -170,7 +172,7 @@ local prometheus = g.query.prometheus; top10ContainerCPU: { query(): - prometheus.withExpr('topk(10,irate(container_cpu_usage_seconds_total{namespace!=\"\",container!=\"POD\",name!=\"\"}[$interval])*100)') + prometheus.withExpr('topk(10,irate(container_cpu_usage_seconds_total{namespace!="",container!="POD",name!=""}[$interval])*100)') + prometheus.withFormat('time_series') + prometheus.withIntervalFactor(2) + prometheus.withLegendFormat('{{ namespace }} - {{ name }}') @@ -197,35 +199,35 @@ local prometheus = g.query.prometheus; CPUBasic: { query(): - prometheus.withExpr('sum by (instance, mode)(rate(node_cpu_seconds_total{node=~\"$_worker_node\",job=~\".*\"}[$interval])) * 100') + prometheus.withExpr('sum by (instance, mode)(rate(node_cpu_seconds_total{node=~"$_worker_node",job=~".*"}[$interval])) * 100') + prometheus.withFormat('time_series') + prometheus.withIntervalFactor(2) + prometheus.withLegendFormat('Busy {{mode}}') + prometheus.withDatasource('$Datasource') }, - SystemMemory: { + systemMemory: { query(): [ - prometheus.withExpr('node_memory_Active_bytes{node=~\"$_worker_node\"}') + prometheus.withExpr('node_memory_Active_bytes{node=~"$_worker_node"}') + prometheus.withFormat('time_series') + prometheus.withIntervalFactor(2) + prometheus.withLegendFormat('Active') + prometheus.withDatasource('$Datasource') , - prometheus.withExpr('node_memory_MemTotal_bytes{node=~\"$_worker_node\"}') + prometheus.withExpr('node_memory_MemTotal_bytes{node=~"$_worker_node"}') + prometheus.withFormat('time_series') + prometheus.withIntervalFactor(2) + prometheus.withLegendFormat('Total') + prometheus.withDatasource('$Datasource'), - prometheus.withExpr('node_memory_Cached_bytes{node=~\"$_worker_node\"} + node_memory_Buffers_bytes{node=~\"$_worker_node\"}') + prometheus.withExpr('node_memory_Cached_bytes{node=~"$_worker_node"} + node_memory_Buffers_bytes{node=~"$_worker_node"}') + prometheus.withFormat('time_series') + prometheus.withIntervalFactor(2) + prometheus.withLegendFormat('Total') + prometheus.withDatasource('$Datasource'), - prometheus.withExpr('node_memory_MemAvailable_bytes{node=~\"$_worker_node\"}') + prometheus.withExpr('node_memory_MemAvailable_bytes{node=~"$_worker_node"}') + prometheus.withFormat('time_series') + prometheus.withIntervalFactor(2) + prometheus.withLegendFormat('Total') @@ -234,16 +236,16 @@ local prometheus = g.query.prometheus; ] }, - DiskThroughput: { + diskThroughput: { query(): [ - prometheus.withExpr('rate(node_disk_read_bytes_total{device=~\"$block_device\",node=~\"$_worker_node\"}[$interval])') + prometheus.withExpr('rate(node_disk_read_bytes_total{device=~"$block_device",node=~"$_worker_node"}[$interval])') + prometheus.withFormat('time_series') + prometheus.withIntervalFactor(2) + prometheus.withLegendFormat('{{ device }} - read') + prometheus.withDatasource('$Datasource') , - prometheus.withExpr('rate(node_disk_written_bytes_total{device=~\"$block_device\",node=~\"$_worker_node\"}[$interval])') + prometheus.withExpr('rate(node_disk_written_bytes_total{device=~"$block_device",node=~"$_worker_node"}[$interval])') + prometheus.withFormat('time_series') + prometheus.withIntervalFactor(2) + prometheus.withLegendFormat('{{ device }} - write') @@ -251,16 +253,16 @@ local prometheus = g.query.prometheus; ] }, - DiskIOPS: { + diskIOPS: { query(): [ - prometheus.withExpr('rate(node_disk_reads_completed_total{device=~\"$block_device\",node=~\"$_worker_node\"}[$interval])') + prometheus.withExpr('rate(node_disk_reads_completed_total{device=~"$block_device",node=~"$_worker_node"}[$interval])') + prometheus.withFormat('time_series') + prometheus.withIntervalFactor(2) + prometheus.withLegendFormat('{{ device }} - read') + prometheus.withDatasource('$Datasource') , - prometheus.withExpr('rate(node_disk_writes_completed_total{device=~\"$block_device\",node=~\"$_worker_node\"}[$interval])') + prometheus.withExpr('rate(node_disk_writes_completed_total{device=~"$block_device",node=~"$_worker_node"}[$interval])') + prometheus.withFormat('time_series') + prometheus.withIntervalFactor(2) + prometheus.withLegendFormat('{{ device }} - write') @@ -271,13 +273,13 @@ local prometheus = g.query.prometheus; networkUtilization: { query(): [ - prometheus.withExpr('rate(node_network_receive_bytes_total{node=~\"$_worker_node\",device=~\"$net_device\"}[$interval]) * 8') + prometheus.withExpr('rate(node_network_receive_bytes_total{node=~"$_worker_node",device=~"$net_device"}[$interval]) * 8') + prometheus.withFormat('time_series') + prometheus.withIntervalFactor(2) + prometheus.withLegendFormat('{{instance}} - {{device}} - RX') + prometheus.withDatasource('$Datasource'), - prometheus.withExpr('rate(node_network_transmit_bytes_total{node=~\"$_worker_node\",device=~\"$net_device\"}[$interval]) * 8') + prometheus.withExpr('rate(node_network_transmit_bytes_total{node=~"$_worker_node",device=~"$net_device"}[$interval]) * 8') + prometheus.withFormat('time_series') + prometheus.withIntervalFactor(2) + prometheus.withLegendFormat('{{instance}} - {{device}} - TX') @@ -288,13 +290,13 @@ local prometheus = g.query.prometheus; networkPackets: { query(): [ - prometheus.withExpr('rate(node_network_receive_packets_total{node=~\"$_worker_node\",device=~\"$net_device\"}[$interval])') + prometheus.withExpr('rate(node_network_receive_packets_total{node=~"$_worker_node",device=~"$net_device"}[$interval])') + prometheus.withFormat('time_series') + prometheus.withIntervalFactor(2) + prometheus.withLegendFormat('{{instance}} - {{device}} - RX') + prometheus.withDatasource('$Datasource'), - prometheus.withExpr('rate(node_network_transmit_packets_total{node=~\"$_worker_node\",device=~\"$net_device\"}[$interval])') + prometheus.withExpr('rate(node_network_transmit_packets_total{node=~"$_worker_node",device=~"$net_device"}[$interval])') + prometheus.withFormat('time_series') + prometheus.withIntervalFactor(2) + prometheus.withLegendFormat('{{instance}} - {{device}} - TX') @@ -307,13 +309,13 @@ local prometheus = g.query.prometheus; networkPacketDrop: { query(): [ - prometheus.withExpr('topk(10, rate(node_network_receive_drop_total{node=~\"$_worker_node\"}[$interval]))') + prometheus.withExpr('topk(10, rate(node_network_receive_drop_total{node=~"$_worker_node"}[$interval]))') + prometheus.withFormat('time_series') + prometheus.withIntervalFactor(2) + prometheus.withLegendFormat('rx-drop-{{ device }}') + prometheus.withDatasource('$Datasource'), - prometheus.withExpr('topk(10,rate(node_network_transmit_drop_total{node=~\"$_worker_node\"}[$interval]))') + prometheus.withExpr('topk(10,rate(node_network_transmit_drop_total{node=~"$_worker_node"}[$interval]))') + prometheus.withFormat('time_series') + prometheus.withIntervalFactor(2) + prometheus.withLegendFormat('tx-drop-{{ device }}') @@ -324,13 +326,13 @@ local prometheus = g.query.prometheus; conntrackStats: { query(): [ - prometheus.withExpr('node_nf_conntrack_entries{node=~\"$_worker_node\"}') + prometheus.withExpr('node_nf_conntrack_entries{node=~"$_worker_node"}') + prometheus.withFormat('time_series') + prometheus.withIntervalFactor(2) + prometheus.withLegendFormat('conntrack_entries') + prometheus.withDatasource('$Datasource'), - prometheus.withExpr('node_nf_conntrack_entries_limit{node=~\"$_worker_node\"}') + prometheus.withExpr('node_nf_conntrack_entries_limit{node=~"$_worker_node"}') + prometheus.withFormat('time_series') + prometheus.withIntervalFactor(2) + prometheus.withLegendFormat('conntrack_limit') @@ -340,7 +342,7 @@ local prometheus = g.query.prometheus; top10ContainerCPUNode: { query(): - prometheus.withExpr('topk(10, sum(irate(container_cpu_usage_seconds_total{container!=\"POD\",name!=\"\",instance=~\"$_worker_node\",namespace!=\"\",namespace=~\"$namespace\"}[$interval])) by (pod,container,namespace,name,service) * 100)') + prometheus.withExpr('topk(10, sum(irate(container_cpu_usage_seconds_total{container!="POD", instance=~"$_worker_node", namespace=~"$namespace"}[$interval])) by (pod, container) * 100)') + prometheus.withFormat('time_series') + prometheus.withIntervalFactor(2) + prometheus.withLegendFormat('{{ pod }}: {{ container }}') @@ -349,16 +351,10 @@ local prometheus = g.query.prometheus; top10ContainerRSSNode: { query(): - prometheus.withExpr('topk(10, container_memory_rss{container!=\"POD\",name!=\"\",instance=~\"$_worker_node\",namespace!=\"\",namespace=~\"$namespace\"})') + prometheus.withExpr('topk(10, container_memory_rss{container!="POD",name!="",instance=~"$_worker_node",namespace!="",namespace=~"$namespace"})') + prometheus.withFormat('time_series') + prometheus.withIntervalFactor(2) + prometheus.withLegendFormat('{{ pod }}: {{ container }}') + prometheus.withDatasource('$Datasource') }, - - - - - - } \ No newline at end of file diff --git a/templates/CPT/cilium-k8s-perf-v2.jsonnet b/templates/CPT/cilium-k8s-perf-v2.jsonnet index 902c692..65f162b 100644 --- a/templates/CPT/cilium-k8s-perf-v2.jsonnet +++ b/templates/CPT/cilium-k8s-perf-v2.jsonnet @@ -51,16 +51,15 @@ g.dashboard.new('Cilium k8s Performance') panels.timeSeries.withClusterAgg('Goroutines count', 'none', queries.goroutinesCount.query(), { x: 12, y: 53, w: 12, h: 8 }), panels.timeSeries.withCiliumAgg('Pod Distribution', 'none', queries.podDistribution.query(), { x: 0, y: 61, w: 24, h: 8 }), ]), - g.panel.row.new('Node: $_worker_node') + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) + g.panel.row.withCollapsed(true) + g.panel.row.withRepeat('_worker_node') + g.panel.row.withPanels([ panels.timeSeries.withCiliumAgg('CPU Basic: $_worker_node', 'percent', queries.CPUBasic.query(), { x: 0, y: 70, w: 12, h: 8 }), - panels.timeSeries.withCiliumAgg('System Memory: $_worker_node', 'bytes', queries.SystemMemory.query(), { x: 12, y: 70, w: 12, h: 8 }), - panels.timeSeries.withCiliumAgg('Disk throughput: $_worker_node', 'Bps', queries.DiskThroughput.query(), { x: 0, y: 78, w: 12, h: 8 }), - panels.timeSeries.withCiliumAgg('Disk IOPS: $_worker_node', 'iops', queries.DiskIOPS.query(), { x: 12, y: 78, w: 12, h: 8 }), + panels.timeSeries.withCiliumAgg('System Memory: $_worker_node', 'bytes', queries.systemMemory.query(), { x: 12, y: 70, w: 12, h: 8 }), + panels.timeSeries.withCiliumAgg('Disk throughput: $_worker_node', 'Bps', queries.diskThroughput.query(), { x: 0, y: 78, w: 12, h: 8 }), + panels.timeSeries.withCiliumAgg('Disk IOPS: $_worker_node', 'iops', queries.diskIOPS.query(), { x: 12, y: 78, w: 12, h: 8 }), panels.timeSeries.withCiliumAgg('Network Utilization: $_worker_node', 'bps', queries.networkUtilization.query(), { x: 0, y: 86, w: 12, h: 8 }), panels.timeSeries.withCiliumAgg('Network Packets: $_worker_node', 'pps', queries.networkPackets.query(), { x: 12, y: 86, w: 12, h: 8 }), panels.timeSeries.withCiliumAgg('Network packets drop: $_worker_node', 'pps', queries.networkPacketDrop.query(), { x: 0, y: 94, w: 12, h: 8 }), @@ -68,6 +67,4 @@ g.dashboard.new('Cilium k8s Performance') panels.timeSeries.withCiliumAgg('Top 10 container CPU: $_worker_node', 'percent', queries.top10ContainerCPUNode.query(), { x: 0, y: 102, w: 12, h: 8 }), panels.timeSeries.withCiliumAgg('Top 10 container RSS: $_worker_node', 'bytes', queries.top10ContainerRSSNode.query(), { x: 12, y: 102, w: 12, h: 8 }), ]), - - ])