Skip to content

Commit

Permalink
Cilium k8s performance dashboard changes updated
Browse files Browse the repository at this point in the history
  • Loading branch information
smanda99 committed Dec 20, 2023
1 parent 8b47ffe commit 602a159
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 54 deletions.
14 changes: 0 additions & 14 deletions assets/cilium-k8s-perf/panels.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,6 @@ local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonn
+ options.legend.withPlacement('bottom')
+ options.legend.withDisplayMode('table')
+ options.legend.withCalcs([])









},

stat: {
Expand All @@ -77,16 +68,11 @@ local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonn
+ options.withGraphMode("area")
+ options.text.withTitleSize(12),


withclusterAgg(title, unit, targets, gridPos):
self.base(title, unit, targets, gridPos)
+ options.reduceOptions.withCalcs([
'last',
])
+ stat.standardOptions.thresholds.withSteps([]),

}



}
64 changes: 30 additions & 34 deletions assets/cilium-k8s-perf/queries.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ local prometheus = g.query.prometheus;

ciliumContainerCPU: {
query():
prometheus.withExpr('sum(irate(container_cpu_usage_seconds_total{container=~\"cilium.*\",container!=\"cilium-operator.*\",namespace!=\"\"}[$interval])) by (instance,pod,container,namespace,name,service) * 100')
prometheus.withExpr('sum(irate(container_cpu_usage_seconds_total{container=~"cilium.*",container!="cilium-operator.*",namespace!=""}[$interval])) by (instance,pod,container,namespace,name,service) * 100')
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('{{ instance }} - {{ pod }}')
Expand All @@ -32,7 +32,7 @@ local prometheus = g.query.prometheus;

ciliumConatinerMemory: {
query():
prometheus.withExpr('container_memory_rss{container=~\"cilium.*\",namespace!=\"\"}')
prometheus.withExpr('container_memory_rss{container=~"cilium.*",namespace!=""}')
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('{{ instance }} - {{ pod }}')
Expand Down Expand Up @@ -65,7 +65,8 @@ local prometheus = g.query.prometheus;
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('Number of nodes')
+ prometheus.withDatasource('$Datasource') ,
prometheus.withExpr('sum(kube_node_status_condition{status=\"true\"}) by (condition) > 0')

prometheus.withExpr('sum(kube_node_status_condition{status="true"}) by (condition) > 0')
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('Node: {{ condition }}')
Expand Down Expand Up @@ -99,7 +100,8 @@ local prometheus = g.query.prometheus;
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('Number of nodes')
+ prometheus.withDatasource('$Datasource'),
prometheus.withExpr('sum(kube_node_status_condition{status=\"true\"}) by (condition) > 0')

prometheus.withExpr('sum(kube_node_status_condition{status="true"}) by (condition) > 0')
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('Node: {{ condition }}')
Expand Down Expand Up @@ -161,7 +163,7 @@ local prometheus = g.query.prometheus;

top10ContainerRSS: {
query():
prometheus.withExpr('topk(10, container_memory_rss{namespace!=\"\",container!=\"POD\",name!=\"\"})')
prometheus.withExpr('topk(10, container_memory_rss{namespace!="",container!="POD",name!=""})')
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('{{ namespace }} - {{ name }}')
Expand All @@ -170,7 +172,7 @@ local prometheus = g.query.prometheus;

top10ContainerCPU: {
query():
prometheus.withExpr('topk(10,irate(container_cpu_usage_seconds_total{namespace!=\"\",container!=\"POD\",name!=\"\"}[$interval])*100)')
prometheus.withExpr('topk(10,irate(container_cpu_usage_seconds_total{namespace!="",container!="POD",name!=""}[$interval])*100)')
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('{{ namespace }} - {{ name }}')
Expand All @@ -197,35 +199,35 @@ local prometheus = g.query.prometheus;

CPUBasic: {
query():
prometheus.withExpr('sum by (instance, mode)(rate(node_cpu_seconds_total{node=~\"$_worker_node\",job=~\".*\"}[$interval])) * 100')
prometheus.withExpr('sum by (instance, mode)(rate(node_cpu_seconds_total{node=~"$_worker_node",job=~".*"}[$interval])) * 100')
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('Busy {{mode}}')
+ prometheus.withDatasource('$Datasource')
},

SystemMemory: {
systemMemory: {
query():
[
prometheus.withExpr('node_memory_Active_bytes{node=~\"$_worker_node\"}')
prometheus.withExpr('node_memory_Active_bytes{node=~"$_worker_node"}')
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('Active')
+ prometheus.withDatasource('$Datasource') ,

prometheus.withExpr('node_memory_MemTotal_bytes{node=~\"$_worker_node\"}')
prometheus.withExpr('node_memory_MemTotal_bytes{node=~"$_worker_node"}')
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('Total')
+ prometheus.withDatasource('$Datasource'),

prometheus.withExpr('node_memory_Cached_bytes{node=~\"$_worker_node\"} + node_memory_Buffers_bytes{node=~\"$_worker_node\"}')
prometheus.withExpr('node_memory_Cached_bytes{node=~"$_worker_node"} + node_memory_Buffers_bytes{node=~"$_worker_node"}')
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('Total')
+ prometheus.withDatasource('$Datasource'),

prometheus.withExpr('node_memory_MemAvailable_bytes{node=~\"$_worker_node\"}')
prometheus.withExpr('node_memory_MemAvailable_bytes{node=~"$_worker_node"}')
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('Total')
Expand All @@ -234,33 +236,33 @@ local prometheus = g.query.prometheus;
]
},

DiskThroughput: {
diskThroughput: {
query():
[
prometheus.withExpr('rate(node_disk_read_bytes_total{device=~\"$block_device\",node=~\"$_worker_node\"}[$interval])')
prometheus.withExpr('rate(node_disk_read_bytes_total{device=~"$block_device",node=~"$_worker_node"}[$interval])')
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('{{ device }} - read')
+ prometheus.withDatasource('$Datasource') ,

prometheus.withExpr('rate(node_disk_written_bytes_total{device=~\"$block_device\",node=~\"$_worker_node\"}[$interval])')
prometheus.withExpr('rate(node_disk_written_bytes_total{device=~"$block_device",node=~"$_worker_node"}[$interval])')
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('{{ device }} - write')
+ prometheus.withDatasource('$Datasource')
]
},

DiskIOPS: {
diskIOPS: {
query():
[
prometheus.withExpr('rate(node_disk_reads_completed_total{device=~\"$block_device\",node=~\"$_worker_node\"}[$interval])')
prometheus.withExpr('rate(node_disk_reads_completed_total{device=~"$block_device",node=~"$_worker_node"}[$interval])')
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('{{ device }} - read')
+ prometheus.withDatasource('$Datasource') ,

prometheus.withExpr('rate(node_disk_writes_completed_total{device=~\"$block_device\",node=~\"$_worker_node\"}[$interval])')
prometheus.withExpr('rate(node_disk_writes_completed_total{device=~"$block_device",node=~"$_worker_node"}[$interval])')
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('{{ device }} - write')
Expand All @@ -271,13 +273,13 @@ local prometheus = g.query.prometheus;
networkUtilization: {
query():
[
prometheus.withExpr('rate(node_network_receive_bytes_total{node=~\"$_worker_node\",device=~\"$net_device\"}[$interval]) * 8')
prometheus.withExpr('rate(node_network_receive_bytes_total{node=~"$_worker_node",device=~"$net_device"}[$interval]) * 8')
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('{{instance}} - {{device}} - RX')
+ prometheus.withDatasource('$Datasource'),

prometheus.withExpr('rate(node_network_transmit_bytes_total{node=~\"$_worker_node\",device=~\"$net_device\"}[$interval]) * 8')
prometheus.withExpr('rate(node_network_transmit_bytes_total{node=~"$_worker_node",device=~"$net_device"}[$interval]) * 8')
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('{{instance}} - {{device}} - TX')
Expand All @@ -288,13 +290,13 @@ local prometheus = g.query.prometheus;
networkPackets: {
query():
[
prometheus.withExpr('rate(node_network_receive_packets_total{node=~\"$_worker_node\",device=~\"$net_device\"}[$interval])')
prometheus.withExpr('rate(node_network_receive_packets_total{node=~"$_worker_node",device=~"$net_device"}[$interval])')
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('{{instance}} - {{device}} - RX')
+ prometheus.withDatasource('$Datasource'),

prometheus.withExpr('rate(node_network_transmit_packets_total{node=~\"$_worker_node\",device=~\"$net_device\"}[$interval])')
prometheus.withExpr('rate(node_network_transmit_packets_total{node=~"$_worker_node",device=~"$net_device"}[$interval])')
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('{{instance}} - {{device}} - TX')
Expand All @@ -307,13 +309,13 @@ local prometheus = g.query.prometheus;
networkPacketDrop: {
query():
[
prometheus.withExpr('topk(10, rate(node_network_receive_drop_total{node=~\"$_worker_node\"}[$interval]))')
prometheus.withExpr('topk(10, rate(node_network_receive_drop_total{node=~"$_worker_node"}[$interval]))')
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('rx-drop-{{ device }}')
+ prometheus.withDatasource('$Datasource'),

prometheus.withExpr('topk(10,rate(node_network_transmit_drop_total{node=~\"$_worker_node\"}[$interval]))')
prometheus.withExpr('topk(10,rate(node_network_transmit_drop_total{node=~"$_worker_node"}[$interval]))')
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('tx-drop-{{ device }}')
Expand All @@ -324,13 +326,13 @@ local prometheus = g.query.prometheus;
conntrackStats: {
query():
[
prometheus.withExpr('node_nf_conntrack_entries{node=~\"$_worker_node\"}')
prometheus.withExpr('node_nf_conntrack_entries{node=~"$_worker_node"}')
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('conntrack_entries')
+ prometheus.withDatasource('$Datasource'),

prometheus.withExpr('node_nf_conntrack_entries_limit{node=~\"$_worker_node\"}')
prometheus.withExpr('node_nf_conntrack_entries_limit{node=~"$_worker_node"}')
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('conntrack_limit')
Expand All @@ -340,7 +342,7 @@ local prometheus = g.query.prometheus;

top10ContainerCPUNode: {
query():
prometheus.withExpr('topk(10, sum(irate(container_cpu_usage_seconds_total{container!=\"POD\",name!=\"\",instance=~\"$_worker_node\",namespace!=\"\",namespace=~\"$namespace\"}[$interval])) by (pod,container,namespace,name,service) * 100)')
prometheus.withExpr('topk(10, sum(irate(container_cpu_usage_seconds_total{container!="POD", instance=~"$_worker_node", namespace=~"$namespace"}[$interval])) by (pod, container) * 100)')
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('{{ pod }}: {{ container }}')
Expand All @@ -349,16 +351,10 @@ local prometheus = g.query.prometheus;

top10ContainerRSSNode: {
query():
prometheus.withExpr('topk(10, container_memory_rss{container!=\"POD\",name!=\"\",instance=~\"$_worker_node\",namespace!=\"\",namespace=~\"$namespace\"})')
prometheus.withExpr('topk(10, container_memory_rss{container!="POD",name!="",instance=~"$_worker_node",namespace!="",namespace=~"$namespace"})')
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('{{ pod }}: {{ container }}')
+ prometheus.withDatasource('$Datasource')
},






}
9 changes: 3 additions & 6 deletions templates/CPT/cilium-k8s-perf-v2.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -51,23 +51,20 @@ g.dashboard.new('Cilium k8s Performance')
panels.timeSeries.withClusterAgg('Goroutines count', 'none', queries.goroutinesCount.query(), { x: 12, y: 53, w: 12, h: 8 }),
panels.timeSeries.withCiliumAgg('Pod Distribution', 'none', queries.podDistribution.query(), { x: 0, y: 61, w: 24, h: 8 }),
]),

g.panel.row.new('Node: $_worker_node')
+ g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 })
+ g.panel.row.withCollapsed(true)
+ g.panel.row.withRepeat('_worker_node')
+ g.panel.row.withPanels([
panels.timeSeries.withCiliumAgg('CPU Basic: $_worker_node', 'percent', queries.CPUBasic.query(), { x: 0, y: 70, w: 12, h: 8 }),
panels.timeSeries.withCiliumAgg('System Memory: $_worker_node', 'bytes', queries.SystemMemory.query(), { x: 12, y: 70, w: 12, h: 8 }),
panels.timeSeries.withCiliumAgg('Disk throughput: $_worker_node', 'Bps', queries.DiskThroughput.query(), { x: 0, y: 78, w: 12, h: 8 }),
panels.timeSeries.withCiliumAgg('Disk IOPS: $_worker_node', 'iops', queries.DiskIOPS.query(), { x: 12, y: 78, w: 12, h: 8 }),
panels.timeSeries.withCiliumAgg('System Memory: $_worker_node', 'bytes', queries.systemMemory.query(), { x: 12, y: 70, w: 12, h: 8 }),
panels.timeSeries.withCiliumAgg('Disk throughput: $_worker_node', 'Bps', queries.diskThroughput.query(), { x: 0, y: 78, w: 12, h: 8 }),
panels.timeSeries.withCiliumAgg('Disk IOPS: $_worker_node', 'iops', queries.diskIOPS.query(), { x: 12, y: 78, w: 12, h: 8 }),
panels.timeSeries.withCiliumAgg('Network Utilization: $_worker_node', 'bps', queries.networkUtilization.query(), { x: 0, y: 86, w: 12, h: 8 }),
panels.timeSeries.withCiliumAgg('Network Packets: $_worker_node', 'pps', queries.networkPackets.query(), { x: 12, y: 86, w: 12, h: 8 }),
panels.timeSeries.withCiliumAgg('Network packets drop: $_worker_node', 'pps', queries.networkPacketDrop.query(), { x: 0, y: 94, w: 12, h: 8 }),
panels.timeSeries.withCiliumAgg('Conntrack stats: $_worker_node', '', queries.conntrackStats.query(), { x: 12, y: 94, w: 12, h: 8 }),
panels.timeSeries.withCiliumAgg('Top 10 container CPU: $_worker_node', 'percent', queries.top10ContainerCPUNode.query(), { x: 0, y: 102, w: 12, h: 8 }),
panels.timeSeries.withCiliumAgg('Top 10 container RSS: $_worker_node', 'bytes', queries.top10ContainerRSSNode.query(), { x: 12, y: 102, w: 12, h: 8 }),
]),


])

0 comments on commit 602a159

Please sign in to comment.