Skip to content

Commit

Permalink
k8s Performance Dashboard
Browse files Browse the repository at this point in the history
  • Loading branch information
smandaRH committed Dec 29, 2023
1 parent aadd6c1 commit 7b63ba6
Show file tree
Hide file tree
Showing 4 changed files with 329 additions and 0 deletions.
76 changes: 76 additions & 0 deletions assets/k8s-perf/panels.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';

{
stat: {
local stat = g.panel.stat,
local options = stat.options,

base(title, unit, targets, gridPos):
stat.new(title)
+ stat.datasource.withType('prometheus')
+ stat.datasource.withUid('$Datasource')
+ stat.standardOptions.withUnit(unit)
+ stat.queryOptions.withTargets(targets)
+ stat.gridPos.withX(gridPos.x)
+ stat.gridPos.withY(gridPos.y)
+ stat.gridPos.withH(gridPos.h)
+ stat.gridPos.withW(gridPos.w)
+ options.withJustifyMode("auto")
+ options.withGraphMode("none")
+ options.text.withTitleSize(12)
+ stat.standardOptions.color.withMode('thresholds')
+ options.withColorMode('none'),

genericStatLegendPanel(title, unit, targets, gridPos):
self.base(title, unit, targets, gridPos)
+ stat.options.reduceOptions.withCalcs([
'last'
])
},

timeSeries: {
local timeSeries = g.panel.timeSeries,
local custom = timeSeries.fieldConfig.defaults.custom,
local options = timeSeries.options,

base(title, unit, targets, gridPos):
timeSeries.new(title)
+ timeSeries.queryOptions.withTargets(targets)
+ timeSeries.datasource.withType('prometheus')
+ timeSeries.datasource.withUid('$Datasource')
+ timeSeries.standardOptions.withUnit(unit)
+ timeSeries.gridPos.withX(gridPos.x)
+ timeSeries.gridPos.withY(gridPos.y)
+ timeSeries.gridPos.withH(gridPos.h)
+ timeSeries.gridPos.withW(gridPos.w)
+ custom.withDrawStyle("line")
+ custom.withLineInterpolation("linear")
+ custom.withBarAlignment(0)
+ custom.withLineWidth(1)
+ custom.withFillOpacity(10)
+ custom.withGradientMode("none")
+ custom.withSpanNulls(false)
+ custom.withPointSize(5)
+ custom.withSpanNulls(false)
+ custom.stacking.withMode("none")
+ custom.withShowPoints('never')
+ options.tooltip.withMode('multi')
+ options.tooltip.withSort('desc')
+ options.legend.withShowLegend(true)
+ options.legend.withPlacement('bottom'),

genericTimeSeriesPanel(title, unit, targets, gridPos):
self.base(title, unit, targets, gridPos)
+ options.legend.withCalcs([])
+ options.legend.withDisplayMode('table'),

genericTimeSeriesLegendPanel(title, unit, targets, gridPos):
self.base(title, unit, targets, gridPos)
+ options.legend.withCalcs([
"mean",
"max"
])
+ options.legend.withDisplayMode('table'),

},
}
138 changes: 138 additions & 0 deletions assets/k8s-perf/queries.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';
local variables = import './variables.libsonnet';

local generateTimeSeriesQuery(query, legend) = [
local prometheusQuery = g.query.prometheus;
prometheusQuery.new('$'+variables.Datasource.name, query)
+ prometheusQuery.withFormat('time_series')
+ prometheusQuery.withIntervalFactor(2)
+ prometheusQuery.withLegendFormat(legend),
];

{
currentNodeCount: {
query():
generateTimeSeriesQuery('sum(kube_node_info{})','Number of nodes')
+ generateTimeSeriesQuery('sum(kube_node_status_condition{status="true"}) by (condition) > 0','Node: {{ condition }}')
},

currentNamespaceCount: {
query():
generateTimeSeriesQuery('sum(kube_namespace_status_phase) by (phase)','{{ phase }}')
},

currentPodCount: {
query():
generateTimeSeriesQuery('sum(kube_pod_status_phase{}) by (phase) > 0','{{ phase}} Pods')
},

numberOfNodes: {
query():
generateTimeSeriesQuery('sum(kube_node_info{})','Number of nodes')
+ generateTimeSeriesQuery('sum(kube_node_status_condition{status="true"}) by (condition) > 0','Node: {{ condition }}')
},

namespaceCount: {
query():
generateTimeSeriesQuery('sum(kube_namespace_status_phase) by (phase) > 0','{{ phase }} namespaces')
},

podCount: {
query():
generateTimeSeriesQuery('sum(kube_pod_status_phase{}) by (phase)','{{phase}} pods')
},

secretAndConfigMapCount: {
query():
generateTimeSeriesQuery('count(kube_secret_info{})','secrets')
+ generateTimeSeriesQuery('count(kube_configmap_info{})','Configmaps')
},
deployCount: {
query():
generateTimeSeriesQuery('count(kube_deployment_labels{})','Deployments')
},

serviceCount: {
query():
generateTimeSeriesQuery('count(kube_service_info{})','Services')
},

top10ContainerRSS: {
query():
generateTimeSeriesQuery('topk(10, container_memory_rss{namespace!="",container!="POD",name!=""})','{{ namespace }} - {{ name }}')
},

top10ContainerCPU: {
query():
generateTimeSeriesQuery('topk(10,irate(container_cpu_usage_seconds_total{namespace!="",container!="POD",name!=""}[$interval])*100)','{{ namespace }} - {{ name }}')
},

goroutinesCount: {
query():
generateTimeSeriesQuery('topk(10, sum(go_goroutines{}) by (job,instance))','{{ job }} - {{ instance }}')
},

podDistribution: {
query():
generateTimeSeriesQuery('count(kube_pod_info{}) by (exported_node)','{{ node }}')
},

basicCPU: {
query(nodeName):
generateTimeSeriesQuery('sum by (instance, mode)(rate(node_cpu_seconds_total{node=~"' + nodeName + '",job=~".*"}[$interval])) * 100','Busy {{mode}}')
},

systemMemory: {
query(nodeName):
generateTimeSeriesQuery('node_memory_Active_bytes{node=~"' + nodeName + '"}','Active')
+ generateTimeSeriesQuery('node_memory_MemTotal_bytes{node=~"' + nodeName + '"}','Total')
+ generateTimeSeriesQuery('node_memory_Cached_bytes{node=~"' + nodeName + '"} + node_memory_Buffers_bytes{node=~"' + nodeName + '"}','Cached + Buffers')
+ generateTimeSeriesQuery('node_memory_MemAvailable_bytes{node=~"' + nodeName + '"}','Available')
},

diskThroughput: {
query(nodeName):
generateTimeSeriesQuery('rate(node_disk_read_bytes_total{device=~"$block_device",node=~"' + nodeName + '"}[$interval])','{{ device }} - read')
+ generateTimeSeriesQuery('rate(node_disk_written_bytes_total{device=~"$block_device",node=~"' + nodeName + '"}[$interval])','{{ device }} - write')
},

diskIOPS: {
query(nodeName):
generateTimeSeriesQuery('rate(node_disk_reads_completed_total{device=~"$block_device",node=~"' + nodeName + '"}[$interval])','{{ device }} - read')
+ generateTimeSeriesQuery('rate(node_disk_writes_completed_total{device=~"$block_device",node=~"' + nodeName + '"}[$interval])','{{ device }} - write')
},

networkUtilization: {
query(nodeName):
generateTimeSeriesQuery('rate(node_network_receive_bytes_total{node=~"' + nodeName + '",device=~"$net_device"}[$interval]) * 8','{{instance}} - {{device}} - RX')
+ generateTimeSeriesQuery('rate(node_network_transmit_bytes_total{node=~"' + nodeName + '",device=~"$net_device"}[$interval]) * 8','{{instance}} - {{device}} - TX')
},

networkPackets: {
query(nodeName):
generateTimeSeriesQuery('rate(node_network_receive_packets_total{node=~"' + nodeName + '",device=~"$net_device"}[$interval])','{{instance}} - {{device}} - RX')
+ generateTimeSeriesQuery('rate(node_network_transmit_packets_total{node=~"' + nodeName + '",device=~"$net_device"}[$interval])','{{instance}} - {{device}} - TX')
},

networkDrop: {
query(nodeName):
generateTimeSeriesQuery('topk(10, rate(node_network_receive_drop_total{node=~"' + nodeName + '"}[$interval]))','rx-drop-{{ device }}')
+ generateTimeSeriesQuery('topk(10,rate(node_network_transmit_drop_total{node=~"' + nodeName + '"}[$interval]))','tx-drop-{{ device }}')
},

conntrackStats: {
query(nodeName):
generateTimeSeriesQuery('node_nf_conntrack_entries{node=~"' + nodeName + '"}','conntrack_entries')
+ generateTimeSeriesQuery('node_nf_conntrack_entries_limit{node=~"' + nodeName + '"}','conntrack_limit')
},

top10ContainersCPU: {
query(nodeName):
generateTimeSeriesQuery('topk(10, sum(irate(container_cpu_usage_seconds_total{container!="POD",name!="",instance=~"' + nodeName + '",namespace!="",namespace=~"$namespace"}[$interval])) by (pod,container,namespace,name,service) * 100)','{{ pod }}: {{ container }}')
},

top10ContainersRSS: {
query(nodeName):
generateTimeSeriesQuery('topk(10, container_memory_rss{container!="POD",name!="",instance=~"' + nodeName + '",namespace!="",namespace=~"$namespace"})','{{ pod }}: {{ container }}')
}
}
54 changes: 54 additions & 0 deletions assets/k8s-perf/variables.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';
local var = g.dashboard.variable;

{
Datasource:
var.datasource.new('Datasource','prometheus')
+ var.datasource.withRegex('')
+ var.query.withRefresh(1)
+ var.query.selectionOptions.withIncludeAll(false)
+ var.query.selectionOptions.withMulti(false),

_worker_node:
var.query.new('_worker_node','label_values(kube_node_labels{}, exported_node)')
+ var.query.generalOptions.withLabel('Worker')
+ var.query.withSort(0)
+ var.query.withRefresh(2)
+ var.query.selectionOptions.withIncludeAll(false)
+ var.query.selectionOptions.withMulti(true),

namespace:
var.query.new('namespace','label_values(kube_pod_info, exported_namespace)')
+ var.query.generalOptions.withLabel('Namespace')
+ var.query.withSort(0)
+ var.query.withRefresh(2)
+ var.query.selectionOptions.withIncludeAll(true)
+ var.query.selectionOptions.withMulti(false),

block_device:
var.query.new('block_device','label_values(node_disk_written_bytes_total,device)')
+ var.query.generalOptions.withLabel('Block device')
+ var.query.withSort(0)
+ var.datasource.withRegex('/^(?:(?!dm|rb).)*$/')
+ var.query.withRefresh(2)
+ var.query.selectionOptions.withIncludeAll(true)
+ var.query.selectionOptions.withMulti(true),

net_device:
var.query.new('net_device','label_values(node_network_receive_bytes_total,device)')
+ var.query.generalOptions.withLabel('Network device')
+ var.query.withSort(0)
+ var.datasource.withRegex('/^((br|en|et).*)$/')
+ var.query.withRefresh(2)
+ var.query.selectionOptions.withIncludeAll(true)
+ var.query.selectionOptions.withMulti(true),

interval:
var.interval.new('interval',['2m','3m','4m','5m'])
+ var.query.withDatasourceFromVariable(self.Datasource)
+ var.interval.generalOptions.withLabel('interval')
+ var.interval.withAutoOption(count=30, minInterval='10s')
+ var.query.withRefresh(2)
+ var.query.selectionOptions.withMulti(false)
+ var.query.selectionOptions.withIncludeAll(false)
}
61 changes: 61 additions & 0 deletions templates/General/k8s-perf-v2.jsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
local panels = import '../../assets/k8s-perf/panels.libsonnet';
local queries = import '../../assets/k8s-perf/queries.libsonnet';
local variables = import '../../assets/k8s-perf/variables.libsonnet';
local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';

g.dashboard.new('k8s Performance')
+ g.dashboard.time.withFrom('now-1h')
+ g.dashboard.time.withTo('now')
+ g.dashboard.withTimezone('utc')
+ g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'])
+ g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'])
+ g.dashboard.withRefresh('30s')
+ g.dashboard.withEditable(false)
+ g.dashboard.graphTooltip.withSharedCrosshair()
+ g.dashboard.withVariables([
variables.Datasource,
variables._worker_node,
variables.namespace,
variables.block_device,
variables.net_device,
variables.interval,
])

+ g.dashboard.withPanels([
g.panel.row.new('Cluster Details')
+ g.panel.row.withCollapsed(true)
+ g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 })
+ g.panel.row.withPanels([
panels.stat.genericStatLegendPanel('Current Node Count', 'none', queries.currentNodeCount.query(), { x: 0, y: 4, w: 8, h: 3 }),
panels.stat.genericStatLegendPanel('Current namespace Count', 'none', queries.currentNamespaceCount.query(), { x: 8, y: 4, w: 8, h: 3 }),
panels.stat.genericStatLegendPanel('Current Pod Count', 'none', queries.currentPodCount.query(), { x: 16, y: 4, w: 8, h: 3 }),
panels.timeSeries.genericTimeSeriesPanel('Number of nodes', 'none', queries.numberOfNodes.query(), { x: 0, y: 12, w: 8, h: 8 }),
panels.timeSeries.genericTimeSeriesPanel('Namespace count', 'none', queries.namespaceCount.query(), { x: 8, y: 12, w: 8, h: 8 }),
panels.timeSeries.genericTimeSeriesPanel('Pod count', 'none', queries.podCount.query(), { x: 16, y: 12, w: 8, h: 8 }),
panels.timeSeries.genericTimeSeriesPanel('Secret & configmap count', 'none', queries.secretAndConfigMapCount.query(), { x: 0, y: 20, w: 8, h: 8 }),
panels.timeSeries.genericTimeSeriesPanel('Deployment count', 'none', queries.deployCount.query(), { x: 8, y: 20, w: 8, h: 8 }),
panels.timeSeries.genericTimeSeriesPanel('Services count', 'none', queries.serviceCount.query(), { x: 16, y: 20, w: 8, h: 8 }),
panels.timeSeries.genericTimeSeriesLegendPanel('Top 10 container RSS', 'bytes', queries.top10ContainerRSS.query(), { x: 0, y: 28, w: 24, h: 8 }),
panels.timeSeries.genericTimeSeriesLegendPanel('Top 10 container CPU', 'percent', queries.top10ContainerCPU.query(), { x: 0, y: 36, w: 12, h: 8 }),
panels.timeSeries.genericTimeSeriesPanel('Goroutines count', 'none', queries.goroutinesCount.query(), { x: 12, y: 36, w: 12, h: 8 }),
panels.timeSeries.genericTimeSeriesLegendPanel('Pod Distribution', 'none', queries.podDistribution.query(), { x: 0, y: 44, w: 24, h: 8 }),
]),

g.panel.row.new('Node: $_worker_node')
+ g.panel.row.withCollapsed(true)
+ g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 })
+ g.panel.row.withRepeat('_worker_node')
+ g.panel.row.withPanels([
panels.timeSeries.genericTimeSeriesLegendPanel('CPU Basic: $_worker_node ', 'percent', queries.basicCPU.query('$_worker_node'), { x: 0, y: 0, w: 12, h: 8 }),
panels.timeSeries.genericTimeSeriesLegendPanel('System Memory: $_worker_node ', 'bytes', queries.systemMemory.query('$_worker_node'), { x: 12, y: 0, w: 12, h: 8 }),
panels.timeSeries.genericTimeSeriesLegendPanel('Disk throughput: $_worker_node ', 'Bps', queries.diskThroughput.query('$_worker_node'), { x: 0, y: 8, w: 12, h: 8 }),
panels.timeSeries.genericTimeSeriesLegendPanel('Disk IOPS: $_worker_node', 'iops', queries.diskIOPS.query('$_worker_node'), { x: 12, y: 8, w: 12, h: 8 }),
panels.timeSeries.genericTimeSeriesLegendPanel('Network Utilization: $_worker_node', 'bps', queries.networkUtilization.query('$_worker_node'), { x: 0, y: 16, w: 12, h: 8 }),
panels.timeSeries.genericTimeSeriesLegendPanel('Network Packets: $_worker_node', 'pps', queries.networkPackets.query('$_worker_node'), { x: 12, y: 16, w: 12, h: 8 }),
panels.timeSeries.genericTimeSeriesLegendPanel('Network packets drop: $_worker_node', 'pps', queries.networkDrop.query('$_worker_node'), { x: 0, y: 24, w: 12, h: 8 }),
panels.timeSeries.genericTimeSeriesLegendPanel('Conntrack stats: $_worker_node', '', queries.conntrackStats.query('$_worker_node'), { x: 12, y: 24, w: 12, h: 8 }),
panels.timeSeries.genericTimeSeriesLegendPanel('Top 10 container CPU: $_worker_node', 'percent', queries.top10ContainersCPU.query('$_worker_node'), { x: 0, y: 32, w: 12, h: 8 }),
panels.timeSeries.genericTimeSeriesLegendPanel('Top 10 container RSS: $_worker_node', 'bytes', queries.top10ContainersRSS.query(' $_worker_node'), { x: 12, y: 32, w: 12, h: 8 }),

]),
])

0 comments on commit 7b63ba6

Please sign in to comment.