Skip to content

Commit

Permalink
Etcd panels (#137)
Browse files Browse the repository at this point in the history
* Add some etcd duration histogram graphs
* Add etcd_pod picker
* Add etcd_pod variable and picker
---------
Signed-off-by: Andrew Collins <[email protected]>
  • Loading branch information
afcollins authored Sep 27, 2024
1 parent 1b03ca2 commit f374918
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 28 deletions.
28 changes: 26 additions & 2 deletions assets/etcd-on-cluster-dashboard/panels.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@ local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonn
local timeSeries = g.panel.timeSeries,
local custom = timeSeries.fieldConfig.defaults.custom,
local options = timeSeries.options,
local standardOptions = timeSeries.standardOptions,
local byRegexp = timeSeries.standardOptions.override.byRegexp,

base(title, unit, targets, gridPos):
timeSeries.new(title)
+ timeSeries.queryOptions.withTargets(targets)
+ timeSeries.datasource.withType('prometheus')
+ timeSeries.datasource.withUid('$Datasource')
+ timeSeries.standardOptions.withUnit(unit)
+ timeSeries.gridPos.withX(gridPos.x)
+ timeSeries.gridPos.withY(gridPos.y)
Expand Down Expand Up @@ -38,7 +39,30 @@ local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonn
'mean',
'max',
])
+ options.legend.withDisplayMode('table'),
+ options.legend.withDisplayMode('table')
+ options.legend.withSortBy('Max')
+ options.legend.withSortDesc(true),

generalCounter(title, unit, targets, gridPos):
self.base(title, unit, targets, gridPos)
+ options.legend.withCalcs([
'first',
'min',
'max',
'last',
]),

histogramStatsRightHand(title, unit, targets, gridPos, leftAxis):
self.generalCounter(title, unit, targets, gridPos)
+ custom.withAxisLabel(leftAxis)
+ options.legend.withDisplayMode('table')
+ options.legend.withSortBy('Max')
+ standardOptions.withOverrides([
byRegexp.new('.*irate.*')
+ byRegexp.withProperty('custom.axisPlacement', 'right')
+ byRegexp.withProperty('custom.axisLabel', 'rate')
+ byRegexp.withProperty('unit', 'none'),
]),

withoutCalcsAgg(title, unit, targets, gridPos):
self.base(title, unit, targets, gridPos)
Expand Down
72 changes: 48 additions & 24 deletions assets/etcd-on-cluster-dashboard/queries.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,46 @@ local generateTimeSeriesQuery(query, legend) = [
{
CPUUsage: {
query():
generateTimeSeriesQuery('sum(irate(container_cpu_usage_seconds_total{namespace="openshift-etcd", container="etcd"}[2m])) by (pod) * 100', '{{ pod }}'),
generateTimeSeriesQuery('sum(irate(container_cpu_usage_seconds_total{namespace="openshift-etcd", container="etcd",pod=~"$etcd_pod"}[2m])) by (pod) * 100', '{{ pod }}'),
},

memoryUsage: {
query():
generateTimeSeriesQuery('sum(avg_over_time(container_memory_working_set_bytes{container="",pod!="", namespace=~"openshift-etcd.*"}[2m])) BY (pod, namespace)', '{{ pod }}'),
generateTimeSeriesQuery('sum(avg_over_time(container_memory_working_set_bytes{container="",pod!="", namespace=~"openshift-etcd.*",pod=~"$etcd_pod"}[2m])) BY (pod, namespace)', '{{ pod }}'),
},

diskWalSyncDuration: {
query():
generateTimeSeriesQuery('histogram_quantile(0.99, sum(irate(etcd_disk_wal_fsync_duration_seconds_bucket{namespace="openshift-etcd"}[2m])) by (pod, le))', '{{pod}} WAL fsync'),
generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{namespace="openshift-etcd",pod=~"$etcd_pod"}[5m])) by (pod, le))', '{{pod}} WAL fsync'),
},

diskBackendSyncDuration: {
diskWalSyncDurationSum: {
query():
generateTimeSeriesQuery('histogram_quantile(0.99, sum(irate(etcd_disk_backend_commit_duration_seconds_bucket{namespace="openshift-etcd"}[2m])) by (pod, le))', '{{pod}} DB fsync'),
generateTimeSeriesQuery('irate(etcd_disk_wal_fsync_duration_seconds_sum{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', '2m irate WAL sum {{instance}} ')
+ generateTimeSeriesQuery('etcd_disk_wal_fsync_duration_seconds_sum{namespace="openshift-etcd",pod=~"$etcd_pod"}', 'WAL sum {{instance}} '),
},

diskWalSyncDurationCount: {
query():
generateTimeSeriesQuery('irate(etcd_disk_wal_fsync_duration_seconds_count{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', '2m irate WAL count {{instance}} ')
+ generateTimeSeriesQuery('etcd_disk_wal_fsync_duration_seconds_count{namespace="openshift-etcd",pod=~"$etcd_pod"}', 'WAL count {{instance}} '),
},

diskBackendCommitDuration: {
query():
generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{namespace="openshift-etcd",pod=~"$etcd_pod"}[5m])) by (pod, le))', '{{pod}} DB fsync'),
},

diskBackendCommitDurationSum: {
query():
generateTimeSeriesQuery('irate(etcd_disk_backend_commit_duration_seconds_sum{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', '2m irate WAL sum {{instance}} ')
+ generateTimeSeriesQuery('etcd_disk_backend_commit_duration_seconds_sum{namespace="openshift-etcd",pod=~"$etcd_pod"}', 'WAL sum {{instance}} '),
},

diskBackendCommitDurationCount: {
query():
generateTimeSeriesQuery('irate(etcd_disk_backend_commit_duration_seconds_count{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', '2m irate WAL count {{instance}} ')
+ generateTimeSeriesQuery('etcd_disk_backend_commit_duration_seconds_count{namespace="openshift-etcd",pod=~"$etcd_pod"}', 'WAL count {{instance}} '),
},

etcdContainerDiskWrites: {
Expand All @@ -37,8 +61,8 @@ local generateTimeSeriesQuery(query, legend) = [

dbSize: {
query():
generateTimeSeriesQuery('etcd_mvcc_db_total_size_in_bytes{namespace="openshift-etcd"}', '{{pod}} DB physical size')
+ generateTimeSeriesQuery('etcd_mvcc_db_total_size_in_use_in_bytes{namespace="openshift-etcd"}', '{{pod}} DB logical size'),
generateTimeSeriesQuery('etcd_mvcc_db_total_size_in_bytes{namespace="openshift-etcd",pod=~"$etcd_pod"}', '{{pod}} DB physical size')
+ generateTimeSeriesQuery('etcd_mvcc_db_total_size_in_use_in_bytes{namespace="openshift-etcd",pod=~"$etcd_pod"}', '{{pod}} DB logical size'),
},

containerNetworkTraffic: {
Expand All @@ -49,19 +73,19 @@ local generateTimeSeriesQuery(query, legend) = [

p99PeerToPeerLatency: {
query():
generateTimeSeriesQuery('histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{namespace="openshift-etcd"}[2m]))', '{{pod}}'),
generateTimeSeriesQuery('histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m]))', '{{pod}}'),
},

peerNetworkTraffic: {
query():
generateTimeSeriesQuery('rate(etcd_network_peer_received_bytes_total{namespace="openshift-etcd"}[2m])', 'rx {{pod}} Peer Traffic')
+ generateTimeSeriesQuery('rate(etcd_network_peer_sent_bytes_total{namespace="openshift-etcd"}[2m])', 'tx {{pod}} Peer Traffic'),
generateTimeSeriesQuery('rate(etcd_network_peer_received_bytes_total{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', 'rx {{pod}} Peer Traffic')
+ generateTimeSeriesQuery('rate(etcd_network_peer_sent_bytes_total{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', 'tx {{pod}} Peer Traffic'),
},

gRPCNetworkTraffic: {
query():
generateTimeSeriesQuery('rate(etcd_network_client_grpc_received_bytes_total{namespace="openshift-etcd"}[2m])', 'rx {{pod}}')
+ generateTimeSeriesQuery('rate(etcd_network_client_grpc_sent_bytes_total{namespace="openshift-etcd"}[2m])', 'tx {{pod}}'),
generateTimeSeriesQuery('rate(etcd_network_client_grpc_received_bytes_total{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', 'rx {{pod}}')
+ generateTimeSeriesQuery('rate(etcd_network_client_grpc_sent_bytes_total{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', 'tx {{pod}}'),
},

activeStreams: {
Expand All @@ -77,17 +101,17 @@ local generateTimeSeriesQuery(query, legend) = [

dbSpaceUsed: {
query():
generateTimeSeriesQuery('(etcd_mvcc_db_total_size_in_bytes{namespace="openshift-etcd"} / etcd_server_quota_backend_bytes{namespace="openshift-etcd"})*100', '{{pod}}'),
generateTimeSeriesQuery('(etcd_mvcc_db_total_size_in_bytes{namespace="openshift-etcd",pod=~"$etcd_pod"} / etcd_server_quota_backend_bytes{namespace="openshift-etcd",pod=~"$etcd_pod"})*100', '{{pod}}'),
},

dbLeftCapacity: {
query():
generateTimeSeriesQuery('etcd_server_quota_backend_bytes{namespace="openshift-etcd"} - etcd_mvcc_db_total_size_in_bytes{namespace="openshift-etcd"}', '{{pod}}'),
generateTimeSeriesQuery('etcd_server_quota_backend_bytes{namespace="openshift-etcd",pod=~"$etcd_pod"} - etcd_mvcc_db_total_size_in_bytes{namespace="openshift-etcd",pod=~"$etcd_pod"}', '{{pod}}'),
},

dbSizeLimit: {
query():
generateTimeSeriesQuery('etcd_server_quota_backend_bytes{namespace="openshift-etcd"}', '{{ pod }} Quota Bytes'),
generateTimeSeriesQuery('etcd_server_quota_backend_bytes{namespace="openshift-etcd",pod=~"$etcd_pod"}', '{{ pod }} Quota Bytes'),
},

raftProposals: {
Expand Down Expand Up @@ -115,34 +139,34 @@ local generateTimeSeriesQuery(query, legend) = [

keys: {
query():
generateTimeSeriesQuery('etcd_debugging_mvcc_keys_total{namespace="openshift-etcd"}', '{{ pod }} Num keys'),
generateTimeSeriesQuery('etcd_debugging_mvcc_keys_total{namespace="openshift-etcd",pod=~"$etcd_pod"}', '{{ pod }} Num keys'),
},

leaderElectionsPerDay: {
query():
generateTimeSeriesQuery('changes(etcd_server_leader_changes_seen_total{namespace="openshift-etcd"}[1d])', '{{instance}} Total Leader Elections Per Day'),
generateTimeSeriesQuery('changes(etcd_server_leader_changes_seen_total{namespace="openshift-etcd",pod=~"$etcd_pod"}[1d])', '{{instance}} Total Leader Elections Per Day'),
},

slowOperations: {
query():
generateTimeSeriesQuery('delta(etcd_server_slow_apply_total{namespace="openshift-etcd"}[2m])', '{{ pod }} slow applies')
+ generateTimeSeriesQuery('delta(etcd_server_slow_read_indexes_total{namespace="openshift-etcd"}[2m])', '{{ pod }} slow read indexes'),
generateTimeSeriesQuery('delta(etcd_server_slow_apply_total{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', '{{ pod }} slow applies')
+ generateTimeSeriesQuery('delta(etcd_server_slow_read_indexes_total{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', '{{ pod }} slow read indexes'),
},

keyOperations: {
query():
generateTimeSeriesQuery('rate(etcd_mvcc_put_total{namespace="openshift-etcd"}[2m])', '{{ pod }} puts/s')
+ generateTimeSeriesQuery('rate(etcd_mvcc_delete_total{namespace="openshift-etcd"}[2m])', '{{ pod }} deletes/s'),
generateTimeSeriesQuery('rate(etcd_mvcc_put_total{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', '{{ pod }} puts/s')
+ generateTimeSeriesQuery('rate(etcd_mvcc_delete_total{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', '{{ pod }} deletes/s'),
},

heartBeatFailure: {
query():
generateTimeSeriesQuery('etcd_server_heartbeat_send_failures_total{namespace="openshift-etcd"}', '{{ pod }} heartbeat failures')
+ generateTimeSeriesQuery('etcd_server_health_failures{namespace="openshift-etcd"}', '{{ pod }} health failures'),
generateTimeSeriesQuery('etcd_server_heartbeat_send_failures_total{namespace="openshift-etcd",pod=~"$etcd_pod"}', '{{ pod }} heartbeat failures')
+ generateTimeSeriesQuery('etcd_server_health_failures{namespace="openshift-etcd",pod=~"$etcd_pod"}', '{{ pod }} health failures'),
},

compactedKeys: {
query():
generateTimeSeriesQuery('etcd_debugging_mvcc_db_compaction_keys_total{namespace="openshift-etcd"}', '{{ pod }} keys compacted'),
generateTimeSeriesQuery('etcd_debugging_mvcc_db_compaction_keys_total{namespace="openshift-etcd",pod=~"$etcd_pod"}', '{{ pod }} keys compacted'),
},
}
11 changes: 11 additions & 0 deletions assets/etcd-on-cluster-dashboard/variables.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,15 @@ local var = g.dashboard.variable;
+ var.query.withRefresh(1)
+ var.query.selectionOptions.withMulti(false)
+ var.query.selectionOptions.withIncludeAll(false),

etcd_pod:
var.query.new('etcd_pod')
+ var.query.withDatasourceFromVariable(self.Datasource)
+ var.query.queryTypes.withLabelValues(
'pod',
'etcd_cluster_version',
)
+ var.query.withRefresh(2)
+ var.query.selectionOptions.withMulti()
+ var.query.selectionOptions.withIncludeAll(true),
}
23 changes: 21 additions & 2 deletions templates/General/etcd-on-cluster-dashboard.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ g.dashboard.new('etcd-cluster-info dashoard')
+ g.dashboard.graphTooltip.withSharedCrosshair()
+ g.dashboard.withVariables([
variables.Datasource,
variables.etcd_pod,
])

+ g.dashboard.withPanels([
Expand All @@ -24,11 +25,29 @@ g.dashboard.new('etcd-cluster-info dashoard')
panels.timeSeries.generalUsageAgg('CPU usage', 'percent', queries.CPUUsage.query(), { x: 0, y: 1, w: 12, h: 8 }),
panels.timeSeries.generalUsageAgg('Memory usage', 'bytes', queries.memoryUsage.query(), { x: 12, y: 1, w: 12, h: 8 }),
panels.timeSeries.generalUsageAgg('Disk WAL Sync Duration', 's', queries.diskWalSyncDuration.query(), { x: 0, y: 8, w: 12, h: 8 }),
panels.timeSeries.generalUsageAgg('Disk Backend Sync Duration', 's', queries.diskBackendSyncDuration.query(), { x: 12, y: 8, w: 12, h: 8 }),
panels.timeSeries.generalUsageAgg('Disk Backend Sync Duration', 's', queries.diskBackendCommitDuration.query(), { x: 12, y: 8, w: 12, h: 8 }),
panels.timeSeries.generalUsageAgg('Etcd container disk writes', 'Bps', queries.etcdContainerDiskWrites.query(), { x: 0, y: 16, w: 12, h: 8 }),
panels.timeSeries.generalUsageAgg('DB Size', 'bytes', queries.dbSize.query(), { x: 12, y: 16, w: 12, h: 8 }),
]),

g.panel.row.new('WAL fsync Duration Detailed')
+ g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 })
+ g.panel.row.withCollapsed(true)
+ g.panel.row.withPanels([
panels.timeSeries.generalUsageAgg('WAL fsync Duration p99', 's', queries.diskWalSyncDuration.query(), { x: 0, y: 0, w: 8, h: 8 }),
panels.timeSeries.histogramStatsRightHand('WAL fsync Duration sum', 'none', queries.diskWalSyncDurationSum.query(), { x: 8, y: 0, w: 8, h: 8 }, 'sum'),
panels.timeSeries.histogramStatsRightHand('WAL fsync Duration count', 'none', queries.diskWalSyncDurationCount.query(), { x: 16, y: 0, w: 8, h: 8 }, 'count'),
]),

g.panel.row.new('Backend Commit Duration Detailed')
+ g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 })
+ g.panel.row.withCollapsed(true)
+ g.panel.row.withPanels([
panels.timeSeries.generalUsageAgg('Backend Commit Duration', 's', queries.diskBackendCommitDuration.query(), { x: 0, y: 0, w: 8, h: 8 }),
panels.timeSeries.histogramStatsRightHand('Backend Commit Duration sum', 'none', queries.diskBackendCommitDurationSum.query(), { x: 8, y: 0, w: 8, h: 8 }, 'sum'),
panels.timeSeries.histogramStatsRightHand('Backend Commit Duration count', 'none', queries.diskBackendCommitDurationCount.query(), { x: 16, y: 0, w: 8, h: 8 }, 'count'),
]),

g.panel.row.new('Network Usage')
+ g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 })
+ g.panel.row.withCollapsed(true)
Expand Down Expand Up @@ -62,7 +81,7 @@ g.dashboard.new('etcd-cluster-info dashoard')
panels.timeSeries.GeneralInfo('Leader Elections Per Day', 'short', queries.leaderElectionsPerDay.query(), { x: 0, y: 12, w: 12, h: 6 }),
panels.timeSeries.GeneralInfo('Slow Operations', 'ops', queries.slowOperations.query(), { x: 0, y: 20, w: 12, h: 8 }),
panels.timeSeries.GeneralInfo('Key Operations', 'ops', queries.keyOperations.query(), { x: 12, y: 20, w: 12, h: 8 }),
panels.timeSeries.GeneralInfo('Heartbeat Failures', 'short', queries.heartBeatFailure.query(), { x: 0, y: 28, w: 12, h: 8 }),
panels.timeSeries.generalCounter('Heartbeat Failures', 'short', queries.heartBeatFailure.query(), { x: 0, y: 28, w: 12, h: 8 }),
panels.timeSeries.GeneralInfo('Compacted Keys', 'short', queries.compactedKeys.query(), { x: 12, y: 28, w: 12, h: 8 }),
]),

Expand Down

0 comments on commit f374918

Please sign in to comment.