Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Etcd panels #137

Merged
merged 3 commits into from
Sep 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 26 additions & 2 deletions assets/etcd-on-cluster-dashboard/panels.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@ local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonn
local timeSeries = g.panel.timeSeries,
local custom = timeSeries.fieldConfig.defaults.custom,
local options = timeSeries.options,
local standardOptions = timeSeries.standardOptions,
local byRegexp = timeSeries.standardOptions.override.byRegexp,

base(title, unit, targets, gridPos):
timeSeries.new(title)
+ timeSeries.queryOptions.withTargets(targets)
+ timeSeries.datasource.withType('prometheus')
+ timeSeries.datasource.withUid('$Datasource')
+ timeSeries.standardOptions.withUnit(unit)
+ timeSeries.gridPos.withX(gridPos.x)
+ timeSeries.gridPos.withY(gridPos.y)
Expand Down Expand Up @@ -38,7 +39,30 @@ local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonn
'mean',
'max',
])
+ options.legend.withDisplayMode('table'),
+ options.legend.withDisplayMode('table')
+ options.legend.withSortBy('Max')
+ options.legend.withSortDesc(true),

generalCounter(title, unit, targets, gridPos):
self.base(title, unit, targets, gridPos)
+ options.legend.withCalcs([
'first',
'min',
'max',
'last',
]),

histogramStatsRightHand(title, unit, targets, gridPos, leftAxis):
self.generalCounter(title, unit, targets, gridPos)
+ custom.withAxisLabel(leftAxis)
+ options.legend.withDisplayMode('table')
+ options.legend.withSortBy('Max')
+ standardOptions.withOverrides([
byRegexp.new('.*irate.*')
+ byRegexp.withProperty('custom.axisPlacement', 'right')
+ byRegexp.withProperty('custom.axisLabel', 'rate')
+ byRegexp.withProperty('unit', 'none'),
]),

withoutCalcsAgg(title, unit, targets, gridPos):
self.base(title, unit, targets, gridPos)
Expand Down
72 changes: 48 additions & 24 deletions assets/etcd-on-cluster-dashboard/queries.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,46 @@ local generateTimeSeriesQuery(query, legend) = [
{
CPUUsage: {
query():
generateTimeSeriesQuery('sum(irate(container_cpu_usage_seconds_total{namespace="openshift-etcd", container="etcd"}[2m])) by (pod) * 100', '{{ pod }}'),
generateTimeSeriesQuery('sum(irate(container_cpu_usage_seconds_total{namespace="openshift-etcd", container="etcd",pod=~"$etcd_pod"}[2m])) by (pod) * 100', '{{ pod }}'),
},

memoryUsage: {
query():
generateTimeSeriesQuery('sum(avg_over_time(container_memory_working_set_bytes{container="",pod!="", namespace=~"openshift-etcd.*"}[2m])) BY (pod, namespace)', '{{ pod }}'),
generateTimeSeriesQuery('sum(avg_over_time(container_memory_working_set_bytes{container="",pod!="", namespace=~"openshift-etcd.*",pod=~"$etcd_pod"}[2m])) BY (pod, namespace)', '{{ pod }}'),
},

diskWalSyncDuration: {
query():
generateTimeSeriesQuery('histogram_quantile(0.99, sum(irate(etcd_disk_wal_fsync_duration_seconds_bucket{namespace="openshift-etcd"}[2m])) by (pod, le))', '{{pod}} WAL fsync'),
generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{namespace="openshift-etcd",pod=~"$etcd_pod"}[5m])) by (pod, le))', '{{pod}} WAL fsync'),
},

diskBackendSyncDuration: {
diskWalSyncDurationSum: {
query():
generateTimeSeriesQuery('histogram_quantile(0.99, sum(irate(etcd_disk_backend_commit_duration_seconds_bucket{namespace="openshift-etcd"}[2m])) by (pod, le))', '{{pod}} DB fsync'),
generateTimeSeriesQuery('irate(etcd_disk_wal_fsync_duration_seconds_sum{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', '2m irate WAL sum {{instance}} ')
+ generateTimeSeriesQuery('etcd_disk_wal_fsync_duration_seconds_sum{namespace="openshift-etcd",pod=~"$etcd_pod"}', 'WAL sum {{instance}} '),
},

diskWalSyncDurationCount: {
query():
generateTimeSeriesQuery('irate(etcd_disk_wal_fsync_duration_seconds_count{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', '2m irate WAL count {{instance}} ')
+ generateTimeSeriesQuery('etcd_disk_wal_fsync_duration_seconds_count{namespace="openshift-etcd",pod=~"$etcd_pod"}', 'WAL count {{instance}} '),
},

diskBackendCommitDuration: {
query():
generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{namespace="openshift-etcd",pod=~"$etcd_pod"}[5m])) by (pod, le))', '{{pod}} DB fsync'),
},

diskBackendCommitDurationSum: {
query():
generateTimeSeriesQuery('irate(etcd_disk_backend_commit_duration_seconds_sum{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', '2m irate WAL sum {{instance}} ')
+ generateTimeSeriesQuery('etcd_disk_backend_commit_duration_seconds_sum{namespace="openshift-etcd",pod=~"$etcd_pod"}', 'WAL sum {{instance}} '),
},

diskBackendCommitDurationCount: {
query():
generateTimeSeriesQuery('irate(etcd_disk_backend_commit_duration_seconds_count{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', '2m irate WAL count {{instance}} ')
+ generateTimeSeriesQuery('etcd_disk_backend_commit_duration_seconds_count{namespace="openshift-etcd",pod=~"$etcd_pod"}', 'WAL count {{instance}} '),
},

etcdContainerDiskWrites: {
Expand All @@ -37,8 +61,8 @@ local generateTimeSeriesQuery(query, legend) = [

dbSize: {
query():
generateTimeSeriesQuery('etcd_mvcc_db_total_size_in_bytes{namespace="openshift-etcd"}', '{{pod}} DB physical size')
+ generateTimeSeriesQuery('etcd_mvcc_db_total_size_in_use_in_bytes{namespace="openshift-etcd"}', '{{pod}} DB logical size'),
generateTimeSeriesQuery('etcd_mvcc_db_total_size_in_bytes{namespace="openshift-etcd",pod=~"$etcd_pod"}', '{{pod}} DB physical size')
+ generateTimeSeriesQuery('etcd_mvcc_db_total_size_in_use_in_bytes{namespace="openshift-etcd",pod=~"$etcd_pod"}', '{{pod}} DB logical size'),
},

containerNetworkTraffic: {
Expand All @@ -49,19 +73,19 @@ local generateTimeSeriesQuery(query, legend) = [

p99PeerToPeerLatency: {
query():
generateTimeSeriesQuery('histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{namespace="openshift-etcd"}[2m]))', '{{pod}}'),
generateTimeSeriesQuery('histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m]))', '{{pod}}'),
},

peerNetworkTraffic: {
query():
generateTimeSeriesQuery('rate(etcd_network_peer_received_bytes_total{namespace="openshift-etcd"}[2m])', 'rx {{pod}} Peer Traffic')
+ generateTimeSeriesQuery('rate(etcd_network_peer_sent_bytes_total{namespace="openshift-etcd"}[2m])', 'tx {{pod}} Peer Traffic'),
generateTimeSeriesQuery('rate(etcd_network_peer_received_bytes_total{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', 'rx {{pod}} Peer Traffic')
+ generateTimeSeriesQuery('rate(etcd_network_peer_sent_bytes_total{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', 'tx {{pod}} Peer Traffic'),
},

gRPCNetworkTraffic: {
query():
generateTimeSeriesQuery('rate(etcd_network_client_grpc_received_bytes_total{namespace="openshift-etcd"}[2m])', 'rx {{pod}}')
+ generateTimeSeriesQuery('rate(etcd_network_client_grpc_sent_bytes_total{namespace="openshift-etcd"}[2m])', 'tx {{pod}}'),
generateTimeSeriesQuery('rate(etcd_network_client_grpc_received_bytes_total{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', 'rx {{pod}}')
+ generateTimeSeriesQuery('rate(etcd_network_client_grpc_sent_bytes_total{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', 'tx {{pod}}'),
},

activeStreams: {
Expand All @@ -77,17 +101,17 @@ local generateTimeSeriesQuery(query, legend) = [

dbSpaceUsed: {
query():
generateTimeSeriesQuery('(etcd_mvcc_db_total_size_in_bytes{namespace="openshift-etcd"} / etcd_server_quota_backend_bytes{namespace="openshift-etcd"})*100', '{{pod}}'),
generateTimeSeriesQuery('(etcd_mvcc_db_total_size_in_bytes{namespace="openshift-etcd",pod=~"$etcd_pod"} / etcd_server_quota_backend_bytes{namespace="openshift-etcd",pod=~"$etcd_pod"})*100', '{{pod}}'),
},

dbLeftCapacity: {
query():
generateTimeSeriesQuery('etcd_server_quota_backend_bytes{namespace="openshift-etcd"} - etcd_mvcc_db_total_size_in_bytes{namespace="openshift-etcd"}', '{{pod}}'),
generateTimeSeriesQuery('etcd_server_quota_backend_bytes{namespace="openshift-etcd",pod=~"$etcd_pod"} - etcd_mvcc_db_total_size_in_bytes{namespace="openshift-etcd",pod=~"$etcd_pod"}', '{{pod}}'),
},

dbSizeLimit: {
query():
generateTimeSeriesQuery('etcd_server_quota_backend_bytes{namespace="openshift-etcd"}', '{{ pod }} Quota Bytes'),
generateTimeSeriesQuery('etcd_server_quota_backend_bytes{namespace="openshift-etcd",pod=~"$etcd_pod"}', '{{ pod }} Quota Bytes'),
},

raftProposals: {
Expand Down Expand Up @@ -115,34 +139,34 @@ local generateTimeSeriesQuery(query, legend) = [

keys: {
query():
generateTimeSeriesQuery('etcd_debugging_mvcc_keys_total{namespace="openshift-etcd"}', '{{ pod }} Num keys'),
generateTimeSeriesQuery('etcd_debugging_mvcc_keys_total{namespace="openshift-etcd",pod=~"$etcd_pod"}', '{{ pod }} Num keys'),
},

leaderElectionsPerDay: {
query():
generateTimeSeriesQuery('changes(etcd_server_leader_changes_seen_total{namespace="openshift-etcd"}[1d])', '{{instance}} Total Leader Elections Per Day'),
generateTimeSeriesQuery('changes(etcd_server_leader_changes_seen_total{namespace="openshift-etcd",pod=~"$etcd_pod"}[1d])', '{{instance}} Total Leader Elections Per Day'),
},

slowOperations: {
query():
generateTimeSeriesQuery('delta(etcd_server_slow_apply_total{namespace="openshift-etcd"}[2m])', '{{ pod }} slow applies')
+ generateTimeSeriesQuery('delta(etcd_server_slow_read_indexes_total{namespace="openshift-etcd"}[2m])', '{{ pod }} slow read indexes'),
generateTimeSeriesQuery('delta(etcd_server_slow_apply_total{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', '{{ pod }} slow applies')
+ generateTimeSeriesQuery('delta(etcd_server_slow_read_indexes_total{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', '{{ pod }} slow read indexes'),
},

keyOperations: {
query():
generateTimeSeriesQuery('rate(etcd_mvcc_put_total{namespace="openshift-etcd"}[2m])', '{{ pod }} puts/s')
+ generateTimeSeriesQuery('rate(etcd_mvcc_delete_total{namespace="openshift-etcd"}[2m])', '{{ pod }} deletes/s'),
generateTimeSeriesQuery('rate(etcd_mvcc_put_total{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', '{{ pod }} puts/s')
+ generateTimeSeriesQuery('rate(etcd_mvcc_delete_total{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', '{{ pod }} deletes/s'),
},

heartBeatFailure: {
query():
generateTimeSeriesQuery('etcd_server_heartbeat_send_failures_total{namespace="openshift-etcd"}', '{{ pod }} heartbeat failures')
+ generateTimeSeriesQuery('etcd_server_health_failures{namespace="openshift-etcd"}', '{{ pod }} health failures'),
generateTimeSeriesQuery('etcd_server_heartbeat_send_failures_total{namespace="openshift-etcd",pod=~"$etcd_pod"}', '{{ pod }} heartbeat failures')
+ generateTimeSeriesQuery('etcd_server_health_failures{namespace="openshift-etcd",pod=~"$etcd_pod"}', '{{ pod }} health failures'),
},

compactedKeys: {
query():
generateTimeSeriesQuery('etcd_debugging_mvcc_db_compaction_keys_total{namespace="openshift-etcd"}', '{{ pod }} keys compacted'),
generateTimeSeriesQuery('etcd_debugging_mvcc_db_compaction_keys_total{namespace="openshift-etcd",pod=~"$etcd_pod"}', '{{ pod }} keys compacted'),
},
}
11 changes: 11 additions & 0 deletions assets/etcd-on-cluster-dashboard/variables.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,15 @@ local var = g.dashboard.variable;
+ var.query.withRefresh(1)
+ var.query.selectionOptions.withMulti(false)
+ var.query.selectionOptions.withIncludeAll(false),

etcd_pod:
var.query.new('etcd_pod')
+ var.query.withDatasourceFromVariable(self.Datasource)
+ var.query.queryTypes.withLabelValues(
'pod',
'etcd_cluster_version',
)
+ var.query.withRefresh(2)
+ var.query.selectionOptions.withMulti()
+ var.query.selectionOptions.withIncludeAll(true),
}
23 changes: 21 additions & 2 deletions templates/General/etcd-on-cluster-dashboard.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ g.dashboard.new('etcd-cluster-info dashoard')
+ g.dashboard.graphTooltip.withSharedCrosshair()
+ g.dashboard.withVariables([
variables.Datasource,
variables.etcd_pod,
])

+ g.dashboard.withPanels([
Expand All @@ -24,11 +25,29 @@ g.dashboard.new('etcd-cluster-info dashoard')
panels.timeSeries.generalUsageAgg('CPU usage', 'percent', queries.CPUUsage.query(), { x: 0, y: 1, w: 12, h: 8 }),
panels.timeSeries.generalUsageAgg('Memory usage', 'bytes', queries.memoryUsage.query(), { x: 12, y: 1, w: 12, h: 8 }),
panels.timeSeries.generalUsageAgg('Disk WAL Sync Duration', 's', queries.diskWalSyncDuration.query(), { x: 0, y: 8, w: 12, h: 8 }),
panels.timeSeries.generalUsageAgg('Disk Backend Sync Duration', 's', queries.diskBackendSyncDuration.query(), { x: 12, y: 8, w: 12, h: 8 }),
panels.timeSeries.generalUsageAgg('Disk Backend Sync Duration', 's', queries.diskBackendCommitDuration.query(), { x: 12, y: 8, w: 12, h: 8 }),
panels.timeSeries.generalUsageAgg('Etcd container disk writes', 'Bps', queries.etcdContainerDiskWrites.query(), { x: 0, y: 16, w: 12, h: 8 }),
panels.timeSeries.generalUsageAgg('DB Size', 'bytes', queries.dbSize.query(), { x: 12, y: 16, w: 12, h: 8 }),
]),

g.panel.row.new('WAL fsync Duration Detailed')
+ g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 })
+ g.panel.row.withCollapsed(true)
+ g.panel.row.withPanels([
panels.timeSeries.generalUsageAgg('WAL fsync Duration p99', 's', queries.diskWalSyncDuration.query(), { x: 0, y: 0, w: 8, h: 8 }),
panels.timeSeries.histogramStatsRightHand('WAL fsync Duration sum', 'none', queries.diskWalSyncDurationSum.query(), { x: 8, y: 0, w: 8, h: 8 }, 'sum'),
panels.timeSeries.histogramStatsRightHand('WAL fsync Duration count', 'none', queries.diskWalSyncDurationCount.query(), { x: 16, y: 0, w: 8, h: 8 }, 'count'),
]),

g.panel.row.new('Backend Commit Duration Detailed')
+ g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 })
+ g.panel.row.withCollapsed(true)
+ g.panel.row.withPanels([
panels.timeSeries.generalUsageAgg('Backend Commit Duration', 's', queries.diskBackendCommitDuration.query(), { x: 0, y: 0, w: 8, h: 8 }),
panels.timeSeries.histogramStatsRightHand('Backend Commit Duration sum', 'none', queries.diskBackendCommitDurationSum.query(), { x: 8, y: 0, w: 8, h: 8 }, 'sum'),
panels.timeSeries.histogramStatsRightHand('Backend Commit Duration count', 'none', queries.diskBackendCommitDurationCount.query(), { x: 16, y: 0, w: 8, h: 8 }, 'count'),
]),

g.panel.row.new('Network Usage')
+ g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 })
+ g.panel.row.withCollapsed(true)
Expand Down Expand Up @@ -62,7 +81,7 @@ g.dashboard.new('etcd-cluster-info dashoard')
panels.timeSeries.GeneralInfo('Leader Elections Per Day', 'short', queries.leaderElectionsPerDay.query(), { x: 0, y: 12, w: 12, h: 6 }),
panels.timeSeries.GeneralInfo('Slow Operations', 'ops', queries.slowOperations.query(), { x: 0, y: 20, w: 12, h: 8 }),
panels.timeSeries.GeneralInfo('Key Operations', 'ops', queries.keyOperations.query(), { x: 12, y: 20, w: 12, h: 8 }),
panels.timeSeries.GeneralInfo('Heartbeat Failures', 'short', queries.heartBeatFailure.query(), { x: 0, y: 28, w: 12, h: 8 }),
panels.timeSeries.generalCounter('Heartbeat Failures', 'short', queries.heartBeatFailure.query(), { x: 0, y: 28, w: 12, h: 8 }),
panels.timeSeries.GeneralInfo('Compacted Keys', 'short', queries.compactedKeys.query(), { x: 12, y: 28, w: 12, h: 8 }),
]),

Expand Down