Skip to content

Commit

Permalink
Add some etcd duration histogram graphs
Browse files Browse the repository at this point in the history
Signed-off-by: Andrew Collins <[email protected]>
  • Loading branch information
afcollins committed Sep 27, 2024
1 parent 1b03ca2 commit 86bf15f
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 6 deletions.
24 changes: 22 additions & 2 deletions assets/etcd-on-cluster-dashboard/panels.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@ local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonn
local timeSeries = g.panel.timeSeries,
local custom = timeSeries.fieldConfig.defaults.custom,
local options = timeSeries.options,
local standardOptions = timeSeries.standardOptions,
local byRegexp = timeSeries.standardOptions.override.byRegexp,

base(title, unit, targets, gridPos):
timeSeries.new(title)
+ timeSeries.queryOptions.withTargets(targets)
+ timeSeries.datasource.withType('prometheus')
+ timeSeries.datasource.withUid('$Datasource')
+ timeSeries.standardOptions.withUnit(unit)
+ timeSeries.gridPos.withX(gridPos.x)
+ timeSeries.gridPos.withY(gridPos.y)
Expand Down Expand Up @@ -38,7 +39,26 @@ local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonn
'mean',
'max',
])
+ options.legend.withDisplayMode('table'),
+ options.legend.withDisplayMode('table')
+ options.legend.withSortBy('Max')
+ options.legend.withSortDesc(true),

rightHandOverride(title, unit, targets, gridPos, leftAxis):
self.generalUsageAgg(title, unit, targets, gridPos)
+ custom.withAxisLabel(leftAxis)
+ options.legend.withCalcs([
'first',
'min',
'max',
'last',
])
+ options.legend.withSortBy('Max')
+ standardOptions.withOverrides([
byRegexp.new('.*irate.*')
+ byRegexp.withProperty('custom.axisPlacement', 'right')
+ byRegexp.withProperty('custom.axisLabel', 'rate')
+ byRegexp.withProperty('unit', 'none'),
]),

withoutCalcsAgg(title, unit, targets, gridPos):
self.base(title, unit, targets, gridPos)
Expand Down
30 changes: 27 additions & 3 deletions assets/etcd-on-cluster-dashboard/queries.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,36 @@ local generateTimeSeriesQuery(query, legend) = [

diskWalSyncDuration: {
query():
generateTimeSeriesQuery('histogram_quantile(0.99, sum(irate(etcd_disk_wal_fsync_duration_seconds_bucket{namespace="openshift-etcd"}[2m])) by (pod, le))', '{{pod}} WAL fsync'),
generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{namespace="openshift-etcd"}[5m])) by (pod, le))', '{{pod}} WAL fsync'),
},

diskBackendSyncDuration: {
diskWalSyncDurationSum: {
query():
generateTimeSeriesQuery('histogram_quantile(0.99, sum(irate(etcd_disk_backend_commit_duration_seconds_bucket{namespace="openshift-etcd"}[2m])) by (pod, le))', '{{pod}} DB fsync'),
generateTimeSeriesQuery('irate(etcd_disk_wal_fsync_duration_seconds_sum{namespace="openshift-etcd"}[2m])', '2m irate WAL sum {{instance}} ')
+ generateTimeSeriesQuery('etcd_disk_wal_fsync_duration_seconds_sum{namespace="openshift-etcd"}', 'WAL sum {{instance}} '),
},

diskWalSyncDurationCount: {
query():
generateTimeSeriesQuery('irate(etcd_disk_wal_fsync_duration_seconds_count{namespace="openshift-etcd"}[2m])', '2m irate WAL count {{instance}} ')
+ generateTimeSeriesQuery('etcd_disk_wal_fsync_duration_seconds_count{namespace="openshift-etcd"}', 'WAL count {{instance}} '),
},

diskBackendCommitDuration: {
query():
generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{namespace="openshift-etcd"}[5m])) by (pod, le))', '{{pod}} DB fsync'),
},

diskBackendCommitDurationSum: {
query():
generateTimeSeriesQuery('irate(etcd_disk_backend_commit_duration_seconds_sum{namespace="openshift-etcd"}[2m])', '2m irate WAL sum {{instance}} ')
+ generateTimeSeriesQuery('etcd_disk_backend_commit_duration_seconds_sum{namespace="openshift-etcd"}', 'WAL sum {{instance}} '),
},

diskBackendCommitDurationCount: {
query():
generateTimeSeriesQuery('irate(etcd_disk_backend_commit_duration_seconds_count{namespace="openshift-etcd"}[2m])', '2m irate WAL count {{instance}} ')
+ generateTimeSeriesQuery('etcd_disk_backend_commit_duration_seconds_count{namespace="openshift-etcd"}', 'WAL count {{instance}} '),
},

etcdContainerDiskWrites: {
Expand Down
20 changes: 19 additions & 1 deletion templates/General/etcd-on-cluster-dashboard.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,29 @@ g.dashboard.new('etcd-cluster-info dashoard')
panels.timeSeries.generalUsageAgg('CPU usage', 'percent', queries.CPUUsage.query(), { x: 0, y: 1, w: 12, h: 8 }),
panels.timeSeries.generalUsageAgg('Memory usage', 'bytes', queries.memoryUsage.query(), { x: 12, y: 1, w: 12, h: 8 }),
panels.timeSeries.generalUsageAgg('Disk WAL Sync Duration', 's', queries.diskWalSyncDuration.query(), { x: 0, y: 8, w: 12, h: 8 }),
panels.timeSeries.generalUsageAgg('Disk Backend Sync Duration', 's', queries.diskBackendSyncDuration.query(), { x: 12, y: 8, w: 12, h: 8 }),
panels.timeSeries.generalUsageAgg('Disk Backend Sync Duration', 's', queries.diskBackendCommitDuration.query(), { x: 12, y: 8, w: 12, h: 8 }),
panels.timeSeries.generalUsageAgg('Etcd container disk writes', 'Bps', queries.etcdContainerDiskWrites.query(), { x: 0, y: 16, w: 12, h: 8 }),
panels.timeSeries.generalUsageAgg('DB Size', 'bytes', queries.dbSize.query(), { x: 12, y: 16, w: 12, h: 8 }),
]),

g.panel.row.new('WAL fsync Duration Detailed')
+ g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 })
+ g.panel.row.withCollapsed(true)
+ g.panel.row.withPanels([
panels.timeSeries.generalUsageAgg('WAL fsync Duration p99', 's', queries.diskWalSyncDuration.query(), { x: 0, y: 0, w: 8, h: 8 }),
panels.timeSeries.rightHandOverride('WAL fsync Duration sum', 'none', queries.diskWalSyncDurationSum.query(), { x: 8, y: 0, w: 8, h: 8 }, 'sum'),
panels.timeSeries.rightHandOverride('WAL fsync Duration count', 'none', queries.diskWalSyncDurationCount.query(), { x: 16, y: 0, w: 8, h: 8 }, 'count'),
]),

g.panel.row.new('Backend Commit Duration Detailed')
+ g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 })
+ g.panel.row.withCollapsed(true)
+ g.panel.row.withPanels([
panels.timeSeries.generalUsageAgg('Backend Commit Duration', 's', queries.diskBackendCommitDuration.query(), { x: 0, y: 0, w: 8, h: 8 }),
panels.timeSeries.rightHandOverride('Backend Commit Duration sum', 'none', queries.diskBackendCommitDurationSum.query(), { x: 8, y: 0, w: 8, h: 8 }, 'sum'),
panels.timeSeries.rightHandOverride('Backend Commit Duration count', 'none', queries.diskBackendCommitDurationCount.query(), { x: 16, y: 0, w: 8, h: 8 }, 'count'),
]),

g.panel.row.new('Network Usage')
+ g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 })
+ g.panel.row.withCollapsed(true)
Expand Down

0 comments on commit 86bf15f

Please sign in to comment.