From 17cda479d68374bef504b5ef303de99586e3a74a Mon Sep 17 00:00:00 2001 From: Andrew Collins Date: Tue, 24 Sep 2024 17:27:22 -0500 Subject: [PATCH] Add etcd_pod picker Signed-off-by: Andrew Collins --- .../panels.libsonnet | 12 +++++++---- .../queries.libsonnet | 20 +++++++++---------- .../variables.libsonnet | 11 ++++++++++ .../General/etcd-on-cluster-dashboard.jsonnet | 11 +++++----- 4 files changed, 35 insertions(+), 19 deletions(-) diff --git a/assets/etcd-on-cluster-dashboard/panels.libsonnet b/assets/etcd-on-cluster-dashboard/panels.libsonnet index e6b42fd..32097df 100644 --- a/assets/etcd-on-cluster-dashboard/panels.libsonnet +++ b/assets/etcd-on-cluster-dashboard/panels.libsonnet @@ -43,15 +43,19 @@ local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonn + options.legend.withSortBy('Max') + options.legend.withSortDesc(true), - rightHandOverride(title, unit, targets, gridPos, leftAxis): - self.generalUsageAgg(title, unit, targets, gridPos) - + custom.withAxisLabel(leftAxis) + generalCounter(title, unit, targets, gridPos): + self.base(title, unit, targets, gridPos) + options.legend.withCalcs([ 'first', 'min', 'max', 'last', - ]) + ]), + + histogramStatsRightHand(title, unit, targets, gridPos, leftAxis): + self.generalCounter(title, unit, targets, gridPos) + + custom.withAxisLabel(leftAxis) + + options.legend.withDisplayMode('table') + options.legend.withSortBy('Max') + standardOptions.withOverrides([ byRegexp.new('.*irate.*') diff --git a/assets/etcd-on-cluster-dashboard/queries.libsonnet b/assets/etcd-on-cluster-dashboard/queries.libsonnet index 48be962..b9060f5 100644 --- a/assets/etcd-on-cluster-dashboard/queries.libsonnet +++ b/assets/etcd-on-cluster-dashboard/queries.libsonnet @@ -22,36 +22,36 @@ local generateTimeSeriesQuery(query, legend) = [ diskWalSyncDuration: { query(): - generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{namespace="openshift-etcd"}[5m])) by (pod, le))', '{{pod}} WAL fsync'), + generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{namespace="openshift-etcd",pod=~"$etcd_pod"}[5m])) by (pod, le))', '{{pod}} WAL fsync'), }, diskWalSyncDurationSum: { query(): - generateTimeSeriesQuery('irate(etcd_disk_wal_fsync_duration_seconds_sum{namespace="openshift-etcd"}[2m])', '2m irate WAL sum {{instance}} ') - + generateTimeSeriesQuery('etcd_disk_wal_fsync_duration_seconds_sum{namespace="openshift-etcd"}', 'WAL sum {{instance}} '), + generateTimeSeriesQuery('irate(etcd_disk_wal_fsync_duration_seconds_sum{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', '2m irate WAL sum {{instance}} ') + + generateTimeSeriesQuery('etcd_disk_wal_fsync_duration_seconds_sum{namespace="openshift-etcd",pod=~"$etcd_pod"}', 'WAL sum {{instance}} '), }, diskWalSyncDurationCount: { query(): - generateTimeSeriesQuery('irate(etcd_disk_wal_fsync_duration_seconds_count{namespace="openshift-etcd"}[2m])', '2m irate WAL count {{instance}} ') - + generateTimeSeriesQuery('etcd_disk_wal_fsync_duration_seconds_count{namespace="openshift-etcd"}', 'WAL count {{instance}} '), + generateTimeSeriesQuery('irate(etcd_disk_wal_fsync_duration_seconds_count{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', '2m irate WAL count {{instance}} ') + + generateTimeSeriesQuery('etcd_disk_wal_fsync_duration_seconds_count{namespace="openshift-etcd",pod=~"$etcd_pod"}', 'WAL count {{instance}} '), }, diskBackendCommitDuration: { query(): - generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{namespace="openshift-etcd"}[5m])) by (pod, le))', '{{pod}} DB fsync'), + generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{namespace="openshift-etcd",pod=~"$etcd_pod"}[5m])) by (pod, le))', '{{pod}} DB fsync'), }, diskBackendCommitDurationSum: { query(): - generateTimeSeriesQuery('irate(etcd_disk_backend_commit_duration_seconds_sum{namespace="openshift-etcd"}[2m])', '2m irate WAL sum {{instance}} ') - + generateTimeSeriesQuery('etcd_disk_backend_commit_duration_seconds_sum{namespace="openshift-etcd"}', 'WAL sum {{instance}} '), + generateTimeSeriesQuery('irate(etcd_disk_backend_commit_duration_seconds_sum{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', '2m irate WAL sum {{instance}} ') + + generateTimeSeriesQuery('etcd_disk_backend_commit_duration_seconds_sum{namespace="openshift-etcd",pod=~"$etcd_pod"}', 'WAL sum {{instance}} '), }, diskBackendCommitDurationCount: { query(): - generateTimeSeriesQuery('irate(etcd_disk_backend_commit_duration_seconds_count{namespace="openshift-etcd"}[2m])', '2m irate WAL count {{instance}} ') - + generateTimeSeriesQuery('etcd_disk_backend_commit_duration_seconds_count{namespace="openshift-etcd"}', 'WAL count {{instance}} '), + generateTimeSeriesQuery('irate(etcd_disk_backend_commit_duration_seconds_count{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', '2m irate WAL count {{instance}} ') + + generateTimeSeriesQuery('etcd_disk_backend_commit_duration_seconds_count{namespace="openshift-etcd",pod=~"$etcd_pod"}', 'WAL count {{instance}} '), }, etcdContainerDiskWrites: { diff --git a/assets/etcd-on-cluster-dashboard/variables.libsonnet b/assets/etcd-on-cluster-dashboard/variables.libsonnet index 6edcc56..ec3e97d 100644 --- a/assets/etcd-on-cluster-dashboard/variables.libsonnet +++ b/assets/etcd-on-cluster-dashboard/variables.libsonnet @@ -9,4 +9,15 @@ local var = g.dashboard.variable; + var.query.withRefresh(1) + var.query.selectionOptions.withMulti(false) + var.query.selectionOptions.withIncludeAll(false), + + etcd_pod: + var.query.new('etcd_pod') + + var.query.withDatasourceFromVariable(self.Datasource) + + var.query.queryTypes.withLabelValues( + 'pod', + 'etcd_cluster_version', + ) + + var.query.withRefresh(2) + + var.query.selectionOptions.withMulti() + + var.query.selectionOptions.withIncludeAll(false), } diff --git a/templates/General/etcd-on-cluster-dashboard.jsonnet b/templates/General/etcd-on-cluster-dashboard.jsonnet index 78a2185..6c3aaa8 100644 --- a/templates/General/etcd-on-cluster-dashboard.jsonnet +++ b/templates/General/etcd-on-cluster-dashboard.jsonnet @@ -14,6 +14,7 @@ g.dashboard.new('etcd-cluster-info dashoard') + g.dashboard.graphTooltip.withSharedCrosshair() + g.dashboard.withVariables([ variables.Datasource, + variables.etcd_pod, ]) + g.dashboard.withPanels([ @@ -34,8 +35,8 @@ g.dashboard.new('etcd-cluster-info dashoard') + g.panel.row.withCollapsed(true) + g.panel.row.withPanels([ panels.timeSeries.generalUsageAgg('WAL fsync Duration p99', 's', queries.diskWalSyncDuration.query(), { x: 0, y: 0, w: 8, h: 8 }), - panels.timeSeries.rightHandOverride('WAL fsync Duration sum', 'none', queries.diskWalSyncDurationSum.query(), { x: 8, y: 0, w: 8, h: 8 }, 'sum'), - panels.timeSeries.rightHandOverride('WAL fsync Duration count', 'none', queries.diskWalSyncDurationCount.query(), { x: 16, y: 0, w: 8, h: 8 }, 'count'), + panels.timeSeries.histogramStatsRightHand('WAL fsync Duration sum', 'none', queries.diskWalSyncDurationSum.query(), { x: 8, y: 0, w: 8, h: 8 }, 'sum'), + panels.timeSeries.histogramStatsRightHand('WAL fsync Duration count', 'none', queries.diskWalSyncDurationCount.query(), { x: 16, y: 0, w: 8, h: 8 }, 'count'), ]), g.panel.row.new('Backend Commit Duration Detailed') @@ -43,8 +44,8 @@ g.dashboard.new('etcd-cluster-info dashoard') + g.panel.row.withCollapsed(true) + g.panel.row.withPanels([ panels.timeSeries.generalUsageAgg('Backend Commit Duration', 's', queries.diskBackendCommitDuration.query(), { x: 0, y: 0, w: 8, h: 8 }), - panels.timeSeries.rightHandOverride('Backend Commit Duration sum', 'none', queries.diskBackendCommitDurationSum.query(), { x: 8, y: 0, w: 8, h: 8 }, 'sum'), - panels.timeSeries.rightHandOverride('Backend Commit Duration count', 'none', queries.diskBackendCommitDurationCount.query(), { x: 16, y: 0, w: 8, h: 8 }, 'count'), + panels.timeSeries.histogramStatsRightHand('Backend Commit Duration sum', 'none', queries.diskBackendCommitDurationSum.query(), { x: 8, y: 0, w: 8, h: 8 }, 'sum'), + panels.timeSeries.histogramStatsRightHand('Backend Commit Duration count', 'none', queries.diskBackendCommitDurationCount.query(), { x: 16, y: 0, w: 8, h: 8 }, 'count'), ]), g.panel.row.new('Network Usage') @@ -80,7 +81,7 @@ g.dashboard.new('etcd-cluster-info dashoard') panels.timeSeries.GeneralInfo('Leader Elections Per Day', 'short', queries.leaderElectionsPerDay.query(), { x: 0, y: 12, w: 12, h: 6 }), panels.timeSeries.GeneralInfo('Slow Operations', 'ops', queries.slowOperations.query(), { x: 0, y: 20, w: 12, h: 8 }), panels.timeSeries.GeneralInfo('Key Operations', 'ops', queries.keyOperations.query(), { x: 12, y: 20, w: 12, h: 8 }), - panels.timeSeries.GeneralInfo('Heartbeat Failures', 'short', queries.heartBeatFailure.query(), { x: 0, y: 28, w: 12, h: 8 }), + panels.timeSeries.generalCounter('Heartbeat Failures', 'short', queries.heartBeatFailure.query(), { x: 0, y: 28, w: 12, h: 8 }), panels.timeSeries.GeneralInfo('Compacted Keys', 'short', queries.compactedKeys.query(), { x: 12, y: 28, w: 12, h: 8 }), ]),