diff --git a/assets/etcd-on-cluster-dashboard/panels.libsonnet b/assets/etcd-on-cluster-dashboard/panels.libsonnet index b23073c..32097df 100644 --- a/assets/etcd-on-cluster-dashboard/panels.libsonnet +++ b/assets/etcd-on-cluster-dashboard/panels.libsonnet @@ -5,12 +5,13 @@ local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonn local timeSeries = g.panel.timeSeries, local custom = timeSeries.fieldConfig.defaults.custom, local options = timeSeries.options, + local standardOptions = timeSeries.standardOptions, + local byRegexp = timeSeries.standardOptions.override.byRegexp, base(title, unit, targets, gridPos): timeSeries.new(title) + timeSeries.queryOptions.withTargets(targets) + timeSeries.datasource.withType('prometheus') - + timeSeries.datasource.withUid('$Datasource') + timeSeries.standardOptions.withUnit(unit) + timeSeries.gridPos.withX(gridPos.x) + timeSeries.gridPos.withY(gridPos.y) @@ -38,7 +39,30 @@ local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonn 'mean', 'max', ]) - + options.legend.withDisplayMode('table'), + + options.legend.withDisplayMode('table') + + options.legend.withSortBy('Max') + + options.legend.withSortDesc(true), + + generalCounter(title, unit, targets, gridPos): + self.base(title, unit, targets, gridPos) + + options.legend.withCalcs([ + 'first', + 'min', + 'max', + 'last', + ]), + + histogramStatsRightHand(title, unit, targets, gridPos, leftAxis): + self.generalCounter(title, unit, targets, gridPos) + + custom.withAxisLabel(leftAxis) + + options.legend.withDisplayMode('table') + + options.legend.withSortBy('Max') + + standardOptions.withOverrides([ + byRegexp.new('.*irate.*') + + byRegexp.withProperty('custom.axisPlacement', 'right') + + byRegexp.withProperty('custom.axisLabel', 'rate') + + byRegexp.withProperty('unit', 'none'), + ]), withoutCalcsAgg(title, unit, targets, gridPos): self.base(title, unit, targets, gridPos) diff --git a/assets/etcd-on-cluster-dashboard/queries.libsonnet b/assets/etcd-on-cluster-dashboard/queries.libsonnet index a5f5804..050d35a 100644 --- a/assets/etcd-on-cluster-dashboard/queries.libsonnet +++ b/assets/etcd-on-cluster-dashboard/queries.libsonnet @@ -12,22 +12,46 @@ local generateTimeSeriesQuery(query, legend) = [ { CPUUsage: { query(): - generateTimeSeriesQuery('sum(irate(container_cpu_usage_seconds_total{namespace="openshift-etcd", container="etcd"}[2m])) by (pod) * 100', '{{ pod }}'), + generateTimeSeriesQuery('sum(irate(container_cpu_usage_seconds_total{namespace="openshift-etcd", container="etcd",pod=~"$etcd_pod"}[2m])) by (pod) * 100', '{{ pod }}'), }, memoryUsage: { query(): - generateTimeSeriesQuery('sum(avg_over_time(container_memory_working_set_bytes{container="",pod!="", namespace=~"openshift-etcd.*"}[2m])) BY (pod, namespace)', '{{ pod }}'), + generateTimeSeriesQuery('sum(avg_over_time(container_memory_working_set_bytes{container="",pod!="", namespace=~"openshift-etcd.*",pod=~"$etcd_pod"}[2m])) BY (pod, namespace)', '{{ pod }}'), }, diskWalSyncDuration: { query(): - generateTimeSeriesQuery('histogram_quantile(0.99, sum(irate(etcd_disk_wal_fsync_duration_seconds_bucket{namespace="openshift-etcd"}[2m])) by (pod, le))', '{{pod}} WAL fsync'), + generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{namespace="openshift-etcd",pod=~"$etcd_pod"}[5m])) by (pod, le))', '{{pod}} WAL fsync'), }, - diskBackendSyncDuration: { + diskWalSyncDurationSum: { query(): - generateTimeSeriesQuery('histogram_quantile(0.99, sum(irate(etcd_disk_backend_commit_duration_seconds_bucket{namespace="openshift-etcd"}[2m])) by (pod, le))', '{{pod}} DB fsync'), + generateTimeSeriesQuery('irate(etcd_disk_wal_fsync_duration_seconds_sum{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', '2m irate WAL sum {{instance}} ') + + generateTimeSeriesQuery('etcd_disk_wal_fsync_duration_seconds_sum{namespace="openshift-etcd",pod=~"$etcd_pod"}', 'WAL sum {{instance}} '), + }, + + diskWalSyncDurationCount: { + query(): + generateTimeSeriesQuery('irate(etcd_disk_wal_fsync_duration_seconds_count{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', '2m irate WAL count {{instance}} ') + + generateTimeSeriesQuery('etcd_disk_wal_fsync_duration_seconds_count{namespace="openshift-etcd",pod=~"$etcd_pod"}', 'WAL count {{instance}} '), + }, + + diskBackendCommitDuration: { + query(): + generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{namespace="openshift-etcd",pod=~"$etcd_pod"}[5m])) by (pod, le))', '{{pod}} DB fsync'), + }, + + diskBackendCommitDurationSum: { + query(): + generateTimeSeriesQuery('irate(etcd_disk_backend_commit_duration_seconds_sum{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', '2m irate WAL sum {{instance}} ') + + generateTimeSeriesQuery('etcd_disk_backend_commit_duration_seconds_sum{namespace="openshift-etcd",pod=~"$etcd_pod"}', 'WAL sum {{instance}} '), + }, + + diskBackendCommitDurationCount: { + query(): + generateTimeSeriesQuery('irate(etcd_disk_backend_commit_duration_seconds_count{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', '2m irate WAL count {{instance}} ') + + generateTimeSeriesQuery('etcd_disk_backend_commit_duration_seconds_count{namespace="openshift-etcd",pod=~"$etcd_pod"}', 'WAL count {{instance}} '), }, etcdContainerDiskWrites: { @@ -37,8 +61,8 @@ local generateTimeSeriesQuery(query, legend) = [ dbSize: { query(): - generateTimeSeriesQuery('etcd_mvcc_db_total_size_in_bytes{namespace="openshift-etcd"}', '{{pod}} DB physical size') - + generateTimeSeriesQuery('etcd_mvcc_db_total_size_in_use_in_bytes{namespace="openshift-etcd"}', '{{pod}} DB logical size'), + generateTimeSeriesQuery('etcd_mvcc_db_total_size_in_bytes{namespace="openshift-etcd",pod=~"$etcd_pod"}', '{{pod}} DB physical size') + + generateTimeSeriesQuery('etcd_mvcc_db_total_size_in_use_in_bytes{namespace="openshift-etcd",pod=~"$etcd_pod"}', '{{pod}} DB logical size'), }, containerNetworkTraffic: { @@ -49,19 +73,19 @@ local generateTimeSeriesQuery(query, legend) = [ p99PeerToPeerLatency: { query(): - generateTimeSeriesQuery('histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{namespace="openshift-etcd"}[2m]))', '{{pod}}'), + generateTimeSeriesQuery('histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m]))', '{{pod}}'), }, peerNetworkTraffic: { query(): - generateTimeSeriesQuery('rate(etcd_network_peer_received_bytes_total{namespace="openshift-etcd"}[2m])', 'rx {{pod}} Peer Traffic') - + generateTimeSeriesQuery('rate(etcd_network_peer_sent_bytes_total{namespace="openshift-etcd"}[2m])', 'tx {{pod}} Peer Traffic'), + generateTimeSeriesQuery('rate(etcd_network_peer_received_bytes_total{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', 'rx {{pod}} Peer Traffic') + + generateTimeSeriesQuery('rate(etcd_network_peer_sent_bytes_total{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', 'tx {{pod}} Peer Traffic'), }, gRPCNetworkTraffic: { query(): - generateTimeSeriesQuery('rate(etcd_network_client_grpc_received_bytes_total{namespace="openshift-etcd"}[2m])', 'rx {{pod}}') - + generateTimeSeriesQuery('rate(etcd_network_client_grpc_sent_bytes_total{namespace="openshift-etcd"}[2m])', 'tx {{pod}}'), + generateTimeSeriesQuery('rate(etcd_network_client_grpc_received_bytes_total{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', 'rx {{pod}}') + + generateTimeSeriesQuery('rate(etcd_network_client_grpc_sent_bytes_total{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', 'tx {{pod}}'), }, activeStreams: { @@ -77,17 +101,17 @@ local generateTimeSeriesQuery(query, legend) = [ dbSpaceUsed: { query(): - generateTimeSeriesQuery('(etcd_mvcc_db_total_size_in_bytes{namespace="openshift-etcd"} / etcd_server_quota_backend_bytes{namespace="openshift-etcd"})*100', '{{pod}}'), + generateTimeSeriesQuery('(etcd_mvcc_db_total_size_in_bytes{namespace="openshift-etcd",pod=~"$etcd_pod"} / etcd_server_quota_backend_bytes{namespace="openshift-etcd",pod=~"$etcd_pod"})*100', '{{pod}}'), }, dbLeftCapacity: { query(): - generateTimeSeriesQuery('etcd_server_quota_backend_bytes{namespace="openshift-etcd"} - etcd_mvcc_db_total_size_in_bytes{namespace="openshift-etcd"}', '{{pod}}'), + generateTimeSeriesQuery('etcd_server_quota_backend_bytes{namespace="openshift-etcd",pod=~"$etcd_pod"} - etcd_mvcc_db_total_size_in_bytes{namespace="openshift-etcd",pod=~"$etcd_pod"}', '{{pod}}'), }, dbSizeLimit: { query(): - generateTimeSeriesQuery('etcd_server_quota_backend_bytes{namespace="openshift-etcd"}', '{{ pod }} Quota Bytes'), + generateTimeSeriesQuery('etcd_server_quota_backend_bytes{namespace="openshift-etcd",pod=~"$etcd_pod"}', '{{ pod }} Quota Bytes'), }, raftProposals: { @@ -115,34 +139,34 @@ local generateTimeSeriesQuery(query, legend) = [ keys: { query(): - generateTimeSeriesQuery('etcd_debugging_mvcc_keys_total{namespace="openshift-etcd"}', '{{ pod }} Num keys'), + generateTimeSeriesQuery('etcd_debugging_mvcc_keys_total{namespace="openshift-etcd",pod=~"$etcd_pod"}', '{{ pod }} Num keys'), }, leaderElectionsPerDay: { query(): - generateTimeSeriesQuery('changes(etcd_server_leader_changes_seen_total{namespace="openshift-etcd"}[1d])', '{{instance}} Total Leader Elections Per Day'), + generateTimeSeriesQuery('changes(etcd_server_leader_changes_seen_total{namespace="openshift-etcd",pod=~"$etcd_pod"}[1d])', '{{instance}} Total Leader Elections Per Day'), }, slowOperations: { query(): - generateTimeSeriesQuery('delta(etcd_server_slow_apply_total{namespace="openshift-etcd"}[2m])', '{{ pod }} slow applies') - + generateTimeSeriesQuery('delta(etcd_server_slow_read_indexes_total{namespace="openshift-etcd"}[2m])', '{{ pod }} slow read indexes'), + generateTimeSeriesQuery('delta(etcd_server_slow_apply_total{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', '{{ pod }} slow applies') + + generateTimeSeriesQuery('delta(etcd_server_slow_read_indexes_total{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', '{{ pod }} slow read indexes'), }, keyOperations: { query(): - generateTimeSeriesQuery('rate(etcd_mvcc_put_total{namespace="openshift-etcd"}[2m])', '{{ pod }} puts/s') - + generateTimeSeriesQuery('rate(etcd_mvcc_delete_total{namespace="openshift-etcd"}[2m])', '{{ pod }} deletes/s'), + generateTimeSeriesQuery('rate(etcd_mvcc_put_total{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', '{{ pod }} puts/s') + + generateTimeSeriesQuery('rate(etcd_mvcc_delete_total{namespace="openshift-etcd",pod=~"$etcd_pod"}[2m])', '{{ pod }} deletes/s'), }, heartBeatFailure: { query(): - generateTimeSeriesQuery('etcd_server_heartbeat_send_failures_total{namespace="openshift-etcd"}', '{{ pod }} heartbeat failures') - + generateTimeSeriesQuery('etcd_server_health_failures{namespace="openshift-etcd"}', '{{ pod }} health failures'), + generateTimeSeriesQuery('etcd_server_heartbeat_send_failures_total{namespace="openshift-etcd",pod=~"$etcd_pod"}', '{{ pod }} heartbeat failures') + + generateTimeSeriesQuery('etcd_server_health_failures{namespace="openshift-etcd",pod=~"$etcd_pod"}', '{{ pod }} health failures'), }, compactedKeys: { query(): - generateTimeSeriesQuery('etcd_debugging_mvcc_db_compaction_keys_total{namespace="openshift-etcd"}', '{{ pod }} keys compacted'), + generateTimeSeriesQuery('etcd_debugging_mvcc_db_compaction_keys_total{namespace="openshift-etcd",pod=~"$etcd_pod"}', '{{ pod }} keys compacted'), }, } diff --git a/assets/etcd-on-cluster-dashboard/variables.libsonnet b/assets/etcd-on-cluster-dashboard/variables.libsonnet index 6edcc56..3936016 100644 --- a/assets/etcd-on-cluster-dashboard/variables.libsonnet +++ b/assets/etcd-on-cluster-dashboard/variables.libsonnet @@ -9,4 +9,15 @@ local var = g.dashboard.variable; + var.query.withRefresh(1) + var.query.selectionOptions.withMulti(false) + var.query.selectionOptions.withIncludeAll(false), + + etcd_pod: + var.query.new('etcd_pod') + + var.query.withDatasourceFromVariable(self.Datasource) + + var.query.queryTypes.withLabelValues( + 'pod', + 'etcd_cluster_version', + ) + + var.query.withRefresh(2) + + var.query.selectionOptions.withMulti() + + var.query.selectionOptions.withIncludeAll(true), } diff --git a/templates/General/etcd-on-cluster-dashboard.jsonnet b/templates/General/etcd-on-cluster-dashboard.jsonnet index f2d08b3..6c3aaa8 100644 --- a/templates/General/etcd-on-cluster-dashboard.jsonnet +++ b/templates/General/etcd-on-cluster-dashboard.jsonnet @@ -14,6 +14,7 @@ g.dashboard.new('etcd-cluster-info dashoard') + g.dashboard.graphTooltip.withSharedCrosshair() + g.dashboard.withVariables([ variables.Datasource, + variables.etcd_pod, ]) + g.dashboard.withPanels([ @@ -24,11 +25,29 @@ g.dashboard.new('etcd-cluster-info dashoard') panels.timeSeries.generalUsageAgg('CPU usage', 'percent', queries.CPUUsage.query(), { x: 0, y: 1, w: 12, h: 8 }), panels.timeSeries.generalUsageAgg('Memory usage', 'bytes', queries.memoryUsage.query(), { x: 12, y: 1, w: 12, h: 8 }), panels.timeSeries.generalUsageAgg('Disk WAL Sync Duration', 's', queries.diskWalSyncDuration.query(), { x: 0, y: 8, w: 12, h: 8 }), - panels.timeSeries.generalUsageAgg('Disk Backend Sync Duration', 's', queries.diskBackendSyncDuration.query(), { x: 12, y: 8, w: 12, h: 8 }), + panels.timeSeries.generalUsageAgg('Disk Backend Sync Duration', 's', queries.diskBackendCommitDuration.query(), { x: 12, y: 8, w: 12, h: 8 }), panels.timeSeries.generalUsageAgg('Etcd container disk writes', 'Bps', queries.etcdContainerDiskWrites.query(), { x: 0, y: 16, w: 12, h: 8 }), panels.timeSeries.generalUsageAgg('DB Size', 'bytes', queries.dbSize.query(), { x: 12, y: 16, w: 12, h: 8 }), ]), + g.panel.row.new('WAL fsync Duration Detailed') + + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withPanels([ + panels.timeSeries.generalUsageAgg('WAL fsync Duration p99', 's', queries.diskWalSyncDuration.query(), { x: 0, y: 0, w: 8, h: 8 }), + panels.timeSeries.histogramStatsRightHand('WAL fsync Duration sum', 'none', queries.diskWalSyncDurationSum.query(), { x: 8, y: 0, w: 8, h: 8 }, 'sum'), + panels.timeSeries.histogramStatsRightHand('WAL fsync Duration count', 'none', queries.diskWalSyncDurationCount.query(), { x: 16, y: 0, w: 8, h: 8 }, 'count'), + ]), + + g.panel.row.new('Backend Commit Duration Detailed') + + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withPanels([ + panels.timeSeries.generalUsageAgg('Backend Commit Duration', 's', queries.diskBackendCommitDuration.query(), { x: 0, y: 0, w: 8, h: 8 }), + panels.timeSeries.histogramStatsRightHand('Backend Commit Duration sum', 'none', queries.diskBackendCommitDurationSum.query(), { x: 8, y: 0, w: 8, h: 8 }, 'sum'), + panels.timeSeries.histogramStatsRightHand('Backend Commit Duration count', 'none', queries.diskBackendCommitDurationCount.query(), { x: 16, y: 0, w: 8, h: 8 }, 'count'), + ]), + g.panel.row.new('Network Usage') + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) + g.panel.row.withCollapsed(true) @@ -62,7 +81,7 @@ g.dashboard.new('etcd-cluster-info dashoard') panels.timeSeries.GeneralInfo('Leader Elections Per Day', 'short', queries.leaderElectionsPerDay.query(), { x: 0, y: 12, w: 12, h: 6 }), panels.timeSeries.GeneralInfo('Slow Operations', 'ops', queries.slowOperations.query(), { x: 0, y: 20, w: 12, h: 8 }), panels.timeSeries.GeneralInfo('Key Operations', 'ops', queries.keyOperations.query(), { x: 12, y: 20, w: 12, h: 8 }), - panels.timeSeries.GeneralInfo('Heartbeat Failures', 'short', queries.heartBeatFailure.query(), { x: 0, y: 28, w: 12, h: 8 }), + panels.timeSeries.generalCounter('Heartbeat Failures', 'short', queries.heartBeatFailure.query(), { x: 0, y: 28, w: 12, h: 8 }), panels.timeSeries.GeneralInfo('Compacted Keys', 'short', queries.compactedKeys.query(), { x: 12, y: 28, w: 12, h: 8 }), ]),