From e583969ebf675e18d9b4a9b05c7562dfc0cfe098 Mon Sep 17 00:00:00 2001 From: Vishnu Challa Date: Sat, 27 Jul 2024 17:15:16 -0400 Subject: [PATCH] Removed old code and keeping only latest grafonnet dashboards Signed-off-by: Vishnu Challa --- .github/workflows/ci.yml | 2 +- .github/workflows/release.yml | 2 +- Makefile | 22 +- README.md | 22 +- ...s-perf-v2.jsonnet => ingress-perf.jsonnet} | 0 ...netperf-v2.jsonnet => k8s-netperf.jsonnet} | 0 ...sonnet => kube-burner-report-mode.jsonnet} | 0 ...=> kube-burner-report-ocp-wrapper.jsonnet} | 0 .../api-performance-overview-v2.jsonnet | 50 - .../General/api-performance-overview.jsonnet | 532 +- templates/General/cilium-k8s-perf-v2.jsonnet | 70 - templates/General/cilium-k8s-perf.jsonnet | 625 +-- .../etcd-on-cluster-dashboard-v2.jsonnet | 69 - .../General/etcd-on-cluster-dashboard.jsonnet | 597 +-- .../General/hypershift-performance-v2.jsonnet | 154 - .../General/hypershift-performance.jsonnet | 2015 +------- templates/General/k8s-perf-v2.jsonnet | 61 - templates/General/k8s-perf.jsonnet | 560 +- templates/General/kube-burner.jsonnet | 4568 ----------------- templates/General/ocp-performance-v2.jsonnet | 145 - templates/General/ocp-performance.jsonnet | 855 +-- templates/General/ovn-dashboard.jsonnet | 378 +- templates/General/ovn-monitoring-v2.jsonnet | 59 - .../General/pgbench-dashboard-v2.jsonnet | 31 - templates/General/pgbench-dashboard.jsonnet | 331 +- templates/General/uperf-perf.jsonnet | 402 +- templates/General/uperf-v2.jsonnet | 32 - templates/General/vegeta-wrapper-v2.jsonnet | 31 - templates/General/vegeta-wrapper.jsonnet | 380 +- templates/General/ycsb-v2.jsonnet | 29 - templates/General/ycsb.jsonnet | 385 +- 31 files changed, 739 insertions(+), 11668 deletions(-) rename templates/CPT/{ingress-perf-v2.jsonnet => ingress-perf.jsonnet} (100%) rename templates/CPT/{k8s-netperf-v2.jsonnet => k8s-netperf.jsonnet} (100%) rename templates/CPT/{kube-burner-report-mode-v2.jsonnet => kube-burner-report-mode.jsonnet} (100%) rename templates/CPT/{kube-burner-report-ocp-wrapper-v2.jsonnet => kube-burner-report-ocp-wrapper.jsonnet} (100%) delete mode 100644 templates/General/api-performance-overview-v2.jsonnet delete mode 100644 templates/General/cilium-k8s-perf-v2.jsonnet delete mode 100644 templates/General/etcd-on-cluster-dashboard-v2.jsonnet delete mode 100644 templates/General/hypershift-performance-v2.jsonnet delete mode 100644 templates/General/k8s-perf-v2.jsonnet delete mode 100644 templates/General/kube-burner.jsonnet delete mode 100644 templates/General/ocp-performance-v2.jsonnet delete mode 100644 templates/General/ovn-monitoring-v2.jsonnet delete mode 100644 templates/General/pgbench-dashboard-v2.jsonnet delete mode 100644 templates/General/uperf-v2.jsonnet delete mode 100644 templates/General/vegeta-wrapper-v2.jsonnet delete mode 100644 templates/General/ycsb-v2.jsonnet diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 043ea3d..d9d0e43 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -38,7 +38,7 @@ jobs: - uses: actions/checkout@v4 - name: Compile dashboards - run: make v2 + run: make - name: Run grafana container run: sudo docker run -d -p 3000:3000 docker.io/grafana/grafana:9.4.3 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 365c337..47178ee 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -25,4 +25,4 @@ jobs: QUAY_TOKEN: ${{ secrets.QUAY_TOKEN }} - name: Build & push syncer image - run: make v2 build-syncer-image push-syncer-image + run: make build-syncer-image push-syncer-image diff --git a/Makefile b/Makefile index 3303400..7f772be 100644 --- a/Makefile +++ b/Makefile @@ -10,15 +10,9 @@ ALLDIRS = $(BINDIR) $(OUTPUTDIR) SYNCER_IMG_TAG ?= quay.io/cloud-bulldozer/dittybopper-syncer:latest PLATFORM = linux/amd64,linux/arm64,linux/ppc64le,linux/s390x -ifeq ($(filter v2,$(MAKECMDGOALS)),v2) - # Set variables and instructions for v2 - TEMPLATES := $(wildcard $(TEMPLATESDIR)/**/*-v2.jsonnet) - LIBRARY_PATH := $(TEMPLATESDIR)/vendor -else - # Get all templates at $(TEMPLATESDIR) - TEMPLATES := $(filter-out %-v2.jsonnet, $(wildcard $(TEMPLATESDIR)/**/*.jsonnet)) - LIBRARY_PATH := $(TEMPLATESDIR)/grafonnet-lib -endif +# Get all templates at $(TEMPLATESDIR) +TEMPLATES := $(wildcard $(TEMPLATESDIR)/**/*.jsonnet) +LIBRARY_PATH := $(TEMPLATESDIR)/vendor # Replace $(TEMPLATESDIR)/*.jsonnet by $(OUTPUTDIR)/*.json outputs := $(patsubst $(TEMPLATESDIR)/%.jsonnet, $(OUTPUTDIR)/%.json, $(TEMPLATES)) @@ -37,7 +31,7 @@ build: deps $(LIBRARY_PATH) $(outputs) clean: @echo "Cleaning up" - rm -rf $(ALLDIRS) $(TEMPLATESDIR)/vendor $(TEMPLATESDIR)/grafonnet-lib + rm -rf $(ALLDIRS) $(TEMPLATESDIR)/vendor $(BINDIR)/jsonnet: @echo "Downloading jsonnet binary" @@ -46,9 +40,6 @@ $(BINDIR)/jsonnet: curl -s -L $(JB) -o $(BINDIR)/jb chmod +x $(BINDIR)/jb -$(TEMPLATESDIR)/grafonnet-lib: - git clone --depth 1 https://github.com/grafana/grafonnet-lib.git $(TEMPLATESDIR)/grafonnet-lib - $(TEMPLATESDIR)/vendor: @echo "Downloading vendor files" cd $(TEMPLATESDIR) && ../$(BINDIR)/jb install && cd ../ @@ -59,10 +50,7 @@ $(OUTPUTDIR)/%.json: $(TEMPLATESDIR)/%.jsonnet mkdir -p $(dir $@) $(BINDIR)/jsonnet -J ./$(LIBRARY_PATH) $< > $@ -v2: all - @echo "Rendered the v2 dashboards with latest grafonnet library" - -build-syncer-image: v2 +build-syncer-image: build podman build --platform=${PLATFORM} -f Dockerfile --manifest=${SYNCER_IMG_TAG} . push-syncer-image: diff --git a/README.md b/README.md index 6770c23..6a403da 100644 --- a/README.md +++ b/README.md @@ -18,25 +18,8 @@ Render a jsonnet file is as simple as executing `jsonnet `. Th A makefile has been included to automate jsonnet formatting and rendering tasks. Executing `make` downloads the jsonnet binary and renders the templates at the *rendered* directory. i.e. - ``` $ make -mkdir -p bin rendered tmp -git clone --depth 1 https://github.com/grafana/grafonnet-lib.git templates/grafonnet-lib -Cloning into 'templates/grafonnet-lib'... -Downloading jsonnet binary -curl -s -L https://github.com/google/go-jsonnet/releases/download/v0.20.0/go-jsonnet_0.20.0_Linux_x86_64.tar.gz | tar xz -C bin -Formating template templates/ocp-performance.jsonnet -bin/jsonnetfmt templates/ocp-performance.jsonnet > tmp/ocp-performance.jsonnet -mv tmp/ocp-performance.jsonnet templates/ocp-performance.jsonnet -Building template templates/ocp-performance.jsonnet -bin/jsonnet templates/ocp-performance.jsonnet > rendered/ocp-performance.json -$ ls rendered -ocp-ingress-controller.json ocp-performance.json -``` -Similarly for V2, the dashboards that are built using latest grafonnet library, use -``` -$ make v2 mkdir -p bin rendered Downloading jsonnet binary curl -s -L https://github.com/google/go-jsonnet/releases/download/v0.20.0/go-jsonnet_0.20.0_Linux_x86_64.tar.gz | tar xz -C bin @@ -53,10 +36,7 @@ bin/jsonnetfmt -i templates/General/ocp-performance-v2.jsonnet Building template templates/General/ocp-performance-v2.jsonnet mkdir -p rendered/General/ bin/jsonnet -J ./templates/vendor templates/General/ocp-performance-v2.jsonnet > rendered/General/ocp-performance-v2.json -Rendered the v2 dashboards with latest grafonnet library ``` -Rest all operations remain same as before. - In order to clean up the environment execute `make clean`. In order to lint the templates using `jsonnetfmt`execute `make format` @@ -73,6 +53,7 @@ Dashboards Available after Migration to Grafonnet v10.1.0(latest): - CPT - [x] Ingress Perf Dashboard. - [x] K8s Netperf Dashboard. + - [x] Kube-burner Report Mode Dashboard. - [x] Kube Burner Report OCP Wrapper dashboard. - General - [x] API Performance Dashboard. @@ -80,7 +61,6 @@ Dashboards Available after Migration to Grafonnet v10.1.0(latest): - [x] Etcd Dashboard. - [x] Hypershift Performance Dashboard. - [x] K8s Performance Dashboard. - - [ ] Kube Burner Dashboard. - [x] OpenShift Performance Dashboard. - [x] OVN Dashboard. - [x] Pgbench Dashboard. diff --git a/templates/CPT/ingress-perf-v2.jsonnet b/templates/CPT/ingress-perf.jsonnet similarity index 100% rename from templates/CPT/ingress-perf-v2.jsonnet rename to templates/CPT/ingress-perf.jsonnet diff --git a/templates/CPT/k8s-netperf-v2.jsonnet b/templates/CPT/k8s-netperf.jsonnet similarity index 100% rename from templates/CPT/k8s-netperf-v2.jsonnet rename to templates/CPT/k8s-netperf.jsonnet diff --git a/templates/CPT/kube-burner-report-mode-v2.jsonnet b/templates/CPT/kube-burner-report-mode.jsonnet similarity index 100% rename from templates/CPT/kube-burner-report-mode-v2.jsonnet rename to templates/CPT/kube-burner-report-mode.jsonnet diff --git a/templates/CPT/kube-burner-report-ocp-wrapper-v2.jsonnet b/templates/CPT/kube-burner-report-ocp-wrapper.jsonnet similarity index 100% rename from templates/CPT/kube-burner-report-ocp-wrapper-v2.jsonnet rename to templates/CPT/kube-burner-report-ocp-wrapper.jsonnet diff --git a/templates/General/api-performance-overview-v2.jsonnet b/templates/General/api-performance-overview-v2.jsonnet deleted file mode 100644 index ca835c7..0000000 --- a/templates/General/api-performance-overview-v2.jsonnet +++ /dev/null @@ -1,50 +0,0 @@ -local panels = import '../../assets/api-performance-overview/panels.libsonnet'; -local queries = import '../../assets/api-performance-overview/queries.libsonnet'; -local variables = import '../../assets/api-performance-overview/variables.libsonnet'; -local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; - -g.dashboard.new('API Performance Dashboard') -+ g.dashboard.withDescription(||| - Dashboard for Api-performance-overview -|||) -+ g.dashboard.withTags('Api-performance') -+ g.dashboard.time.withFrom('now-1h') -+ g.dashboard.time.withTo('now') -+ g.dashboard.withTimezone('utc') -+ g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) -+ g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) -+ g.dashboard.withRefresh('30s') -+ g.dashboard.withEditable(false) -+ g.dashboard.graphTooltip.withSharedCrosshair() -+ g.dashboard.withVariables([ - variables.Datasource, - variables.apiserver, - variables.instance, - variables.resource, - variables.code, - variables.verb, - variables.flowSchema, - variables.priorityLevel, - variables.interval, -]) -+ g.dashboard.withPanels([ - panels.timeSeries.legendRightPlacement('request duration - 99th quantile', 'short', queries.request_duration_99th_quantile.query(), { x: 0, y: 0, w: 12, h: 8 }), - panels.timeSeries.legendRightPlacement('request rate - by instance', 'short', queries.requestRateByInstance.query(), { x: 12, y: 0, w: 12, h: 8 }), - panels.timeSeries.legendRightPlacement('request duration - 99th quantile - by resource', 'short', queries.requestDuarationByResource.query(), { x: 0, y: 8, w: 12, h: 8 }), - panels.timeSeries.legendRightPlacement('request duration - 99th quantile', 'short', queries.requestDurationBy99Quatile.query(), { x: 12, y: 8, w: 12, h: 8 }), - panels.timeSeries.legendBottomPlacement('request duration - read vs write', 'short', queries.requestDurationReadWrite.query(), { x: 0, y: 16, w: 12, h: 8 }), - panels.timeSeries.legendBottomPlacement('request rate - read vs write', 'short', queries.requestRateReadWrite.query(), { x: 12, y: 16, w: 12, h: 8 }), - panels.timeSeries.legendBottomPlacement('requests dropped rate', 'short', queries.requestRateDropped.query(), { x: 0, y: 24, w: 12, h: 8 }), - panels.timeSeries.legendBottomPlacement('requests terminated rate', 'short', queries.requestRateTerminated.query(), { x: 12, y: 24, w: 12, h: 8 }), - panels.timeSeries.legendRightPlacement('requests status rate', 'short', queries.requestRateStatus.query(), { x: 0, y: 32, w: 12, h: 8 }), - panels.timeSeries.legendRightPlacement('long running requests', 'short', queries.requestsLongRunning.query(), { x: 12, y: 32, w: 12, h: 8 }), - panels.timeSeries.legendRightPlacement('request in flight', 'short', queries.requestInFlight.query(), { x: 0, y: 40, w: 12, h: 8 }), - panels.timeSeries.legendRightPlacement('p&f - requests rejected', 'short', queries.requestRejectPandF.query(), { x: 12, y: 40, w: 12, h: 8 }), - panels.timeSeries.legendRightPlacement('response size - 99th quantile', 'short', queries.responseSize99Quatile.query(), { x: 0, y: 48, w: 12, h: 8 }), - panels.timeSeries.legendRightPlacement('p&f - request queue length', 'short', queries.requestQueueLengthPandF.query(), { x: 12, y: 48, w: 12, h: 8 }), - panels.timeSeries.withRequestWaitDurationAggregations('p&f - request wait duration - 99th quantile', 'short', queries.requestWaitDuration99QuatilePandF.query(), { x: 0, y: 56, w: 24, h: 8 }), - panels.timeSeries.legendRightPlacement('p&f - request dispatch rate', 'short', queries.requestDispatchRatePandF.query(), { x: 0, y: 64, w: 12, h: 8 }), - panels.timeSeries.legendRightPlacement('p&f - request execution duration', 'short', queries.requestExecutionDurationPandF.query(), { x: 12, y: 64, w: 12, h: 8 }), - panels.timeSeries.legendRightPlacement('p&f - pending in queue', 'short', queries.pendingInQueuePandF.query(), { x: 0, y: 72, w: 12, h: 8 }), - panels.timeSeries.legendRightPlacement('p&f - concurrency limit by kube-apiserver', 'short', queries.concurrencyLimitByKubeapiserverPandF.query(), { x: 12, y: 72, w: 12, h: 8 }), -]) diff --git a/templates/General/api-performance-overview.jsonnet b/templates/General/api-performance-overview.jsonnet index 246e9ff..ca835c7 100644 --- a/templates/General/api-performance-overview.jsonnet +++ b/templates/General/api-performance-overview.jsonnet @@ -1,482 +1,50 @@ -local grafana = import '../grafonnet-lib/grafonnet/grafana.libsonnet'; -local prometheus = grafana.prometheus; - -//Panel definitions - -local request_duration_99th_quantile = grafana.graphPanel.new( - title='request duration - 99th quantile', - datasource='$datasource', - legend_values=true, - legend_alignAsTable=true, - legend_current=true, - legend_rightSide=true, - legend_sort='max', - legend_sortDesc=true, - nullPointMode='null as zero', - legend_hideZero=true, -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{apiserver=~"$apiserver",instance=~"$instance",resource=~"$resource",subresource!="log",verb!~"WATCH|WATCHLIST|PROXY"}[$interval])) by(verb,le))', - legendFormat='{{verb}}', - ) -); - -local request_rate_by_instance = grafana.graphPanel.new( - title='request rate - by instance', - datasource='$datasource', - legend_values=true, - legend_alignAsTable=true, - legend_current=true, - legend_rightSide=true, - legend_sort='max', - legend_sortDesc=true, - nullPointMode='null as zero', - legend_hideZero=true, -).addTarget( - prometheus.target( - 'sum(rate(apiserver_request_total{apiserver=~"$apiserver",instance=~"$instance",resource=~"$resource",code=~"$code",verb=~"$verb"}[$interval])) by(instance)', - legendFormat='{{instance}}', - ) -); - -local request_duration_99th_quantile_by_resource = grafana.graphPanel.new( - title='request duration - 99th quantile - by resource', - datasource='$datasource', - legend_values=true, - legend_alignAsTable=true, - legend_current=true, - legend_rightSide=true, - legend_sort='max', - legend_sortDesc=true, - nullPointMode='null as zero', - legend_hideZero=true, -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{apiserver=~"$apiserver",instance=~"$instance",resource=~"$resource",subresource!="log",verb!~"WATCH|WATCHLIST|PROXY"}[$interval])) by(resource,le))', - legendFormat='{{resource}}', - ) -); - -local request_rate_by_resource = grafana.graphPanel.new( - title='request duration - 99th quantile', - datasource='$datasource', - legend_values=true, - legend_alignAsTable=true, - legend_current=true, - legend_rightSide=true, - legend_sort='max', - legend_sortDesc=true, - nullPointMode='null as zero', - legend_hideZero=true, -).addTarget( - prometheus.target( - 'sum(rate(apiserver_request_total{apiserver=~"$apiserver",instance=~"$instance",resource=~"$resource",code=~"$code",verb=~"$verb"}[$interval])) by(resource)', - legendFormat='{{resource}}', - ) -); - -local request_duration_read_write = grafana.graphPanel.new( - title='request duration - read vs write', - datasource='$datasource', -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{apiserver=~"$apiserver",instance=~"$instance",resource=~"$resource",verb=~"LIST|GET"}[$interval])) by(le))', - legendFormat='read', - ) -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{apiserver=~"$apiserver",instance=~"$instance",resource=~"$resource",verb=~"POST|PUT|PATCH|UPDATE|DELETE"}[$interval])) by(le))', - legendFormat='write', - ) -); - - -local request_rate_read_write = grafana.graphPanel.new( - title='request rate - read vs write', - datasource='$datasource', -).addTarget( - prometheus.target( - 'sum(rate(apiserver_request_total{apiserver=~"$apiserver",instance=~"$instance",resource=~"$resource",verb=~"LIST|GET"}[$interval]))', - legendFormat='read', - ) -).addTarget( - prometheus.target( - 'sum(rate(apiserver_request_total{apiserver=~"$apiserver",instance=~"$instance",resource=~"$resource",verb=~"POST|PUT|PATCH|UPDATE|DELETE"}[$interval]))', - legendFormat='write', - ) -); - - -local requests_dropped_rate = grafana.graphPanel.new( - title='requests dropped rate', - datasource='$datasource', - description='Number of requests dropped with "Try again later" response', -).addTarget( - prometheus.target( - 'sum(rate(apiserver_dropped_requests_total{instance=~"$instance"}[$interval])) by (requestKind)', - ) -); - - -local requests_terminated_rate = grafana.graphPanel.new( - title='requests terminated rate', - datasource='$datasource', - description='Number of requests which apiserver terminated in self-defense', -).addTarget( - prometheus.target( - 'sum(rate(apiserver_request_terminations_total{instance=~"$instance",resource=~"$resource",code=~"$code"}[$interval])) by(component)', - ) -); - -local requests_status_rate = grafana.graphPanel.new( - title='requests status rate', - datasource='$datasource', - legend_values=true, - legend_alignAsTable=true, - legend_current=true, - legend_rightSide=true, - legend_sort='max', - legend_sortDesc=true, - nullPointMode='null as zero', - legend_hideZero=true, -).addTarget( - prometheus.target( - 'sum(rate(apiserver_request_total{apiserver=~"$apiserver",instance=~"$instance",resource=~"$resource",verb=~"$verb",code=~"$code"}[$interval])) by(code)', - legendFormat='{{code}}' - ) -); - -local long_running_requests = grafana.graphPanel.new( - title='long running requests', - datasource='$datasource', - legend_values=true, - legend_alignAsTable=true, - legend_current=true, - legend_rightSide=true, - legend_sort='max', - legend_sortDesc=true, - nullPointMode='null as zero', - legend_hideZero=true, -).addTarget( - prometheus.target( - 'sum(apiserver_longrunning_gauge{instance=~"$instance",resource=~"$resource",verb=~"$verb"}) by(instance)', - legendFormat='{{instance}}' - ) -); - -local request_in_flight = grafana.graphPanel.new( - title='request in flight', - datasource='$datasource', - legend_values=true, - legend_alignAsTable=true, - legend_current=true, - legend_rightSide=true, - legend_sort='max', - legend_sortDesc=true, - nullPointMode='null as zero', - legend_hideZero=true, -).addTarget( - prometheus.target( - 'sum(apiserver_current_inflight_requests{instance=~"$instance"}) by (instance,requestKind)', - legendFormat='{{requestKind}}-{{instance}}', - ) -); - -local pf_requests_rejected = grafana.graphPanel.new( - title='p&f - requests rejected', - datasource='$datasource', - description='Number of requests rejected by API Priority and Fairness system', -).addTarget( - prometheus.target( - 'sum(rate(apiserver_flowcontrol_rejected_requests_total{instance=~"$instance",flowSchema=~"$flowSchema",priorityLevel=~"$priorityLevel"}[$interval])) by (reason)', - ) -); - -local response_size_99th_quartile = grafana.graphPanel.new( - title='response size - 99th quantile', - datasource='$datasource', - description='Response size distribution in bytes for each group, version, verb, resource, subresource, scope and component', - legend_values=true, - legend_alignAsTable=true, - legend_current=true, - legend_rightSide=true, - legend_sort='max', - legend_sortDesc=true, - nullPointMode='null as zero', - legend_hideZero=true, -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(rate(apiserver_response_sizes_bucket{instance=~"$instance",resource=~"$resource",verb=~"$verb"}[$interval])) by(instance,le))', - legendFormat='{{instance}}', - ) -); - -local pf_request_queue_length = grafana.graphPanel.new( - title='p&f - request queue length', - datasource='$datasource', - description='Length of queue in the API Priority and Fairness system, as seen by each request after it is enqueued', - legend_values=true, - legend_alignAsTable=true, - legend_current=true, - legend_rightSide=true, - legend_sort='max', - legend_sortDesc=true, - nullPointMode='null as zero', - legend_hideZero=true, -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(rate(apiserver_flowcontrol_request_queue_length_after_enqueue_bucket{instance=~"$instance",flowSchema=~"$flowSchema",priorityLevel=~"$priorityLevel"}[$interval])) by(flowSchema, priorityLevel, le))', - legendFormat='{{flowSchema}}:{{priorityLevel}}', - ) -); - -local pf_request_wait_duration_99th_quartile = grafana.graphPanel.new( - title='p&f - request wait duration - 99th quantile', - datasource='$datasource', - description='Length of time a request spent waiting in its queue', - legend_values=true, - legend_alignAsTable=true, - legend_current=true, - legend_rightSide=true, - legend_sort='max', - legend_sortDesc=true, - legend_max=true, - legend_avg=true, - nullPointMode='null as zero', - legend_hideZero=true, -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(rate(apiserver_flowcontrol_request_wait_duration_seconds_bucket{instance=~"$instance"}[5m])) by(flow_schema, priority_level, le))', - ) -); - -local pf_request_execution_duration = grafana.graphPanel.new( - title='p&f - request execution duration', - datasource='$datasource', - description='Duration of request execution in the API Priority and Fairness system', - legend_values=true, - legend_alignAsTable=true, - legend_current=true, - legend_rightSide=true, - legend_sort='max', - legend_sortDesc=true, - nullPointMode='null as zero', - legend_hideZero=true, -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(rate(apiserver_flowcontrol_request_execution_seconds_bucket{instance=~"$instance",flowSchema=~"$flowSchema",priorityLevel=~"$priorityLevel"}[$interval])) by(flowSchema, priorityLevel, le))', - legendFormat='{{flowSchema}}:{{priorityLevel}}', - ) -); - -local pf_request_dispatch_rate = grafana.graphPanel.new( - title='p&f - request dispatch rate', - datasource='$datasource', - description='Number of requests released by API Priority and Fairness system for service', - legend_values=true, - legend_alignAsTable=true, - legend_current=true, - legend_rightSide=true, - legend_sort='max', - legend_sortDesc=true, - nullPointMode='null as zero', - legend_hideZero=true, -).addTarget( - prometheus.target( - 'sum(rate(apiserver_flowcontrol_dispatched_requests_total{instance=~"$instance",flowSchema=~"$flowSchema",priorityLevel=~"$priorityLevel"}[$interval])) by(flowSchema,priorityLevel)', - legendFormat='{{flowSchema}}:{{priorityLevel}}', - ) -); - -local pf_concurrency_limit = grafana.graphPanel.new( - title='p&f - concurrency limit by kube-apiserver', - datasource='$datasource', - description='Shared concurrency limit in the API Priority and Fairness system', - legend_values=true, - legend_alignAsTable=true, - legend_current=true, - legend_rightSide=true, - legend_sort='max', - legend_sortDesc=true, -).addTarget( - prometheus.target( - 'sum(apiserver_flowcontrol_request_concurrency_limit{instance=~".*:6443",priorityLevel=~"$priorityLevel"}) by (instance,priorityLevel)', - ) -); - -local pf_pending_in_queue = grafana.graphPanel.new( - title='p&f - pending in queue', - datasource='$datasource', - description='Number of requests currently pending in queues of the API Priority and Fairness system', - legend_values=true, - legend_alignAsTable=true, - legend_current=true, - legend_rightSide=true, - legend_sort='max', - legend_sortDesc=true, - nullPointMode='null as zero', - legend_hideZero=true, -).addTarget( - prometheus.target( - 'sum(apiserver_flowcontrol_current_inqueue_requests{instance=~"$instance",flowSchema=~"$flowSchema",priorityLevel=~"$priorityLevel"}) by (flowSchema,priorityLevel)', - legendFormat='{{flowSchema}}:{{priorityLevel}}', - ) -); - -//Dashboard + Templates - -grafana.dashboard.new( - 'API Performance', - description='', - timezone='utc', - time_from='now-1h', - refresh='30s', - editable='true', -) - -.addTemplate( - grafana.template.datasource( - 'datasource', - 'prometheus', - '', - label='datasource' - ) -) - -.addTemplate( - grafana.template.new( - 'apiserver', - '$datasource', - 'label_values(apiserver_request_duration_seconds_bucket, apiserver)', - refresh='time', - label='apiserver' - ) { - type: 'query', - multi: false, - includeAll: true, - } -) - -.addTemplate( - grafana.template.new( - 'instance', - '$datasource', - 'label_values(apiserver_request_total, instance)', - refresh='time', - label='instance' - ) { - type: 'query', - multi: false, - includeAll: true, - } -) - -.addTemplate( - grafana.template.new( - 'resource', - '$datasource', - 'label_values(apiserver_request_duration_seconds_bucket, resource)', - refresh='time', - label='resource' - ) { - type: 'query', - multi: false, - includeAll: true, - }, -) - -.addTemplate( - grafana.template.new( - 'code', - '$datasource', - 'label_values(code)', - refresh='time', - label='code', - ) { - type: 'query', - multi: false, - includeAll: true, - }, -) - - -.addTemplate( - grafana.template.new( - 'verb', - '$datasource', - 'label_values(verb)', - refresh='time', - label='verb', - ) { - type: 'query', - multi: false, - includeAll: true, - }, -) - - -.addTemplate( - grafana.template.new( - 'flowSchema', - '$datasource', - 'label_values(flowSchema)', - refresh='time', - label='flow-schema' - ) { - type: 'query', - multi: false, - includeAll: true, - }, -) - -.addTemplate( - grafana.template.new( - 'priorityLevel', - '$datasource', - 'label_values(priorityLevel)', - refresh='time', - label='priority-level' - ) { - type: 'query', - multi: false, - includeAll: true, - }, -) - -.addTemplate( - grafana.template.new( - 'interval', - '$datasource', - '$__auto_interval_period', - label='interval', - refresh='time', - ) { - type: 'interval', - query: '1m,5m', - multi: false, - includeAll: true, - auto: true, - auto_count: 30, - auto_min: '10s', - }, -) - -.addPanel(request_duration_99th_quantile, gridPos={ x: 0, y: 0, w: 12, h: 8 }) -.addPanel(request_rate_by_instance, gridPos={ x: 12, y: 0, w: 12, h: 8 }) -.addPanel(request_duration_99th_quantile_by_resource, gridPos={ x: 0, y: 8, w: 12, h: 8 }) -.addPanel(request_rate_by_resource, gridPos={ x: 12, y: 8, w: 12, h: 8 }) -.addPanel(request_duration_read_write, gridPos={ x: 0, y: 16, w: 12, h: 8 }) -.addPanel(request_rate_read_write, gridPos={ x: 12, y: 16, w: 12, h: 8 }) -.addPanel(requests_dropped_rate, gridPos={ x: 0, y: 24, w: 12, h: 8 }) -.addPanel(requests_terminated_rate, gridPos={ x: 12, y: 24, w: 12, h: 8 }) -.addPanel(requests_status_rate, gridPos={ x: 0, y: 32, w: 12, h: 8 }) -.addPanel(long_running_requests, gridPos={ x: 12, y: 32, w: 12, h: 8 }) -.addPanel(request_in_flight, gridPos={ x: 0, y: 40, w: 12, h: 8 }) -.addPanel(pf_requests_rejected, gridPos={ x: 12, y: 40, w: 12, h: 8 }) -.addPanel(response_size_99th_quartile, gridPos={ x: 0, y: 48, w: 12, h: 8 }) -.addPanel(pf_request_queue_length, gridPos={ x: 12, y: 48, w: 12, h: 8 }) -.addPanel(pf_request_wait_duration_99th_quartile, gridPos={ x: 0, y: 56, w: 24, h: 8 }) -.addPanel(pf_request_execution_duration, gridPos={ x: 12, y: 56, w: 12, h: 8 }) -.addPanel(pf_request_dispatch_rate, gridPos={ x: 0, y: 64, w: 12, h: 8 }) -.addPanel(pf_concurrency_limit, gridPos={ x: 12, y: 64, w: 12, h: 8 }) -.addPanel(pf_pending_in_queue, gridPos={ x: 0, y: 72, w: 12, h: 8 }) +local panels = import '../../assets/api-performance-overview/panels.libsonnet'; +local queries = import '../../assets/api-performance-overview/queries.libsonnet'; +local variables = import '../../assets/api-performance-overview/variables.libsonnet'; +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; + +g.dashboard.new('API Performance Dashboard') ++ g.dashboard.withDescription(||| + Dashboard for Api-performance-overview +|||) ++ g.dashboard.withTags('Api-performance') ++ g.dashboard.time.withFrom('now-1h') ++ g.dashboard.time.withTo('now') ++ g.dashboard.withTimezone('utc') ++ g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) ++ g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) ++ g.dashboard.withRefresh('30s') ++ g.dashboard.withEditable(false) ++ g.dashboard.graphTooltip.withSharedCrosshair() ++ g.dashboard.withVariables([ + variables.Datasource, + variables.apiserver, + variables.instance, + variables.resource, + variables.code, + variables.verb, + variables.flowSchema, + variables.priorityLevel, + variables.interval, +]) ++ g.dashboard.withPanels([ + panels.timeSeries.legendRightPlacement('request duration - 99th quantile', 'short', queries.request_duration_99th_quantile.query(), { x: 0, y: 0, w: 12, h: 8 }), + panels.timeSeries.legendRightPlacement('request rate - by instance', 'short', queries.requestRateByInstance.query(), { x: 12, y: 0, w: 12, h: 8 }), + panels.timeSeries.legendRightPlacement('request duration - 99th quantile - by resource', 'short', queries.requestDuarationByResource.query(), { x: 0, y: 8, w: 12, h: 8 }), + panels.timeSeries.legendRightPlacement('request duration - 99th quantile', 'short', queries.requestDurationBy99Quatile.query(), { x: 12, y: 8, w: 12, h: 8 }), + panels.timeSeries.legendBottomPlacement('request duration - read vs write', 'short', queries.requestDurationReadWrite.query(), { x: 0, y: 16, w: 12, h: 8 }), + panels.timeSeries.legendBottomPlacement('request rate - read vs write', 'short', queries.requestRateReadWrite.query(), { x: 12, y: 16, w: 12, h: 8 }), + panels.timeSeries.legendBottomPlacement('requests dropped rate', 'short', queries.requestRateDropped.query(), { x: 0, y: 24, w: 12, h: 8 }), + panels.timeSeries.legendBottomPlacement('requests terminated rate', 'short', queries.requestRateTerminated.query(), { x: 12, y: 24, w: 12, h: 8 }), + panels.timeSeries.legendRightPlacement('requests status rate', 'short', queries.requestRateStatus.query(), { x: 0, y: 32, w: 12, h: 8 }), + panels.timeSeries.legendRightPlacement('long running requests', 'short', queries.requestsLongRunning.query(), { x: 12, y: 32, w: 12, h: 8 }), + panels.timeSeries.legendRightPlacement('request in flight', 'short', queries.requestInFlight.query(), { x: 0, y: 40, w: 12, h: 8 }), + panels.timeSeries.legendRightPlacement('p&f - requests rejected', 'short', queries.requestRejectPandF.query(), { x: 12, y: 40, w: 12, h: 8 }), + panels.timeSeries.legendRightPlacement('response size - 99th quantile', 'short', queries.responseSize99Quatile.query(), { x: 0, y: 48, w: 12, h: 8 }), + panels.timeSeries.legendRightPlacement('p&f - request queue length', 'short', queries.requestQueueLengthPandF.query(), { x: 12, y: 48, w: 12, h: 8 }), + panels.timeSeries.withRequestWaitDurationAggregations('p&f - request wait duration - 99th quantile', 'short', queries.requestWaitDuration99QuatilePandF.query(), { x: 0, y: 56, w: 24, h: 8 }), + panels.timeSeries.legendRightPlacement('p&f - request dispatch rate', 'short', queries.requestDispatchRatePandF.query(), { x: 0, y: 64, w: 12, h: 8 }), + panels.timeSeries.legendRightPlacement('p&f - request execution duration', 'short', queries.requestExecutionDurationPandF.query(), { x: 12, y: 64, w: 12, h: 8 }), + panels.timeSeries.legendRightPlacement('p&f - pending in queue', 'short', queries.pendingInQueuePandF.query(), { x: 0, y: 72, w: 12, h: 8 }), + panels.timeSeries.legendRightPlacement('p&f - concurrency limit by kube-apiserver', 'short', queries.concurrencyLimitByKubeapiserverPandF.query(), { x: 12, y: 72, w: 12, h: 8 }), +]) diff --git a/templates/General/cilium-k8s-perf-v2.jsonnet b/templates/General/cilium-k8s-perf-v2.jsonnet deleted file mode 100644 index 67045cb..0000000 --- a/templates/General/cilium-k8s-perf-v2.jsonnet +++ /dev/null @@ -1,70 +0,0 @@ -local panels = import '../../assets/cilium-k8s-perf/panels.libsonnet'; -local queries = import '../../assets/cilium-k8s-perf/queries.libsonnet'; -local variables = import '../../assets/cilium-k8s-perf/variables.libsonnet'; -local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; - -g.dashboard.new('Cilium k8s Performance dashboard') -+ g.dashboard.time.withFrom('now-1h') -+ g.dashboard.time.withTo('now') -+ g.dashboard.withTimezone('utc') -+ g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) -+ g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) -+ g.dashboard.withRefresh('30s') -+ g.dashboard.withEditable(false) -+ g.dashboard.graphTooltip.withSharedCrosshair() -+ g.dashboard.withVariables([ - variables.Datasource, - variables._worker_node, - variables.namespace, - variables.block_device, - variables.net_device, - variables.interval, -]) - -+ g.dashboard.withPanels([ - g.panel.row.new('Cilium Details') - + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) - + g.panel.row.withCollapsed(true) - + g.panel.row.withPanels([ - panels.timeSeries.withCiliumAgg('Cilium Controller Failures', 'none', queries.ciliumControllerFailures.query(), { x: 0, y: 1, w: 12, h: 8 }), - panels.timeSeries.withCiliumAgg('Cilium IP Address Allocation', 'none', queries.ciliumIPAddressAllocation.query(), { x: 12, y: 1, w: 12, h: 8 }), - panels.timeSeries.withCiliumAgg('Cilium Container CPU', 'percent', queries.ciliumContainerCPU.query(), { x: 0, y: 9, w: 12, h: 8 }), - panels.timeSeries.withCiliumAgg('Cilium Container Memory', 'bytes', queries.ciliumConatinerMemory.query(), { x: 12, y: 9, w: 12, h: 8 }), - panels.timeSeries.withCiliumAgg('Cilium Network Polices Per Agent', 'none', queries.ciliumNetworkPolicesPerAgent.query(), { x: 0, y: 17, w: 12, h: 8 }), - panels.timeSeries.withCiliumAgg('Cilium BPF Operations', 'none', queries.ciliumBPFOperations.query(), { x: 12, y: 17, w: 12, h: 8 }), - ]), - g.panel.row.new('Cluster Details') - + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) - + g.panel.row.withCollapsed(true) - + g.panel.row.withPanels([ - panels.stat.withclusterAgg('Current Node Count', 'none', queries.currentNodeCount.query(), { x: 0, y: 26, w: 8, h: 3 }), - panels.stat.withclusterAgg('Current namespace Count', 'none', queries.currentNamespaceCount.query(), { x: 8, y: 26, w: 8, h: 3 }), - panels.stat.withclusterAgg('Current Pod Count', 'none', queries.currentPodCount.query(), { x: 16, y: 26, w: 8, h: 3 }), - panels.timeSeries.withClusterAgg('Number of nodes', 'none', queries.numberOfNodes.query(), { x: 0, y: 29, w: 8, h: 8 }), - panels.timeSeries.withClusterAgg('Namespace count', 'none', queries.namespaceCount.query(), { x: 8, y: 29, w: 8, h: 8 }), - panels.timeSeries.withClusterAgg('Pod count', 'none', queries.podCount.query(), { x: 16, y: 29, w: 8, h: 8 }), - panels.timeSeries.withClusterAgg('Secret & configmap count', 'none', queries.secretConfigmapCount.query(), { x: 0, y: 37, w: 8, h: 8 }), - panels.timeSeries.withClusterAgg('Deployment count', 'none', queries.deploymentCount.query(), { x: 8, y: 37, w: 8, h: 8 }), - panels.timeSeries.withClusterAgg('Services count', 'none', queries.serviceCount.query(), { x: 16, y: 37, w: 8, h: 8 }), - panels.timeSeries.withCiliumAgg('Top 10 container RSS', 'bytes', queries.top10ContainerRSS.query(), { x: 0, y: 45, w: 24, h: 8 }), - panels.timeSeries.withCiliumAgg('Top 10 container CPU', 'percent', queries.top10ContainerCPU.query(), { x: 0, y: 53, w: 12, h: 8 }), - panels.timeSeries.withClusterAgg('Goroutines count', 'none', queries.goroutinesCount.query(), { x: 12, y: 53, w: 12, h: 8 }), - panels.timeSeries.withCiliumAgg('Pod Distribution', 'none', queries.podDistribution.query(), { x: 0, y: 61, w: 24, h: 8 }), - ]), - g.panel.row.new('Node: $_worker_node') - + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) - + g.panel.row.withCollapsed(true) - + g.panel.row.withRepeat('_worker_node') - + g.panel.row.withPanels([ - panels.timeSeries.withCiliumAgg('CPU Basic: $_worker_node', 'percent', queries.CPUBasic.query(), { x: 0, y: 70, w: 12, h: 8 }), - panels.timeSeries.withCiliumAgg('System Memory: $_worker_node', 'bytes', queries.systemMemory.query(), { x: 12, y: 70, w: 12, h: 8 }), - panels.timeSeries.withCiliumAgg('Disk throughput: $_worker_node', 'Bps', queries.diskThroughput.query(), { x: 0, y: 78, w: 12, h: 8 }), - panels.timeSeries.withCiliumAgg('Disk IOPS: $_worker_node', 'iops', queries.diskIOPS.query(), { x: 12, y: 78, w: 12, h: 8 }), - panels.timeSeries.withCiliumAgg('Network Utilization: $_worker_node', 'bps', queries.networkUtilization.query(), { x: 0, y: 86, w: 12, h: 8 }), - panels.timeSeries.withCiliumAgg('Network Packets: $_worker_node', 'pps', queries.networkPackets.query(), { x: 12, y: 86, w: 12, h: 8 }), - panels.timeSeries.withCiliumAgg('Network packets drop: $_worker_node', 'pps', queries.networkPacketDrop.query(), { x: 0, y: 94, w: 12, h: 8 }), - panels.timeSeries.withCiliumAgg('Conntrack stats: $_worker_node', '', queries.conntrackStats.query(), { x: 12, y: 94, w: 12, h: 8 }), - panels.timeSeries.withCiliumAgg('Top 10 container CPU: $_worker_node', 'percent', queries.top10ContainerCPUNode.query(), { x: 0, y: 102, w: 12, h: 8 }), - panels.timeSeries.withCiliumAgg('Top 10 container RSS: $_worker_node', 'bytes', queries.top10ContainerRSSNode.query(), { x: 12, y: 102, w: 12, h: 8 }), - ]), -]) diff --git a/templates/General/cilium-k8s-perf.jsonnet b/templates/General/cilium-k8s-perf.jsonnet index 90c21f0..67045cb 100644 --- a/templates/General/cilium-k8s-perf.jsonnet +++ b/templates/General/cilium-k8s-perf.jsonnet @@ -1,555 +1,70 @@ -local grafana = import '../grafonnet-lib/grafonnet/grafana.libsonnet'; -local prometheus = grafana.prometheus; - - -// Helper functions - -local genericGraphPanel(title, format) = grafana.graphPanel.new( - title=title, - datasource='$datasource', - format=format, - nullPointMode='null as zero', - sort='decreasing', - legend_alignAsTable=true, -); - -local genericGraphLegendPanel(title, format) = grafana.graphPanel.new( - title=title, - datasource='$datasource', - format=format, - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='min', - nullPointMode='null as zero', - sort='decreasing', -); - - -local nodeMemory(nodeName) = genericGraphLegendPanel('System Memory: ' + nodeName, 'bytes').addTarget( - prometheus.target( - 'node_memory_Active_bytes{node=~"' + nodeName + '"}', - legendFormat='Active', - ) -).addTarget( - prometheus.target( - 'node_memory_MemTotal_bytes{node=~"' + nodeName + '"}', - legendFormat='Total', - ) -).addTarget( - prometheus.target( - 'node_memory_Cached_bytes{node=~"' + nodeName + '"} + node_memory_Buffers_bytes{node=~"' + nodeName + '"}', - legendFormat='Cached + Buffers', - ) -).addTarget( - prometheus.target( - 'node_memory_MemAvailable_bytes{node=~"' + nodeName + '"}', - legendFormat='Available', - ) -); - - -local nodeCPU(nodeName) = genericGraphLegendPanel('CPU Basic: ' + nodeName, 'percent').addTarget( - prometheus.target( - 'sum by (instance, mode)(rate(node_cpu_seconds_total{node=~"' + nodeName + '",job=~".*"}[$interval])) * 100', - legendFormat='Busy {{mode}}', - ) -); - - -local diskThroughput(nodeName) = genericGraphLegendPanel('Disk throughput: ' + nodeName, 'Bps').addTarget( - prometheus.target( - 'rate(node_disk_read_bytes_total{device=~"$block_device",node=~"' + nodeName + '"}[$interval])', - legendFormat='{{ device }} - read', - ) -).addTarget( - prometheus.target( - 'rate(node_disk_written_bytes_total{device=~"$block_device",node=~"' + nodeName + '"}[$interval])', - legendFormat='{{ device }} - write', - ) -); - -local diskIOPS(nodeName) = genericGraphLegendPanel('Disk IOPS: ' + nodeName, 'iops').addTarget( - prometheus.target( - 'rate(node_disk_reads_completed_total{device=~"$block_device",node=~"' + nodeName + '"}[$interval])', - legendFormat='{{ device }} - read', - ) -).addTarget( - prometheus.target( - 'rate(node_disk_writes_completed_total{device=~"$block_device",node=~"' + nodeName + '"}[$interval])', - legendFormat='{{ device }} - write', - ) -); - -local networkUtilization(nodeName) = genericGraphLegendPanel('Network Utilization: ' + nodeName, 'bps').addTarget( - prometheus.target( - 'rate(node_network_receive_bytes_total{node=~"' + nodeName + '",device=~"$net_device"}[$interval]) * 8', - legendFormat='{{instance}} - {{device}} - RX', - ) -).addTarget( - prometheus.target( - 'rate(node_network_transmit_bytes_total{node=~"' + nodeName + '",device=~"$net_device"}[$interval]) * 8', - legendFormat='{{instance}} - {{device}} - TX', - ) -); - -local networkPackets(nodeName) = genericGraphLegendPanel('Network Packets: ' + nodeName, 'pps').addTarget( - prometheus.target( - 'rate(node_network_receive_packets_total{node=~"' + nodeName + '",device=~"$net_device"}[$interval])', - legendFormat='{{instance}} - {{device}} - RX', - ) -).addTarget( - prometheus.target( - 'rate(node_network_transmit_packets_total{node=~"' + nodeName + '",device=~"$net_device"}[$interval])', - legendFormat='{{instance}} - {{device}} - TX', - ) -); - -local networkDrop(nodeName) = genericGraphLegendPanel('Network packets drop: ' + nodeName, 'pps').addTarget( - prometheus.target( - 'topk(10, rate(node_network_receive_drop_total{node=~"' + nodeName + '"}[$interval]))', - legendFormat='rx-drop-{{ device }}', - ) -).addTarget( - prometheus.target( - 'topk(10,rate(node_network_transmit_drop_total{node=~"' + nodeName + '"}[$interval]))', - legendFormat='tx-drop-{{ device }}', - ) -); - -local conntrackStats(nodeName) = genericGraphLegendPanel('Conntrack stats: ' + nodeName, '') - { - seriesOverrides: [{ - alias: 'conntrack_limit', - yaxis: 2, - }], - yaxes: [{ show: true }, { show: true }], -} - .addTarget( - prometheus.target( - 'node_nf_conntrack_entries{node=~"' + nodeName + '"}', - legendFormat='conntrack_entries', - ) -).addTarget( - prometheus.target( - 'node_nf_conntrack_entries_limit{node=~"' + nodeName + '"}', - legendFormat='conntrack_limit', - ) -); - -local top10ContainerCPU(nodeName) = genericGraphLegendPanel('Top 10 container CPU: ' + nodeName, 'percent').addTarget( - prometheus.target( - 'topk(10, sum(irate(container_cpu_usage_seconds_total{container!="POD",name!="",instance=~"' + nodeName + '",namespace!="",namespace=~"$namespace"}[$interval])) by (pod,container,namespace,name,service) * 100)', - legendFormat='{{ pod }}: {{ container }}', - ) -); - -local top10ContainerRSS(nodeName) = genericGraphLegendPanel('Top 10 container RSS: ' + nodeName, 'bytes').addTarget( - prometheus.target( - 'topk(10, container_memory_rss{container!="POD",name!="",instance=~"' + nodeName + '",namespace!="",namespace=~"$namespace"})', - legendFormat='{{ pod }}: {{ container }}', - ) -); - -local containerWriteBytes(nodeName) = genericGraphLegendPanel('Container fs write rate: ' + nodeName, 'Bps').addTarget( - prometheus.target( - 'sum(rate(container_fs_writes_bytes_total{device!~".+dm.+", node=~"' + nodeName + '", container!=""}[$interval])) by (device, container)', - legendFormat='{{ container }}: {{ device }}', - ) -); - -// Individual panel definitions - -// Monitoring Stack - -local promReplMemUsage = genericGraphLegendPanel('Prometheus Replica Memory usage', 'bytes').addTarget( - prometheus.target( - 'sum(container_memory_rss{pod="prometheus-k8s-1",namespace!="",name!="",container="prometheus"}) by (pod)', - legendFormat='{{pod}}', - ) -).addTarget( - prometheus.target( - 'sum(container_memory_rss{pod="prometheus-k8s-0",namespace!="",name!="",container="prometheus"}) by (pod)', - legendFormat='{{pod}}', - ) -); - -// Kubelet - -local kubeletCPU = genericGraphLegendPanel('Top 10 Kubelet CPU usage', 'percent').addTarget( - prometheus.target( - 'topk(10,rate(process_cpu_seconds_total{service="kubelet",job="kubelet"}[$interval])*100)', - legendFormat='kubelet - {{node}}', - ) -); - -local crioCPU = genericGraphLegendPanel('Top 10 crio CPU usage', 'percent').addTarget( - prometheus.target( - 'topk(10,rate(process_cpu_seconds_total{service="kubelet",job="crio"}[$interval])*100)', - legendFormat='crio - {{node}}', - ) -); - -local kubeletMemory = genericGraphLegendPanel('Top 10 Kubelet memory usage', 'bytes').addTarget( - prometheus.target( - 'topk(10,process_resident_memory_bytes{service="kubelet",job="kubelet"})', - legendFormat='kubelet - {{node}}', - ) -); - -local crioMemory = genericGraphLegendPanel('Top 10 crio memory usage', 'bytes').addTarget( - prometheus.target( - 'topk(10,process_resident_memory_bytes{service="kubelet",job="crio"})', - legendFormat='crio - {{node}}', - ) -); - -// Cluster details - -local current_node_count = grafana.statPanel.new( - title='Current Node Count', - datasource='$datasource', - reducerFunction='last', -).addTarget( - prometheus.target( - 'sum(kube_node_info{})', - legendFormat='Number of nodes', - ) -).addTarget( - prometheus.target( - 'sum(kube_node_status_condition{status="true"}) by (condition) > 0', - legendFormat='Node: {{ condition }}', - ) -); - -local current_namespace_count = grafana.statPanel.new( - title='Current namespace Count', - datasource='$datasource', - reducerFunction='last', -).addTarget( - prometheus.target( - 'sum(kube_namespace_status_phase) by (phase)', - legendFormat='{{ phase }}', - ) -); - -local current_pod_count = grafana.statPanel.new( - title='Current Pod Count', - reducerFunction='last', - datasource='$datasource', -).addTarget( - prometheus.target( - 'sum(kube_pod_status_phase{}) by (phase) > 0', - legendFormat='{{ phase}} Pods', - ) -); - -local nodeCount = genericGraphPanel('Number of nodes', 'none').addTarget( - prometheus.target( - 'sum(kube_node_info{})', - legendFormat='Number of nodes', - ) -).addTarget( - prometheus.target( - 'sum(kube_node_status_condition{status="true"}) by (condition) > 0', - legendFormat='Node: {{ condition }}', - ) -); - -local nsCount = genericGraphPanel('Namespace count', 'none').addTarget( - prometheus.target( - 'sum(kube_namespace_status_phase) by (phase) > 0', - legendFormat='{{ phase }} namespaces', - ) -); - -local podCount = genericGraphPanel('Pod count', 'none').addTarget( - prometheus.target( - 'sum(kube_pod_status_phase{}) by (phase)', - legendFormat='{{phase}} pods', - ) -); - -local secretCmCount = genericGraphPanel('Secret & configmap count', 'none').addTarget( - prometheus.target( - 'count(kube_secret_info{})', - legendFormat='secrets', - ) -).addTarget( - prometheus.target( - 'count(kube_configmap_info{})', - legendFormat='Configmaps', - ) -); - -local deployCount = genericGraphPanel('Deployment count', 'none').addTarget( - prometheus.target( - 'count(kube_deployment_labels{})', - legendFormat='Deployments', - ) -); - - -local servicesCount = genericGraphPanel('Services count', 'none').addTarget( - prometheus.target( - 'count(kube_service_info{})', - legendFormat='Services', - ) -); - -local alerts = genericGraphPanel('Alerts', 'none').addTarget( - prometheus.target( - 'topk(10,sum(ALERTS{severity!="none"}) by (alertname, severity))', - legendFormat='{{severity}}: {{alertname}}', - ) -); - -local top10ContMem = genericGraphLegendPanel('Top 10 container RSS', 'bytes').addTarget( - prometheus.target( - 'topk(10, container_memory_rss{namespace!="",container!="POD",name!=""})', - legendFormat='{{ namespace }} - {{ name }}', - ) -); - -local podDistribution = genericGraphLegendPanel('Pod Distribution', 'none').addTarget( - prometheus.target( - 'count(kube_pod_info{}) by (exported_node)', - legendFormat='{{ node }}', - ) -); - -local top10ContCPU = genericGraphLegendPanel('Top 10 container CPU', 'percent').addTarget( - prometheus.target( - 'topk(10,irate(container_cpu_usage_seconds_total{namespace!="",container!="POD",name!=""}[$interval])*100)', - legendFormat='{{ namespace }} - {{ name }}', - ) -); - - -local goroutines_count = genericGraphPanel('Goroutines count', 'none').addTarget( - prometheus.target( - 'topk(10, sum(go_goroutines{}) by (job,instance))', - legendFormat='{{ job }} - {{ instance }}', - ) -); - -// Cilium Panels - -local cilium_container_cpu = genericGraphLegendPanel('Cilium Container CPU', 'percent').addTarget( - prometheus.target( - 'sum(irate(container_cpu_usage_seconds_total{container=~"cilium.*",container!="cilium-operator.*",namespace!=""}[$interval])) by (instance,pod,container,namespace,name,service) * 100', - legendFormat='{{ instance }} - {{ pod }}' - ) -); - -local cilium_container_mem = genericGraphLegendPanel('Cilium Container Memory', 'bytes').addTarget( - prometheus.target( - 'container_memory_rss{container=~"cilium.*",namespace!=""}', - legendFormat='{{ instance }} - {{ pod }}' - ) -); - -local cilium_ip_addresses = genericGraphLegendPanel('Cilium IP Address Allocation', 'none').addTarget( - prometheus.target( - 'cilium_ip_addresses', - legendFormat='{{ pod }} - {{ family }}' - ) -); - -local cilium_netpol = genericGraphLegendPanel('Cilium Network Polices Per Agent', 'none').addTarget( - prometheus.target( - 'cilium_policy', - legendFormat='{{ instance }} - {{ pod }}' - ) -); - -local cilium_bpf_op = genericGraphLegendPanel('Cilium BPF Operations', 'none').addTarget( - prometheus.target( - 'sum by (instance,map_name,operation,outcome)(rate(cilium_bpf_map_ops_total[2m]))', - legendFormat='{{instance}} - {{map_name}} - {{operation}}' - ) -); - -local cilium_failing_control = genericGraphLegendPanel('Cilium Controller Failures', 'none').addTarget( - prometheus.target( - 'cilium_controllers_failing', - legendFormat='{{ instance }} - {{ pod }}' - ) -); - - -// Cluster operators - -local clusterOperatorsOverview = grafana.statPanel.new( - datasource='$datasource', - title='Cluster operators overview', -).addTarget( - prometheus.target( - 'sum by (condition)(cluster_operator_conditions{condition!=""})', - legendFormat='{{ condition }}', - ) -); - -local clusterOperatorsInformation = genericGraphLegendPanel('Cluster operators information', 'none').addTarget( - prometheus.target( - 'cluster_operator_conditions{name!="",reason!=""}', - legendFormat='{{name}} - {{reason}}', - ) -); - -local clusterOperatorsDegraded = genericGraphLegendPanel('Cluster operators degraded', 'none').addTarget( - prometheus.target( - 'cluster_operator_conditions{condition="Degraded",name!="",reason!=""}', - legendFormat='{{name}} - {{reason}}', - ) -); - - -// Dashboard - -grafana.dashboard.new( - 'Cilium k8s Performance', - description='Performance dashboard for k8s w/ Cilium as the CNI', - time_from='now-1h', - timezone='utc', - refresh='30s', - editable='true', -) - - -// Templates - -.addTemplate( - grafana.template.datasource( - 'datasource', - 'prometheus', - '', - ) -) - -.addTemplate( - grafana.template.new( - '_worker_node', - '$datasource', - 'label_values(kube_node_labels{}, exported_node)', - '', - refresh=2, - ) { - label: 'Worker', - type: 'query', - multi: true, - includeAll: false, - }, -) - -.addTemplate( - grafana.template.new( - 'namespace', - '$datasource', - 'label_values(kube_pod_info, exported_namespace)', - '', - refresh=2, - ) { - label: 'Namespace', - type: 'query', - multi: false, - includeAll: true, - }, -) - - -.addTemplate( - grafana.template.new( - 'block_device', - '$datasource', - 'label_values(node_disk_written_bytes_total,device)', - '', - regex='/^(?:(?!dm|rb).)*$/', - refresh=2, - ) { - label: 'Block device', - type: 'query', - multi: true, - includeAll: true, - }, -) - - -.addTemplate( - grafana.template.new( - 'net_device', - '$datasource', - 'label_values(node_network_receive_bytes_total,device)', - '', - regex='/^((br|en|et).*)$/', - refresh=2, - ) { - label: 'Network device', - type: 'query', - multi: true, - includeAll: true, - }, -) - -.addTemplate( - grafana.template.new( - 'interval', - '$datasource', - '$__auto_interval_period', - label='interval', - refresh='time', - ) { - type: 'interval', - query: '2m,3m,4m,5m', - auto: false, - }, -) - -// Dashboard definition - -.addPanel(grafana.row.new(title='Cilium Details', collapse=true).addPanels( - [ - cilium_failing_control { gridPos: { x: 0, y: 4, w: 12, h: 8 } }, - cilium_ip_addresses { gridPos: { x: 12, y: 4, w: 12, h: 8 } }, - cilium_container_cpu { gridPos: { x: 0, y: 4, w: 12, h: 8 } }, - cilium_container_mem { gridPos: { x: 12, y: 4, w: 12, h: 8 } }, - cilium_netpol { gridPos: { x: 0, y: 4, w: 12, h: 8 } }, - cilium_bpf_op { gridPos: { x: 12, y: 12, w: 12, h: 8 } }, - ] -), { gridPos: { x: 0, y: 3, w: 24, h: 1 } }) - -.addPanel(grafana.row.new(title='Cluster Details', collapse=true).addPanels( - [ - current_node_count { gridPos: { x: 0, y: 4, w: 8, h: 3 } }, - current_namespace_count { gridPos: { x: 8, y: 4, w: 8, h: 3 } }, - current_pod_count { gridPos: { x: 16, y: 4, w: 8, h: 3 } }, - nodeCount { gridPos: { x: 0, y: 12, w: 8, h: 8 } }, - nsCount { gridPos: { x: 8, y: 12, w: 8, h: 8 } }, - podCount { gridPos: { x: 16, y: 12, w: 8, h: 8 } }, - secretCmCount { gridPos: { x: 0, y: 20, w: 8, h: 8 } }, - deployCount { gridPos: { x: 8, y: 20, w: 8, h: 8 } }, - servicesCount { gridPos: { x: 16, y: 20, w: 8, h: 8 } }, - top10ContMem { gridPos: { x: 0, y: 28, w: 24, h: 8 } }, - top10ContCPU { gridPos: { x: 0, y: 36, w: 12, h: 8 } }, - goroutines_count { gridPos: { x: 12, y: 36, w: 12, h: 8 } }, - podDistribution { gridPos: { x: 0, y: 44, w: 24, h: 8 } }, - ] -), { gridPos: { x: 0, y: 3, w: 24, h: 1 } }) - -.addPanel(grafana.row.new(title='Node: $_worker_node', collapse=true, repeat='_worker_node').addPanels( - [ - nodeCPU('$_worker_node') { gridPos: { x: 0, y: 0, w: 12, h: 8 } }, - nodeMemory('$_worker_node') { gridPos: { x: 12, y: 0, w: 12, h: 8 } }, - diskThroughput('$_worker_node') { gridPos: { x: 0, y: 8, w: 12, h: 8 } }, - diskIOPS('$_worker_node') { gridPos: { x: 12, y: 8, w: 12, h: 8 } }, - networkUtilization('$_worker_node') { gridPos: { x: 0, y: 16, w: 12, h: 8 } }, - networkPackets('$_worker_node') { gridPos: { x: 12, y: 16, w: 12, h: 8 } }, - networkDrop('$_worker_node') { gridPos: { x: 0, y: 24, w: 12, h: 8 } }, - conntrackStats('$_worker_node') { gridPos: { x: 12, y: 24, w: 12, h: 8 } }, - top10ContainerCPU('$_worker_node') { gridPos: { x: 0, y: 32, w: 12, h: 8 } }, - top10ContainerRSS('$_worker_node') { gridPos: { x: 12, y: 32, w: 12, h: 8 } }, - ], -), { gridPos: { x: 0, y: 1, w: 0, h: 8 } }) +local panels = import '../../assets/cilium-k8s-perf/panels.libsonnet'; +local queries = import '../../assets/cilium-k8s-perf/queries.libsonnet'; +local variables = import '../../assets/cilium-k8s-perf/variables.libsonnet'; +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; + +g.dashboard.new('Cilium k8s Performance dashboard') ++ g.dashboard.time.withFrom('now-1h') ++ g.dashboard.time.withTo('now') ++ g.dashboard.withTimezone('utc') ++ g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) ++ g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) ++ g.dashboard.withRefresh('30s') ++ g.dashboard.withEditable(false) ++ g.dashboard.graphTooltip.withSharedCrosshair() ++ g.dashboard.withVariables([ + variables.Datasource, + variables._worker_node, + variables.namespace, + variables.block_device, + variables.net_device, + variables.interval, +]) + ++ g.dashboard.withPanels([ + g.panel.row.new('Cilium Details') + + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withPanels([ + panels.timeSeries.withCiliumAgg('Cilium Controller Failures', 'none', queries.ciliumControllerFailures.query(), { x: 0, y: 1, w: 12, h: 8 }), + panels.timeSeries.withCiliumAgg('Cilium IP Address Allocation', 'none', queries.ciliumIPAddressAllocation.query(), { x: 12, y: 1, w: 12, h: 8 }), + panels.timeSeries.withCiliumAgg('Cilium Container CPU', 'percent', queries.ciliumContainerCPU.query(), { x: 0, y: 9, w: 12, h: 8 }), + panels.timeSeries.withCiliumAgg('Cilium Container Memory', 'bytes', queries.ciliumConatinerMemory.query(), { x: 12, y: 9, w: 12, h: 8 }), + panels.timeSeries.withCiliumAgg('Cilium Network Polices Per Agent', 'none', queries.ciliumNetworkPolicesPerAgent.query(), { x: 0, y: 17, w: 12, h: 8 }), + panels.timeSeries.withCiliumAgg('Cilium BPF Operations', 'none', queries.ciliumBPFOperations.query(), { x: 12, y: 17, w: 12, h: 8 }), + ]), + g.panel.row.new('Cluster Details') + + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withPanels([ + panels.stat.withclusterAgg('Current Node Count', 'none', queries.currentNodeCount.query(), { x: 0, y: 26, w: 8, h: 3 }), + panels.stat.withclusterAgg('Current namespace Count', 'none', queries.currentNamespaceCount.query(), { x: 8, y: 26, w: 8, h: 3 }), + panels.stat.withclusterAgg('Current Pod Count', 'none', queries.currentPodCount.query(), { x: 16, y: 26, w: 8, h: 3 }), + panels.timeSeries.withClusterAgg('Number of nodes', 'none', queries.numberOfNodes.query(), { x: 0, y: 29, w: 8, h: 8 }), + panels.timeSeries.withClusterAgg('Namespace count', 'none', queries.namespaceCount.query(), { x: 8, y: 29, w: 8, h: 8 }), + panels.timeSeries.withClusterAgg('Pod count', 'none', queries.podCount.query(), { x: 16, y: 29, w: 8, h: 8 }), + panels.timeSeries.withClusterAgg('Secret & configmap count', 'none', queries.secretConfigmapCount.query(), { x: 0, y: 37, w: 8, h: 8 }), + panels.timeSeries.withClusterAgg('Deployment count', 'none', queries.deploymentCount.query(), { x: 8, y: 37, w: 8, h: 8 }), + panels.timeSeries.withClusterAgg('Services count', 'none', queries.serviceCount.query(), { x: 16, y: 37, w: 8, h: 8 }), + panels.timeSeries.withCiliumAgg('Top 10 container RSS', 'bytes', queries.top10ContainerRSS.query(), { x: 0, y: 45, w: 24, h: 8 }), + panels.timeSeries.withCiliumAgg('Top 10 container CPU', 'percent', queries.top10ContainerCPU.query(), { x: 0, y: 53, w: 12, h: 8 }), + panels.timeSeries.withClusterAgg('Goroutines count', 'none', queries.goroutinesCount.query(), { x: 12, y: 53, w: 12, h: 8 }), + panels.timeSeries.withCiliumAgg('Pod Distribution', 'none', queries.podDistribution.query(), { x: 0, y: 61, w: 24, h: 8 }), + ]), + g.panel.row.new('Node: $_worker_node') + + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withRepeat('_worker_node') + + g.panel.row.withPanels([ + panels.timeSeries.withCiliumAgg('CPU Basic: $_worker_node', 'percent', queries.CPUBasic.query(), { x: 0, y: 70, w: 12, h: 8 }), + panels.timeSeries.withCiliumAgg('System Memory: $_worker_node', 'bytes', queries.systemMemory.query(), { x: 12, y: 70, w: 12, h: 8 }), + panels.timeSeries.withCiliumAgg('Disk throughput: $_worker_node', 'Bps', queries.diskThroughput.query(), { x: 0, y: 78, w: 12, h: 8 }), + panels.timeSeries.withCiliumAgg('Disk IOPS: $_worker_node', 'iops', queries.diskIOPS.query(), { x: 12, y: 78, w: 12, h: 8 }), + panels.timeSeries.withCiliumAgg('Network Utilization: $_worker_node', 'bps', queries.networkUtilization.query(), { x: 0, y: 86, w: 12, h: 8 }), + panels.timeSeries.withCiliumAgg('Network Packets: $_worker_node', 'pps', queries.networkPackets.query(), { x: 12, y: 86, w: 12, h: 8 }), + panels.timeSeries.withCiliumAgg('Network packets drop: $_worker_node', 'pps', queries.networkPacketDrop.query(), { x: 0, y: 94, w: 12, h: 8 }), + panels.timeSeries.withCiliumAgg('Conntrack stats: $_worker_node', '', queries.conntrackStats.query(), { x: 12, y: 94, w: 12, h: 8 }), + panels.timeSeries.withCiliumAgg('Top 10 container CPU: $_worker_node', 'percent', queries.top10ContainerCPUNode.query(), { x: 0, y: 102, w: 12, h: 8 }), + panels.timeSeries.withCiliumAgg('Top 10 container RSS: $_worker_node', 'bytes', queries.top10ContainerRSSNode.query(), { x: 12, y: 102, w: 12, h: 8 }), + ]), +]) diff --git a/templates/General/etcd-on-cluster-dashboard-v2.jsonnet b/templates/General/etcd-on-cluster-dashboard-v2.jsonnet deleted file mode 100644 index f2d08b3..0000000 --- a/templates/General/etcd-on-cluster-dashboard-v2.jsonnet +++ /dev/null @@ -1,69 +0,0 @@ -local panels = import '../../assets/etcd-on-cluster-dashboard/panels.libsonnet'; -local queries = import '../../assets/etcd-on-cluster-dashboard/queries.libsonnet'; -local variables = import '../../assets/etcd-on-cluster-dashboard/variables.libsonnet'; -local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; - -g.dashboard.new('etcd-cluster-info dashoard') -+ g.dashboard.time.withFrom('now-1h') -+ g.dashboard.time.withTo('now') -+ g.dashboard.withTimezone('utc') -+ g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) -+ g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) -+ g.dashboard.withRefresh('') -+ g.dashboard.withEditable(false) -+ g.dashboard.graphTooltip.withSharedCrosshair() -+ g.dashboard.withVariables([ - variables.Datasource, -]) - -+ g.dashboard.withPanels([ - g.panel.row.new('General Resource Usage') - + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) - + g.panel.row.withCollapsed(true) - + g.panel.row.withPanels([ - panels.timeSeries.generalUsageAgg('CPU usage', 'percent', queries.CPUUsage.query(), { x: 0, y: 1, w: 12, h: 8 }), - panels.timeSeries.generalUsageAgg('Memory usage', 'bytes', queries.memoryUsage.query(), { x: 12, y: 1, w: 12, h: 8 }), - panels.timeSeries.generalUsageAgg('Disk WAL Sync Duration', 's', queries.diskWalSyncDuration.query(), { x: 0, y: 8, w: 12, h: 8 }), - panels.timeSeries.generalUsageAgg('Disk Backend Sync Duration', 's', queries.diskBackendSyncDuration.query(), { x: 12, y: 8, w: 12, h: 8 }), - panels.timeSeries.generalUsageAgg('Etcd container disk writes', 'Bps', queries.etcdContainerDiskWrites.query(), { x: 0, y: 16, w: 12, h: 8 }), - panels.timeSeries.generalUsageAgg('DB Size', 'bytes', queries.dbSize.query(), { x: 12, y: 16, w: 12, h: 8 }), - ]), - - g.panel.row.new('Network Usage') - + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) - + g.panel.row.withCollapsed(true) - + g.panel.row.withPanels([ - panels.timeSeries.generalUsageAgg('Container network traffic', 'Bps', queries.containerNetworkTraffic.query(), { x: 0, y: 1, w: 12, h: 8 }), - panels.timeSeries.generalUsageAgg('p99 peer to peer latency', 's', queries.p99PeerToPeerLatency.query(), { x: 12, y: 1, w: 12, h: 8 }), - panels.timeSeries.generalUsageAgg('Peer network traffic', 'Bps', queries.peerNetworkTraffic.query(), { x: 0, y: 8, w: 12, h: 8 }), - panels.timeSeries.generalUsageAgg('gRPC network traffic', 'Bps', queries.gRPCNetworkTraffic.query(), { x: 12, y: 8, w: 12, h: 8 }), - panels.timeSeries.withoutCalcsAgg('Active Streams', '', queries.activeStreams.query(), { x: 0, y: 16, w: 12, h: 8 }), - panels.timeSeries.withoutCalcsAgg('Snapshot duration', 's', queries.snapshotDuration.query(), { x: 12, y: 16, w: 12, h: 8 }), - ]), - - g.panel.row.new('DB Info per Member') - + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) - + g.panel.row.withCollapsed(true) - + g.panel.row.withPanels([ - panels.timeSeries.withoutCalcsAgg('% DB Space Used', 'percent', queries.dbSpaceUsed.query(), { x: 0, y: 8, w: 8, h: 8 }), - panels.timeSeries.withoutCalcsAgg('DB Left capacity (with fragmented space)', 'bytes', queries.dbLeftCapacity.query(), { x: 8, y: 8, w: 8, h: 8 }), - panels.timeSeries.withoutCalcsAgg('DB Size Limit (Backend-bytes)', 'bytes', queries.dbSizeLimit.query(), { x: 16, y: 8, w: 8, h: 8 }), - ]), - - g.panel.row.new('General Info') - + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) - + g.panel.row.withCollapsed(true) - + g.panel.row.withPanels([ - panels.timeSeries.GeneralInfo('Raft Proposals', '', queries.raftProposals.query(), { x: 0, y: 1, w: 12, h: 8 }), - panels.timeSeries.GeneralInfo('Number of leader changes seen', '', queries.numberOfLeaderChangesSeen.query(), { x: 12, y: 1, w: 12, h: 8 }), - panels.stat.etcdLeader('Etcd has a leader?', 'none', queries.etcdHasALeader.query(), { x: 0, y: 8, w: 6, h: 2 }), - panels.stat.failedProposalsSeen('Total number of failed proposals seen', 'none', queries.totalNumberOfProposalsSeen.query(), { x: 6, y: 8, w: 6, h: 2 }), - panels.timeSeries.GeneralInfo('Keys', 'short', queries.keys.query(), { x: 12, y: 12, w: 12, h: 8 }), - panels.timeSeries.GeneralInfo('Leader Elections Per Day', 'short', queries.leaderElectionsPerDay.query(), { x: 0, y: 12, w: 12, h: 6 }), - panels.timeSeries.GeneralInfo('Slow Operations', 'ops', queries.slowOperations.query(), { x: 0, y: 20, w: 12, h: 8 }), - panels.timeSeries.GeneralInfo('Key Operations', 'ops', queries.keyOperations.query(), { x: 12, y: 20, w: 12, h: 8 }), - panels.timeSeries.GeneralInfo('Heartbeat Failures', 'short', queries.heartBeatFailure.query(), { x: 0, y: 28, w: 12, h: 8 }), - panels.timeSeries.GeneralInfo('Compacted Keys', 'short', queries.compactedKeys.query(), { x: 12, y: 28, w: 12, h: 8 }), - ]), - -]) diff --git a/templates/General/etcd-on-cluster-dashboard.jsonnet b/templates/General/etcd-on-cluster-dashboard.jsonnet index 68bbc9d..f2d08b3 100644 --- a/templates/General/etcd-on-cluster-dashboard.jsonnet +++ b/templates/General/etcd-on-cluster-dashboard.jsonnet @@ -1,528 +1,69 @@ -local grafana = import '../grafonnet-lib/grafonnet/grafana.libsonnet'; -local prometheus = grafana.prometheus; - -// Panel definitions - -// First sections -local fs_writes = grafana.graphPanel.new( - title='Etcd container disk writes', - datasource='$datasource', - format='Bps', - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - sort='decreasing', - nullPointMode='null as zero', -).addTarget( - prometheus.target( - 'rate(container_fs_writes_bytes_total{namespace="openshift-etcd",container="etcd",device!~".+dm.+"}[2m])', - legendFormat='{{ pod }}: {{ device }}', - ) -); - -local ptp = grafana.graphPanel.new( - title='p99 peer to peer latency', - datasource='$datasource', - format='s', - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - sort='decreasing', - nullPointMode='null as zero', -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{namespace="openshift-etcd"}[2m]))', - legendFormat='{{pod}}', - ) -); - -local disk_wal_sync_duration = grafana.graphPanel.new( - title='Disk WAL Sync Duration', - datasource='$datasource', - format='s', - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - sort='decreasing', - nullPointMode='null as zero', -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(irate(etcd_disk_wal_fsync_duration_seconds_bucket{namespace="openshift-etcd"}[2m])) by (pod, le))', - legendFormat='{{pod}} WAL fsync', - ) -); - -local disk_backend_sync_duration = grafana.graphPanel.new( - title='Disk Backend Sync Duration', - datasource='$datasource', - format='s', - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - sort='decreasing', - nullPointMode='null as zero', -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(irate(etcd_disk_backend_commit_duration_seconds_bucket{namespace="openshift-etcd"}[2m])) by (pod, le))', - legendFormat='{{pod}} DB fsync', - ) -); - -local db_size = grafana.graphPanel.new( - title='DB Size', - datasource='$datasource', - format='bytes', - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - sort='decreasing', - nullPointMode='null as zero', -).addTarget( - prometheus.target( - 'etcd_mvcc_db_total_size_in_bytes{namespace="openshift-etcd"}', - legendFormat='{{pod}} DB physical size' - ) -).addTarget( - prometheus.target( - 'etcd_mvcc_db_total_size_in_use_in_bytes{namespace="openshift-etcd"}', - legendFormat='{{pod}} DB logical size', - ) -); - - -local cpu_usage = grafana.graphPanel.new( - title='CPU usage', - datasource='$datasource', - format='percent', - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - sort='decreasing', - nullPointMode='null as zero', -).addTarget( - prometheus.target( - 'sum(irate(container_cpu_usage_seconds_total{namespace="openshift-etcd", container="etcd"}[2m])) by (pod) * 100', - legendFormat='{{ pod }}', - ) -); - -local mem_usage = grafana.graphPanel.new( - title='Memory usage', - datasource='$datasource', - format='bytes', - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - sort='decreasing', - nullPointMode='null as zero', -).addTarget( - prometheus.target( - 'sum(avg_over_time(container_memory_working_set_bytes{container="",pod!="", namespace=~"openshift-etcd.*"}[2m])) BY (pod, namespace)', - legendFormat='{{ pod }}', - ) -); - -local network_traffic = grafana.graphPanel.new( - title='Container network traffic', - datasource='$datasource', - format='Bps', - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - sort='decreasing', - nullPointMode='null as zero', -).addTarget( - prometheus.target( - 'sum(rate(container_network_receive_bytes_total{ container="etcd", namespace=~"openshift-etcd.*"}[2m])) BY (namespace, pod)', - legendFormat='rx {{ pod }}' - ) -).addTarget( - prometheus.target( - 'sum(rate(container_network_transmit_bytes_total{ container="etcd", namespace=~"openshift-etcd.*"}[2m])) BY (namespace, pod)', - legendFormat='tx {{ pod }}', - ) -); - - -local grpc_traffic = grafana.graphPanel.new( - title='gRPC network traffic', - datasource='$datasource', - format='Bps', - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - sort='decreasing', - nullPointMode='null as zero', -).addTarget( - prometheus.target( - 'rate(etcd_network_client_grpc_received_bytes_total{namespace="openshift-etcd"}[2m])', - legendFormat='rx {{pod}}' - ) -).addTarget( - prometheus.target( - 'rate(etcd_network_client_grpc_sent_bytes_total{namespace="openshift-etcd"}[2m])', - legendFormat='tx {{pod}}', - ) -); - -local peer_traffic = grafana.graphPanel.new( - title='Peer network traffic', - datasource='$datasource', - format='Bps', - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - sort='decreasing', - nullPointMode='null as zero', -).addTarget( - prometheus.target( - 'rate(etcd_network_peer_received_bytes_total{namespace="openshift-etcd"}[2m])', - legendFormat='rx {{pod}} Peer Traffic' - ) -).addTarget( - prometheus.target( - 'rate(etcd_network_peer_sent_bytes_total{namespace="openshift-etcd"}[2m])', - legendFormat='tx {{pod}} Peer Traffic', - ) -); - - -local active_streams = grafana.graphPanel.new( - title='Active Streams', - datasource='$datasource', -).addTarget( - prometheus.target( - 'sum(grpc_server_started_total{namespace="openshift-etcd",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{namespace="openshift-etcd",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"})', - legendFormat='Watch Streams', - ) -).addTarget( - prometheus.target( - 'sum(grpc_server_started_total{namespace="openshift-etcd",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{namespace="openshift-etcd",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"})', - legendFormat='Lease Streams', - ) -); - -local snapshot_duration = grafana.graphPanel.new( - title='Snapshot duration', - datasource='$datasource', - format='s', -).addTarget( - prometheus.target( - 'sum(rate(etcd_debugging_snap_save_total_duration_seconds_sum{namespace="openshift-etcd"}[2m]))', - legendFormat='the total latency distributions of save called by snapshot', - ) -); - -//DB Info per Member - -local percent_db_used = grafana.graphPanel.new( - title='% DB Space Used', - datasource='$datasource', - format='percent', -).addTarget( - prometheus.target( - '(etcd_mvcc_db_total_size_in_bytes{namespace="openshift-etcd"} / etcd_server_quota_backend_bytes{namespace="openshift-etcd"})*100', - legendFormat='{{pod}}', - ) -); - -local db_capacity_left = grafana.graphPanel.new( - title='DB Left capacity (with fragmented space)', - datasource='$datasource', - format='bytes', -).addTarget( - prometheus.target( - 'etcd_server_quota_backend_bytes{namespace="openshift-etcd"} - etcd_mvcc_db_total_size_in_bytes{namespace="openshift-etcd"}', - legendFormat='{{pod}}', - ) -); - -local db_size_limit = grafana.graphPanel.new( - title='DB Size Limit (Backend-bytes)', - datasource='$datasource', - format='bytes' -).addTarget( - prometheus.target( - 'etcd_server_quota_backend_bytes{namespace="openshift-etcd"}', - legendFormat='{{ pod }} Quota Bytes', - ) -); - -// Proposals, leaders, and keys section - -local keys = grafana.graphPanel.new( - title='Keys', - datasource='$datasource', -).addTarget( - prometheus.target( - 'etcd_debugging_mvcc_keys_total{namespace="openshift-etcd"}', - legendFormat='{{ pod }} Num keys', - ) -); - -local compacted_keys = grafana.graphPanel.new( - title='Compacted Keys', - datasource='$datasource', -).addTarget( - prometheus.target( - 'etcd_debugging_mvcc_db_compaction_keys_total{namespace="openshift-etcd"}', - legendFormat='{{ pod }} keys compacted', - ) -); - -local heartbeat_failures = grafana.graphPanel.new( - title='Heartbeat Failures', - datasource='$datasource', -).addTarget( - prometheus.target( - 'etcd_server_heartbeat_send_failures_total{namespace="openshift-etcd"}', - legendFormat='{{ pod }} heartbeat failures', - ) -).addTarget( - prometheus.target( - 'etcd_server_health_failures{namespace="openshift-etcd"}', - legendFormat='{{ pod }} health failures', - ) -); - - -local key_operations = grafana.graphPanel.new( - title='Key Operations', - datasource='$datasource', - format='ops', -) { - yaxes: [ - { - format: 'ops', - show: 'true', - }, - { - format: 'short', - show: 'false', - }, - ], -}.addTarget( - prometheus.target( - 'rate(etcd_mvcc_put_total{namespace="openshift-etcd"}[2m])', - legendFormat='{{ pod }} puts/s', - ) -).addTarget( - prometheus.target( - 'rate(etcd_mvcc_delete_total{namespace="openshift-etcd"}[2m])', - legendFormat='{{ pod }} deletes/s', - ) -); - -local slow_operations = grafana.graphPanel.new( - title='Slow Operations', - datasource='$datasource', - format='ops', -) { - yaxes: [ - { - format: 'ops', - show: 'true', - }, - { - format: 'short', - show: 'false', - }, - ], -}.addTarget( - prometheus.target( - 'delta(etcd_server_slow_apply_total{namespace="openshift-etcd"}[2m])', - legendFormat='{{ pod }} slow applies', - ) -).addTarget( - prometheus.target( - 'delta(etcd_server_slow_read_indexes_total{namespace="openshift-etcd"}[2m])', - legendFormat='{{ pod }} slow read indexes', - ) -); - -local raft_proposals = grafana.graphPanel.new( - title='Raft Proposals', - datasource='$datasource', -).addTarget( - prometheus.target( - 'sum(rate(etcd_server_proposals_failed_total{namespace="openshift-etcd"}[2m]))', - legendFormat='Proposal Failure Rate', - ) -).addTarget( - prometheus.target( - 'sum(etcd_server_proposals_pending{namespace="openshift-etcd"})', - legendFormat='Proposal Pending Total', - ) -).addTarget( - prometheus.target( - 'sum(rate(etcd_server_proposals_committed_total{namespace="openshift-etcd"}[2m]))', - legendFormat='Proposal Commit Rate', - ) -).addTarget( - prometheus.target( - 'sum(rate(etcd_server_proposals_applied_total{namespace="openshift-etcd"}[2m]))', - legendFormat='Proposal Apply Rate', - ) -); - -local leader_elections_per_day = grafana.graphPanel.new( - title='Leader Elections Per Day', - datasource='$datasource', -).addTarget( - prometheus.target( - 'changes(etcd_server_leader_changes_seen_total{namespace="openshift-etcd"}[1d])', - legendFormat='{{instance}} Total Leader Elections Per Day', - ) -); - -local etcd_has_leader = grafana.singlestat.new( - title='Etcd has a leader?', - datasource='$datasource', - valueMaps=[ - { - op: '=', - text: 'YES', - value: '1', - }, - { - op: '=', - text: 'NO', - value: '0', - }, - ] -).addTarget( - prometheus.target( - 'max(etcd_server_has_leader{namespace="openshift-etcd"})', - ) -); - -local num_leader_changes = grafana.graphPanel.new( - title='Number of leader changes seen', - datasource='$datasource', -).addTarget( - prometheus.target( - 'sum(rate(etcd_server_leader_changes_seen_total{namespace="openshift-etcd"}[2m]))', - ) -); - -local num_failed_proposals = grafana.singlestat.new( - title='Total number of failed proposals seen', - datasource='$datasource', -).addTarget( - prometheus.target( - 'max(etcd_server_proposals_committed_total{namespace="openshift-etcd"})', - ) -); - - -// Creating the dashboard from the panels described above. - -grafana.dashboard.new( - 'etcd-cluster-info', - description='', - timezone='utc', - time_from='now-1h', - editable='true' -) - -.addTemplate( - grafana.template.datasource( - 'datasource', - 'prometheus', - '', - label='datasource' - ) -) - -.addPanel( - grafana.row.new(title='General Resource Usage', collapse=true).addPanels( - [ - cpu_usage { gridPos: { x: 0, y: 1, w: 12, h: 8 } }, - mem_usage { gridPos: { x: 12, y: 1, w: 12, h: 8 } }, - disk_wal_sync_duration { gridPos: { x: 0, y: 8, w: 12, h: 8 } }, - disk_backend_sync_duration { gridPos: { x: 12, y: 8, w: 12, h: 8 } }, - fs_writes { gridPos: { x: 0, y: 16, w: 12, h: 8 } }, - db_size { gridPos: { x: 12, y: 16, w: 12, h: 8 } }, - ] - ), { gridPos: { x: 0, y: 0, w: 24, h: 1 } } -) - -.addPanel( - grafana.row.new(title='Network Usage', collapse=true).addPanels( - [ - network_traffic { gridPos: { x: 0, y: 1, w: 12, h: 8 } }, - ptp { gridPos: { x: 12, y: 1, w: 12, h: 8 } }, - peer_traffic { gridPos: { x: 0, y: 8, w: 12, h: 8 } }, - grpc_traffic { gridPos: { x: 12, y: 8, w: 12, h: 8 } }, - active_streams { gridPos: { x: 0, y: 16, w: 12, h: 8 } }, - snapshot_duration { gridPos: { x: 12, y: 16, w: 12, h: 8 } }, - ] - ), { gridPos: { x: 0, y: 1, w: 24, h: 1 } } -) - -.addPanel( - grafana.row.new(title='DB Info per Member', collapse=true).addPanels( - [ - percent_db_used { gridPos: { x: 0, y: 8, w: 8, h: 8 } }, - db_capacity_left { gridPos: { x: 8, y: 8, w: 8, h: 8 } }, - db_size_limit { gridPos: { x: 16, y: 8, w: 8, h: 8 } }, - ] - ), { gridPos: { x: 0, y: 2, w: 24, h: 1 } } -) - -.addPanel( - grafana.row.new(title='General Info', collapse=true).addPanels( - [ - raft_proposals { gridPos: { x: 0, y: 1, w: 12, h: 8 } }, - num_leader_changes { gridPos: { x: 12, y: 1, w: 12, h: 8 } }, - etcd_has_leader { gridPos: { x: 0, y: 8, w: 6, h: 2 } }, - num_failed_proposals { gridPos: { x: 6, y: 8, w: 6, h: 2 } }, - leader_elections_per_day { gridPos: { x: 0, y: 12, w: 12, h: 6 } }, - keys { gridPos: { x: 12, y: 12, w: 12, h: 8 } }, - slow_operations { gridPos: { x: 0, y: 20, w: 12, h: 8 } }, - key_operations { gridPos: { x: 12, y: 20, w: 12, h: 8 } }, - heartbeat_failures { gridPos: { x: 0, y: 28, w: 12, h: 8 } }, - compacted_keys { gridPos: { x: 12, y: 28, w: 12, h: 8 } }, - ] - ), { gridPos: { x: 0, y: 3, w: 24, h: 1 } } -) +local panels = import '../../assets/etcd-on-cluster-dashboard/panels.libsonnet'; +local queries = import '../../assets/etcd-on-cluster-dashboard/queries.libsonnet'; +local variables = import '../../assets/etcd-on-cluster-dashboard/variables.libsonnet'; +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; + +g.dashboard.new('etcd-cluster-info dashoard') ++ g.dashboard.time.withFrom('now-1h') ++ g.dashboard.time.withTo('now') ++ g.dashboard.withTimezone('utc') ++ g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) ++ g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) ++ g.dashboard.withRefresh('') ++ g.dashboard.withEditable(false) ++ g.dashboard.graphTooltip.withSharedCrosshair() ++ g.dashboard.withVariables([ + variables.Datasource, +]) + ++ g.dashboard.withPanels([ + g.panel.row.new('General Resource Usage') + + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withPanels([ + panels.timeSeries.generalUsageAgg('CPU usage', 'percent', queries.CPUUsage.query(), { x: 0, y: 1, w: 12, h: 8 }), + panels.timeSeries.generalUsageAgg('Memory usage', 'bytes', queries.memoryUsage.query(), { x: 12, y: 1, w: 12, h: 8 }), + panels.timeSeries.generalUsageAgg('Disk WAL Sync Duration', 's', queries.diskWalSyncDuration.query(), { x: 0, y: 8, w: 12, h: 8 }), + panels.timeSeries.generalUsageAgg('Disk Backend Sync Duration', 's', queries.diskBackendSyncDuration.query(), { x: 12, y: 8, w: 12, h: 8 }), + panels.timeSeries.generalUsageAgg('Etcd container disk writes', 'Bps', queries.etcdContainerDiskWrites.query(), { x: 0, y: 16, w: 12, h: 8 }), + panels.timeSeries.generalUsageAgg('DB Size', 'bytes', queries.dbSize.query(), { x: 12, y: 16, w: 12, h: 8 }), + ]), + + g.panel.row.new('Network Usage') + + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withPanels([ + panels.timeSeries.generalUsageAgg('Container network traffic', 'Bps', queries.containerNetworkTraffic.query(), { x: 0, y: 1, w: 12, h: 8 }), + panels.timeSeries.generalUsageAgg('p99 peer to peer latency', 's', queries.p99PeerToPeerLatency.query(), { x: 12, y: 1, w: 12, h: 8 }), + panels.timeSeries.generalUsageAgg('Peer network traffic', 'Bps', queries.peerNetworkTraffic.query(), { x: 0, y: 8, w: 12, h: 8 }), + panels.timeSeries.generalUsageAgg('gRPC network traffic', 'Bps', queries.gRPCNetworkTraffic.query(), { x: 12, y: 8, w: 12, h: 8 }), + panels.timeSeries.withoutCalcsAgg('Active Streams', '', queries.activeStreams.query(), { x: 0, y: 16, w: 12, h: 8 }), + panels.timeSeries.withoutCalcsAgg('Snapshot duration', 's', queries.snapshotDuration.query(), { x: 12, y: 16, w: 12, h: 8 }), + ]), + + g.panel.row.new('DB Info per Member') + + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withPanels([ + panels.timeSeries.withoutCalcsAgg('% DB Space Used', 'percent', queries.dbSpaceUsed.query(), { x: 0, y: 8, w: 8, h: 8 }), + panels.timeSeries.withoutCalcsAgg('DB Left capacity (with fragmented space)', 'bytes', queries.dbLeftCapacity.query(), { x: 8, y: 8, w: 8, h: 8 }), + panels.timeSeries.withoutCalcsAgg('DB Size Limit (Backend-bytes)', 'bytes', queries.dbSizeLimit.query(), { x: 16, y: 8, w: 8, h: 8 }), + ]), + + g.panel.row.new('General Info') + + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withPanels([ + panels.timeSeries.GeneralInfo('Raft Proposals', '', queries.raftProposals.query(), { x: 0, y: 1, w: 12, h: 8 }), + panels.timeSeries.GeneralInfo('Number of leader changes seen', '', queries.numberOfLeaderChangesSeen.query(), { x: 12, y: 1, w: 12, h: 8 }), + panels.stat.etcdLeader('Etcd has a leader?', 'none', queries.etcdHasALeader.query(), { x: 0, y: 8, w: 6, h: 2 }), + panels.stat.failedProposalsSeen('Total number of failed proposals seen', 'none', queries.totalNumberOfProposalsSeen.query(), { x: 6, y: 8, w: 6, h: 2 }), + panels.timeSeries.GeneralInfo('Keys', 'short', queries.keys.query(), { x: 12, y: 12, w: 12, h: 8 }), + panels.timeSeries.GeneralInfo('Leader Elections Per Day', 'short', queries.leaderElectionsPerDay.query(), { x: 0, y: 12, w: 12, h: 6 }), + panels.timeSeries.GeneralInfo('Slow Operations', 'ops', queries.slowOperations.query(), { x: 0, y: 20, w: 12, h: 8 }), + panels.timeSeries.GeneralInfo('Key Operations', 'ops', queries.keyOperations.query(), { x: 12, y: 20, w: 12, h: 8 }), + panels.timeSeries.GeneralInfo('Heartbeat Failures', 'short', queries.heartBeatFailure.query(), { x: 0, y: 28, w: 12, h: 8 }), + panels.timeSeries.GeneralInfo('Compacted Keys', 'short', queries.compactedKeys.query(), { x: 12, y: 28, w: 12, h: 8 }), + ]), + +]) diff --git a/templates/General/hypershift-performance-v2.jsonnet b/templates/General/hypershift-performance-v2.jsonnet deleted file mode 100644 index 505d1bb..0000000 --- a/templates/General/hypershift-performance-v2.jsonnet +++ /dev/null @@ -1,154 +0,0 @@ -local panels = import '../../assets/hypershift-perf-dashboard/panels.libsonnet'; -local queries = import '../../assets/hypershift-perf-dashboard/queries.libsonnet'; -local variables = import '../../assets/hypershift-perf-dashboard/variables.libsonnet'; -local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; - -local cluster_prometheus = 'PF55DCC5EC58ABF5A'; -local OBO = 'P1BA917A37525EDF3'; - -g.dashboard.new('Hypershift Performance Dashboard') -+ g.dashboard.withDescription(||| - Dashboard for Api-performance-overview -|||) -+ g.dashboard.withTags('') -+ g.dashboard.time.withFrom('now-6h') -+ g.dashboard.time.withTo('now') -+ g.dashboard.withTimezone('utc') -+ g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) -+ g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) -+ g.dashboard.withRefresh('') -+ g.dashboard.withEditable(true) -+ g.dashboard.graphTooltip.withSharedCrosshair() -+ g.dashboard.withVariables([ - variables.Namespace, - variables.Resource, - variables.Code, - variables.Verb, -]) -+ g.dashboard.withPanels([ - g.panel.row.new('Management cluster stats') - + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) - + g.panel.row.withCollapsed(true) - + g.panel.row.withPanels([ - panels.stat.m_infrastructure('Management Cloud Infrastructure', '', queries.m_infrastructure.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 0, w: 6, h: 4 }), - panels.stat.m_region('Management Cloud Region', '', queries.m_region.query(), 'PF55DCC5EC58ABF5A', { x: 6, y: 0, w: 6, h: 4 }), - panels.stat.m_ocp_version('Management OCP Version', '', queries.m_ocp_version.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 0, w: 6, h: 4 }), - panels.stat.num_hosted_cluster('Number of HostedCluster', '', queries.num_hosted_cluster.query(), 'PF55DCC5EC58ABF5A', { x: 18, y: 0, w: 6, h: 4 }), - panels.stat.current_namespace_count('Current namespace Count', '', queries.current_namespace_count.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 5, w: 8, h: 4 }), - panels.stat.current_node_count('Current Node Count', '', queries.current_node_count.query(), 'PF55DCC5EC58ABF5A', { x: 8, y: 5, w: 8, h: 4 }), - panels.stat.current_pod_count('Current Pod Count', '', queries.current_pod_count.query(), 'PF55DCC5EC58ABF5A', { x: 16, y: 5, w: 8, h: 4 }), - panels.timeSeries.managementClustersStatsTimeseriesSettings('Top 10 Hosted Clusters container CPU', 'percent', queries.top10ContCPUHosted.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 8, w: 12, h: 8 }), - panels.timeSeries.managementClustersStatsTimeseriesSettings('Top 10 Hosted Clusters container RSS', 'bytes', queries.top10ContMemHosted.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 8, w: 12, h: 8 }), - panels.timeSeries.managementClustersStatsTimeseriesSettings('Top 10 Management Cluster container CPU', 'percent', queries.top10ContCPUManagement.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 20, w: 12, h: 8 }), - panels.timeSeries.managementClustersStatsTimeseriesSettings('Top 10 Management Cluster container RSS', 'bytes', queries.top10ContMemManagement.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 20, w: 12, h: 8 }), - panels.timeSeries.managementClustersStatsTimeseriesSettings('Top 10 Management Cluster OBO NS Pods CPU', 'percent', queries.top10ContCPUOBOManagement.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 28, w: 12, h: 8 }), - panels.timeSeries.managementClustersStatsTimeseriesSettings('Top 10 Management Cluster OBO NS Pods RSS', 'bytes', queries.top10ContMemOBOManagement.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 28, w: 12, h: 8 }), - panels.timeSeries.managementClustersStatsTimeseriesSettings('Top 10 Management Cluster Hypershift NS Pods CPU', 'percent', queries.top10ContCPUHypershiftManagement.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 36, w: 12, h: 8 }), - panels.timeSeries.managementClustersStatsTimeseriesSettings('Top 10 Management Cluster Hypershift NS Pods RSS', 'bytes', queries.top10ContMemHypershiftManagement.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 36, w: 12, h: 8 }), - panels.timeSeries.managementClustersStatsTimeseriesSettings('Active Gate Memory Usage', 'bytes', queries.dynaactivegateMem.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 18, w: 12, h: 8 }), - panels.timeSeries.managementClustersStatsTimeseriesSettings('Active Gate CPU Usage', 'percent', queries.dynaactivegateCPU.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 18, w: 12, h: 8 }), - panels.timeSeries.managementClustersStatsTimeseriesSettings('Opentelemetry CPU Usage', 'percent', queries.opentelemetryCPU.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 18, w: 12, h: 8 }), - panels.timeSeries.managementClustersStatsTimeseriesSettings('Opentelemetry Memory Usage', 'bytes', queries.opentelemetryMem.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 18, w: 12, h: 8 }), - panels.timeSeries.managementClustersStatsTimeseriesSettings('Number of nodes', 'none', queries.nodeCount.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 44, w: 6, h: 8 }), - panels.timeSeries.managementClustersStatsTimeseriesSettings('Machine Set Replicas', 'none', queries.current_machine_set_replica_count.query(), 'PF55DCC5EC58ABF5A', { x: 6, y: 44, w: 6, h: 8 }), - panels.timeSeries.managementClustersStatsTimeseriesSettings('Namespace count', 'none', queries.nsCount.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 44, w: 6, h: 8 }), - panels.timeSeries.managementClustersStatsTimeseriesSettings('Pod count', 'none', queries.podCount.query(), 'PF55DCC5EC58ABF5A', { x: 18, y: 44, w: 6, h: 8 }), - panels.timeSeries.managementClustersStatsTimeseriesSettings('Cluster operators information', 'none', queries.clusterOperatorsInformation.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 52, w: 8, h: 8 }), - panels.timeSeries.managementClustersStatsTimeseriesSettings('Cluster operators degraded', 'none', queries.clusterOperatorsDegraded.query(), 'PF55DCC5EC58ABF5A', { x: 8, y: 52, w: 8, h: 8 }), - panels.timeSeries.managementClustersStatsTimeseriesSettings('Failed pods', 'none', queries.FailedPods.query(), 'PF55DCC5EC58ABF5A', { x: 16, y: 52, w: 8, h: 8 }), - panels.timeSeries.managementClustersStatsTimeseriesSettings('Alerts', 'none', queries.alerts.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 60, w: 24, h: 8 }), - ]), - g.panel.row.new('Management cluster Etcd stats') - + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) - + g.panel.row.withCollapsed(true) - + g.panel.row.withPanels([ - panels.timeSeries.mgmt('Disk WAL Sync Duration', 's', queries.mgmt_disk_wal_sync_duration.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 2, w: 12, h: 8 }), - panels.timeSeries.mgmt('Disk Backend Sync Duration', 's', queries.mgmt_disk_backend_sync_duration.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 2, w: 12, h: 8 }), - panels.timeSeries.DBPanelsSettings('% DB Space Used', 'percent', queries.mgmt_percent_db_used.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 10, w: 8, h: 8 }), - panels.timeSeries.DBPanelsSettings('DB Left capacity (with fragmented space)', 'bytes', queries.mgmt_db_capacity_left.query(), 'PF55DCC5EC58ABF5A', { x: 8, y: 10, w: 8, h: 8 }), - panels.timeSeries.DBPanelsSettings('DB Size Limit (Backend-bytes)', 'bytes', queries.mgmt_db_size_limit.query(), 'PF55DCC5EC58ABF5A', { x: 16, y: 10, w: 8, h: 8 }), - panels.timeSeries.mgmt('DB Size', 'bytes', queries.mgmt_db_size.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 18, w: 12, h: 8 }), - panels.timeSeries.mgmt('gRPC network traffic', 'Bps', queries.mgmt_grpc_traffic.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 18, w: 12, h: 8 }), - panels.timeSeries.DBPanelsSettings('Active Streams', '', queries.mgmt_active_streams.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 26, w: 12, h: 8 }), - panels.timeSeries.DBPanelsSettings('Snapshot duration', 's', queries.mgmt_snapshot_duration.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 26, w: 12, h: 8 }), - panels.timeSeries.DBPanelsSettings('Raft Proposals', '', queries.mgmt_raft_proposals.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 1, w: 12, h: 8 }), - panels.timeSeries.DBPanelsSettings('Number of leader changes seen', '', queries.mgmt_num_leader_changes.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 1, w: 12, h: 8 }), - panels.stat.etcd_has_leader('Etcd has a leader?', '', queries.mgmt_etcd_has_leader.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 8, w: 6, h: 2 }), - panels.stat.mgmt_num_failed_proposals('Total number of failed proposals seen', '', queries.mgmt_num_failed_proposals.query(), 'PF55DCC5EC58ABF5A', { x: 6, y: 8, w: 6, h: 2 }), - panels.timeSeries.DBPanelsSettings('Leader Elections Per Day', '', queries.mgmt_leader_elections_per_day.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 12, w: 12, h: 6 }), - panels.timeSeries.DBPanelsSettings('Keys', '', queries.mgmt_keys.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 12, w: 12, h: 8 }), - panels.timeSeries.DBPanelsSettings('Slow Operations', 'ops', queries.mgmt_slow_operations.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 20, w: 12, h: 8 }), - panels.timeSeries.DBPanelsSettings('Key Operations', 'ops', queries.mgmt_key_operations.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 20, w: 12, h: 8 }), - panels.timeSeries.DBPanelsSettings('Heartbeat Failures', '', queries.mgmt_heartbeat_failures.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 28, w: 12, h: 8 }), - panels.timeSeries.DBPanelsSettings('Compacted Keys', '', queries.mgmt_compacted_keys.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 28, w: 12, h: 8 }), - ]), - - g.panel.row.new('Hosted Clusters Serving Node stats - $namespace') - + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) - + g.panel.row.withCollapsed(true) - + g.panel.row.withRepeat('namespace') - + g.panel.row.withPanels([ - panels.timeSeries.genericGraphLegendPanel('Serving Node CPU Basic', 'percent', queries.nodeCPU.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 2, w: 12, h: 8 }), - panels.timeSeries.genericGraphLegendPanel('Serving Node Memory', 'bytes', queries.nodeMemory.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 2, w: 12, h: 8 }), - panels.timeSeries.genericGraphLegendPanel('Suricata CPU(Running on Serving node)', 'percent', queries.suricataCPU.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 18, w: 12, h: 8 }), - panels.timeSeries.genericGraphLegendPanel('Suricata Memory(Running on Serving node)', 'bytes', queries.suricataMemory.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 18, w: 12, h: 8 }), - panels.timeSeries.genericGraphLegendPanel('OneAgent CPU Usage', 'percent', queries.dynaactivegateCPU.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 18, w: 12, h: 8 }), - panels.timeSeries.genericGraphLegendPanel('OneAgent Memory Usage', 'bytes', queries.dynaoneagentMem.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 18, w: 12, h: 8 }), - ]), - - g.panel.row.new('Hosted Clusters Serving Node stats - $namespace') - + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) - + g.panel.row.withCollapsed(true) - + g.panel.row.withRepeat('namespace') - + g.panel.row.withPanels([ - panels.stat.hostedControlPlaneStats('Hosted Cluster Cloud Infrastructure', '', queries.infrastructure.query(), 'P1BA917A37525EDF3', { x: 0, y: 0, w: 8, h: 4 }), - panels.stat.hostedControlPlaneStats('Hosted Cluster Cloud Region', '', queries.region.query(), 'P1BA917A37525EDF3', { x: 8, y: 0, w: 8, h: 4 }), - panels.stat.hostedControlPlaneStats('Hosted Cluster OCP Version', '', queries.ocp_version.query(), 'P1BA917A37525EDF3', { x: 16, y: 0, w: 8, h: 4 }), - panels.timeSeries.genericGraphLegendPanel('Hosted Control Plane CPU', 'percent', queries.hostedControlPlaneCPU.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 12, w: 12, h: 8 }), - panels.timeSeries.genericGraphLegendPanel('Hosted Control Plane Memory', 'bytes', queries.hostedControlPlaneMemory.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 12, w: 12, h: 8 }), - panels.timeSeries.genericGraphLegendPanelRightSide('request duration - 99th quantile', '', queries.request_duration_99th_quantile.query(), OBO, { x: 0, y: 20, w: 8, h: 8 }), - panels.timeSeries.genericGraphLegendPanelRightSide('request rate - by instance', '', queries.request_rate_by_instance.query(), OBO, { x: 8, y: 20, w: 8, h: 8 }), - panels.timeSeries.genericGraphLegendPanelRightSide('request duration - 99th quantile - by resource', '', queries.request_duration_99th_quantile_by_resource.query(), OBO, { x: 16, y: 20, w: 8, h: 8 }), - panels.timeSeries.genericGraphLegendPanelRightSide('request duration - 99th quantile', '', queries.request_rate_by_resource.query(), OBO, { x: 0, y: 30, w: 8, h: 8 }), - panels.timeSeries.genericGraphLegendPanelList('request duration - read vs write', '', queries.request_duration_read_write.query(), OBO, { x: 8, y: 30, w: 8, h: 8 }), - panels.timeSeries.genericGraphLegendPanelList('request rate - read vs write', '', queries.request_rate_read_write.query(), OBO, { x: 16, y: 30, w: 8, h: 8 }), - panels.timeSeries.genericGraphLegendPanelList('requests dropped rate', '', queries.requests_dropped_rate.query(), OBO, { x: 0, y: 40, w: 8, h: 8 }), - panels.timeSeries.genericGraphLegendPanelList('requests terminated rate', '', queries.requests_terminated_rate.query(), OBO, { x: 8, y: 40, w: 8, h: 8 }), - panels.timeSeries.genericGraphLegendPanelList('requests status rate', '', queries.requests_status_rate.query(), OBO, { x: 16, y: 40, w: 8, h: 8 }), - panels.timeSeries.genericGraphLegendPanelList('long running requests', '', queries.long_running_requests.query(), OBO, { x: 0, y: 50, w: 8, h: 8 }), - panels.timeSeries.genericGraphLegendPanelRightSide('requests in flight', '', queries.request_in_flight.query(), OBO, { x: 8, y: 50, w: 8, h: 8 }), - panels.timeSeries.genericGraphLegendPanelList('p&f - requests rejected', '', queries.pf_requests_rejected.query(), OBO, { x: 16, y: 50, w: 8, h: 8 }), - panels.timeSeries.genericGraphLegendPanelRightSide('response size - 99th quantile', '', queries.response_size_99th_quartile.query(), OBO, { x: 0, y: 60, w: 8, h: 8 }), - panels.timeSeries.genericGraphLegendPanelRightSide('p&f - request queue length', '', queries.pf_request_queue_length.query(), OBO, { x: 8, y: 60, w: 8, h: 8 }), - panels.timeSeries.genericGraphLegendPanelRightSide('p&f - request wait duration - 99th quantile', '', queries.pf_request_wait_duration_99th_quartile.query(), OBO, { x: 16, y: 60, w: 8, h: 8 }), - panels.timeSeries.genericGraphLegendPanelRightSide('p&f - request execution duration', '', queries.pf_request_execution_duration.query(), OBO, { x: 0, y: 70, w: 8, h: 8 }), - panels.timeSeries.genericGraphLegendPanelRightSide('p&f - request dispatch rate', '', queries.pf_request_dispatch_rate.query(), OBO, { x: 8, y: 70, w: 8, h: 8 }), - panels.timeSeries.genericGraphLegendPanelList('p&f - concurrency limit by priority level', '', queries.pf_concurrency_limit.query(), OBO, { x: 16, y: 70, w: 8, h: 8 }), - panels.timeSeries.genericGraphLegendPanelRightSide('p&f - pending in queue', '', queries.pf_pending_in_queue.query(), OBO, { x: 0, y: 80, w: 8, h: 8 }), - ]), - g.panel.row.new('Hosted Clusters ETCD General Resource Usage - $namespace') - + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) - + g.panel.row.withCollapsed(true) - + g.panel.row.withRepeat('namespace') - + g.panel.row.withPanels([ - panels.timeSeries.genericGraphLegendPanel('Disk WAL Sync Duration', 's', queries.disk_wal_sync_duration.query(), OBO, { x: 0, y: 2, w: 12, h: 8 }), - panels.timeSeries.genericGraphLegendPanel('Disk Backend Sync Duration', 's', queries.disk_backend_sync_duration.query(), OBO, { x: 12, y: 2, w: 12, h: 8 }), - panels.timeSeries.genericGraphLegendPanelList('% DB Space Used', 'percent', queries.percent_db_used.query(), OBO, { x: 0, y: 10, w: 8, h: 8 }), - panels.timeSeries.genericGraphLegendPanelList('DB Left capacity (with fragmented space)', 'bytes', queries.db_capacity_left.query(), OBO, { x: 8, y: 10, w: 8, h: 8 }), - panels.timeSeries.genericGraphLegendPanelList('DB Size Limit (Backend-bytes)', 'bytes', queries.db_size_limit.query(), OBO, { x: 16, y: 10, w: 8, h: 8 }), - panels.timeSeries.genericGraphLegendPanel('DB Size', 'bytes', queries.db_size.query(), OBO, { x: 0, y: 18, w: 12, h: 8 }), - panels.timeSeries.genericGraphLegendPanel('gRPC network traffic', 'Bps', queries.grpc_traffic.query(), OBO, { x: 12, y: 18, w: 12, h: 8 }), - panels.timeSeries.genericGraphLegendPanelList('Active Streams', '', queries.active_streams.query(), OBO, { x: 0, y: 26, w: 12, h: 8 }), - panels.timeSeries.genericGraphLegendPanelList('Snapshot duration', 's', queries.snapshot_duration.query(), OBO, { x: 12, y: 26, w: 12, h: 8 }), - panels.timeSeries.genericGraphLegendPanelList('Raft Proposals', '', queries.raft_proposals.query(), OBO, { x: 0, y: 34, w: 12, h: 8 }), - panels.timeSeries.genericGraphLegendPanelList('Number of leader changes seen', '', queries.num_leader_changes.query(), OBO, { x: 12, y: 34, w: 12, h: 8 }), - panels.stat.etcd_has_leader('Etcd has a leader?', '', queries.etcd_has_leader.query(), OBO, { x: 0, y: 42, w: 6, h: 2 }), - panels.stat.mgmt_num_failed_proposals('Total number of failed proposals seen', '', queries.num_failed_proposals.query(), OBO, { x: 6, y: 42, w: 6, h: 2 }), - panels.timeSeries.genericGraphLegendPanelList('Leader Elections Per Day', '', queries.leader_elections_per_day.query(), OBO, { x: 0, y: 44, w: 12, h: 6 }), - panels.timeSeries.genericGraphLegendPanelList('Keys', '', queries.keys.query(), OBO, { x: 12, y: 44, w: 12, h: 8 }), - panels.timeSeries.genericGraphLegendPanelList('Slow Operations', 'ops', queries.slow_operations.query(), OBO, { x: 0, y: 52, w: 12, h: 8 }), - panels.timeSeries.genericGraphLegendPanelList('Key Operations', 'ops', queries.key_operations.query(), OBO, { x: 12, y: 52, w: 12, h: 8 }), - panels.timeSeries.genericGraphLegendPanelList('Heartbeat Failures', '', queries.heartbeat_failures.query(), OBO, { x: 0, y: 60, w: 12, h: 8 }), - panels.timeSeries.genericGraphLegendPanelList('Compacted Keys', '', queries.compacted_keys.query(), OBO, { x: 12, y: 60, w: 12, h: 8 }), - ]), -]) diff --git a/templates/General/hypershift-performance.jsonnet b/templates/General/hypershift-performance.jsonnet index c427a73..505d1bb 100644 --- a/templates/General/hypershift-performance.jsonnet +++ b/templates/General/hypershift-performance.jsonnet @@ -1,1861 +1,154 @@ -local grafana = import '../grafonnet-lib/grafonnet/grafana.libsonnet'; -local prometheus = grafana.prometheus; -local stat = grafana.statPanel; - -// Panel definitions - -// Hypershift Hosted Cluster Components - -local genericGraphLegendPanel(title, datasource, format) = grafana.graphPanel.new( - title=title, - datasource='Cluster Prometheus', - format=format, - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - nullPointMode='null as zero', - sort='decreasing', -); - -local hostedControlPlaneCPU = genericGraphLegendPanel('Hosted Control Plane CPU', 'Cluster Prometheus', 'percent').addTarget( - prometheus.target( - 'topk(10,irate(container_cpu_usage_seconds_total{namespace=~"$namespace",container!="POD",name!=""}[2m])*100)', - legendFormat='{{pod}}/{{container}}', - ) -); - -local hostedControlPlaneMemory = genericGraphLegendPanel('Hosted Control Plane Memory', 'Cluster Prometheus', 'bytes').addTarget( - prometheus.target( - 'topk(10, container_memory_rss{namespace=~"$namespace",container!="POD",name!=""})', - legendFormat='{{pod}}/{{container}}', - ) -); - -// Serving node stats and other daemons - -local nodeMemory = genericGraphLegendPanel('Serving Node Memory', 'Cluster Prometheus', 'bytes').addTarget( - prometheus.target( - 'node_memory_Active_bytes and on (instance) label_replace(cluster:nodes_roles{label_hypershift_openshift_io_cluster=~"$namespace"}, "instance", "$1", "node", "(.+)")', - legendFormat='{{instance}} - Active', - ) -).addTarget( - prometheus.target( - 'node_memory_MemTotal_bytes and on (instance) label_replace(cluster:nodes_roles{label_hypershift_openshift_io_cluster=~"$namespace"}, "instance", "$1", "node", "(.+)")', - legendFormat='{{instance}} - Total', - ) -).addTarget( - prometheus.target( - '(node_memory_Cached_bytes + node_memory_Buffers_bytes) and on (instance) label_replace(cluster:nodes_roles{label_hypershift_openshift_io_cluster=~"$namespace"}, "instance", "$1", "node", "(.+)")', - legendFormat='{{instance}} - Cached + Buffers', - ) -).addTarget( - prometheus.target( - 'node_memory_MemAvailable_bytes and on (instance) label_replace(cluster:nodes_roles{label_hypershift_openshift_io_cluster=~"$namespace"}, "instance", "$1", "node", "(.+)")', - legendFormat='{{instance}} - Available', - ) -).addTarget( - prometheus.target( - '(node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)) and on (instance) label_replace(cluster:nodes_roles{label_hypershift_openshift_io_cluster=~"$namespace"}, "instance", "$1", "node", "(.+)")', - legendFormat='{{instance}} - Used', - ) -); - - -local nodeCPU = genericGraphLegendPanel('Serving Node CPU Basic', 'Cluster Prometheus', 'percent').addTarget( - prometheus.target( - 'sum by (instance, mode)(irate(node_cpu_seconds_total{job=~".*"}[2m])) * 100 and on (instance) label_replace(cluster:nodes_roles{label_hypershift_openshift_io_cluster=~"$namespace"}, "instance", "$1", "node", "(.+)")', - legendFormat='{{instance}} - {{mode}}', - ) -); - -local suricataCPU = genericGraphLegendPanel('Suricata CPU(Running on Serving node)', 'Cluster Prometheus', 'percent').addTarget( - prometheus.target( - 'sum(irate(container_cpu_usage_seconds_total{namespace=~"openshift-suricata",container!="POD",name!=""}[2m])*100) by (node) and on (node) label_replace(cluster:nodes_roles{label_hypershift_openshift_io_cluster=~"$namespace"}, "node", "$1", "node", "(.+)")', - legendFormat='{{node}}', - ) -); - -local suricataMemory = genericGraphLegendPanel('Suricata Memory(Running on Serving node)', 'Cluster Prometheus', 'bytes').addTarget( - prometheus.target( - 'sum(container_memory_rss{namespace=~"openshift-suricata",container!="POD",name!=""}) by (node) and on (node) label_replace(cluster:nodes_roles{label_hypershift_openshift_io_cluster=~"$namespace"}, "node", "$1", "node", "(.+)")', - legendFormat='{{node}}', - ) -); - -local dynaoneagentMem = genericGraphLegendPanel('OneAgent Memory Usage', 'Cluster Prometheus', 'bytes').addTarget( - prometheus.target( - 'sum(container_memory_rss{namespace=~"dynatrace",pod=~".*-oneagent-.*",container!=""}) by (node, namespace, pod)', - legendFormat='{{ node }}: {{ namespace }} : {{ pod }}', - ) -); - -local dynaoneagentCPU = genericGraphLegendPanel('OneAgent CPU Usage', 'Cluster Prometheus', 'percent').addTarget( - prometheus.target( - 'sum(irate(container_cpu_usage_seconds_total{namespace=~"dynatrace", pod=~".*-oneagent-.*", container!~"POD|"}[2m])*100) by (node, namespace, pod)', - legendFormat='{{ node }}: {{ namespace }} : {{ pod }}', - ) -); - -// Overall stats on the management cluster - -// Cluster Operators details and status - -local clusterOperatorsInformation = genericGraphLegendPanel('Cluster operators information', 'Cluster Prometheus', 'none').addTarget( - prometheus.target( - 'cluster_operator_conditions{name!="",reason!=""}', - legendFormat='{{name}} - {{reason}}', - ) -); - -local clusterOperatorsDegraded = genericGraphLegendPanel('Cluster operators degraded', 'Cluster Prometheus', 'none').addTarget( - prometheus.target( - 'cluster_operator_conditions{condition="Degraded",name!="",reason!=""}', - legendFormat='{{name}} - {{reason}}', - ) -); - - -// Management cluster alerts - -local alerts = genericGraphLegendPanel('Alerts', 'Cluster Prometheus', 'none').addTarget( - prometheus.target( - 'topk(10,sum(ALERTS{severity!="none"}) by (alertname, severity))', - legendFormat='{{severity}}: {{alertname}}', - ) -); - - -// Cluster info - -local num_hosted_cluster = stat.new( - title='Number of HostedCluster', - datasource='Cluster Prometheus', - graphMode='none', - reducerFunction='max', -).addTarget( - prometheus.target( - 'count(kube_namespace_labels{namespace=~"^ocm-.*"})', - instant=true, - ) -).addThresholds([ - { color: 'green', value: null }, -]); - -local m_ocp_version = stat.new( - title='Management OCP Version', - datasource='Cluster Prometheus', - textMode='name', - graphMode='none', -).addTarget( - prometheus.target( - 'cluster_version{type="completed",version!="",namespace="openshift-cluster-version"}', - legendFormat='{{version}}', - instant=true, - ) -).addThresholds([ - { color: 'green', value: null }, -]); - -local ocp_version = stat.new( - title='Hosted Cluster OCP Version', - datasource='OBO', - textMode='name', - graphMode='none', -).addTarget( - prometheus.target( - 'cluster_version{type="completed",version!="",namespace=~"$namespace"}', - legendFormat='{{version}}', - instant=true, - ) -).addThresholds([ - { color: 'green', value: null }, -]); - -local infrastructure = stat.new( - title='Hosted Cluster Cloud Infrastructure', - datasource='OBO', - textMode='name', - graphMode='none', -).addTarget( - prometheus.target( - 'cluster_infrastructure_provider{namespace=~"$namespace"}', - legendFormat='{{type}}', - instant=true, - ) -).addThresholds([ - { color: 'green', value: null }, -]); - -local region = stat.new( - title='Hosted Cluster Cloud Region', - datasource='OBO', - textMode='name', - graphMode='none', -).addTarget( - prometheus.target( - 'cluster_infrastructure_provider{namespace=~"$namespace"}', - legendFormat='{{region}}', - instant=true, - ) -).addThresholds([ - { color: 'green', value: null }, -]); - -local m_infrastructure = stat.new( - title='Management Cloud Infrastructure', - datasource='Cluster Prometheus', - textMode='name', - graphMode='none', -).addTarget( - prometheus.target( - 'cluster_infrastructure_provider{namespace="openshift-kube-apiserver-operator"}', - instant=true, - legendFormat='{{type}}', - ) -).addThresholds([ - { color: 'green', value: null }, -]); - -local m_region = stat.new( - title='Management Cloud Region', - datasource='Cluster Prometheus', - textMode='name', - graphMode='none', -).addTarget( - prometheus.target( - 'cluster_infrastructure_provider{namespace="openshift-kube-apiserver-operator"}', - legendFormat='{{region}}', - instant=true, - ) -).addThresholds([ - { color: 'green', value: null }, -]); - -local top10ContMemHosted = genericGraphLegendPanel('Top 10 Hosted Clusters container RSS', 'Cluster Prometheus', 'bytes').addTarget( - prometheus.target( - 'topk(10, container_memory_rss{namespace=~"^ocm-.*",container!="POD",name!=""})', - legendFormat='{{ namespace }} - {{ name }}', - ) -); - -local top10ContCPUHosted = genericGraphLegendPanel('Top 10 Hosted Clusters container CPU', 'Cluster Prometheus', 'percent').addTarget( - prometheus.target( - 'topk(10,irate(container_cpu_usage_seconds_total{namespace=~"^ocm-.*",container!="POD",name!=""}[2m])*100)', - legendFormat='{{ namespace }} - {{ name }}', - ) -); - -local top10ContMemManagement = genericGraphLegendPanel('Top 10 Management Cluster container RSS', 'Cluster Prometheus', 'bytes').addTarget( - prometheus.target( - 'topk(10, container_memory_rss{namespace!="",container!="POD",name!=""})', - legendFormat='{{ namespace }} - {{ name }}', - ) -); - -local top10ContCPUManagement = genericGraphLegendPanel('Top 10 Management Cluster container CPU', 'Cluster Prometheus', 'percent').addTarget( - prometheus.target( - 'topk(10,irate(container_cpu_usage_seconds_total{namespace!="",container!="POD",name!=""}[2m])*100)', - legendFormat='{{ namespace }} - {{ name }}', - ) -); - -local top10ContMemOBOManagement = genericGraphLegendPanel('Top 10 Management Cluster OBO NS Pods RSS', 'Cluster Prometheus', 'bytes').addTarget( - prometheus.target( - 'topk(10, container_memory_rss{namespace="openshift-observability-operator",container!="POD",name!=""})', - legendFormat='{{ pod }}/{{ container }}', - ) -); - -local top10ContCPUOBOManagement = genericGraphLegendPanel('Top 10 Management Cluster OBO NS Pods CPU', 'Cluster Prometheus', 'percent').addTarget( - prometheus.target( - 'topk(10,irate(container_cpu_usage_seconds_total{namespace="openshift-observability-operator",container!="POD",name!=""}[2m])*100)', - legendFormat='{{ pod }}/{{ container }}', - ) -); - -local top10ContMemHypershiftManagement = genericGraphLegendPanel('Top 10 Management Cluster Hypershift NS Pods RSS', 'Cluster Prometheus', 'bytes').addTarget( - prometheus.target( - 'topk(10, container_memory_rss{namespace="hypershift",container!="POD",name!=""})', - legendFormat='{{ pod }}/{{ container }}', - ) -); - -local top10ContCPUHypershiftManagement = genericGraphLegendPanel('Top 10 Management Cluster Hypershift NS Pods CPU', 'Cluster Prometheus', 'percent').addTarget( - prometheus.target( - 'topk(10,irate(container_cpu_usage_seconds_total{namespace="hypershift",container!="POD",name!=""}[2m])*100)', - legendFormat='{{ pod }}/{{ container }}', - ) -); - -local current_node_count = grafana.statPanel.new( - title='Current Node Count', - datasource='Cluster Prometheus', - reducerFunction='last', -).addTarget( - prometheus.target( - 'sum(kube_node_info{})', - legendFormat='Number of nodes', - instant='true', - ) -).addTarget( - prometheus.target( - 'sum(kube_node_status_condition{status="true"}) by (condition) > 0', - legendFormat='Node: {{ condition }}', - ) -).addTarget( - prometheus.target( - 'sum(kube_node_role{}) by (role)', - legendFormat='Role: {{ role }}', - ) -); - -local current_machine_set_replica_count = genericGraphLegendPanel('Machine Set Replicas', 'Cluster Prometheus', 'none').addTarget( - prometheus.target( - 'mapi_machine_set_status_replicas{name=~".*worker.*"}', - legendFormat='Replicas: {{ name }}', - ) -).addTarget( - prometheus.target( - 'mapi_machine_set_status_replicas_available{name=~".*worker.*"}', - legendFormat='Available: {{ name }}', - ) -).addTarget( - prometheus.target( - 'mapi_machine_set_status_replicas_ready{name=~".*worker.*"}', - legendFormat='Ready: {{ name }}', - ) -); - -local current_namespace_count = grafana.statPanel.new( - title='Current namespace Count', - datasource='Cluster Prometheus', - reducerFunction='last', -).addTarget( - prometheus.target( - 'sum(kube_namespace_status_phase) by (phase)', - legendFormat='{{ phase }}', - instant=true, - ) -); - -local current_pod_count = grafana.statPanel.new( - title='Current Pod Count', - reducerFunction='last', - datasource='Cluster Prometheus', -).addTarget( - prometheus.target( - 'sum(kube_pod_status_phase{}) by (phase) > 0', - legendFormat='{{ phase}} Pods', - instant=true, - ) -); - -local nodeCount = genericGraphLegendPanel('Number of nodes', 'Cluster Prometheus', 'none').addTarget( - prometheus.target( - 'sum(kube_node_info{})', - legendFormat='Number of nodes', - ) -).addTarget( - prometheus.target( - 'sum(kube_node_status_condition{status="true"}) by (node,condition) > 0', - legendFormat='{{node}}: {{ condition }}', - ) -); - -local nsCount = genericGraphLegendPanel('Namespace count', 'Cluster Prometheus', 'none').addTarget( - prometheus.target( - 'sum(kube_namespace_status_phase) by (phase) > 0', - legendFormat='{{ phase }} namespaces', - ) -); - -local podCount = genericGraphLegendPanel('Pod count', 'Cluster Prometheus', 'none').addTarget( - prometheus.target( - 'sum(kube_pod_status_phase{}) by (phase)', - legendFormat='{{phase}} pods', - ) -); - -local FailedPods = genericGraphLegendPanel('Failed pods', 'Cluster Prometheus', 'none').addTarget( - prometheus.target( - 'kube_pod_status_phase{phase="Failed"}', - legendFormat='{{namespace}}/{{ pod }}:{{ phase }}', - ) -).addTarget( - prometheus.target( - 'count(kube_pod_status_phase{phase="Failed"})', - legendFormat='{{phase}} pods', - ) -); - -// API 99th percentile request duration by resource, namespace -local request_duration_99th_quantile_by_resource = grafana.graphPanel.new( - title='request duration - 99th quantile - by resource', - datasource='OBO', - legend_values=true, - legend_alignAsTable=true, - legend_current=true, - legend_rightSide=true, - legend_sort='max', - legend_sortDesc=true, - nullPointMode='null as zero', - legend_hideZero=true, -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{namespace=~"$namespace",resource=~"$resource",subresource!="log",verb!~"WATCH|WATCHLIST|PROXY"}[2m])) by(resource, namespace, verb, le))', - legendFormat='{{verb}}:{{resource}}/{{namespace}}', - ) -); - -// Dynatrace on the management cluster -local dynaactivegateMem = genericGraphLegendPanel('Active Gate Memory Usage', 'Cluster Prometheus', 'bytes').addTarget( - prometheus.target( - 'sum(container_memory_rss{namespace=~"dynatrace",pod=~".*-activegate-.*",container!=""}) by (node, namespace, pod)', - legendFormat='{{ node }}: {{ namespace }} : {{ pod }}', - ) -); - -local dynaactivegateCPU = genericGraphLegendPanel('Active Gate CPU Usage', 'Cluster Prometheus', 'percent').addTarget( - prometheus.target( - 'sum(irate(container_cpu_usage_seconds_total{namespace=~"dynatrace", pod=~".*-activegate-.*", container!~"POD|"}[2m])*100) by (node, namespace, pod)', - legendFormat='{{ node }}: {{ namespace }} : {{ pod }}', - ) -); - -local opentelemetryMem = genericGraphLegendPanel('Opentelemetry Memory Usage', 'Cluster Prometheus', 'bytes').addTarget( - prometheus.target( - 'sum(container_memory_rss{namespace=~"dynatrace",pod=~"opentelemetry-.*",container!=""}) by (node, namespace, pod)', - legendFormat='{{ node }}: {{ namespace }} : {{ pod }}', - ) -); - -local opentelemetryCPU = genericGraphLegendPanel('Opentelemetry CPU Usage', 'Cluster Prometheus', 'percent').addTarget( - prometheus.target( - 'sum(irate(container_cpu_usage_seconds_total{namespace=~"dynatrace", pod=~"opentelemetry-.*", container!~"POD|"}[2m])*100) by (node, namespace, pod)', - legendFormat='{{ node }}: {{ namespace }} : {{ pod }}', - ) -); - -// Management cluster metrics - -local mgmt_fs_writes = grafana.graphPanel.new( - title='Etcd container disk writes', - datasource='Cluster Prometheus', - format='Bps', - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - sort='decreasing', - nullPointMode='null as zero', -).addTarget( - prometheus.target( - 'rate(container_fs_writes_bytes_total{namespace=~"openshift-etcd",container="etcd",device!~".+dm.+"}[2m])', - legendFormat='{{namespace}} - {{ pod }}: {{ device }}', - ) -); - -local mgmt_ptp = grafana.graphPanel.new( - title='p99 peer to peer latency', - datasource='Cluster Prometheus', - format='s', - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - sort='decreasing', - nullPointMode='null as zero', -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{namespace=~"openshift-etcd"}[2m]))', - legendFormat='{{namespace}} - {{pod}}', - ) -); - -local mgmt_disk_wal_sync_duration = grafana.graphPanel.new( - title='Disk WAL Sync Duration', - datasource='Cluster Prometheus', - format='s', - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - sort='decreasing', - nullPointMode='null as zero', -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(irate(etcd_disk_wal_fsync_duration_seconds_bucket{namespace=~"openshift-etcd"}[2m])) by (namespace, pod, le))', - legendFormat='{{namespace}} - {{pod}} WAL fsync', - ) -); - -local mgmt_disk_backend_sync_duration = grafana.graphPanel.new( - title='Disk Backend Sync Duration', - datasource='Cluster Prometheus', - format='s', - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - sort='decreasing', - nullPointMode='null as zero', -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(irate(etcd_disk_backend_commit_duration_seconds_bucket{namespace=~"openshift-etcd"}[2m])) by (namespace, pod, le))', - legendFormat='{{namespace}} - {{pod}} DB fsync', - ) -); - -local mgmt_db_size = grafana.graphPanel.new( - title='DB Size', - datasource='Cluster Prometheus', - format='bytes', - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - sort='decreasing', - nullPointMode='null as zero', -).addTarget( - prometheus.target( - 'etcd_mvcc_db_total_size_in_bytes{namespace=~"openshift-etcd"}', - legendFormat='{{namespace}} - {{pod}} DB physical size' - ) -).addTarget( - prometheus.target( - 'etcd_mvcc_db_total_size_in_use_in_bytes{namespace=~"openshift-etcd"}', - legendFormat='{{namespace}} - {{pod}} DB logical size', - ) -); - - -local mgmt_cpu_usage = grafana.graphPanel.new( - title='CPU usage', - datasource='Cluster Prometheus', - format='percent', - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - sort='decreasing', - nullPointMode='null as zero', -).addTarget( - prometheus.target( - 'sum(irate(container_cpu_usage_seconds_total{namespace=~"openshift-etcd", container="etcd"}[2m])) by (namespace, pod) * 100', - legendFormat='{{namespace}} - {{ pod }}', - ) -); - -local mgmt_mem_usage = grafana.graphPanel.new( - title='Memory usage', - datasource='Cluster Prometheus', - format='bytes', - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - sort='decreasing', - nullPointMode='null as zero', -).addTarget( - prometheus.target( - 'sum(avg_over_time(container_memory_working_set_bytes{container="",pod!="", namespace=~"openshift-etcd"}[2m])) BY (pod, namespace)', - legendFormat='{{namespace}} - {{ pod }}', - ) -); - -local mgmt_network_traffic = grafana.graphPanel.new( - title='Container network traffic', - datasource='Cluster Prometheus', - format='Bps', - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - sort='decreasing', - nullPointMode='null as zero', -).addTarget( - prometheus.target( - 'sum(rate(container_network_receive_bytes_total{ container="etcd", namespace=~"openshift-etcd"}[2m])) BY (namespace, pod)', - legendFormat='rx {{namespace}} - {{ pod }}' - ) -).addTarget( - prometheus.target( - 'sum(rate(container_network_transmit_bytes_total{ container="etcd", namespace=~"openshift-etcd"}[2m])) BY (namespace, pod)', - legendFormat='tx {{namespace}} - {{ pod }}', - ) -); - - -local mgmt_grpc_traffic = grafana.graphPanel.new( - title='gRPC network traffic', - datasource='Cluster Prometheus', - format='Bps', - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - sort='decreasing', - nullPointMode='null as zero', -).addTarget( - prometheus.target( - 'rate(etcd_network_client_grpc_received_bytes_total{namespace=~"openshift-etcd"}[2m])', - legendFormat='rx {{namespace}} - {{pod}}' - ) -).addTarget( - prometheus.target( - 'rate(etcd_network_client_grpc_sent_bytes_total{namespace=~"openshift-etcd"}[2m])', - legendFormat='tx {{namespace}} - {{pod}}', - ) -); - -local mgmt_peer_traffic = grafana.graphPanel.new( - title='Peer network traffic', - datasource='Cluster Prometheus', - format='Bps', - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - sort='decreasing', - nullPointMode='null as zero', -).addTarget( - prometheus.target( - 'rate(etcd_network_peer_received_bytes_total{namespace=~"openshift-etcd"}[2m])', - legendFormat='rx {{namespace}} - {{pod}} Peer Traffic' - ) -).addTarget( - prometheus.target( - 'rate(etcd_network_peer_sent_bytes_total{namespace=~"openshift-etcd"}[2m])', - legendFormat='tx {{namespace}} - {{pod}} Peer Traffic', - ) -); - - -local mgmt_active_streams = grafana.graphPanel.new( - title='Active Streams', - datasource='Cluster Prometheus', -).addTarget( - prometheus.target( - 'sum(grpc_server_started_total{namespace=~"openshift-etcd",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{namespace=~"openshift-etcd",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"})', - legendFormat='{{namespace}} - Watch Streams', - ) -).addTarget( - prometheus.target( - 'sum(grpc_server_started_total{namespace=~"openshift-etcd",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{namespace=~"openshift-etcd",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"})', - legendFormat='{{namespace}} - Lease Streams', - ) -); - -local mgmt_snapshot_duration = grafana.graphPanel.new( - title='Snapshot duration', - datasource='Cluster Prometheus', - format='s', -).addTarget( - prometheus.target( - 'sum(rate(etcd_debugging_snap_save_total_duration_seconds_sum{namespace=~"openshift-etcd"}[2m]))', - legendFormat='the total latency distributions of save called by snapshot', - ) -); - -//DB Info per Member - -local mgmt_percent_db_used = grafana.graphPanel.new( - title='% DB Space Used', - datasource='Cluster Prometheus', - format='percent', -).addTarget( - prometheus.target( - '(etcd_mvcc_db_total_size_in_bytes{namespace=~"openshift-etcd"} / etcd_server_quota_backend_bytes{namespace=~"openshift-etcd"})*100', - legendFormat='{{namespace}} - {{pod}}', - ) -); - -local mgmt_db_capacity_left = grafana.graphPanel.new( - title='DB Left capacity (with fragmented space)', - datasource='Cluster Prometheus', - format='bytes', -).addTarget( - prometheus.target( - 'etcd_server_quota_backend_bytes{namespace=~"openshift-etcd"} - etcd_mvcc_db_total_size_in_bytes{namespace=~"openshift-etcd"}', - legendFormat='{{namespace}} - {{pod}}', - ) -); - -local mgmt_db_size_limit = grafana.graphPanel.new( - title='DB Size Limit (Backend-bytes)', - datasource='Cluster Prometheus', - format='bytes' -).addTarget( - prometheus.target( - 'etcd_server_quota_backend_bytes{namespace=~"openshift-etcd"}', - legendFormat='{{namespace}} - {{ pod }} Quota Bytes', - ) -); - -// Proposals, leaders, and keys section - -local mgmt_keys = grafana.graphPanel.new( - title='Keys', - datasource='Cluster Prometheus', -).addTarget( - prometheus.target( - 'etcd_debugging_mvcc_keys_total{namespace=~"openshift-etcd"}', - legendFormat='{{namespace}} - {{ pod }} Num keys', - ) -); - -local mgmt_compacted_keys = grafana.graphPanel.new( - title='Compacted Keys', - datasource='Cluster Prometheus', -).addTarget( - prometheus.target( - 'etcd_debugging_mvcc_db_compaction_keys_total{namespace=~"openshift-etcd"}', - legendFormat='{{namespace}} - {{ pod }} keys compacted', - ) -); - -local mgmt_heartbeat_failures = grafana.graphPanel.new( - title='Heartbeat Failures', - datasource='Cluster Prometheus', -).addTarget( - prometheus.target( - 'etcd_server_heartbeat_send_failures_total{namespace=~"openshift-etcd"}', - legendFormat='{{namespace}} - {{ pod }} heartbeat failures', - ) -).addTarget( - prometheus.target( - 'etcd_server_health_failures{namespace=~"openshift-etcd"}', - legendFormat='{{namespace}} - {{ pod }} health failures', - ) -); - - -local mgmt_key_operations = grafana.graphPanel.new( - title='Key Operations', - datasource='Cluster Prometheus', - format='ops', -) { - yaxes: [ - { - format: 'ops', - show: 'true', - }, - { - format: 'short', - show: 'false', - }, - ], -}.addTarget( - prometheus.target( - 'rate(etcd_mvcc_put_total{namespace=~"openshift-etcd"}[2m])', - legendFormat='{{namespace}} - {{ pod }} puts/s', - ) -).addTarget( - prometheus.target( - 'rate(etcd_mvcc_delete_total{namespace=~"openshift-etcd"}[2m])', - legendFormat='{{namespace}} - {{ pod }} deletes/s', - ) -); - -local mgmt_slow_operations = grafana.graphPanel.new( - title='Slow Operations', - datasource='Cluster Prometheus', - format='ops', -) { - yaxes: [ - { - format: 'ops', - show: 'true', - }, - { - format: 'short', - show: 'false', - }, - ], -}.addTarget( - prometheus.target( - 'delta(etcd_server_slow_apply_total{namespace=~"openshift-etcd"}[2m])', - legendFormat='{{namespace}} - {{ pod }} slow applies', - ) -).addTarget( - prometheus.target( - 'delta(etcd_server_slow_read_indexes_total{namespace=~"openshift-etcd"}[2m])', - legendFormat='{{namespace}} - {{ pod }} slow read indexes', - ) -); - -local mgmt_raft_proposals = grafana.graphPanel.new( - title='Raft Proposals', - datasource='Cluster Prometheus', -).addTarget( - prometheus.target( - 'sum(rate(etcd_server_proposals_failed_total{namespace=~"openshift-etcd"}[2m]))', - legendFormat='{{namespace}} - Proposal Failure Rate', - ) -).addTarget( - prometheus.target( - 'sum(etcd_server_proposals_pending{namespace=~"openshift-etcd"})', - legendFormat='{{namespace}} - Proposal Pending Total', - ) -).addTarget( - prometheus.target( - 'sum(rate(etcd_server_proposals_committed_total{namespace=~"openshift-etcd"}[2m]))', - legendFormat='{{namespace}} - Proposal Commit Rate', - ) -).addTarget( - prometheus.target( - 'sum(rate(etcd_server_proposals_applied_total{namespace=~"openshift-etcd"}[2m]))', - legendFormat='{{namespace}} - Proposal Apply Rate', - ) -); - -local mgmt_leader_elections_per_day = grafana.graphPanel.new( - title='Leader Elections Per Day', - datasource='Cluster Prometheus', -).addTarget( - prometheus.target( - 'changes(etcd_server_leader_changes_seen_total{namespace=~"openshift-etcd"}[1d])', - legendFormat='{{namespace}} - {{instance}} Total Leader Elections Per Day', - ) -); - -local mgmt_etcd_has_leader = grafana.singlestat.new( - title='Etcd has a leader?', - datasource='Cluster Prometheus', - valueMaps=[ - { - op: '=', - text: 'YES', - value: '1', - }, - { - op: '=', - text: 'NO', - value: '0', - }, - ] -).addTarget( - prometheus.target( - 'max(etcd_server_has_leader{namespace=~"openshift-etcd"})', - instant=true, - ) -); - -local mgmt_num_leader_changes = grafana.graphPanel.new( - title='Number of leader changes seen', - datasource='Cluster Prometheus', -).addTarget( - prometheus.target( - 'sum(rate(etcd_server_leader_changes_seen_total{namespace=~"openshift-etcd"}[2m]))', - ) -); - -local mgmt_num_failed_proposals = grafana.singlestat.new( - title='Total number of failed proposals seen', - datasource='Cluster Prometheus', -).addTarget( - prometheus.target( - 'max(etcd_server_proposals_committed_total{namespace=~"openshift-etcd"})', - instant=true, - ) -); - - -// Hosted ETCD metrics - -local fs_writes = grafana.graphPanel.new( - title='Etcd container disk writes', - datasource='OBO', - format='Bps', - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - sort='decreasing', - nullPointMode='null as zero', -).addTarget( - prometheus.target( - 'rate(container_fs_writes_bytes_total{namespace=~"$namespace",container="etcd",device!~".+dm.+"}[2m])', - legendFormat='{{namespace}} - {{ pod }}: {{ device }}', - ) -); - -local ptp = grafana.graphPanel.new( - title='p99 peer to peer latency', - datasource='OBO', - format='s', - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - sort='decreasing', - nullPointMode='null as zero', -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{namespace=~"$namespace"}[2m]))', - legendFormat='{{namespace}} - {{pod}}', - ) -); - -local disk_wal_sync_duration = grafana.graphPanel.new( - title='Disk WAL Sync Duration', - datasource='OBO', - format='s', - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - sort='decreasing', - nullPointMode='null as zero', -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(irate(etcd_disk_wal_fsync_duration_seconds_bucket{namespace=~"$namespace"}[2m])) by (namespace, pod, le))', - legendFormat='{{namespace}} - {{pod}} WAL fsync', - ) -); - -local disk_backend_sync_duration = grafana.graphPanel.new( - title='Disk Backend Sync Duration', - datasource='OBO', - format='s', - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - sort='decreasing', - nullPointMode='null as zero', -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(irate(etcd_disk_backend_commit_duration_seconds_bucket{namespace=~"$namespace"}[2m])) by (namespace, pod, le))', - legendFormat='{{namespace}} - {{pod}} DB fsync', - ) -); - -local db_size = grafana.graphPanel.new( - title='DB Size', - datasource='OBO', - format='bytes', - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - sort='decreasing', - nullPointMode='null as zero', -).addTarget( - prometheus.target( - 'etcd_mvcc_db_total_size_in_bytes{namespace=~"$namespace"}', - legendFormat='{{namespace}} - {{pod}} DB physical size' - ) -).addTarget( - prometheus.target( - 'etcd_mvcc_db_total_size_in_use_in_bytes{namespace=~"$namespace"}', - legendFormat='{{namespace}} - {{pod}} DB logical size', - ) -); - - -local cpu_usage = grafana.graphPanel.new( - title='CPU usage', - datasource='OBO', - format='percent', - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - sort='decreasing', - nullPointMode='null as zero', -).addTarget( - prometheus.target( - 'sum(irate(container_cpu_usage_seconds_total{namespace=~"$namespace", container="etcd"}[2m])) by (namespace, pod) * 100', - legendFormat='{{namespace}} - {{ pod }}', - ) -); - -local mem_usage = grafana.graphPanel.new( - title='Memory usage', - datasource='OBO', - format='bytes', - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - sort='decreasing', - nullPointMode='null as zero', -).addTarget( - prometheus.target( - 'sum(avg_over_time(container_memory_working_set_bytes{container="",pod!="", namespace=~"$namespace"}[2m])) BY (pod, namespace)', - legendFormat='{{namespace}} - {{ pod }}', - ) -); - -local network_traffic = grafana.graphPanel.new( - title='Container network traffic', - datasource='OBO', - format='Bps', - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - sort='decreasing', - nullPointMode='null as zero', -).addTarget( - prometheus.target( - 'sum(rate(container_network_receive_bytes_total{ container="etcd", namespace=~"$namespace"}[2m])) BY (namespace, pod)', - legendFormat='rx {{namespace}} - {{ pod }}' - ) -).addTarget( - prometheus.target( - 'sum(rate(container_network_transmit_bytes_total{ container="etcd", namespace=~"$namespace"}[2m])) BY (namespace, pod)', - legendFormat='tx {{namespace}} - {{ pod }}', - ) -); - - -local grpc_traffic = grafana.graphPanel.new( - title='gRPC network traffic', - datasource='OBO', - format='Bps', - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - sort='decreasing', - nullPointMode='null as zero', -).addTarget( - prometheus.target( - 'rate(etcd_network_client_grpc_received_bytes_total{namespace=~"$namespace"}[2m])', - legendFormat='rx {{namespace}} - {{pod}}' - ) -).addTarget( - prometheus.target( - 'rate(etcd_network_client_grpc_sent_bytes_total{namespace=~"$namespace"}[2m])', - legendFormat='tx {{namespace}} - {{pod}}', - ) -); - -local peer_traffic = grafana.graphPanel.new( - title='Peer network traffic', - datasource='OBO', - format='Bps', - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - sort='decreasing', - nullPointMode='null as zero', -).addTarget( - prometheus.target( - 'rate(etcd_network_peer_received_bytes_total{namespace=~"$namespace"}[2m])', - legendFormat='rx {{namespace}} - {{pod}} Peer Traffic' - ) -).addTarget( - prometheus.target( - 'rate(etcd_network_peer_sent_bytes_total{namespace=~"$namespace"}[2m])', - legendFormat='tx {{namespace}} - {{pod}} Peer Traffic', - ) -); - - -local active_streams = grafana.graphPanel.new( - title='Active Streams', - datasource='OBO', -).addTarget( - prometheus.target( - 'sum(grpc_server_started_total{namespace=~"$namespace",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{namespace=~"$namespace",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"})', - legendFormat='{{namespace}} - Watch Streams', - ) -).addTarget( - prometheus.target( - 'sum(grpc_server_started_total{namespace=~"$namespace",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{namespace=~"$namespace",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"})', - legendFormat='{{namespace}} - Lease Streams', - ) -); - -local snapshot_duration = grafana.graphPanel.new( - title='Snapshot duration', - datasource='OBO', - format='s', -).addTarget( - prometheus.target( - 'sum(rate(etcd_debugging_snap_save_total_duration_seconds_sum{namespace=~"$namespace"}[2m]))', - legendFormat='the total latency distributions of save called by snapshot', - ) -); - -//DB Info per Member - -local percent_db_used = grafana.graphPanel.new( - title='% DB Space Used', - datasource='OBO', - format='percent', -).addTarget( - prometheus.target( - '(etcd_mvcc_db_total_size_in_bytes{namespace=~"$namespace"} / etcd_server_quota_backend_bytes{namespace=~"$namespace"})*100', - legendFormat='{{namespace}} - {{pod}}', - ) -); - -local db_capacity_left = grafana.graphPanel.new( - title='DB Left capacity (with fragmented space)', - datasource='OBO', - format='bytes', -).addTarget( - prometheus.target( - 'etcd_server_quota_backend_bytes{namespace=~"$namespace"} - etcd_mvcc_db_total_size_in_bytes{namespace=~"$namespace"}', - legendFormat='{{namespace}} - {{pod}}', - ) -); - -local db_size_limit = grafana.graphPanel.new( - title='DB Size Limit (Backend-bytes)', - datasource='OBO', - format='bytes' -).addTarget( - prometheus.target( - 'etcd_server_quota_backend_bytes{namespace=~"$namespace"}', - legendFormat='{{namespace}} - {{ pod }} Quota Bytes', - ) -); - -// Proposals, leaders, and keys section - -local keys = grafana.graphPanel.new( - title='Keys', - datasource='OBO', -).addTarget( - prometheus.target( - 'etcd_debugging_mvcc_keys_total{namespace=~"$namespace"}', - legendFormat='{{namespace}} - {{ pod }} Num keys', - ) -); - -local compacted_keys = grafana.graphPanel.new( - title='Compacted Keys', - datasource='OBO', -).addTarget( - prometheus.target( - 'etcd_debugging_mvcc_db_compaction_keys_total{namespace=~"$namespace"}', - legendFormat='{{namespace}} - {{ pod }} keys compacted', - ) -); - -local heartbeat_failures = grafana.graphPanel.new( - title='Heartbeat Failures', - datasource='OBO', -).addTarget( - prometheus.target( - 'etcd_server_heartbeat_send_failures_total{namespace=~"$namespace"}', - legendFormat='{{namespace}} - {{ pod }} heartbeat failures', - ) -).addTarget( - prometheus.target( - 'etcd_server_health_failures{namespace=~"$namespace"}', - legendFormat='{{namespace}} - {{ pod }} health failures', - ) -); - - -local key_operations = grafana.graphPanel.new( - title='Key Operations', - datasource='OBO', - format='ops', -) { - yaxes: [ - { - format: 'ops', - show: 'true', - }, - { - format: 'short', - show: 'false', - }, - ], -}.addTarget( - prometheus.target( - 'rate(etcd_mvcc_put_total{namespace=~"$namespace"}[2m])', - legendFormat='{{namespace}} - {{ pod }} puts/s', - ) -).addTarget( - prometheus.target( - 'rate(etcd_mvcc_delete_total{namespace=~"$namespace"}[2m])', - legendFormat='{{namespace}} - {{ pod }} deletes/s', - ) -); - -local slow_operations = grafana.graphPanel.new( - title='Slow Operations', - datasource='OBO', - format='ops', -) { - yaxes: [ - { - format: 'ops', - show: 'true', - }, - { - format: 'short', - show: 'false', - }, - ], -}.addTarget( - prometheus.target( - 'delta(etcd_server_slow_apply_total{namespace=~"$namespace"}[2m])', - legendFormat='{{namespace}} - {{ pod }} slow applies', - ) -).addTarget( - prometheus.target( - 'delta(etcd_server_slow_read_indexes_total{namespace=~"$namespace"}[2m])', - legendFormat='{{namespace}} - {{ pod }} slow read indexes', - ) -); - -local raft_proposals = grafana.graphPanel.new( - title='Raft Proposals', - datasource='OBO', -).addTarget( - prometheus.target( - 'sum(rate(etcd_server_proposals_failed_total{namespace=~"$namespace"}[2m]))', - legendFormat='{{namespace}} - Proposal Failure Rate', - ) -).addTarget( - prometheus.target( - 'sum(etcd_server_proposals_pending{namespace=~"$namespace"})', - legendFormat='{{namespace}} - Proposal Pending Total', - ) -).addTarget( - prometheus.target( - 'sum(rate(etcd_server_proposals_committed_total{namespace=~"$namespace"}[2m]))', - legendFormat='{{namespace}} - Proposal Commit Rate', - ) -).addTarget( - prometheus.target( - 'sum(rate(etcd_server_proposals_applied_total{namespace=~"$namespace"}[2m]))', - legendFormat='{{namespace}} - Proposal Apply Rate', - ) -); - -local leader_elections_per_day = grafana.graphPanel.new( - title='Leader Elections Per Day', - datasource='OBO', -).addTarget( - prometheus.target( - 'changes(etcd_server_leader_changes_seen_total{namespace=~"$namespace"}[1d])', - legendFormat='{{namespace}} - {{instance}} Total Leader Elections Per Day', - ) -); - -local etcd_has_leader = grafana.singlestat.new( - title='Etcd has a leader?', - datasource='OBO', - valueMaps=[ - { - op: '=', - text: 'YES', - value: '1', - }, - { - op: '=', - text: 'NO', - value: '0', - }, - ] -).addTarget( - prometheus.target( - 'max(etcd_server_has_leader{namespace=~"$namespace"})', - instant=true, - ) -); - -local num_leader_changes = grafana.graphPanel.new( - title='Number of leader changes seen', - datasource='OBO', -).addTarget( - prometheus.target( - 'sum(rate(etcd_server_leader_changes_seen_total{namespace=~"$namespace"}[2m]))', - ) -); - -local num_failed_proposals = grafana.singlestat.new( - title='Total number of failed proposals seen', - datasource='OBO', -).addTarget( - prometheus.target( - 'max(etcd_server_proposals_committed_total{namespace=~"$namespace"})', - instant=true, - ) -); - -// API metrics - -local request_duration_99th_quantile = grafana.graphPanel.new( - title='request duration - 99th quantile', - datasource='OBO', - legend_values=true, - legend_alignAsTable=true, - legend_current=true, - legend_rightSide=true, - legend_sort='max', - legend_sortDesc=true, - nullPointMode='null as zero', - legend_hideZero=true, -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{namespace=~"$namespace",resource=~"$resource",subresource!="log",verb!~"WATCH|WATCHLIST|PROXY"}[2m])) by(verb,le))', - legendFormat='{{verb}}', - ) -); - -local request_rate_by_instance = grafana.graphPanel.new( - title='request rate - by instance', - datasource='OBO', - legend_values=true, - legend_alignAsTable=true, - legend_current=true, - legend_rightSide=true, - legend_sort='max', - legend_sortDesc=true, - nullPointMode='null as zero', - legend_hideZero=true, -).addTarget( - prometheus.target( - 'sum(rate(apiserver_request_total{namespace=~"$namespace",resource=~"$resource",code=~"$code",verb=~"$verb"}[2m])) by(instance)', - legendFormat='{{instance}}', - ) -); - -local request_duration_99th_quantile_by_resource = grafana.graphPanel.new( - title='request duration - 99th quantile - by resource', - datasource='OBO', - legend_values=true, - legend_alignAsTable=true, - legend_current=true, - legend_rightSide=true, - legend_sort='max', - legend_sortDesc=true, - nullPointMode='null as zero', - legend_hideZero=true, -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{namespace=~"$namespace",resource=~"$resource",subresource!="log",verb!~"WATCH|WATCHLIST|PROXY"}[2m])) by(resource,le))', - legendFormat='{{resource}}', - ) -); - -local request_rate_by_resource = grafana.graphPanel.new( - title='request duration - 99th quantile', - datasource='OBO', - legend_values=true, - legend_alignAsTable=true, - legend_current=true, - legend_rightSide=true, - legend_sort='max', - legend_sortDesc=true, - nullPointMode='null as zero', - legend_hideZero=true, -).addTarget( - prometheus.target( - 'sum(rate(apiserver_request_total{namespace=~"$namespace",resource=~"$resource",code=~"$code",verb=~"$verb"}[2m])) by(resource)', - legendFormat='{{resource}}', - ) -); - -local request_duration_read_write = grafana.graphPanel.new( - title='request duration - read vs write', - datasource='OBO', -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{namespace=~"$namespace",resource=~"$resource",verb=~"LIST|GET"}[2m])) by(le))', - legendFormat='read', - ) -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{namespace=~"$namespace",resource=~"$resource",verb=~"POST|PUT|PATCH|UPDATE|DELETE"}[2m])) by(le))', - legendFormat='write', - ) -); - - -local request_rate_read_write = grafana.graphPanel.new( - title='request rate - read vs write', - datasource='OBO', -).addTarget( - prometheus.target( - 'sum(rate(apiserver_request_total{namespace=~"$namespace",resource=~"$resource",verb=~"LIST|GET"}[2m]))', - legendFormat='read', - ) -).addTarget( - prometheus.target( - 'sum(rate(apiserver_request_total{namespace=~"$namespace",resource=~"$resource",verb=~"POST|PUT|PATCH|UPDATE|DELETE"}[2m]))', - legendFormat='write', - ) -); - - -local requests_dropped_rate = grafana.graphPanel.new( - title='requests dropped rate', - datasource='OBO', - description='Number of requests dropped with "Try again later" response', -).addTarget( - prometheus.target( - 'sum(rate(apiserver_dropped_requests_total{namespace=~"$namespace"}[2m])) by (requestKind)', - ) -); - - -local requests_terminated_rate = grafana.graphPanel.new( - title='requests terminated rate', - datasource='OBO', - description='Number of requests which apiserver terminated in self-defense', -).addTarget( - prometheus.target( - 'sum(rate(apiserver_request_terminations_total{namespace=~"$namespace",resource=~"$resource",code=~"$code"}[2m])) by(component)', - ) -); - -local requests_status_rate = grafana.graphPanel.new( - title='requests status rate', - datasource='OBO', - legend_values=true, - legend_alignAsTable=true, - legend_current=true, - legend_rightSide=true, - legend_sort='max', - legend_sortDesc=true, - nullPointMode='null as zero', - legend_hideZero=true, -).addTarget( - prometheus.target( - 'sum(rate(apiserver_request_total{namespace=~"$namespace",resource=~"$resource",verb=~"$verb",code=~"$code"}[2m])) by(code)', - legendFormat='{{code}}' - ) -); - -local long_running_requests = grafana.graphPanel.new( - title='long running requests', - datasource='OBO', - legend_values=true, - legend_alignAsTable=true, - legend_current=true, - legend_rightSide=true, - legend_sort='max', - legend_sortDesc=true, - nullPointMode='null as zero', - legend_hideZero=true, -).addTarget( - prometheus.target( - 'sum(apiserver_longrunning_gauge{namespace=~"$namespace",resource=~"$resource",verb=~"$verb"}) by(instance)', - legendFormat='{{instance}}' - ) -); - -local request_in_flight = grafana.graphPanel.new( - title='requests in flight', - datasource='OBO', - legend_values=true, - legend_alignAsTable=true, - legend_current=true, - legend_rightSide=true, - legend_sort='max', - legend_sortDesc=true, - nullPointMode='null as zero', - legend_hideZero=true, -).addTarget( - prometheus.target( - 'sum(apiserver_current_inflight_requests{namespace=~"$namespace"}) by (instance,requestKind)', - legendFormat='{{requestKind}}-{{instance}}', - ) -); - -local pf_requests_rejected = grafana.graphPanel.new( - title='p&f - requests rejected', - datasource='OBO', - description='Number of requests rejected by API Priority and Fairness system', -).addTarget( - prometheus.target( - 'sum(rate(apiserver_flowcontrol_rejected_requests_total{namespace=~"$namespace"}[2m])) by (reason)', - ) -); - -local response_size_99th_quartile = grafana.graphPanel.new( - title='response size - 99th quantile', - datasource='OBO', - description='Response size distribution in bytes for each group, version, verb, resource, subresource, scope and component', - legend_values=true, - legend_alignAsTable=true, - legend_current=true, - legend_rightSide=true, - legend_sort='max', - legend_sortDesc=true, - nullPointMode='null as zero', - legend_hideZero=true, -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(rate(apiserver_response_sizes_bucket{namespace=~"$namespace",resource=~"$resource",verb=~"$verb"}[2m])) by(instance,le))', - legendFormat='{{instance}}', - ) -); - -local pf_request_queue_length = grafana.graphPanel.new( - title='p&f - request queue length', - datasource='OBO', - description='Length of queue in the API Priority and Fairness system, as seen by each request after it is enqueued', - legend_values=true, - legend_alignAsTable=true, - legend_current=true, - legend_rightSide=true, - legend_sort='max', - legend_sortDesc=true, - nullPointMode='null as zero', - legend_hideZero=true, -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(rate(apiserver_flowcontrol_request_queue_length_after_enqueue_bucket{namespace=~"$namespace"}[2m])) by(flowSchema, priorityLevel, le))', - legendFormat='{{flowSchema}}:{{priorityLevel}}', - ) -); - -local pf_request_wait_duration_99th_quartile = grafana.graphPanel.new( - title='p&f - request wait duration - 99th quantile', - datasource='OBO', - description='Length of time a request spent waiting in its queue', - legend_values=true, - legend_alignAsTable=true, - legend_current=true, - legend_rightSide=true, - legend_sort='max', - legend_sortDesc=true, - nullPointMode='null as zero', - legend_hideZero=true, -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(rate(apiserver_flowcontrol_request_wait_duration_seconds_bucket{namespace=~"$namespace"}[2m])) by(flowSchema, priorityLevel, le))', - legendFormat='{{flowSchema}}:{{priorityLevel}}', - ) -); - -local pf_request_execution_duration = grafana.graphPanel.new( - title='p&f - request execution duration', - datasource='OBO', - description='Duration of request execution in the API Priority and Fairness system', - legend_values=true, - legend_alignAsTable=true, - legend_current=true, - legend_rightSide=true, - legend_sort='max', - legend_sortDesc=true, - nullPointMode='null as zero', - legend_hideZero=true, -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(rate(apiserver_flowcontrol_request_execution_seconds_bucket{namespace=~"$namespace"}[2m])) by(flowSchema, priorityLevel, le))', - legendFormat='{{flowSchema}}:{{priorityLevel}}', - ) -); - -local pf_request_dispatch_rate = grafana.graphPanel.new( - title='p&f - request dispatch rate', - datasource='OBO', - description='Number of requests released by API Priority and Fairness system for service', - legend_values=true, - legend_alignAsTable=true, - legend_current=true, - legend_rightSide=true, - legend_sort='max', - legend_sortDesc=true, - nullPointMode='null as zero', - legend_hideZero=true, -).addTarget( - prometheus.target( - 'sum(rate(apiserver_flowcontrol_dispatched_requests_total{namespace=~"$namespace"}[2m])) by(flowSchema,priorityLevel)', - legendFormat='{{flowSchema}}:{{priorityLevel}}', - ) -); - -local pf_concurrency_limit = grafana.graphPanel.new( - title='p&f - concurrency limit by priority level', - datasource='OBO', - description='Shared concurrency limit in the API Priority and Fairness system', -).addTarget( - prometheus.target( - 'sum(apiserver_flowcontrol_request_concurrency_limit{namespace=~"$namespace"}) by (priorityLevel)', - legendFormat='{{priorityLevel}}' - ) -); - -local pf_pending_in_queue = grafana.graphPanel.new( - title='p&f - pending in queue', - datasource='OBO', - description='Number of requests currently pending in queues of the API Priority and Fairness system', - legend_values=true, - legend_alignAsTable=true, - legend_current=true, - legend_rightSide=true, - legend_sort='max', - legend_sortDesc=true, - nullPointMode='null as zero', - legend_hideZero=true, -).addTarget( - prometheus.target( - 'sum(apiserver_flowcontrol_current_inqueue_requests{namespace=~"$namespace"}) by (flowSchema,priorityLevel)', - legendFormat='{{flowSchema}}:{{priorityLevel}}', - ) -); - - -// Creating the dashboard from the panels described above. - -grafana.dashboard.new( - 'Hypershift Performance', - description='', - timezone='utc', - time_from='now-6h', - editable='true' -) - -.addTemplate( - grafana.template.new( - 'namespace', - 'Cluster Prometheus', - 'label_values(kube_pod_info, namespace)', - '', - regex='/^ocm/', - refresh=2, - ) { - label: 'Namespace', - type: 'query', - multi: true, - includeAll: true, - }, -) - -.addTemplate( - grafana.template.new( - 'resource', - 'Cluster Prometheus', - 'label_values(apiserver_request_duration_seconds_bucket, resource)', - refresh='time', - label='resource' - ) { - type: 'query', - multi: true, - includeAll: true, - }, -) - -.addTemplate( - grafana.template.new( - 'code', - 'Cluster Prometheus', - 'label_values(code)', - refresh='time', - label='code', - ) { - type: 'query', - multi: true, - includeAll: true, - }, -) - - -.addTemplate( - grafana.template.new( - 'verb', - 'Cluster Prometheus', - 'label_values(verb)', - refresh='time', - label='verb', - ) { - type: 'query', - multi: true, - includeAll: true, - }, -) - -.addPanel(grafana.row.new(title='Management cluster stats', collapse=true).addPanels( - [ - m_infrastructure { gridPos: { x: 0, y: 0, w: 6, h: 4 } }, - m_region { gridPos: { x: 6, y: 0, w: 6, h: 4 } }, - m_ocp_version { gridPos: { x: 12, y: 0, w: 6, h: 4 } }, - num_hosted_cluster { gridPos: { x: 18, y: 0, w: 6, h: 4 } }, - current_namespace_count { gridPos: { x: 0, y: 5, w: 8, h: 4 } }, - current_node_count { gridPos: { x: 8, y: 5, w: 8, h: 4 } }, - current_pod_count { gridPos: { x: 16, y: 5, w: 8, h: 4 } }, - top10ContCPUHosted { gridPos: { x: 0, y: 8, w: 12, h: 8 } }, - top10ContMemHosted { gridPos: { x: 12, y: 8, w: 12, h: 8 } }, - top10ContCPUManagement { gridPos: { x: 0, y: 20, w: 12, h: 8 } }, - top10ContMemManagement { gridPos: { x: 12, y: 20, w: 12, h: 8 } }, - top10ContCPUOBOManagement { gridPos: { x: 0, y: 28, w: 12, h: 8 } }, - top10ContMemOBOManagement { gridPos: { x: 12, y: 28, w: 12, h: 8 } }, - top10ContCPUHypershiftManagement { gridPos: { x: 0, y: 36, w: 12, h: 8 } }, - top10ContMemHypershiftManagement { gridPos: { x: 12, y: 36, w: 12, h: 8 } }, - nodeCount { gridPos: { x: 0, y: 44, w: 6, h: 8 } }, - current_machine_set_replica_count { gridPos: { x: 6, y: 44, w: 6, h: 8 } }, - nsCount { gridPos: { x: 12, y: 44, w: 6, h: 8 } }, - podCount { gridPos: { x: 18, y: 44, w: 6, h: 8 } }, - clusterOperatorsInformation { gridPos: { x: 0, y: 52, w: 8, h: 8 } }, - clusterOperatorsDegraded { gridPos: { x: 8, y: 52, w: 8, h: 8 } }, - FailedPods { gridPos: { x: 16, y: 52, w: 8, h: 8 } }, - alerts { gridPos: { x: 0, y: 60, w: 24, h: 8 } }, - dynaactivegateMem { gridPos: { x: 0, y: 18, w: 12, h: 8 } }, - dynaactivegateCPU { gridPos: { x: 12, y: 18, w: 12, h: 8 } }, - opentelemetryCPU { gridPos: { x: 0, y: 18, w: 12, h: 8 } }, - opentelemetryMem { gridPos: { x: 12, y: 18, w: 12, h: 8 } }, - ], -), { gridPos: { x: 0, y: 4, w: 24, h: 1 } }) - -.addPanel(grafana.row.new(title='Management cluster Etcd stats', collapse=true).addPanels( - [ - mgmt_disk_wal_sync_duration { gridPos: { x: 0, y: 2, w: 12, h: 8 } }, - mgmt_disk_backend_sync_duration { gridPos: { x: 12, y: 2, w: 12, h: 8 } }, - mgmt_percent_db_used { gridPos: { x: 0, y: 10, w: 8, h: 8 } }, - mgmt_db_capacity_left { gridPos: { x: 8, y: 10, w: 8, h: 8 } }, - mgmt_db_size_limit { gridPos: { x: 16, y: 10, w: 8, h: 8 } }, - mgmt_db_size { gridPos: { x: 0, y: 18, w: 12, h: 8 } }, - mgmt_grpc_traffic { gridPos: { x: 12, y: 18, w: 12, h: 8 } }, - mgmt_active_streams { gridPos: { x: 0, y: 26, w: 12, h: 8 } }, - mgmt_snapshot_duration { gridPos: { x: 12, y: 26, w: 12, h: 8 } }, - mgmt_raft_proposals { gridPos: { x: 0, y: 1, w: 12, h: 8 } }, - mgmt_num_leader_changes { gridPos: { x: 12, y: 1, w: 12, h: 8 } }, - mgmt_etcd_has_leader { gridPos: { x: 0, y: 8, w: 6, h: 2 } }, - mgmt_num_failed_proposals { gridPos: { x: 6, y: 8, w: 6, h: 2 } }, - mgmt_leader_elections_per_day { gridPos: { x: 0, y: 12, w: 12, h: 6 } }, - mgmt_keys { gridPos: { x: 12, y: 12, w: 12, h: 8 } }, - mgmt_slow_operations { gridPos: { x: 0, y: 20, w: 12, h: 8 } }, - mgmt_key_operations { gridPos: { x: 12, y: 20, w: 12, h: 8 } }, - mgmt_heartbeat_failures { gridPos: { x: 0, y: 28, w: 12, h: 8 } }, - mgmt_compacted_keys { gridPos: { x: 12, y: 28, w: 12, h: 8 } }, - ], -), { gridPos: { x: 0, y: 4, w: 24, h: 1 } }) - -.addPanel( - grafana.row.new(title='Hosted Clusters Serving Node stats - $namespace', collapse=true, repeat='namespace').addPanels( - [ - nodeCPU { gridPos: { x: 0, y: 2, w: 12, h: 8 } }, - nodeMemory { gridPos: { x: 12, y: 2, w: 12, h: 8 } }, - suricataCPU { gridPos: { x: 0, y: 18, w: 12, h: 8 } }, - suricataMemory { gridPos: { x: 12, y: 18, w: 12, h: 8 } }, - dynaoneagentCPU { gridPos: { x: 0, y: 18, w: 12, h: 8 } }, - dynaoneagentMem { gridPos: { x: 12, y: 18, w: 12, h: 8 } }, - ] - ), { gridPos: { x: 0, y: 4, w: 24, h: 1 } } -) - -.addPanel(grafana.row.new(title='HostedControlPlane stats - $namespace', collapse=true, repeat='namespace').addPanels( - [ - infrastructure { gridPos: { x: 0, y: 0, w: 8, h: 4 } }, - region { gridPos: { x: 8, y: 0, w: 8, h: 4 } }, - ocp_version { gridPos: { x: 16, y: 0, w: 8, h: 4 } }, - hostedControlPlaneCPU { gridPos: { x: 0, y: 12, w: 12, h: 8 } }, - hostedControlPlaneMemory { gridPos: { x: 12, y: 12, w: 12, h: 8 } }, - request_duration_99th_quantile { gridPos: { x: 0, y: 20, w: 8, h: 8 } }, - request_rate_by_instance { gridPos: { x: 8, y: 20, w: 8, h: 8 } }, - request_duration_99th_quantile_by_resource { gridPos: { x: 16, y: 20, w: 8, h: 8 } }, - request_rate_by_resource { gridPos: { x: 0, y: 30, w: 8, h: 8 } }, - request_duration_read_write { gridPos: { x: 8, y: 30, w: 8, h: 8 } }, - request_rate_read_write { gridPos: { x: 16, y: 30, w: 8, h: 8 } }, - requests_dropped_rate { gridPos: { x: 0, y: 40, w: 8, h: 8 } }, - requests_terminated_rate { gridPos: { x: 8, y: 40, w: 8, h: 8 } }, - requests_status_rate { gridPos: { x: 16, y: 40, w: 8, h: 8 } }, - long_running_requests { gridPos: { x: 0, y: 50, w: 8, h: 8 } }, - request_in_flight { gridPos: { x: 8, y: 50, w: 8, h: 8 } }, - pf_requests_rejected { gridPos: { x: 16, y: 50, w: 8, h: 8 } }, - response_size_99th_quartile { gridPos: { x: 0, y: 60, w: 8, h: 8 } }, - pf_request_queue_length { gridPos: { x: 8, y: 60, w: 8, h: 8 } }, - pf_request_wait_duration_99th_quartile { gridPos: { x: 16, y: 60, w: 8, h: 8 } }, - pf_request_execution_duration { gridPos: { x: 0, y: 70, w: 8, h: 8 } }, - pf_request_dispatch_rate { gridPos: { x: 8, y: 70, w: 8, h: 8 } }, - pf_concurrency_limit { gridPos: { x: 16, y: 70, w: 8, h: 8 } }, - pf_pending_in_queue { gridPos: { x: 0, y: 80, w: 8, h: 8 } }, - ], -), { gridPos: { x: 0, y: 4, w: 24, h: 1 } }) - -.addPanel( - grafana.row.new(title='Hosted Clusters ETCD General Resource Usage - $namespace', collapse=true, repeat='namespace').addPanels( - [ - disk_wal_sync_duration { gridPos: { x: 0, y: 2, w: 12, h: 8 } }, - disk_backend_sync_duration { gridPos: { x: 12, y: 2, w: 12, h: 8 } }, - percent_db_used { gridPos: { x: 0, y: 10, w: 8, h: 8 } }, - db_capacity_left { gridPos: { x: 8, y: 10, w: 8, h: 8 } }, - db_size_limit { gridPos: { x: 16, y: 10, w: 8, h: 8 } }, - db_size { gridPos: { x: 0, y: 18, w: 12, h: 8 } }, - grpc_traffic { gridPos: { x: 12, y: 18, w: 12, h: 8 } }, - active_streams { gridPos: { x: 0, y: 26, w: 12, h: 8 } }, - snapshot_duration { gridPos: { x: 12, y: 26, w: 12, h: 8 } }, - - raft_proposals { gridPos: { x: 0, y: 34, w: 12, h: 8 } }, - num_leader_changes { gridPos: { x: 12, y: 34, w: 12, h: 8 } }, - etcd_has_leader { gridPos: { x: 0, y: 42, w: 6, h: 2 } }, - num_failed_proposals { gridPos: { x: 6, y: 42, w: 6, h: 2 } }, - leader_elections_per_day { gridPos: { x: 0, y: 44, w: 12, h: 6 } }, - keys { gridPos: { x: 12, y: 44, w: 12, h: 8 } }, - slow_operations { gridPos: { x: 0, y: 52, w: 12, h: 8 } }, - key_operations { gridPos: { x: 12, y: 52, w: 12, h: 8 } }, - heartbeat_failures { gridPos: { x: 0, y: 60, w: 12, h: 8 } }, - compacted_keys { gridPos: { x: 12, y: 60, w: 12, h: 8 } }, - ] - ), { gridPos: { x: 0, y: 0, w: 24, h: 1 } } -) +local panels = import '../../assets/hypershift-perf-dashboard/panels.libsonnet'; +local queries = import '../../assets/hypershift-perf-dashboard/queries.libsonnet'; +local variables = import '../../assets/hypershift-perf-dashboard/variables.libsonnet'; +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; + +local cluster_prometheus = 'PF55DCC5EC58ABF5A'; +local OBO = 'P1BA917A37525EDF3'; + +g.dashboard.new('Hypershift Performance Dashboard') ++ g.dashboard.withDescription(||| + Dashboard for Api-performance-overview +|||) ++ g.dashboard.withTags('') ++ g.dashboard.time.withFrom('now-6h') ++ g.dashboard.time.withTo('now') ++ g.dashboard.withTimezone('utc') ++ g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) ++ g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) ++ g.dashboard.withRefresh('') ++ g.dashboard.withEditable(true) ++ g.dashboard.graphTooltip.withSharedCrosshair() ++ g.dashboard.withVariables([ + variables.Namespace, + variables.Resource, + variables.Code, + variables.Verb, +]) ++ g.dashboard.withPanels([ + g.panel.row.new('Management cluster stats') + + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withPanels([ + panels.stat.m_infrastructure('Management Cloud Infrastructure', '', queries.m_infrastructure.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 0, w: 6, h: 4 }), + panels.stat.m_region('Management Cloud Region', '', queries.m_region.query(), 'PF55DCC5EC58ABF5A', { x: 6, y: 0, w: 6, h: 4 }), + panels.stat.m_ocp_version('Management OCP Version', '', queries.m_ocp_version.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 0, w: 6, h: 4 }), + panels.stat.num_hosted_cluster('Number of HostedCluster', '', queries.num_hosted_cluster.query(), 'PF55DCC5EC58ABF5A', { x: 18, y: 0, w: 6, h: 4 }), + panels.stat.current_namespace_count('Current namespace Count', '', queries.current_namespace_count.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 5, w: 8, h: 4 }), + panels.stat.current_node_count('Current Node Count', '', queries.current_node_count.query(), 'PF55DCC5EC58ABF5A', { x: 8, y: 5, w: 8, h: 4 }), + panels.stat.current_pod_count('Current Pod Count', '', queries.current_pod_count.query(), 'PF55DCC5EC58ABF5A', { x: 16, y: 5, w: 8, h: 4 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Top 10 Hosted Clusters container CPU', 'percent', queries.top10ContCPUHosted.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 8, w: 12, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Top 10 Hosted Clusters container RSS', 'bytes', queries.top10ContMemHosted.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 8, w: 12, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Top 10 Management Cluster container CPU', 'percent', queries.top10ContCPUManagement.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 20, w: 12, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Top 10 Management Cluster container RSS', 'bytes', queries.top10ContMemManagement.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 20, w: 12, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Top 10 Management Cluster OBO NS Pods CPU', 'percent', queries.top10ContCPUOBOManagement.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 28, w: 12, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Top 10 Management Cluster OBO NS Pods RSS', 'bytes', queries.top10ContMemOBOManagement.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 28, w: 12, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Top 10 Management Cluster Hypershift NS Pods CPU', 'percent', queries.top10ContCPUHypershiftManagement.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 36, w: 12, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Top 10 Management Cluster Hypershift NS Pods RSS', 'bytes', queries.top10ContMemHypershiftManagement.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 36, w: 12, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Active Gate Memory Usage', 'bytes', queries.dynaactivegateMem.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 18, w: 12, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Active Gate CPU Usage', 'percent', queries.dynaactivegateCPU.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 18, w: 12, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Opentelemetry CPU Usage', 'percent', queries.opentelemetryCPU.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 18, w: 12, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Opentelemetry Memory Usage', 'bytes', queries.opentelemetryMem.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 18, w: 12, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Number of nodes', 'none', queries.nodeCount.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 44, w: 6, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Machine Set Replicas', 'none', queries.current_machine_set_replica_count.query(), 'PF55DCC5EC58ABF5A', { x: 6, y: 44, w: 6, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Namespace count', 'none', queries.nsCount.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 44, w: 6, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Pod count', 'none', queries.podCount.query(), 'PF55DCC5EC58ABF5A', { x: 18, y: 44, w: 6, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Cluster operators information', 'none', queries.clusterOperatorsInformation.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 52, w: 8, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Cluster operators degraded', 'none', queries.clusterOperatorsDegraded.query(), 'PF55DCC5EC58ABF5A', { x: 8, y: 52, w: 8, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Failed pods', 'none', queries.FailedPods.query(), 'PF55DCC5EC58ABF5A', { x: 16, y: 52, w: 8, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Alerts', 'none', queries.alerts.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 60, w: 24, h: 8 }), + ]), + g.panel.row.new('Management cluster Etcd stats') + + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withPanels([ + panels.timeSeries.mgmt('Disk WAL Sync Duration', 's', queries.mgmt_disk_wal_sync_duration.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 2, w: 12, h: 8 }), + panels.timeSeries.mgmt('Disk Backend Sync Duration', 's', queries.mgmt_disk_backend_sync_duration.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 2, w: 12, h: 8 }), + panels.timeSeries.DBPanelsSettings('% DB Space Used', 'percent', queries.mgmt_percent_db_used.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 10, w: 8, h: 8 }), + panels.timeSeries.DBPanelsSettings('DB Left capacity (with fragmented space)', 'bytes', queries.mgmt_db_capacity_left.query(), 'PF55DCC5EC58ABF5A', { x: 8, y: 10, w: 8, h: 8 }), + panels.timeSeries.DBPanelsSettings('DB Size Limit (Backend-bytes)', 'bytes', queries.mgmt_db_size_limit.query(), 'PF55DCC5EC58ABF5A', { x: 16, y: 10, w: 8, h: 8 }), + panels.timeSeries.mgmt('DB Size', 'bytes', queries.mgmt_db_size.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 18, w: 12, h: 8 }), + panels.timeSeries.mgmt('gRPC network traffic', 'Bps', queries.mgmt_grpc_traffic.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 18, w: 12, h: 8 }), + panels.timeSeries.DBPanelsSettings('Active Streams', '', queries.mgmt_active_streams.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 26, w: 12, h: 8 }), + panels.timeSeries.DBPanelsSettings('Snapshot duration', 's', queries.mgmt_snapshot_duration.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 26, w: 12, h: 8 }), + panels.timeSeries.DBPanelsSettings('Raft Proposals', '', queries.mgmt_raft_proposals.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 1, w: 12, h: 8 }), + panels.timeSeries.DBPanelsSettings('Number of leader changes seen', '', queries.mgmt_num_leader_changes.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 1, w: 12, h: 8 }), + panels.stat.etcd_has_leader('Etcd has a leader?', '', queries.mgmt_etcd_has_leader.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 8, w: 6, h: 2 }), + panels.stat.mgmt_num_failed_proposals('Total number of failed proposals seen', '', queries.mgmt_num_failed_proposals.query(), 'PF55DCC5EC58ABF5A', { x: 6, y: 8, w: 6, h: 2 }), + panels.timeSeries.DBPanelsSettings('Leader Elections Per Day', '', queries.mgmt_leader_elections_per_day.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 12, w: 12, h: 6 }), + panels.timeSeries.DBPanelsSettings('Keys', '', queries.mgmt_keys.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 12, w: 12, h: 8 }), + panels.timeSeries.DBPanelsSettings('Slow Operations', 'ops', queries.mgmt_slow_operations.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 20, w: 12, h: 8 }), + panels.timeSeries.DBPanelsSettings('Key Operations', 'ops', queries.mgmt_key_operations.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 20, w: 12, h: 8 }), + panels.timeSeries.DBPanelsSettings('Heartbeat Failures', '', queries.mgmt_heartbeat_failures.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 28, w: 12, h: 8 }), + panels.timeSeries.DBPanelsSettings('Compacted Keys', '', queries.mgmt_compacted_keys.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 28, w: 12, h: 8 }), + ]), + + g.panel.row.new('Hosted Clusters Serving Node stats - $namespace') + + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withRepeat('namespace') + + g.panel.row.withPanels([ + panels.timeSeries.genericGraphLegendPanel('Serving Node CPU Basic', 'percent', queries.nodeCPU.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 2, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanel('Serving Node Memory', 'bytes', queries.nodeMemory.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 2, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanel('Suricata CPU(Running on Serving node)', 'percent', queries.suricataCPU.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 18, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanel('Suricata Memory(Running on Serving node)', 'bytes', queries.suricataMemory.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 18, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanel('OneAgent CPU Usage', 'percent', queries.dynaactivegateCPU.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 18, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanel('OneAgent Memory Usage', 'bytes', queries.dynaoneagentMem.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 18, w: 12, h: 8 }), + ]), + + g.panel.row.new('Hosted Clusters Serving Node stats - $namespace') + + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withRepeat('namespace') + + g.panel.row.withPanels([ + panels.stat.hostedControlPlaneStats('Hosted Cluster Cloud Infrastructure', '', queries.infrastructure.query(), 'P1BA917A37525EDF3', { x: 0, y: 0, w: 8, h: 4 }), + panels.stat.hostedControlPlaneStats('Hosted Cluster Cloud Region', '', queries.region.query(), 'P1BA917A37525EDF3', { x: 8, y: 0, w: 8, h: 4 }), + panels.stat.hostedControlPlaneStats('Hosted Cluster OCP Version', '', queries.ocp_version.query(), 'P1BA917A37525EDF3', { x: 16, y: 0, w: 8, h: 4 }), + panels.timeSeries.genericGraphLegendPanel('Hosted Control Plane CPU', 'percent', queries.hostedControlPlaneCPU.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 12, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanel('Hosted Control Plane Memory', 'bytes', queries.hostedControlPlaneMemory.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 12, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanelRightSide('request duration - 99th quantile', '', queries.request_duration_99th_quantile.query(), OBO, { x: 0, y: 20, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelRightSide('request rate - by instance', '', queries.request_rate_by_instance.query(), OBO, { x: 8, y: 20, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelRightSide('request duration - 99th quantile - by resource', '', queries.request_duration_99th_quantile_by_resource.query(), OBO, { x: 16, y: 20, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelRightSide('request duration - 99th quantile', '', queries.request_rate_by_resource.query(), OBO, { x: 0, y: 30, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('request duration - read vs write', '', queries.request_duration_read_write.query(), OBO, { x: 8, y: 30, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('request rate - read vs write', '', queries.request_rate_read_write.query(), OBO, { x: 16, y: 30, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('requests dropped rate', '', queries.requests_dropped_rate.query(), OBO, { x: 0, y: 40, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('requests terminated rate', '', queries.requests_terminated_rate.query(), OBO, { x: 8, y: 40, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('requests status rate', '', queries.requests_status_rate.query(), OBO, { x: 16, y: 40, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('long running requests', '', queries.long_running_requests.query(), OBO, { x: 0, y: 50, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelRightSide('requests in flight', '', queries.request_in_flight.query(), OBO, { x: 8, y: 50, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('p&f - requests rejected', '', queries.pf_requests_rejected.query(), OBO, { x: 16, y: 50, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelRightSide('response size - 99th quantile', '', queries.response_size_99th_quartile.query(), OBO, { x: 0, y: 60, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelRightSide('p&f - request queue length', '', queries.pf_request_queue_length.query(), OBO, { x: 8, y: 60, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelRightSide('p&f - request wait duration - 99th quantile', '', queries.pf_request_wait_duration_99th_quartile.query(), OBO, { x: 16, y: 60, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelRightSide('p&f - request execution duration', '', queries.pf_request_execution_duration.query(), OBO, { x: 0, y: 70, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelRightSide('p&f - request dispatch rate', '', queries.pf_request_dispatch_rate.query(), OBO, { x: 8, y: 70, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('p&f - concurrency limit by priority level', '', queries.pf_concurrency_limit.query(), OBO, { x: 16, y: 70, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelRightSide('p&f - pending in queue', '', queries.pf_pending_in_queue.query(), OBO, { x: 0, y: 80, w: 8, h: 8 }), + ]), + g.panel.row.new('Hosted Clusters ETCD General Resource Usage - $namespace') + + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withRepeat('namespace') + + g.panel.row.withPanels([ + panels.timeSeries.genericGraphLegendPanel('Disk WAL Sync Duration', 's', queries.disk_wal_sync_duration.query(), OBO, { x: 0, y: 2, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanel('Disk Backend Sync Duration', 's', queries.disk_backend_sync_duration.query(), OBO, { x: 12, y: 2, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('% DB Space Used', 'percent', queries.percent_db_used.query(), OBO, { x: 0, y: 10, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('DB Left capacity (with fragmented space)', 'bytes', queries.db_capacity_left.query(), OBO, { x: 8, y: 10, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('DB Size Limit (Backend-bytes)', 'bytes', queries.db_size_limit.query(), OBO, { x: 16, y: 10, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanel('DB Size', 'bytes', queries.db_size.query(), OBO, { x: 0, y: 18, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanel('gRPC network traffic', 'Bps', queries.grpc_traffic.query(), OBO, { x: 12, y: 18, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('Active Streams', '', queries.active_streams.query(), OBO, { x: 0, y: 26, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('Snapshot duration', 's', queries.snapshot_duration.query(), OBO, { x: 12, y: 26, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('Raft Proposals', '', queries.raft_proposals.query(), OBO, { x: 0, y: 34, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('Number of leader changes seen', '', queries.num_leader_changes.query(), OBO, { x: 12, y: 34, w: 12, h: 8 }), + panels.stat.etcd_has_leader('Etcd has a leader?', '', queries.etcd_has_leader.query(), OBO, { x: 0, y: 42, w: 6, h: 2 }), + panels.stat.mgmt_num_failed_proposals('Total number of failed proposals seen', '', queries.num_failed_proposals.query(), OBO, { x: 6, y: 42, w: 6, h: 2 }), + panels.timeSeries.genericGraphLegendPanelList('Leader Elections Per Day', '', queries.leader_elections_per_day.query(), OBO, { x: 0, y: 44, w: 12, h: 6 }), + panels.timeSeries.genericGraphLegendPanelList('Keys', '', queries.keys.query(), OBO, { x: 12, y: 44, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('Slow Operations', 'ops', queries.slow_operations.query(), OBO, { x: 0, y: 52, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('Key Operations', 'ops', queries.key_operations.query(), OBO, { x: 12, y: 52, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('Heartbeat Failures', '', queries.heartbeat_failures.query(), OBO, { x: 0, y: 60, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('Compacted Keys', '', queries.compacted_keys.query(), OBO, { x: 12, y: 60, w: 12, h: 8 }), + ]), +]) diff --git a/templates/General/k8s-perf-v2.jsonnet b/templates/General/k8s-perf-v2.jsonnet deleted file mode 100644 index fd0f28d..0000000 --- a/templates/General/k8s-perf-v2.jsonnet +++ /dev/null @@ -1,61 +0,0 @@ -local panels = import '../../assets/k8s-perf/panels.libsonnet'; -local queries = import '../../assets/k8s-perf/queries.libsonnet'; -local variables = import '../../assets/k8s-perf/variables.libsonnet'; -local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; - -g.dashboard.new('k8s Performance dashboard') -+ g.dashboard.time.withFrom('now-1h') -+ g.dashboard.time.withTo('now') -+ g.dashboard.withTimezone('utc') -+ g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) -+ g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) -+ g.dashboard.withRefresh('30s') -+ g.dashboard.withEditable(false) -+ g.dashboard.graphTooltip.withSharedCrosshair() -+ g.dashboard.withVariables([ - variables.Datasource, - variables._worker_node, - variables.namespace, - variables.block_device, - variables.net_device, - variables.interval, -]) - -+ g.dashboard.withPanels([ - g.panel.row.new('Cluster Details') - + g.panel.row.withCollapsed(true) - + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) - + g.panel.row.withPanels([ - panels.stat.genericStatLegendPanel('Current Node Count', 'none', queries.currentNodeCount.query(), { x: 0, y: 4, w: 8, h: 3 }), - panels.stat.genericStatLegendPanel('Current namespace Count', 'none', queries.currentNamespaceCount.query(), { x: 8, y: 4, w: 8, h: 3 }), - panels.stat.genericStatLegendPanel('Current Pod Count', 'none', queries.currentPodCount.query(), { x: 16, y: 4, w: 8, h: 3 }), - panels.timeSeries.genericTimeSeriesPanel('Number of nodes', 'none', queries.numberOfNodes.query(), { x: 0, y: 12, w: 8, h: 8 }), - panels.timeSeries.genericTimeSeriesPanel('Namespace count', 'none', queries.namespaceCount.query(), { x: 8, y: 12, w: 8, h: 8 }), - panels.timeSeries.genericTimeSeriesPanel('Pod count', 'none', queries.podCount.query(), { x: 16, y: 12, w: 8, h: 8 }), - panels.timeSeries.genericTimeSeriesPanel('Secret & configmap count', 'none', queries.secretAndConfigMapCount.query(), { x: 0, y: 20, w: 8, h: 8 }), - panels.timeSeries.genericTimeSeriesPanel('Deployment count', 'none', queries.deployCount.query(), { x: 8, y: 20, w: 8, h: 8 }), - panels.timeSeries.genericTimeSeriesPanel('Services count', 'none', queries.serviceCount.query(), { x: 16, y: 20, w: 8, h: 8 }), - panels.timeSeries.genericTimeSeriesLegendPanel('Top 10 container RSS', 'bytes', queries.top10ContainerRSS.query(), { x: 0, y: 28, w: 24, h: 8 }), - panels.timeSeries.genericTimeSeriesLegendPanel('Top 10 container CPU', 'percent', queries.top10ContainerCPU.query(), { x: 0, y: 36, w: 12, h: 8 }), - panels.timeSeries.genericTimeSeriesPanel('Goroutines count', 'none', queries.goroutinesCount.query(), { x: 12, y: 36, w: 12, h: 8 }), - panels.timeSeries.genericTimeSeriesLegendPanel('Pod Distribution', 'none', queries.podDistribution.query(), { x: 0, y: 44, w: 24, h: 8 }), - ]), - - g.panel.row.new('Node: $_worker_node') - + g.panel.row.withCollapsed(true) - + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) - + g.panel.row.withRepeat('_worker_node') - + g.panel.row.withPanels([ - panels.timeSeries.genericTimeSeriesLegendPanel('CPU Basic: $_worker_node ', 'percent', queries.basicCPU.query('$_worker_node'), { x: 0, y: 0, w: 12, h: 8 }), - panels.timeSeries.genericTimeSeriesLegendPanel('System Memory: $_worker_node ', 'bytes', queries.systemMemory.query('$_worker_node'), { x: 12, y: 0, w: 12, h: 8 }), - panels.timeSeries.genericTimeSeriesLegendPanel('Disk throughput: $_worker_node ', 'Bps', queries.diskThroughput.query('$_worker_node'), { x: 0, y: 8, w: 12, h: 8 }), - panels.timeSeries.genericTimeSeriesLegendPanel('Disk IOPS: $_worker_node', 'iops', queries.diskIOPS.query('$_worker_node'), { x: 12, y: 8, w: 12, h: 8 }), - panels.timeSeries.genericTimeSeriesLegendPanel('Network Utilization: $_worker_node', 'bps', queries.networkUtilization.query('$_worker_node'), { x: 0, y: 16, w: 12, h: 8 }), - panels.timeSeries.genericTimeSeriesLegendPanel('Network Packets: $_worker_node', 'pps', queries.networkPackets.query('$_worker_node'), { x: 12, y: 16, w: 12, h: 8 }), - panels.timeSeries.genericTimeSeriesLegendPanel('Network packets drop: $_worker_node', 'pps', queries.networkDrop.query('$_worker_node'), { x: 0, y: 24, w: 12, h: 8 }), - panels.timeSeries.genericTimeSeriesLegendPanel('Conntrack stats: $_worker_node', '', queries.conntrackStats.query('$_worker_node'), { x: 12, y: 24, w: 12, h: 8 }), - panels.timeSeries.genericTimeSeriesLegendPanel('Top 10 container CPU: $_worker_node', 'percent', queries.top10ContainersCPU.query('$_worker_node'), { x: 0, y: 32, w: 12, h: 8 }), - panels.timeSeries.genericTimeSeriesLegendPanel('Top 10 container RSS: $_worker_node', 'bytes', queries.top10ContainersRSS.query(' $_worker_node'), { x: 12, y: 32, w: 12, h: 8 }), - - ]), -]) diff --git a/templates/General/k8s-perf.jsonnet b/templates/General/k8s-perf.jsonnet index 7308819..fd0f28d 100644 --- a/templates/General/k8s-perf.jsonnet +++ b/templates/General/k8s-perf.jsonnet @@ -1,499 +1,61 @@ -local grafana = import '../grafonnet-lib/grafonnet/grafana.libsonnet'; -local prometheus = grafana.prometheus; - - -// Helper functions - -local genericGraphPanel(title, format) = grafana.graphPanel.new( - title=title, - datasource='$datasource', - format=format, - nullPointMode='null as zero', - sort='decreasing', - legend_alignAsTable=true, -); - -local genericGraphLegendPanel(title, format) = grafana.graphPanel.new( - title=title, - datasource='$datasource', - format=format, - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - nullPointMode='null as zero', - sort='decreasing', -); - - -local nodeMemory(nodeName) = genericGraphLegendPanel('System Memory: ' + nodeName, 'bytes').addTarget( - prometheus.target( - 'node_memory_Active_bytes{node=~"' + nodeName + '"}', - legendFormat='Active', - ) -).addTarget( - prometheus.target( - 'node_memory_MemTotal_bytes{node=~"' + nodeName + '"}', - legendFormat='Total', - ) -).addTarget( - prometheus.target( - 'node_memory_Cached_bytes{node=~"' + nodeName + '"} + node_memory_Buffers_bytes{node=~"' + nodeName + '"}', - legendFormat='Cached + Buffers', - ) -).addTarget( - prometheus.target( - 'node_memory_MemAvailable_bytes{node=~"' + nodeName + '"}', - legendFormat='Available', - ) -); - - -local nodeCPU(nodeName) = genericGraphLegendPanel('CPU Basic: ' + nodeName, 'percent').addTarget( - prometheus.target( - 'sum by (instance, mode)(rate(node_cpu_seconds_total{node=~"' + nodeName + '",job=~".*"}[$interval])) * 100', - legendFormat='Busy {{mode}}', - ) -); - - -local diskThroughput(nodeName) = genericGraphLegendPanel('Disk throughput: ' + nodeName, 'Bps').addTarget( - prometheus.target( - 'rate(node_disk_read_bytes_total{device=~"$block_device",node=~"' + nodeName + '"}[$interval])', - legendFormat='{{ device }} - read', - ) -).addTarget( - prometheus.target( - 'rate(node_disk_written_bytes_total{device=~"$block_device",node=~"' + nodeName + '"}[$interval])', - legendFormat='{{ device }} - write', - ) -); - -local diskIOPS(nodeName) = genericGraphLegendPanel('Disk IOPS: ' + nodeName, 'iops').addTarget( - prometheus.target( - 'rate(node_disk_reads_completed_total{device=~"$block_device",node=~"' + nodeName + '"}[$interval])', - legendFormat='{{ device }} - read', - ) -).addTarget( - prometheus.target( - 'rate(node_disk_writes_completed_total{device=~"$block_device",node=~"' + nodeName + '"}[$interval])', - legendFormat='{{ device }} - write', - ) -); - -local networkUtilization(nodeName) = genericGraphLegendPanel('Network Utilization: ' + nodeName, 'bps').addTarget( - prometheus.target( - 'rate(node_network_receive_bytes_total{node=~"' + nodeName + '",device=~"$net_device"}[$interval]) * 8', - legendFormat='{{instance}} - {{device}} - RX', - ) -).addTarget( - prometheus.target( - 'rate(node_network_transmit_bytes_total{node=~"' + nodeName + '",device=~"$net_device"}[$interval]) * 8', - legendFormat='{{instance}} - {{device}} - TX', - ) -); - -local networkPackets(nodeName) = genericGraphLegendPanel('Network Packets: ' + nodeName, 'pps').addTarget( - prometheus.target( - 'rate(node_network_receive_packets_total{node=~"' + nodeName + '",device=~"$net_device"}[$interval])', - legendFormat='{{instance}} - {{device}} - RX', - ) -).addTarget( - prometheus.target( - 'rate(node_network_transmit_packets_total{node=~"' + nodeName + '",device=~"$net_device"}[$interval])', - legendFormat='{{instance}} - {{device}} - TX', - ) -); - -local networkDrop(nodeName) = genericGraphLegendPanel('Network packets drop: ' + nodeName, 'pps').addTarget( - prometheus.target( - 'topk(10, rate(node_network_receive_drop_total{node=~"' + nodeName + '"}[$interval]))', - legendFormat='rx-drop-{{ device }}', - ) -).addTarget( - prometheus.target( - 'topk(10,rate(node_network_transmit_drop_total{node=~"' + nodeName + '"}[$interval]))', - legendFormat='tx-drop-{{ device }}', - ) -); - -local conntrackStats(nodeName) = genericGraphLegendPanel('Conntrack stats: ' + nodeName, '') - { - seriesOverrides: [{ - alias: 'conntrack_limit', - yaxis: 2, - }], - yaxes: [{ show: true }, { show: true }], -} - .addTarget( - prometheus.target( - 'node_nf_conntrack_entries{node=~"' + nodeName + '"}', - legendFormat='conntrack_entries', - ) -).addTarget( - prometheus.target( - 'node_nf_conntrack_entries_limit{node=~"' + nodeName + '"}', - legendFormat='conntrack_limit', - ) -); - -local top10ContainerCPU(nodeName) = genericGraphLegendPanel('Top 10 container CPU: ' + nodeName, 'percent').addTarget( - prometheus.target( - 'topk(10, sum(irate(container_cpu_usage_seconds_total{container!="POD",name!="",instance=~"' + nodeName + '",namespace!="",namespace=~"$namespace"}[$interval])) by (pod,container,namespace,name,service) * 100)', - legendFormat='{{ pod }}: {{ container }}', - ) -); - -local top10ContainerRSS(nodeName) = genericGraphLegendPanel('Top 10 container RSS: ' + nodeName, 'bytes').addTarget( - prometheus.target( - 'topk(10, container_memory_rss{container!="POD",name!="",instance=~"' + nodeName + '",namespace!="",namespace=~"$namespace"})', - legendFormat='{{ pod }}: {{ container }}', - ) -); - -local containerWriteBytes(nodeName) = genericGraphLegendPanel('Container fs write rate: ' + nodeName, 'Bps').addTarget( - prometheus.target( - 'sum(rate(container_fs_writes_bytes_total{device!~".+dm.+", node=~"' + nodeName + '", container!=""}[$interval])) by (device, container)', - legendFormat='{{ container }}: {{ device }}', - ) -); - -// Individual panel definitions - -// Monitoring Stack - -local promReplMemUsage = genericGraphLegendPanel('Prometheus Replica Memory usage', 'bytes').addTarget( - prometheus.target( - 'sum(container_memory_rss{pod="prometheus-k8s-1",namespace!="",name!="",container="prometheus"}) by (pod)', - legendFormat='{{pod}}', - ) -).addTarget( - prometheus.target( - 'sum(container_memory_rss{pod="prometheus-k8s-0",namespace!="",name!="",container="prometheus"}) by (pod)', - legendFormat='{{pod}}', - ) -); - -// Kubelet - -local kubeletCPU = genericGraphLegendPanel('Top 10 Kubelet CPU usage', 'percent').addTarget( - prometheus.target( - 'topk(10,rate(process_cpu_seconds_total{service="kubelet",job="kubelet"}[$interval])*100)', - legendFormat='kubelet - {{node}}', - ) -); - -local crioCPU = genericGraphLegendPanel('Top 10 crio CPU usage', 'percent').addTarget( - prometheus.target( - 'topk(10,rate(process_cpu_seconds_total{service="kubelet",job="crio"}[$interval])*100)', - legendFormat='crio - {{node}}', - ) -); - -local kubeletMemory = genericGraphLegendPanel('Top 10 Kubelet memory usage', 'bytes').addTarget( - prometheus.target( - 'topk(10,process_resident_memory_bytes{service="kubelet",job="kubelet"})', - legendFormat='kubelet - {{node}}', - ) -); - -local crioMemory = genericGraphLegendPanel('Top 10 crio memory usage', 'bytes').addTarget( - prometheus.target( - 'topk(10,process_resident_memory_bytes{service="kubelet",job="crio"})', - legendFormat='crio - {{node}}', - ) -); - -// Cluster details - -local current_node_count = grafana.statPanel.new( - title='Current Node Count', - datasource='$datasource', - reducerFunction='last', -).addTarget( - prometheus.target( - 'sum(kube_node_info{})', - legendFormat='Number of nodes', - ) -).addTarget( - prometheus.target( - 'sum(kube_node_status_condition{status="true"}) by (condition) > 0', - legendFormat='Node: {{ condition }}', - ) -); - -local current_namespace_count = grafana.statPanel.new( - title='Current namespace Count', - datasource='$datasource', - reducerFunction='last', -).addTarget( - prometheus.target( - 'sum(kube_namespace_status_phase) by (phase)', - legendFormat='{{ phase }}', - ) -); - -local current_pod_count = grafana.statPanel.new( - title='Current Pod Count', - reducerFunction='last', - datasource='$datasource', -).addTarget( - prometheus.target( - 'sum(kube_pod_status_phase{}) by (phase) > 0', - legendFormat='{{ phase}} Pods', - ) -); - -local nodeCount = genericGraphPanel('Number of nodes', 'none').addTarget( - prometheus.target( - 'sum(kube_node_info{})', - legendFormat='Number of nodes', - ) -).addTarget( - prometheus.target( - 'sum(kube_node_status_condition{status="true"}) by (condition) > 0', - legendFormat='Node: {{ condition }}', - ) -); - -local nsCount = genericGraphPanel('Namespace count', 'none').addTarget( - prometheus.target( - 'sum(kube_namespace_status_phase) by (phase) > 0', - legendFormat='{{ phase }} namespaces', - ) -); - -local podCount = genericGraphPanel('Pod count', 'none').addTarget( - prometheus.target( - 'sum(kube_pod_status_phase{}) by (phase)', - legendFormat='{{phase}} pods', - ) -); - -local secretCmCount = genericGraphPanel('Secret & configmap count', 'none').addTarget( - prometheus.target( - 'count(kube_secret_info{})', - legendFormat='secrets', - ) -).addTarget( - prometheus.target( - 'count(kube_configmap_info{})', - legendFormat='Configmaps', - ) -); - -local deployCount = genericGraphPanel('Deployment count', 'none').addTarget( - prometheus.target( - 'count(kube_deployment_labels{})', - legendFormat='Deployments', - ) -); - - -local servicesCount = genericGraphPanel('Services count', 'none').addTarget( - prometheus.target( - 'count(kube_service_info{})', - legendFormat='Services', - ) -); - -local alerts = genericGraphPanel('Alerts', 'none').addTarget( - prometheus.target( - 'topk(10,sum(ALERTS{severity!="none"}) by (alertname, severity))', - legendFormat='{{severity}}: {{alertname}}', - ) -); - -local top10ContMem = genericGraphLegendPanel('Top 10 container RSS', 'bytes').addTarget( - prometheus.target( - 'topk(10, container_memory_rss{namespace!="",container!="POD",name!=""})', - legendFormat='{{ namespace }} - {{ name }}', - ) -); - -local podDistribution = genericGraphLegendPanel('Pod Distribution', 'none').addTarget( - prometheus.target( - 'count(kube_pod_info{}) by (exported_node)', - legendFormat='{{ node }}', - ) -); - -local top10ContCPU = genericGraphLegendPanel('Top 10 container CPU', 'percent').addTarget( - prometheus.target( - 'topk(10,irate(container_cpu_usage_seconds_total{namespace!="",container!="POD",name!=""}[$interval])*100)', - legendFormat='{{ namespace }} - {{ name }}', - ) -); - - -local goroutines_count = genericGraphPanel('Goroutines count', 'none').addTarget( - prometheus.target( - 'topk(10, sum(go_goroutines{}) by (job,instance))', - legendFormat='{{ job }} - {{ instance }}', - ) -); - -// Cluster operators - -local clusterOperatorsOverview = grafana.statPanel.new( - datasource='$datasource', - title='Cluster operators overview', -).addTarget( - prometheus.target( - 'sum by (condition)(cluster_operator_conditions{condition!=""})', - legendFormat='{{ condition }}', - ) -); - -local clusterOperatorsInformation = genericGraphLegendPanel('Cluster operators information', 'none').addTarget( - prometheus.target( - 'cluster_operator_conditions{name!="",reason!=""}', - legendFormat='{{name}} - {{reason}}', - ) -); - -local clusterOperatorsDegraded = genericGraphLegendPanel('Cluster operators degraded', 'none').addTarget( - prometheus.target( - 'cluster_operator_conditions{condition="Degraded",name!="",reason!=""}', - legendFormat='{{name}} - {{reason}}', - ) -); - - -// Dashboard - -grafana.dashboard.new( - 'k8s Performance', - description='Performance dashboard for Red Hat k8s', - time_from='now-1h', - timezone='utc', - refresh='30s', - editable='true', -) - - -// Templates - -.addTemplate( - grafana.template.datasource( - 'datasource', - 'prometheus', - '', - ) -) - -.addTemplate( - grafana.template.new( - '_worker_node', - '$datasource', - 'label_values(kube_node_labels{}, exported_node)', - '', - refresh=2, - ) { - label: 'Worker', - type: 'query', - multi: true, - includeAll: false, - }, -) - -.addTemplate( - grafana.template.new( - 'namespace', - '$datasource', - 'label_values(kube_pod_info, exported_namespace)', - '', - refresh=2, - ) { - label: 'Namespace', - type: 'query', - multi: false, - includeAll: true, - }, -) - - -.addTemplate( - grafana.template.new( - 'block_device', - '$datasource', - 'label_values(node_disk_written_bytes_total,device)', - '', - regex='/^(?:(?!dm|rb).)*$/', - refresh=2, - ) { - label: 'Block device', - type: 'query', - multi: true, - includeAll: true, - }, -) - - -.addTemplate( - grafana.template.new( - 'net_device', - '$datasource', - 'label_values(node_network_receive_bytes_total,device)', - '', - regex='/^((br|en|et).*)$/', - refresh=2, - ) { - label: 'Network device', - type: 'query', - multi: true, - includeAll: true, - }, -) - -.addTemplate( - grafana.template.new( - 'interval', - '$datasource', - '$__auto_interval_period', - label='interval', - refresh='time', - ) { - type: 'interval', - query: '2m,3m,4m,5m', - auto: false, - }, -) - -// Dashboard definition - -.addPanel(grafana.row.new(title='Cluster Details', collapse=true).addPanels( - [ - current_node_count { gridPos: { x: 0, y: 4, w: 8, h: 3 } }, - current_namespace_count { gridPos: { x: 8, y: 4, w: 8, h: 3 } }, - current_pod_count { gridPos: { x: 16, y: 4, w: 8, h: 3 } }, - nodeCount { gridPos: { x: 0, y: 12, w: 8, h: 8 } }, - nsCount { gridPos: { x: 8, y: 12, w: 8, h: 8 } }, - podCount { gridPos: { x: 16, y: 12, w: 8, h: 8 } }, - secretCmCount { gridPos: { x: 0, y: 20, w: 8, h: 8 } }, - deployCount { gridPos: { x: 8, y: 20, w: 8, h: 8 } }, - servicesCount { gridPos: { x: 16, y: 20, w: 8, h: 8 } }, - top10ContMem { gridPos: { x: 0, y: 28, w: 24, h: 8 } }, - top10ContCPU { gridPos: { x: 0, y: 36, w: 12, h: 8 } }, - goroutines_count { gridPos: { x: 12, y: 36, w: 12, h: 8 } }, - podDistribution { gridPos: { x: 0, y: 44, w: 24, h: 8 } }, - ] -), { gridPos: { x: 0, y: 3, w: 24, h: 1 } }) - -.addPanel(grafana.row.new(title='Node: $_worker_node', collapse=true, repeat='_worker_node').addPanels( - [ - nodeCPU('$_worker_node') { gridPos: { x: 0, y: 0, w: 12, h: 8 } }, - nodeMemory('$_worker_node') { gridPos: { x: 12, y: 0, w: 12, h: 8 } }, - diskThroughput('$_worker_node') { gridPos: { x: 0, y: 8, w: 12, h: 8 } }, - diskIOPS('$_worker_node') { gridPos: { x: 12, y: 8, w: 12, h: 8 } }, - networkUtilization('$_worker_node') { gridPos: { x: 0, y: 16, w: 12, h: 8 } }, - networkPackets('$_worker_node') { gridPos: { x: 12, y: 16, w: 12, h: 8 } }, - networkDrop('$_worker_node') { gridPos: { x: 0, y: 24, w: 12, h: 8 } }, - conntrackStats('$_worker_node') { gridPos: { x: 12, y: 24, w: 12, h: 8 } }, - top10ContainerCPU('$_worker_node') { gridPos: { x: 0, y: 32, w: 12, h: 8 } }, - top10ContainerRSS('$_worker_node') { gridPos: { x: 12, y: 32, w: 12, h: 8 } }, - ], -), { gridPos: { x: 0, y: 1, w: 0, h: 8 } }) +local panels = import '../../assets/k8s-perf/panels.libsonnet'; +local queries = import '../../assets/k8s-perf/queries.libsonnet'; +local variables = import '../../assets/k8s-perf/variables.libsonnet'; +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; + +g.dashboard.new('k8s Performance dashboard') ++ g.dashboard.time.withFrom('now-1h') ++ g.dashboard.time.withTo('now') ++ g.dashboard.withTimezone('utc') ++ g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) ++ g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) ++ g.dashboard.withRefresh('30s') ++ g.dashboard.withEditable(false) ++ g.dashboard.graphTooltip.withSharedCrosshair() ++ g.dashboard.withVariables([ + variables.Datasource, + variables._worker_node, + variables.namespace, + variables.block_device, + variables.net_device, + variables.interval, +]) + ++ g.dashboard.withPanels([ + g.panel.row.new('Cluster Details') + + g.panel.row.withCollapsed(true) + + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) + + g.panel.row.withPanels([ + panels.stat.genericStatLegendPanel('Current Node Count', 'none', queries.currentNodeCount.query(), { x: 0, y: 4, w: 8, h: 3 }), + panels.stat.genericStatLegendPanel('Current namespace Count', 'none', queries.currentNamespaceCount.query(), { x: 8, y: 4, w: 8, h: 3 }), + panels.stat.genericStatLegendPanel('Current Pod Count', 'none', queries.currentPodCount.query(), { x: 16, y: 4, w: 8, h: 3 }), + panels.timeSeries.genericTimeSeriesPanel('Number of nodes', 'none', queries.numberOfNodes.query(), { x: 0, y: 12, w: 8, h: 8 }), + panels.timeSeries.genericTimeSeriesPanel('Namespace count', 'none', queries.namespaceCount.query(), { x: 8, y: 12, w: 8, h: 8 }), + panels.timeSeries.genericTimeSeriesPanel('Pod count', 'none', queries.podCount.query(), { x: 16, y: 12, w: 8, h: 8 }), + panels.timeSeries.genericTimeSeriesPanel('Secret & configmap count', 'none', queries.secretAndConfigMapCount.query(), { x: 0, y: 20, w: 8, h: 8 }), + panels.timeSeries.genericTimeSeriesPanel('Deployment count', 'none', queries.deployCount.query(), { x: 8, y: 20, w: 8, h: 8 }), + panels.timeSeries.genericTimeSeriesPanel('Services count', 'none', queries.serviceCount.query(), { x: 16, y: 20, w: 8, h: 8 }), + panels.timeSeries.genericTimeSeriesLegendPanel('Top 10 container RSS', 'bytes', queries.top10ContainerRSS.query(), { x: 0, y: 28, w: 24, h: 8 }), + panels.timeSeries.genericTimeSeriesLegendPanel('Top 10 container CPU', 'percent', queries.top10ContainerCPU.query(), { x: 0, y: 36, w: 12, h: 8 }), + panels.timeSeries.genericTimeSeriesPanel('Goroutines count', 'none', queries.goroutinesCount.query(), { x: 12, y: 36, w: 12, h: 8 }), + panels.timeSeries.genericTimeSeriesLegendPanel('Pod Distribution', 'none', queries.podDistribution.query(), { x: 0, y: 44, w: 24, h: 8 }), + ]), + + g.panel.row.new('Node: $_worker_node') + + g.panel.row.withCollapsed(true) + + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) + + g.panel.row.withRepeat('_worker_node') + + g.panel.row.withPanels([ + panels.timeSeries.genericTimeSeriesLegendPanel('CPU Basic: $_worker_node ', 'percent', queries.basicCPU.query('$_worker_node'), { x: 0, y: 0, w: 12, h: 8 }), + panels.timeSeries.genericTimeSeriesLegendPanel('System Memory: $_worker_node ', 'bytes', queries.systemMemory.query('$_worker_node'), { x: 12, y: 0, w: 12, h: 8 }), + panels.timeSeries.genericTimeSeriesLegendPanel('Disk throughput: $_worker_node ', 'Bps', queries.diskThroughput.query('$_worker_node'), { x: 0, y: 8, w: 12, h: 8 }), + panels.timeSeries.genericTimeSeriesLegendPanel('Disk IOPS: $_worker_node', 'iops', queries.diskIOPS.query('$_worker_node'), { x: 12, y: 8, w: 12, h: 8 }), + panels.timeSeries.genericTimeSeriesLegendPanel('Network Utilization: $_worker_node', 'bps', queries.networkUtilization.query('$_worker_node'), { x: 0, y: 16, w: 12, h: 8 }), + panels.timeSeries.genericTimeSeriesLegendPanel('Network Packets: $_worker_node', 'pps', queries.networkPackets.query('$_worker_node'), { x: 12, y: 16, w: 12, h: 8 }), + panels.timeSeries.genericTimeSeriesLegendPanel('Network packets drop: $_worker_node', 'pps', queries.networkDrop.query('$_worker_node'), { x: 0, y: 24, w: 12, h: 8 }), + panels.timeSeries.genericTimeSeriesLegendPanel('Conntrack stats: $_worker_node', '', queries.conntrackStats.query('$_worker_node'), { x: 12, y: 24, w: 12, h: 8 }), + panels.timeSeries.genericTimeSeriesLegendPanel('Top 10 container CPU: $_worker_node', 'percent', queries.top10ContainersCPU.query('$_worker_node'), { x: 0, y: 32, w: 12, h: 8 }), + panels.timeSeries.genericTimeSeriesLegendPanel('Top 10 container RSS: $_worker_node', 'bytes', queries.top10ContainersRSS.query(' $_worker_node'), { x: 12, y: 32, w: 12, h: 8 }), + + ]), +]) diff --git a/templates/General/kube-burner.jsonnet b/templates/General/kube-burner.jsonnet deleted file mode 100644 index cdb5160..0000000 --- a/templates/General/kube-burner.jsonnet +++ /dev/null @@ -1,4568 +0,0 @@ -local grafana = import '../grafonnet-lib/grafonnet/grafana.libsonnet'; -local es = grafana.elasticsearch; - -local worker_count = grafana.statPanel.new( - title='Node count', - datasource='$datasource1', - justifyMode='center' -).addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "nodeRoles"', - timeField='timestamp', - metrics=[{ - field: 'coun', - id: '1', - meta: {}, - settings: {}, - type: 'count', - }], - bucketAggs=[ - { - field: 'labels.role.keyword', - fake: true, - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '_term', - size: '10', - - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -).addThresholds([ - { color: 'green', value: null }, - { color: 'red', value: 80 }, -]); - - -local metric_count_panel = grafana.statPanel.new( - datasource='$datasource1', - justifyMode='center', - title=null -).addTarget( - // Namespaces count - es.target( - query='uuid.keyword: $uuid AND metricName: "namespaceCount" AND labels.phase: "Active"', - alias='Namespaces', - timeField='timestamp', - metrics=[{ - field: 'value', - id: '1', - meta: {}, - settings: {}, - type: 'avg', - }], - bucketAggs=[ - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: '0', - }, - type: 'date_histogram', - }, - ], - ) -).addTarget( - // Services count - es.target( - query='uuid.keyword: $uuid AND metricName: "serviceCount"', - alias='Services', - timeField='timestamp', - metrics=[{ - field: 'value', - id: '1', - meta: {}, - settings: {}, - type: 'avg', - }], - bucketAggs=[ - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: '0', - }, - type: 'date_histogram', - }, - ], - ) -).addTarget( - // Deployments count - es.target( - query='uuid.keyword: $uuid AND metricName: "deploymentCount"', - alias='Services', - timeField='timestamp', - metrics=[{ - field: 'value', - id: '1', - meta: {}, - settings: {}, - type: 'avg', - }], - bucketAggs=[ - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: '0', - }, - type: 'date_histogram', - }, - ], - ) -).addTarget( - // Secrets count - es.target( - query='uuid.keyword: $uuid AND metricName: "secretCount"', - alias='Services', - timeField='timestamp', - metrics=[{ - field: 'value', - id: '1', - meta: {}, - settings: {}, - type: 'avg', - }], - bucketAggs=[ - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: '0', - }, - type: 'date_histogram', - }, - ], - ) -).addTarget( - // ConfigMap count - es.target( - query='uuid.keyword: $uuid AND metricName: "configmapCount"', - alias='ConfigMaps', - timeField='timestamp', - metrics=[{ - field: 'value', - id: '1', - meta: {}, - settings: {}, - type: 'avg', - }], - bucketAggs=[ - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: '0', - }, - type: 'date_histogram', - }, - ], - ) -).addThresholds([ - { color: 'green', value: null }, - { color: 'red', value: 80 }, -]); - -local openshift_version_panel = grafana.statPanel.new( - title='OpenShift version', - datasource='$datasource1', - justifyMode='center', - reducerFunction='lastNotNull', - fields='/^labels\\.version$/' -).addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "clusterVersion"', - timeField='timestamp', - metrics=[{ - id: '1', - settings: { - size: '500', - }, - type: 'raw_data', - }], - ) -); - -local etcd_version_panel = grafana.statPanel.new( - title='Etcd version', - datasource='$datasource1', - justifyMode='center', - reducerFunction='lastNotNull', - fields='labels.cluster_version' -).addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "etcdVersion"', - timeField='timestamp', - metrics=[{ - id: '1', - settings: { - size: '500', - }, - type: 'raw_data', - }], - ) -); - - -// Next line -// TODO: Convert to new table format once jsonnet supports it. -// That would fix the text wrapping problem. -local summary_panel_1 = grafana.tablePanel.new( - datasource='$datasource1', - title=null, - styles=[ - { - pattern: 'uuid', - alias: 'UUID', - type: 'string', - }, - { - pattern: 'jobConfig.name', - alias: 'Name', - type: 'hidden', - }, - { - pattern: 'jobConfig.qps', - alias: 'QPS', - type: 'number', - }, - { - pattern: 'jobConfig.burst', - alias: 'Burst', - type: 'number', - }, - { - pattern: 'elapsedTime', - alias: 'Elapsed time', - type: 'number', - unit: 's', - }, - { - pattern: 'jobConfig.jobIterations', - alias: 'Iterations', - type: 'number', - }, - { - pattern: 'jobConfig.jobType', - alias: 'Job Type', - type: 'string', - }, - { - pattern: 'jobConfig.podWait', - alias: 'podWait', - type: 'hidden', - }, - { - pattern: 'jobConfig.namespacedIterations', - alias: 'Namespaced iterations', - type: 'hidden', - }, - { - pattern: 'jobConfig.preLoadImages', - alias: 'Preload Images', - type: 'boolean', - }, - { - pattern: '_id', - alias: '_id', - type: 'hidden', - }, - { - pattern: '_index', - alias: '_index', - type: 'hidden', - }, - { - pattern: '_type', - alias: '_type', - type: 'hidden', - }, - { - pattern: 'highlight', - alias: 'highlight', - type: 'hidden', - }, - { - pattern: '_type', - alias: '_type', - type: 'hidden', - }, - { - pattern: 'jobConfig.cleanup', - type: 'hidden', - }, - { - pattern: 'jobConfig.errorOnVerify', - alias: 'errorOnVerify', - type: 'hidden', - }, - { - pattern: 'jobConfig.jobIterationDelay', - alias: 'jobIterationDelay', - type: 'hidden', - unit: 's', - }, - { - pattern: 'jobConfig.jobPause', - alias: 'jobPause', - type: 'hidden', - unit: 's', - }, - { - pattern: 'jobConfig.maxWaitTimeout', - alias: 'maxWaitTimeout', - type: 'hidden', - unit: 's', - }, - { - pattern: 'jobConfig.namespace', - alias: 'namespacePrefix', - type: 'hidden', - }, - { - pattern: 'jobConfig.namespaced', - alias: 'jobConfig.namespaced', - type: 'hidden', - }, - { - pattern: 'jobConfig.objects', - alias: 'jobConfig.objects', - type: 'hidden', - }, - { - pattern: 'jobConfig.preLoadPeriod', - alias: 'jobConfig.preLoadPeriod', - type: 'hidden', - }, - { - pattern: 'jobConfig.verifyObjects', - alias: 'jobConfig.verifyObjects', - type: 'hidden', - }, - { - pattern: 'metricName', - alias: 'metricName', - type: 'hidden', - }, - { - pattern: 'timestamp', - alias: 'timestamp', - type: 'hidden', - }, - { - pattern: 'jobConfig.waitFor', - alias: 'jobConfig.waitFor', - type: 'hidden', - }, - { - pattern: 'jobConfig.waitForDeletion', - alias: 'jobConfig.waitForDeletion', - type: 'hidden', - }, - { - pattern: 'jobConfig.waitWhenFinished', - alias: 'jobConfig.waitWhenFinished', - type: 'hidden', - }, - { - pattern: 'sort', - alias: 'sort', - type: 'hidden', - }, - ] -).addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "jobSummary"', - timeField='timestamp', - metrics=[{ - id: '1', - settings: { - size: '500', - }, - type: 'raw_data', - }], - ) -).addTransformation( - grafana.transformation.new('organize', options={ - indexByName: { - _id: 1, - _index: 2, - _type: 3, - elapsedTime: 8, - 'jobConfig.burst': 7, - 'jobConfig.cleanup': 12, - 'jobConfig.errorOnVerify': 13, - 'jobConfig.jobIterationDelay': 14, - 'jobConfig.jobIterations': 9, - 'jobConfig.jobPause': 15, - 'jobConfig.jobType': 10, - 'jobConfig.maxWaitTimeout': 16, - 'jobConfig.name': 5, - 'jobConfig.namespace': 17, - 'jobConfig.namespacedIterations': 18, - 'jobConfig.objects': 19, - 'jobConfig.podWait': 11, - 'jobConfig.qps': 6, - 'jobConfig.verifyObjects': 20, - 'jobConfig.waitFor': 21, - 'jobConfig.waitForDeletion': 22, - 'jobConfig.waitWhenFinished': 23, - metricName: 24, - timestamp: 0, - uuid: 4, - }, - }) -); - - -// TODO: Convert to new table format once jsonnet supports it. -// That would fix the text wrapping problem. -local summary_panel_2 = grafana.tablePanel.new( - datasource='$datasource1', - title=null, - styles=[ - { - pattern: 'k8s_version', - alias: 'k8s version', - type: 'string', - }, - { - pattern: 'result', - alias: 'Result', - type: 'string', - }, - { - pattern: 'sdn_type', - alias: 'SDN', - type: 'string', - }, - { - pattern: 'total_nodes', - alias: 'Total nodes', - type: 'number', - }, - { - pattern: 'master_nodes_count', - alias: 'Master nodes', - type: 'number', - }, - { - pattern: 'worker_nodes_count', - alias: 'Worker nodes', - type: 'number', - }, - { - pattern: 'infra_nodes_count', - alias: 'Infra nodes', - type: 'number', - }, - { - pattern: 'master_nodes_type', - alias: 'Masters flavor', - type: 'string', - }, - { - pattern: '_id', - alias: '_id', - type: 'hidden', - }, - { - pattern: '_index', - alias: '_index', - type: 'hidden', - }, - { - pattern: '_type', - alias: '_type', - type: 'hidden', - }, - { - pattern: 'benchmark', - alias: 'benchmark', - type: 'hidden', - }, - { - pattern: 'clustertype', - alias: 'clustertype', - type: 'hidden', - }, - { - pattern: 'end_date', - alias: 'end_date', - type: 'hidden', - }, - { - pattern: 'highlight', - alias: 'highlight', - type: 'hidden', - }, - { - pattern: 'jobConfig.cleanup', - alias: 'jobConfig.cleanup', - type: 'hidden', - }, - { - pattern: 'jobConfig.errorOnVerify', - alias: 'errorOnVerify', - type: 'hidden', - }, - { - pattern: 'jobConfig.jobIterationDelay', - alias: 'jobIterationDelay', - type: 'hidden', - unit: 's', - }, - { - pattern: 'jobConfig.jobPause', - alias: 'jobPause', - type: 'hidden', - unit: 's', - }, - { - pattern: 'jobConfig.maxWaitTimeout', - alias: 'maxWaitTimeout', - type: 'hidden', - unit: 's', - }, - { - pattern: 'jobConfig.namespace', - alias: 'namespacePrefix', - type: 'hidden', - }, - { - pattern: 'jobConfig.namespaced', - alias: 'jobConfig.namespaced', - type: 'hidden', - }, - { - pattern: 'jobConfig.objects', - alias: 'jobConfig.objects', - type: 'hidden', - }, - { - pattern: 'jobConfig.preLoadPeriod', - alias: 'jobConfig.preLoadPeriod', - type: 'hidden', - }, - { - pattern: 'jobConfig.verifyObjects', - alias: 'jobConfig.verifyObjects', - type: 'hidden', - }, - { - pattern: 'jobConfig.waitFor', - alias: 'jobConfig.waitFor', - type: 'hidden', - }, - { - pattern: 'jobConfig.waitForDeletion', - alias: 'jobConfig.waitForDeletion', - type: 'hidden', - }, - { - pattern: 'jobConfig.waitWhenFinished', - alias: 'jobConfig.waitWhenFinished', - type: 'hidden', - }, - { - pattern: 'metricName', - alias: 'metricName', - type: 'hidden', - }, - { - pattern: 'ocp_version', - alias: 'ocp_version', - type: 'hidden', - }, - { - pattern: 'ocp_version', - alias: 'ocp_version', - type: 'hidden', - }, - { - pattern: 'sort', - alias: 'sort', - type: 'hidden', - }, - { - pattern: 'timestamp', - alias: 'timestamp', - type: 'hidden', - }, - { - pattern: 'uuid', - alias: 'uuid', - type: 'hidden', - }, - { - pattern: 'workload', - alias: 'workload', - type: 'hidden', - }, - { - pattern: 'worker_nodes_type', - alias: 'worker_nodes_type', - type: 'hidden', - }, - { - pattern: 'infra_nodes_type', - alias: 'infra_nodes_type', - type: 'hidden', - }, - { - pattern: 'platform', - alias: 'platform', - type: 'hidden', - }, - { - pattern: 'workload_nodes_count', - alias: 'workload_nodes_count', - type: 'hidden', - }, - { - pattern: 'workload_nodes_type', - alias: 'workload_nodes_type', - type: 'hidden', - }, - ] -).addTarget( - es.target( - query='uuid.keyword: $uuid AND result.keyword: *', - timeField='timestamp', - metrics=[{ - id: '1', - settings: { - size: '500', - }, - type: 'raw_data', - }], - ) -).addTransformation( - grafana.transformation.new('organize', options={ - indexByName: { - _id: 4, - _index: 5, - _type: 15, - benchmark: 17, - clustertype: 18, - end_date: 19, - highlight: 20, - infra_nodes_count: 9, - infra_nodes_type: 14, - k8s_version: 1, - master_nodes_count: 7, - master_nodes_type: 11, - ocp_version: 21, - platform: 22, - result: 2, - sdn_type: 3, - sort: 23, - timestamp: 0, - total_nodes: 6, - uuid: 16, - worker_nodes_count: 8, - worker_nodes_type: 12, - workload: 24, - workload_nodes_count: 10, - workload_nodes_type: 13, - }, - }) -); - -// First row: Cluster status -local masters_cpu = grafana.graphPanel.new( - title='Masters CPU utilization', - datasource='$datasource1', - legend_alignAsTable=true, - legend_avg=true, - legend_max=true, - percentage=true, - legend_values=true, - format='percent', -).addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: "nodeCPU-Masters" AND NOT labels.mode.keyword: idle AND NOT labels.mode.keyword: steal', - timeField='timestamp', - alias='{{labels.instance.keyword}}', - metrics=[{ - field: 'value', - id: '1', - settings: { - script: '_value * 100', - }, - type: 'sum', - }], - bucketAggs=[ - { - field: 'labels.instance.keyword', - fake: true, - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '10', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: 'auto', - min_doc_count: '1', - trimEdges: '0', - }, - type: 'date_histogram', - }, - ], - ) -); - -local masters_memory = grafana.graphPanel.new( - title='Masters Memory utilization', - datasource='$datasource1', - legend_alignAsTable=true, - legend_avg=true, - legend_max=true, - legend_values=true, - format='bytes' -).addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: "nodeMemoryAvailable-Masters"', - timeField='timestamp', - alias='Available {{labels.instance.keyword}}', - metrics=[{ - field: 'value', - id: '1', - settings: {}, - type: 'sum', - }], - bucketAggs=[ - { - field: 'labels.instance.keyword', - fake: true, - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '10', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: 'auto', - min_doc_count: '1', - trimEdges: '0', - }, - type: 'date_histogram', - }, - ], - ) -); - -local node_status_summary = grafana.graphPanel.new( - title='Node Status Summary', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_current=true, - legend_values=true, - legend_rightSide=true, -).addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: "nodeStatus"', - timeField='timestamp', - alias='{{labels.condition.keyword}}', - metrics=[{ - field: 'value', - id: '1', - settings: {}, - type: 'avg', - }], - bucketAggs=[ - { - field: 'labels.condition.keyword', - fake: true, - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - - -local pod_status_summary = grafana.graphPanel.new( - title='Pod Status Summary', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_current=true, - legend_values=true, -).addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: "podStatusCount"', - timeField='timestamp', - alias='{{labels.phase.keyword}}', - metrics=[{ - field: 'value', - id: '1', - settings: {}, - type: 'avg', - }], - bucketAggs=[ - { - field: 'labels.phase.keyword', - fake: true, - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - -local kube_api_cpu = grafana.graphPanel.new( - title='Kube-apiserver CPU', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='percent', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "containerCPU" AND labels.container.keyword: kube-apiserver', - timeField='timestamp', - alias='{{labels.namespace.keyword}}-{{labels.pod.keyword}}', - metrics=[{ - field: 'value', - id: '1', - settings: {}, - type: 'avg', - }], - bucketAggs=[ - { - fake: true, - field: 'labels.pod.keyword', - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'labels.container.keyword', - fake: true, - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - { - field: 'labels.namespace.keyword', - fake: true, - id: '5', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "containerCPU-Masters" AND labels.container.keyword: kube-apiserver', - timeField='timestamp', - alias='{{labels.namespace.keyword}}-{{labels.pod.keyword}}', - metrics=[{ - field: 'value', - id: '1', - settings: {}, - type: 'avg', - }], - bucketAggs=[ - { - fake: true, - field: 'labels.pod.keyword', - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'labels.container.keyword', - fake: true, - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - { - field: 'labels.namespace.keyword', - fake: true, - id: '5', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "containerCPU" AND labels.container.keyword: kube-apiserver', - timeField='timestamp', - alias='Avg CPU {{labels.container.keyword}}', - metrics=[{ - field: 'value', - id: '1', - settings: {}, - type: 'avg', - }], - bucketAggs=[ - { - field: 'labels.container.keyword', - fake: true, - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'labels.namespace.keyword', - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); -// TODO: When the feature is added to grafannet, style the average differently. - - -local kube_api_memory = grafana.graphPanel.new( - title='Kube-apiserver Memory', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='bytes', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "containerMemory" AND labels.container.keyword: kube-apiserver', - timeField='timestamp', - alias='Rss {{labels.namespace.keyword}}-{{labels.pod.keyword}}', - metrics=[{ - field: 'value', - id: '1', - settings: {}, - type: 'avg', - }], - bucketAggs=[ - { - fake: true, - field: 'labels.pod.keyword', - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'labels.container.keyword', - fake: true, - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - { - field: 'labels.namespace.keyword', - fake: true, - id: '5', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -).addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "containerMemory-Masters" AND labels.container.keyword: kube-apiserver', - timeField='timestamp', - alias='Rss {{labels.namespace.keyword}}-{{labels.pod.keyword}}', - metrics=[{ - field: 'value', - id: '1', - settings: {}, - type: 'avg', - }], - bucketAggs=[ - { - fake: true, - field: 'labels.pod.keyword', - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'labels.container.keyword', - fake: true, - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - { - field: 'labels.namespace.keyword', - fake: true, - id: '5', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "containerMemory" AND labels.container.keyword: kube-apiserver', - timeField='timestamp', - alias='Avg Rss {{labels.container.keyword}}', - metrics=[{ - field: 'value', - id: '1', - settings: {}, - type: 'avg', - }], - bucketAggs=[ - { - field: 'labels.container.keyword', - fake: true, - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'labels.namespace.keyword', - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); -// TODO: When the feature is added to grafannet, style the average differently. - - -local active_controller_manager_cpu = grafana.graphPanel.new( - title='Active Kube-controller-manager CPU', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='percent', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "containerCPU" AND labels.container.keyword: kube-controller-manager', - timeField='timestamp', - alias='{{labels.namespace.keyword}}-{{labels.pod.keyword}}', - metrics=[{ - field: 'value', - id: '1', - settings: {}, - type: 'avg', - }], - bucketAggs=[ - { - fake: true, - field: 'labels.pod.keyword', - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '1', - }, - type: 'terms', - }, - { - field: 'labels.container.keyword', - fake: true, - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'labels.namespace.keyword', - fake: true, - id: '5', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "containerCPU-Masters" AND labels.container.keyword: kube-controller-manager', - timeField='timestamp', - alias='{{labels.container.keyword}}', - metrics=[{ - field: 'value', - id: '1', - settings: {}, - type: 'avg', - }], - bucketAggs=[ - { - fake: true, - field: 'labels.pod.keyword', - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '1', - }, - type: 'terms', - }, - { - field: 'labels.container.keyword', - fake: true, - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'labels.namespace.keyword', - fake: true, - id: '5', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - - -local active_controller_manager_memory = grafana.graphPanel.new( - title='Active Kube-controller-manager memory', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='bytes', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "containerMemory" AND labels.container.keyword: kube-controller-manager', - timeField='timestamp', - alias='{{labels.namespace.keyword}}-{{labels.pod.keyword}}', - metrics=[{ - field: 'value', - id: '1', - settings: {}, - type: 'avg', - }], - bucketAggs=[ - { - fake: true, - field: 'labels.pod.keyword', - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '1', - }, - type: 'terms', - }, - { - field: 'labels.container.keyword', - fake: true, - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'labels.namespace.keyword', - fake: true, - id: '5', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "containerMemory-Masters" AND labels.container.keyword: kube-controller-manager', - timeField='timestamp', - alias='{{labels.container.keyword}}', - metrics=[{ - field: 'value', - id: '1', - settings: {}, - type: 'avg', - }], - bucketAggs=[ - { - fake: true, - field: 'labels.pod.keyword', - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '1', - }, - type: 'terms', - }, - { - field: 'labels.container.keyword', - fake: true, - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'labels.namespace.keyword', - fake: true, - id: '5', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - - -local kube_scheduler_cpu = grafana.graphPanel.new( - title='Kube-scheduler CPU', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='percent', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "containerCPU" AND labels.container.keyword: kube-scheduler', - timeField='timestamp', - alias='{{labels.namespace.keyword}}-{{labels.pod.keyword}}', - metrics=[{ - field: 'value', - id: '1', - settings: {}, - type: 'avg', - }], - bucketAggs=[ - { - fake: true, - field: 'labels.pod.keyword', - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'labels.container.keyword', - fake: true, - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'labels.namespace.keyword', - id: '5', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "containerCPU-Masters" AND labels.container.keyword: kube-scheduler', - timeField='timestamp', - alias='{{labels.container.keyword}}', - metrics=[{ - field: 'value', - id: '1', - settings: {}, - type: 'avg', - }], - bucketAggs=[ - { - fake: true, - field: 'labels.pod.keyword', - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'labels.container.keyword', - fake: true, - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'labels.namespace.keyword', - id: '5', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - - -local kube_scheduler_memory = grafana.graphPanel.new( - title='Kube-scheduler memory', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='bytes', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "containerMemory" AND labels.container.keyword: kube-scheduler', - timeField='timestamp', - alias='{{labels.namespace.keyword}}-{{labels.pod.keyword}}', - metrics=[{ - field: 'value', - id: '1', - settings: {}, - type: 'avg', - }], - bucketAggs=[ - { - fake: true, - field: 'labels.pod.keyword', - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'labels.container.keyword', - fake: true, - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'labels.namespace.keyword', - id: '5', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "containerMemory-Masters" AND labels.container.keyword: kube-scheduler', - timeField='timestamp', - alias='Rss {{labels.container.keyword}}', - metrics=[{ - field: 'value', - id: '1', - settings: {}, - type: 'avg', - }], - bucketAggs=[ - { - fake: true, - field: 'labels.pod.keyword', - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'labels.container.keyword', - fake: true, - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'labels.namespace.keyword', - id: '5', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - - -local hypershift_controlplane_cpu = grafana.graphPanel.new( - title='Hypershift Controlplane CPU Usage', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='percent', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "containerCPU-Controlplane"', - timeField='timestamp', - metrics=[{ - field: 'value', - id: '1', - settings: {}, - type: 'avg', - }], - bucketAggs=[ - { - field: 'labels.pod.keyword', - id: '2', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '20', - }, - type: 'terms', - }, - { - field: 'labels.container.keyword', - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '20', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '4', - settings: { - interval: '30s', - min_doc_count: '1', - timeZone: 'utc', - trimEdges: '0', - }, - type: 'date_histogram', - }, - ], - ) -); - - -local hypershift_controlplane_memory = grafana.graphPanel.new( - title='Hypershift Controlplane RSS memory Usage', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='bytes', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "containerMemory-Controlplane"', - timeField='timestamp', - metrics=[{ - field: 'value', - id: '1', - settings: {}, - type: 'avg', - }], - bucketAggs=[ - { - field: 'labels.pod.keyword', - id: '2', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '20', - }, - type: 'terms', - }, - { - field: 'labels.container.keyword', - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '20', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '4', - settings: { - interval: '30s', - min_doc_count: '1', - timeZone: 'utc', - trimEdges: '0', - }, - type: 'date_histogram', - }, - ], - ) -); - -// Pod latencies section -local average_pod_latency = grafana.graphPanel.new( - title='Average pod latency', - datasource='$datasource1', - legend_alignAsTable=true, - legend_min=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='ms', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: podLatencyMeasurement', - timeField='timestamp', - alias='{{field}}', - metrics=[ - { - field: 'podReadyLatency', - id: '1', - meta: {}, - settings: {}, - type: 'avg', - }, - { - field: 'schedulingLatency', - id: '3', - meta: {}, - settings: {}, - type: 'avg', - }, - { - field: 'initializedLatency', - id: '4', - meta: {}, - settings: {}, - type: 'avg', - }, - ], - bucketAggs=[ - { - field: 'timestamp', - id: '2', - settings: { - interval: 'auto', - min_doc_count: '1', - trimEdges: '0', - }, - type: 'date_histogram', - }, - ], - ) -); - -local pod_latencies_summary = grafana.statPanel.new( - datasource='$datasource1', - justifyMode='center', - title='Pod latencies summary $latencyPercentile', - unit='ms', - colorMode='value', // Note: There isn't currently a way to set the color palette. -).addTarget( - // Namespaces count - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: podLatencyQuantilesMeasurement', - alias='$latencyPercentile {{term quantileName.keyword}}', - timeField='timestamp', - metrics=[{ - field: '$latencyPercentile', - id: '1', - meta: {}, - settings: {}, - type: 'max', - }], - bucketAggs=[ - { - fake: true, - field: 'quantileName.keyword', - id: '5', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '10', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: 'auto', - min_doc_count: '0', - trimEdges: '0', - }, - type: 'date_histogram', - }, - ], - ) -); - -local pod_conditions_latency = grafana.tablePanel.new( - title='Pod conditions latency', - datasource='$datasource1', - transform='table', - styles=[ - { - pattern: 'Average containersReadyLatency', - alias: 'ContainersReady', - type: 'number', - unit: 'ms', - }, - { - pattern: 'Average initializedLatency', - alias: 'Initialized', - type: 'number', - unit: 'ms', - }, - { - pattern: 'Average podReadyLatency', - alias: 'Ready', - type: 'number', - unit: 'ms', - }, - { - pattern: 'Average schedulingLatency', - alias: 'Scheduling', - type: 'number', - unit: 'ms', - }, - { - pattern: 'namespace.keyword', - alias: 'Namespace', - type: 'string', - }, - { - pattern: 'podName.keyword', - alias: 'Pod', - type: 'string', - }, - { - pattern: 'nodeName.keyword', - alias: 'Node', - type: 'string', - }, - ], -).addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: podLatencyMeasurement', - timeField='timestamp', - metrics=[ - { - field: 'schedulingLatency', - id: '1', - meta: {}, - settings: {}, - type: 'avg', - }, - { - field: 'initializedLatency', - id: '3', - meta: {}, - settings: {}, - type: 'avg', - }, - { - field: 'containersReadyLatency', - id: '4', - meta: {}, - settings: {}, - type: 'avg', - }, - { - field: 'podReadyLatency', - id: '5', - meta: {}, - settings: {}, - type: 'avg', - }, - ], - bucketAggs=[ - { - fake: true, - field: 'namespace.keyword', - id: '6', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '5', - size: '100', - }, - type: 'terms', - }, - { - fake: true, - field: 'nodeName.keyword', - id: '7', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '_term', - size: '100', - }, - type: 'terms', - }, - { - field: 'podName.keyword', - id: '2', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '5', - size: '100', - }, - type: 'terms', - }, - ], - ) -); - -local setup_latency = grafana.graphPanel.new( - title='Top 10 Container runtime network setup latency', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='µs', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: containerNetworkSetupLatency', - timeField='timestamp', - alias='{{labels.node.keyword}}', - metrics=[ - { - field: 'value', - id: '1', - meta: {}, - settings: {}, - type: 'avg', - }, - ], - bucketAggs=[ - { - fake: true, - field: 'labels.node.keyword', - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: 'auto', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - -local scheduling_throughput = grafana.graphPanel.new( - title='Scheduling throughput', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='reqps', -) - .addTarget( - es.target( - query='uuid: $uuid AND metricName.keyword: schedulingThroughput', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - meta: {}, - settings: {}, - type: 'avg', - }, - ], - bucketAggs=[ - { - field: 'timestamp', - id: '2', - settings: { - interval: 'auto', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - -// OVN section -local ovnkube_master_cpu = grafana.graphPanel.new( - title='ovnkube-master CPU usage', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='percent', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "containerCPU" AND labels.pod.keyword: /ovnkube-master.*/', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - field: 'labels.pod.keyword', - id: '2', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '_term', - size: '0', - }, - type: 'terms', - }, - { - field: 'labels.container.keyword', - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '_term', - size: '0', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '3', - settings: { - interval: '30s', - min_doc_count: '1', - timeZone: 'utc', - trimEdges: '0', - }, - type: 'date_histogram', - }, - ], - ) -); - - -local ovnkube_master_memory = grafana.graphPanel.new( - title='ovnkube-master Memory usage', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='bytes', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "containerMemory" AND labels.pod.keyword: /ovnkube-master.*/', - timeField='timestamp', - alias='{{labels.pod.keyword}}', - metrics=[ - { - field: 'value', - id: '1', - type: 'sum', - }, - ], - bucketAggs=[ - { - field: 'labels.pod.keyword', - id: '2', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '_term', - size: '0', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '3', - settings: { - interval: '30s', - min_doc_count: '1', - timeZone: 'utc', - trimEdges: '0', - }, - type: 'date_histogram', - }, - ], - ) -); - -local ovnkube_controller_cpu = grafana.graphPanel.new( - title='ovn-controller CPU usage', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='percent', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "containerCPU" AND labels.namespace.keyword: "openshift-ovn-kubernetes" AND labels.pod.keyword: /ovnkube-node.*/ AND labels.container.keyword: "ovn-controller"', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - field: 'labels.pod.keyword', - id: '2', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '_term', - size: '0', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '3', - settings: { - interval: '30s', - min_doc_count: '1', - timeZone: 'utc', - trimEdges: '0', - }, - type: 'date_histogram', - }, - ], - ) -); - - -local ovnkube_controller_memory = grafana.graphPanel.new( - title='ovn-controller Memory usage', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='bytes', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "containerMemory" AND labels.namespace.keyword: "openshift-ovn-kubernetes" AND labels.pod.keyword: /ovnkube-node.*/ AND labels.container.keyword: "ovn-controller"', - timeField='timestamp', - alias='{{labels.pod.keyword}}', - metrics=[ - { - field: 'value', - id: '1', - type: 'sum', - }, - ], - bucketAggs=[ - { - field: 'labels.pod.keyword', - id: '2', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '_term', - size: '0', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '3', - settings: { - interval: '30s', - min_doc_count: '1', - timeZone: 'utc', - trimEdges: '0', - }, - type: 'date_histogram', - }, - ], - ) -); - - -// ETCD section -local etcd_fsync_latency = grafana.graphPanel.new( - title='etcd 99th disk WAL fsync latency', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='s', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "99thEtcdDiskWalFsyncDurationSeconds"', - timeField='timestamp', - alias='{{labels.pod.keyword}}', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - field: 'labels.pod.keyword', - fake: true, - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - -local etcd_commit_latency = grafana.graphPanel.new( - title='etcd 99th disk backend commit latency', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='s', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "99thEtcdDiskBackendCommitDurationSeconds"', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - field: 'labels.pod.keyword', - fake: true, - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - - -local etcd_leader_changes = grafana.graphPanel.new( - title='Etcd leader changes', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_values=true, - min=0, - format='s', -) - .addTarget( - es.target( - query='uuid: $uuid AND metricName.keyword: etcdLeaderChangesRate', - alias='Etcd leader changes', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - field: 'timestamp', - id: '1', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - - -local etcd_peer_roundtrip_time = grafana.graphPanel.new( - title='Etcd 99th network peer roundtrip time', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='s', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: 99thEtcdRoundTripTimeSeconds', - alias='{{labels.pod.keyword}} to {{labels.To.keyword}}', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - field: 'labels.pod.keyword', - fake: true, - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - { - fake: true, - field: 'labels.To.keyword', - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - -local etcd_cpu = grafana.graphPanel.new( - title='Etcd CPU utilization', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='percent', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "containerCPU" AND labels.container.keyword: etcd', - alias='{{labels.namespace.keyword}}-{{labels.pod.keyword}}', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - fake: true, - field: 'labels.pod.keyword', - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - fake: true, - field: 'labels.container.keyword', - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'labels.namespace.keyword', - id: '5', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - - -local etcd_memory = grafana.graphPanel.new( - title='Etcd memory utilization', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='bytes', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "containerMemory" AND labels.container.keyword: etcd', - alias='{{labels.namespace.keyword}}-{{labels.pod.keyword}}', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - fake: true, - field: 'labels.pod.keyword', - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - fake: true, - field: 'labels.container.keyword', - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'labels.namespace.keyword', - id: '5', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - -// API an Kubeproxy section - -local api_latency_read_only_resource = grafana.graphPanel.new( - title='Read Only API request P99 latency - resource scoped', - datasource='$datasource1', - legend_alignAsTable=true, - format='s', - legend_max=true, - legend_avg=true, - legend_values=true, -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: readOnlyAPICallsLatency AND labels.scope.keyword: resource', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - fake: true, - field: 'labels.verb.keyword', - id: '3', - settings: { - min_doc_count: 0, - order: 'desc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - { - field: 'labels.resource.keyword', - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - -local api_latency_read_only_namespace = grafana.graphPanel.new( - title='Read Only API request P99 latency - namespace scoped', - datasource='$datasource1', - legend_alignAsTable=true, - format='s', - legend_max=true, - legend_avg=true, - legend_values=true, -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: readOnlyAPICallsLatency AND labels.scope.keyword: namespace', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - fake: true, - field: 'labels.verb.keyword', - id: '3', - settings: { - min_doc_count: 0, - order: 'desc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - -local api_latency_read_only_cluster = grafana.graphPanel.new( - title='Read Only API request P99 latency - cluster scoped', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='s', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: readOnlyAPICallsLatency AND labels.scope.keyword: cluster', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - fake: true, - field: 'labels.verb.keyword', - id: '3', - settings: { - min_doc_count: 0, - order: 'desc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - -local api_latency_mutating = grafana.graphPanel.new( - title='Mutating API request P99 latency', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='s', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: mutatingAPICallsLatency', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - fake: true, - field: 'labels.verb.keyword', - id: '3', - settings: { - min_doc_count: 0, - order: 'desc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - - -local api_request_rate = grafana.graphPanel.new( - title='API request rate', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='s', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: APIRequestRate', - alias='{{labels.verb.keyword}} {{labels.resource.keyword}}', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - fake: true, - field: 'labels.resource.keyword', - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '_term', - size: '0', - }, - type: 'terms', - }, - { - fake: true, - field: 'labels.verb.keyword', - id: '3', - settings: { - min_doc_count: 0, - order: 'desc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - - -local service_sync_latency = grafana.graphPanel.new( - title='Service sync latency', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='s', -) - .addTarget( - es.target( - query='uuid: $uuid AND metricName.keyword: kubeproxyP99ProgrammingLatency', - alias='Latency', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - field: 'labels.instance.keyword', - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: 'auto', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -) - .addTarget( - es.target( - query='uuid: $uuid AND metricName.keyword: serviceSyncLatency', - alias='Latency', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - field: 'timestamp', - id: '2', - settings: { - interval: 'auto', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - -// Cluster Kubelet & CRI-O section -local kubelet_process_cpu = grafana.graphPanel.new( - title='Kubelet process CPU usage', - datasource='$datasource1', - legend_alignAsTable=true, - legend_rightSide=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='percent', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: kubeletCPU', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - field: 'labels.node.keyword', - fake: true, - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - -local kubelet_process_memory = grafana.graphPanel.new( - title='Kubelet process RSS memory usage', - datasource='$datasource1', - legend_alignAsTable=true, - legend_rightSide=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='bytes', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: kubeletMemory', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - field: 'labels.node.keyword', - fake: true, - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - -local cri_o_process_cpu = grafana.graphPanel.new( - title='CRI-O process CPU usage', - datasource='$datasource1', - legend_alignAsTable=true, - legend_rightSide=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='percent', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: crioCPU', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - field: 'labels.node.keyword', - fake: true, - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - - -local cri_o_process_memory = grafana.graphPanel.new( - title='CRI-O RSS memory usage', - datasource='$datasource1', - legend_alignAsTable=true, - legend_rightSide=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='percent', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: crioMemory', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - field: 'labels.node.keyword', - fake: true, - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - -// Master Node section - -local container_cpu_master = grafana.graphPanel.new( - title='Container CPU usage $master', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='percent', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "containerCPU" AND labels.node.keyword: $master AND labels.namespace.keyword: $namespace', - alias='{{labels.pod.keyword}} {{labels.container.keyword}}', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - fake: true, - field: 'labels.pod.keyword', - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'labels.container.keyword', - fake: true, - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - -local container_memory_master = grafana.graphPanel.new( - title='Container RSS memory $master', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='bytes', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "containerMemory" AND labels.node.keyword: $master AND labels.namespace.keyword: $namespace', - alias='{{labels.pod.keyword}} {{labels.container.keyword}}', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - fake: true, - field: 'labels.pod.keyword', - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'labels.container.keyword', - fake: true, - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - -local cpu_master = grafana.graphPanel.new( - title='CPU $master', - datasource='$datasource1', - legend_alignAsTable=true, - legend_rightSide=true, - legend_max=true, - legend_min=true, - legend_avg=true, - legend_values=true, - format='percent', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: "nodeCPU-Masters" AND labels.instance.keyword: $master', - alias='{{labels.mode.keyword}}', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - settings: { - script: { - inline: '_value*100', - }, - }, - }, - ], - bucketAggs=[ - { - fake: true, - field: 'labels.mode.keyword', - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '10', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: 'auto', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - -local memory_master = grafana.graphPanel.new( - title='Memory $master', - datasource='$datasource1', - legend_alignAsTable=true, - legend_rightSide=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='bytes', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: "nodeMemoryAvailable-Masters" AND labels.instance.keyword: $master', - alias='Available', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: "nodeMemoryTotal-Masters" AND labels.instance.keyword: $master', - alias='Total', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: "nodeMemoryUtilization-Masters" AND labels.instance.keyword: $master', - alias='Utilization', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - -// Worker Node section - -local container_cpu_worker = grafana.graphPanel.new( - title='Container CPU usage $worker', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='percent', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "containerCPU" AND labels.node.keyword: $worker AND labels.namespace.keyword: $namespace', - alias='{{labels.pod.keyword}} {{labels.container.keyword}}', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - fake: true, - field: 'labels.pod.keyword', - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'labels.container.keyword', - fake: true, - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - -local container_memory_worker = grafana.graphPanel.new( - title='Container RSS memory $worker', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='bytes', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "containerMemory" AND labels.node.keyword: $worker AND labels.namespace.keyword: $namespace', - alias='{{labels.pod.keyword}} {{labels.container.keyword}}', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - fake: true, - field: 'labels.pod.keyword', - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'labels.container.keyword', - fake: true, - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - -local cpu_worker = grafana.graphPanel.new( - title='CPU $worker', - datasource='$datasource1', - legend_alignAsTable=true, - legend_rightSide=true, - legend_max=true, - legend_min=true, - legend_avg=true, - legend_values=true, - format='percent', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: "nodeCPU-Workers" AND labels.instance.keyword: $worker', - alias='{{labels.mode.keyword}}', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - settings: { - script: { - inline: '_value*100', - }, - }, - }, - ], - bucketAggs=[ - { - fake: true, - field: 'labels.mode.keyword', - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '10', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: 'auto', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - -local memory_worker = grafana.graphPanel.new( - title='Memory $worker', - datasource='$datasource1', - legend_alignAsTable=true, - legend_rightSide=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='bytes', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: "nodeMemoryAvailable-Workers" AND labels.instance.keyword: $worker', - alias='Available', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: "nodeMemoryTotal-Workers" AND labels.instance.keyword: $worker', - alias='Total', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: "nodeMemoryUtilization-Workers" AND labels.instance.keyword: $worker', - alias='Utilization', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - - -// Infra Node section - -local container_cpu_infra = grafana.graphPanel.new( - title='Container CPU usage $infra', - datasource='$datasource1', - legend_alignAsTable=true, - legend_avg=true, - legend_max=true, - legend_values=true, - format='percent', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "containerCPU" AND labels.node.keyword: $infra AND labels.namespace.keyword: $namespace', - alias='{{labels.pod.keyword}} {{labels.container.keyword}}', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - fake: true, - field: 'labels.pod.keyword', - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'labels.container.keyword', - fake: true, - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - -local container_memory_infra = grafana.graphPanel.new( - title='Container RSS memory $infra', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='bytes', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName: "containerMemory" AND labels.node.keyword: $infra AND labels.namespace.keyword: $namespace', - alias='{{labels.pod.keyword}} {{labels.container.keyword}}', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - fake: true, - field: 'labels.pod.keyword', - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'labels.container.keyword', - fake: true, - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '0', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - -local cpu_infra = grafana.graphPanel.new( - title='CPU $infra', - datasource='$datasource1', - legend_alignAsTable=true, - legend_rightSide=true, - legend_max=true, - legend_min=true, - legend_avg=true, - legend_values=true, - format='percent', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: "nodeCPU-Infra" AND labels.instance.keyword: $infra', - alias='{{labels.mode.keyword}}', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - settings: { - script: { - inline: '_value*100', - }, - }, - }, - ], - bucketAggs=[ - { - fake: true, - field: 'labels.mode.keyword', - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '10', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: 'auto', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - -local memory_infra = grafana.graphPanel.new( - title='Memory $infra', - datasource='$datasource1', - legend_alignAsTable=true, - legend_rightSide=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='bytes', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: "nodeMemoryAvailable-Infra" AND labels.instance.keyword: $infra', - alias='Available', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: "nodeMemoryTotal-Infra" AND labels.instance.keyword: $infra', - alias='Total', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: "nodeMemoryUtilization-Infra" AND labels.instance.keyword: $infra', - alias='Utilization', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - field: 'timestamp', - id: '2', - settings: { - interval: '30s', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - -// Aggregated worker node usage section -local agg_avg_cpu = grafana.graphPanel.new( - title='Avg CPU usage', - datasource='$datasource1', - legend_alignAsTable=true, - legend_rightSide=true, - legend_avg=true, - legend_max=true, - legend_values=true, - format='percent', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: "nodeCPU-AggregatedWorkers"', - alias='{{labels.mode.keyword}}', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - settings: { - script: { - inline: '_value*100', - }, - }, - }, - ], - bucketAggs=[ - { - fake: true, - field: 'labels.mode.keyword', - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '10', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: 'auto', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - -local agg_avg_mem = grafana.graphPanel.new( - title='Avg Memory', - datasource='$datasource1', - legend_alignAsTable=true, - legend_rightSide=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='bytes', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: "nodeMemoryAvailable-AggregatedWorkers"', - alias='Available', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - field: 'timestamp', - id: '2', - settings: { - interval: 'auto', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: "nodeMemoryTotal-AggregatedWorkers"', - alias='Total', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - field: 'timestamp', - id: '2', - settings: { - interval: 'auto', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - - -local agg_container_cpu = grafana.graphPanel.new( - title='Container CPU usage', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='percent', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: "containerCPU-AggregatedWorkers" AND labels.namespace.keyword: $namespace', - alias='{{labels.pod.keyword}}: {{labels.container.keyword}}', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - fake: true, - field: 'labels.container.keyword', - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '10', - }, - type: 'terms', - }, - { - field: 'labels.pod.keyword', - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: 'auto', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - -local agg_container_mem = grafana.graphPanel.new( - title='Container memory RSS', - datasource='$datasource1', - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_values=true, - format='bytes', -) - .addTarget( - es.target( - query='uuid.keyword: $uuid AND metricName.keyword: "containerMemory-AggregatedWorkers" AND labels.namespace.keyword: $namespace', - alias='{{labels.pod.keyword}}: {{labels.container.keyword}}', - timeField='timestamp', - metrics=[ - { - field: 'value', - id: '1', - type: 'avg', - }, - ], - bucketAggs=[ - { - fake: true, - field: 'labels.pod.keyword', - id: '4', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '10', - }, - type: 'terms', - }, - { - fake: true, - field: 'labels.container.keyword', - id: '3', - settings: { - min_doc_count: '1', - order: 'desc', - orderBy: '1', - size: '10', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: 'auto', - min_doc_count: '1', - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - - -//Dashboard & Templates - -grafana.dashboard.new( - 'Kube-burner report v2', - description='', - editable='true', - time_from='now/y', - time_to='now', - timezone='utc', -) -.addTemplate( - grafana.template.datasource( - 'datasource1', - 'elasticsearch', - 'AWS Dev - ripsaw-kube-burner', - label='Datasource', - regex='/.*kube-burner.*/' - ) -) -.addTemplate( - grafana.template.new( - label='Platform', - name='platform', - current='All', - query='{"find": "terms", "field": "platform.keyword"}', - refresh=2, - multi=true, - includeAll=true, - datasource='$datasource1', - ) -) -.addTemplate( - grafana.template.new( - label='SDN type', - name='sdn', - current='All', - query='{"find": "terms", "field": "sdn_type.keyword"}', - refresh=2, - multi=true, - includeAll=true, - datasource='$datasource1', - ) -) -.addTemplate( - grafana.template.new( - label='Workload', - multi=true, - query='{"find": "terms", "field": "workload.keyword", "query": "platform.keyword: $platform AND sdn_type.keyword: $sdn"}', - refresh=1, - name='workload', - includeAll=false, - datasource='$datasource1' - ) -) -.addTemplate( - grafana.template.new( - label='Worker count', - multi=true, - query='{"find": "terms", "field": "worker_nodes_count", "query": "platform.keyword: $platform AND sdn_type.keyword: $sdn AND workload.keyword: $workload"}', - refresh=1, - name='worker_count', - includeAll=true, - datasource='$datasource1' - ) -) -.addTemplate( - grafana.template.new( - label='UUID', - multi=false, - query='{"find": "terms", "field": "uuid.keyword", "query": "platform.keyword: $platform AND sdn_type.keyword: $sdn AND workload.keyword: $workload AND worker_nodes_count: $worker_count"}', - refresh=2, - name='uuid', - includeAll=false, - datasource='$datasource1' - ) -) -.addTemplate( - grafana.template.new( - label='Master nodes', - multi=true, - query='{ "find" : "terms", "field": "labels.node.keyword", "query": "metricName.keyword: nodeRoles AND labels.role.keyword: master AND uuid.keyword: $uuid"}', - refresh=2, - name='master', - includeAll=false, - datasource='$datasource1' - ) -) -.addTemplate( - grafana.template.new( - label='Worker nodes', - multi=true, - query='{ "find" : "terms", "field": "labels.node.keyword", "query": "metricName.keyword: nodeRoles AND labels.role.keyword: worker AND uuid.keyword: $uuid"}', - refresh=2, - name='worker', - includeAll=false, - datasource='$datasource1' - ) -) -.addTemplate( - grafana.template.new( - label='Infra nodes', - multi=true, - query='{ "find" : "terms", "field": "labels.node.keyword", "query": "metricName.keyword: nodeRoles AND labels.role.keyword: infra AND uuid.keyword: $uuid"}', - refresh=2, - name='infra', - includeAll=false, - datasource='$datasource1' - ) -) -.addTemplate( - grafana.template.new( - label='Namespace', - multi=true, - query='{ "find" : "terms", "field": "labels.namespace.keyword", "query": "labels.namespace.keyword: /openshift-.*/ AND uuid.keyword: $uuid"}', - refresh=2, - name='namespace', - includeAll=true, - datasource='$datasource1' - ) -) -.addTemplate( - grafana.template.custom( - label='Latency percentile', - name='latencyPercentile', - current='P99', - query='P99, P95, P50', - multi=false, - includeAll=false, - ) -) -.addPanels( - [ - worker_count { gridPos: { x: 0, y: 0, w: 4, h: 3 } }, - metric_count_panel { gridPos: { x: 4, y: 0, w: 12, h: 3 } }, - openshift_version_panel { gridPos: { x: 16, y: 0, w: 6, h: 3 } }, - etcd_version_panel { gridPos: { x: 22, y: 0, w: 2, h: 3 } }, - summary_panel_1 { gridPos: { x: 0, y: 3, h: 2, w: 24 } }, - summary_panel_2 { gridPos: { x: 0, y: 5, h: 2, w: 24 } }, - ], -) -.addPanel( - grafana.row.new(title='Cluster status', collapse=true).addPanels( - [ - masters_cpu { gridPos: { x: 0, y: 8, w: 12, h: 9 } }, - masters_memory { gridPos: { x: 12, y: 8, w: 12, h: 9 } }, - node_status_summary { gridPos: { x: 0, y: 17, w: 12, h: 8 } }, - pod_status_summary { gridPos: { x: 12, y: 17, w: 12, h: 8 } }, - kube_api_cpu { gridPos: { x: 0, y: 25, w: 12, h: 9 } }, - kube_api_memory { gridPos: { x: 12, y: 25, w: 12, h: 9 } }, - active_controller_manager_cpu { gridPos: { x: 0, y: 34, w: 12, h: 9 } }, - active_controller_manager_memory { gridPos: { x: 12, y: 34, w: 12, h: 9 } }, - kube_scheduler_cpu { gridPos: { x: 0, y: 43, w: 12, h: 9 } }, - kube_scheduler_memory { gridPos: { x: 12, y: 43, w: 12, h: 9 } }, - hypershift_controlplane_cpu { gridPos: { x: 0, y: 52, w: 12, h: 9 } }, - hypershift_controlplane_memory { gridPos: { x: 12, y: 52, w: 12, h: 9 } }, - ] - ), { x: 0, y: 7, w: 24, h: 1 } -) -.addPanel( - // Panels below for uncollapsed row. - grafana.row.new(title='Pod latency stats', collapse=false), { x: 0, y: 8, w: 24, h: 1 } -) -.addPanels( - [ - average_pod_latency { gridPos: { x: 0, y: 9, w: 12, h: 8 } }, - pod_latencies_summary { gridPos: { x: 12, y: 9, w: 12, h: 8 } }, - pod_conditions_latency { gridPos: { x: 0, y: 17, w: 24, h: 10 } }, - setup_latency { gridPos: { x: 0, y: 27, w: 12, h: 9 } }, - scheduling_throughput { gridPos: { x: 12, y: 27, w: 12, h: 9 } }, - ] -) -.addPanel( - grafana.row.new(title='OVNKubernetes', collapse=true).addPanels( - [ - ovnkube_master_cpu { gridPos: { x: 0, y: 80, w: 12, h: 8 } }, - ovnkube_master_memory { gridPos: { x: 12, y: 80, w: 12, h: 8 } }, - ovnkube_controller_cpu { gridPos: { x: 0, y: 88, w: 12, h: 8 } }, - ovnkube_controller_memory { gridPos: { x: 12, y: 88, w: 12, h: 8 } }, - ] - ), { x: 0, y: 36, w: 24, h: 1 } -) -.addPanel( - grafana.row.new(title='etcd', collapse=false), { x: 0, y: 37, w: 24, h: 1 } -) -.addPanels( - [ - etcd_fsync_latency { gridPos: { x: 0, y: 38, w: 12, h: 9 } }, - etcd_commit_latency { gridPos: { x: 12, y: 38, w: 12, h: 9 } }, - etcd_leader_changes { gridPos: { x: 0, y: 47, w: 12, h: 9 } }, - etcd_peer_roundtrip_time { gridPos: { x: 12, y: 47, w: 12, h: 9 } }, - etcd_cpu { gridPos: { x: 0, y: 56, w: 12, h: 9 } }, - etcd_memory { gridPos: { x: 12, y: 56, w: 12, h: 9 } }, - ], -) -.addPanel( - grafana.row.new(title='API and Kubeproxy', collapse=false), { x: 0, y: 65, w: 24, h: 1 } -) -.addPanels( - [ - api_latency_read_only_resource { gridPos: { x: 0, y: 66, w: 12, h: 9 } }, - api_latency_read_only_namespace { gridPos: { x: 12, y: 66, w: 12, h: 9 } }, - api_latency_read_only_cluster { gridPos: { x: 0, y: 75, w: 12, h: 9 } }, - api_latency_mutating { gridPos: { x: 12, y: 75, w: 12, h: 9 } }, - api_request_rate { gridPos: { x: 0, y: 84, w: 12, h: 9 } }, - service_sync_latency { gridPos: { x: 12, y: 84, w: 12, h: 9 } }, - ], -) - -.addPanel( - grafana.row.new(title='Cluster Kubelet & CRI-O', collapse=false), { x: 0, y: 93, w: 24, h: 1 } -) -.addPanels( - [ - kubelet_process_cpu { gridPos: { x: 0, y: 94, w: 12, h: 8 } }, - kubelet_process_memory { gridPos: { x: 12, y: 94, w: 12, h: 8 } }, - cri_o_process_cpu { gridPos: { x: 0, y: 103, w: 12, h: 8 } }, - cri_o_process_memory { gridPos: { x: 12, y: 103, w: 12, h: 8 } }, - ], -) - -.addPanel( - grafana.row.new(title='Master: $master', collapse=true, repeat='$master').addPanels( - [ - container_cpu_master { gridPos: { x: 0, y: 112, w: 12, h: 9 } }, - container_memory_master { gridPos: { x: 12, y: 112, w: 12, h: 9 } }, - cpu_master { gridPos: { x: 0, y: 121, w: 12, h: 9 } }, - memory_master { gridPos: { x: 12, y: 121, w: 12, h: 9 } }, - ] - ), { x: 0, y: 111, w: 24, h: 1 } -) - -.addPanel( - grafana.row.new(title='Worker: $worker', collapse=true, repeat='$worker').addPanels( - [ - container_cpu_worker { gridPos: { x: 0, y: 112, w: 12, h: 9 } }, - container_memory_worker { gridPos: { x: 12, y: 112, w: 12, h: 9 } }, - cpu_worker { gridPos: { x: 0, y: 121, w: 12, h: 9 } }, - memory_worker { gridPos: { x: 12, y: 121, w: 12, h: 9 } }, - ] - ), { x: 0, y: 111, w: 24, h: 1 } -) - -.addPanel( - grafana.row.new(title='Infra: $infra', collapse=true, repeat='$infra').addPanels( - [ - container_cpu_infra { gridPos: { x: 0, y: 131, w: 12, h: 9 } }, - container_memory_infra { gridPos: { x: 12, y: 131, w: 12, h: 9 } }, - cpu_infra { gridPos: { x: 0, y: 140, w: 12, h: 9 } }, - memory_infra { gridPos: { x: 12, y: 140, w: 12, h: 9 } }, - ] - ), { x: 0, y: 130, w: 24, h: 1 } -) - -.addPanel( - grafana.row.new(title='Aggregated worker nodes usage (only in aggregated metrics profile)', collapse=true).addPanels( - [ - agg_avg_cpu { gridPos: { x: 0, y: 150, w: 12, h: 9 } }, - agg_avg_mem { gridPos: { x: 12, y: 150, w: 12, h: 9 } }, - agg_container_cpu { gridPos: { x: 0, y: 159, w: 12, h: 9 } }, - agg_container_mem { gridPos: { x: 12, y: 159, w: 12, h: 9 } }, - ] - ), { x: 0, y: 149, w: 24, h: 1 } -) diff --git a/templates/General/ocp-performance-v2.jsonnet b/templates/General/ocp-performance-v2.jsonnet deleted file mode 100644 index c0734d2..0000000 --- a/templates/General/ocp-performance-v2.jsonnet +++ /dev/null @@ -1,145 +0,0 @@ -local panels = import '../../assets/ocp-performance/panels.libsonnet'; -local queries = import '../../assets/ocp-performance/queries.libsonnet'; -local variables = import '../../assets/ocp-performance/variables.libsonnet'; -local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; - -g.dashboard.new('Openshift Performance') -+ g.dashboard.withDescription(||| - Performance dashboard for Red Hat Openshift -|||) -+ g.dashboard.time.withFrom('now-1h') -+ g.dashboard.time.withTo('now') -+ g.dashboard.withTimezone('utc') -+ g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) -+ g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) -+ g.dashboard.withRefresh('30s') -+ g.dashboard.withEditable(true) -+ g.dashboard.graphTooltip.withSharedCrosshair() -+ g.dashboard.withVariables([ - variables.datasource, - variables.master_node, - variables.worker_node, - variables.infra_node, - variables.namespace, - variables.block_device, - variables.net_device, - variables.interval, -]) -+ g.dashboard.withPanels([ - g.panel.row.new('OVN') - + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) - + g.panel.row.withCollapsed(true) - + g.panel.row.withPanels([ - panels.timeSeries.generic('99% Pod Annotation Latency', 's', queries.ovnAnnotationLatency.query(), { x: 0, y: 1, w: 24, h: 4 }), - panels.timeSeries.generic('99% CNI Request ADD Latency', 's', queries.ovnCNIAdd.query(), { x: 0, y: 13, w: 12, h: 8 }), - panels.timeSeries.generic('99% CNI Request DEL Latency', 's', queries.ovnCNIDel.query(), { x: 12, y: 13, w: 12, h: 8 }), - panels.timeSeries.genericLegend('ovnkube-master CPU Usage', 'percent', queries.ovnKubeMasterCPU.query(), { x: 0, y: 21, w: 12, h: 8 }), - panels.timeSeries.genericLegend('ovnkube-master Memory Usage', 'bytes', queries.ovnKubeMasterMem.query(), { x: 12, y: 21, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Top 10 ovn-controller CPU Usage', 'percent', queries.topOvnControllerCPU.query(), { x: 0, y: 28, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Top 10 ovn-controller Memory Usage', 'bytes', queries.topOvnControllerMem.query(), { x: 12, y: 28, w: 12, h: 8 }), - ]), - g.panel.row.new('Monitoring stack') - + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) - + g.panel.row.withCollapsed(true) - + g.panel.row.withPanels([ - panels.timeSeries.genericLegend('Prometheus Replica CPU', 'percent', queries.promReplCpuUsage.query(), { x: 0, y: 2, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Prometheus Replica RSS', 'bytes', queries.promReplMemUsage.query(), { x: 12, y: 2, w: 12, h: 8 }), - panels.timeSeries.genericLegend('metrics-server/prom-adapter CPU', 'percent', queries.metricsServerCpuUsage.query(), { x: 0, y: 10, w: 12, h: 8 }), - panels.timeSeries.genericLegend('metrics-server/prom-adapter RSS', 'bytes', queries.metricsServerMemUsage.query(), { x: 12, y: 10, w: 12, h: 8 }), - ]), - g.panel.row.new('Stackrox') - + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) - + g.panel.row.withCollapsed(true) - + g.panel.row.withPanels([ - panels.timeSeries.genericLegend('Top 25 stackrox container RSS bytes', 'bytes', queries.stackroxMem.query(), { x: 0, y: 2, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Top 25 stackrox container CPU percent', 'percent', queries.stackroxCPU.query(), { x: 12, y: 2, w: 12, h: 8 }), - ]), - g.panel.row.new('Cluster Kubelet') - + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) - + g.panel.row.withCollapsed(true) - + g.panel.row.withPanels([ - panels.timeSeries.genericLegend('Top 10 Kubelet CPU usage', 'percent', queries.kubeletCPU.query(), { x: 0, y: 3, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Top 10 crio CPU usage', 'percent', queries.crioCPU.query(), { x: 12, y: 3, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Top 10 Kubelet memory usage', 'bytes', queries.kubeletMemory.query(), { x: 0, y: 11, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Top 10 crio memory usage', 'bytes', queries.crioMemory.query(), { x: 12, y: 11, w: 12, h: 8 }), - panels.timeSeries.genericLegend('inodes usage in /var/run', 'percent', queries.crioINodes.query(), { x: 0, y: 19, w: 24, h: 8 }), - ]), - g.panel.row.new('Cluster Details') - + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) - + g.panel.row.withCollapsed(true) - + g.panel.row.withPanels([ - panels.stat.base('Current Node Count', queries.currentNodeCount.query(), { x: 0, y: 4, w: 8, h: 3 }), - panels.stat.base('Current Namespace Count', queries.currentNamespaceCount.query(), { x: 8, y: 4, w: 8, h: 3 }), - panels.stat.base('Current Pod Count', queries.currentPodCount.query(), { x: 16, y: 4, w: 8, h: 3 }), - panels.timeSeries.generic('Number of nodes', 'none', queries.currentNodeCount.query(), { x: 0, y: 12, w: 8, h: 8 }), - panels.timeSeries.generic('Namespace count', 'none', queries.nsCount.query(), { x: 8, y: 12, w: 8, h: 8 }), - panels.timeSeries.generic('Pod count', 'none', queries.podCount.query(), { x: 16, y: 12, w: 8, h: 8 }), - panels.timeSeries.generic('Secret & configmap count', 'none', queries.secretCmCount.query(), { x: 0, y: 20, w: 8, h: 8 }), - panels.timeSeries.generic('Deployment count', 'none', queries.deployCount.query(), { x: 8, y: 20, w: 8, h: 8 }), - panels.timeSeries.generic('Services count', 'none', queries.servicesCount.query(), { x: 16, y: 20, w: 8, h: 8 }), - panels.timeSeries.generic('Routes count', 'none', queries.routesCount.query(), { x: 0, y: 20, w: 8, h: 8 }), - panels.timeSeries.generic('Alerts', 'none', queries.alerts.query(), { x: 8, y: 20, w: 8, h: 8 }), - panels.timeSeries.genericLegend('Pod Distribution', 'none', queries.podDistribution.query(), { x: 16, y: 20, w: 8, h: 8 }), - panels.timeSeries.genericLegend('Top 10 container RSS', 'bytes', queries.top10ContMem.query(), { x: 0, y: 28, w: 24, h: 8 }), - panels.timeSeries.genericLegend('container RSS system.slice', 'bytes', queries.contMemRSSSystemSlice.query(), { x: 12, y: 28, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Top 10 container CPU', 'percent', queries.top10ContCPU.query(), { x: 0, y: 36, w: 12, h: 8 }), - panels.timeSeries.generic('Goroutines count', 'none', queries.goroutinesCount.query(), { x: 12, y: 36, w: 12, h: 8 }), - ]), - g.panel.row.new('Cluster Operators Details') - + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) - + g.panel.row.withCollapsed(true) - + g.panel.row.withPanels([ - panels.stat.base('Cluster operators overview', queries.clusterOperatorsOverview.query(), { x: 0, y: 4, w: 24, h: 3 }), - panels.timeSeries.genericLegend('Cluster operators information', 'none', queries.clusterOperatorsInformation.query(), { x: 0, y: 4, w: 8, h: 8 }), - panels.timeSeries.genericLegend('Cluster operators degraded', 'none', queries.clusterOperatorsDegraded.query(), { x: 8, y: 4, w: 8, h: 8 }), - ]), - g.panel.row.new('Master: $_master_node') - + g.panel.row.withGridPos({ x: 0, y: 0, w: 0, h: 8 }) - + g.panel.row.withCollapsed(true) - + g.panel.row.withRepeat('_master_node') - + g.panel.row.withPanels([ - panels.timeSeries.genericLegend('CPU Basic: $_master_node', 'percent', queries.nodeCPU.query('$_master_node'), { x: 0, y: 0, w: 12, h: 8 }), - panels.timeSeries.genericLegend('System Memory: $_master_node', 'bytes', queries.nodeMemory.query('$_master_node'), { x: 12, y: 0, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Disk throughput: $_master_node', 'Bps', queries.diskThroughput.query('$_master_node'), { x: 0, y: 8, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Disk IOPS: $_master_node', 'iops', queries.diskIOPS.query('$_master_node'), { x: 12, y: 8, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Network Utilization: $_master_node', 'bps', queries.networkUtilization.query('$_master_node'), { x: 0, y: 16, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Network Packets: $_master_node', 'pps', queries.networkPackets.query('$_master_node'), { x: 12, y: 16, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Network packets drop: $_master_node', 'pps', queries.networkDrop.query('$_master_node'), { x: 0, y: 24, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Conntrack stats: $_master_node', '', queries.conntrackStats.query('$_master_node'), { x: 12, y: 24, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Top 10 container CPU: $_master_node', 'percent', queries.top10ContainerCPU.query('$_master_node'), { x: 0, y: 24, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Top 10 container RSS: $_master_node', 'bytes', queries.top10ContainerRSS.query('$_master_node'), { x: 12, y: 24, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Container fs write rate: $_master_node', 'Bps', queries.containerWriteBytes.query('$_master_node'), { x: 0, y: 32, w: 12, h: 8 }), - ]), - g.panel.row.new('Worker: $_worker_node') - + g.panel.row.withGridPos({ x: 0, y: 0, w: 0, h: 8 }) - + g.panel.row.withCollapsed(true) - + g.panel.row.withRepeat('_worker_node') - + g.panel.row.withPanels([ - panels.timeSeries.genericLegend('CPU Basic: $_worker_node', 'percent', queries.nodeCPU.query('$_worker_node'), { x: 0, y: 0, w: 12, h: 8 }), - panels.timeSeries.genericLegend('System Memory: $_worker_node', 'bytes', queries.nodeMemory.query('$_worker_node'), { x: 12, y: 0, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Disk throughput: $_worker_node', 'Bps', queries.diskThroughput.query('$_worker_node'), { x: 0, y: 8, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Disk IOPS: $_worker_node', 'iops', queries.diskIOPS.query('$_worker_node'), { x: 12, y: 8, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Network Utilization: $_worker_node', 'bps', queries.networkUtilization.query('$_worker_node'), { x: 0, y: 16, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Network Packets: $_worker_node', 'pps', queries.networkPackets.query('$_worker_node'), { x: 12, y: 16, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Network packets drop: $_worker_node', 'pps', queries.networkDrop.query('$_worker_node'), { x: 0, y: 24, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Conntrack stats: $_worker_node', '', queries.conntrackStats.query('$_worker_node'), { x: 12, y: 24, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Top 10 container CPU: $_worker_node', 'percent', queries.top10ContainerCPU.query('$_worker_node'), { x: 0, y: 24, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Top 10 container RSS: $_worker_node', 'bytes', queries.top10ContainerRSS.query('$_worker_node'), { x: 12, y: 24, w: 12, h: 8 }), - ]), - g.panel.row.new('Infra: $_infra_node') - + g.panel.row.withGridPos({ x: 0, y: 0, w: 0, h: 8 }) - + g.panel.row.withCollapsed(true) - + g.panel.row.withRepeat('_infra_node') - + g.panel.row.withPanels([ - panels.timeSeries.genericLegend('CPU Basic: $_infra_node', 'percent', queries.nodeCPU.query('$_infra_node'), { x: 0, y: 0, w: 12, h: 8 }), - panels.timeSeries.genericLegend('System Memory: $_infra_node', 'bytes', queries.nodeMemory.query('$_infra_node'), { x: 12, y: 0, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Disk throughput: $_infra_node', 'Bps', queries.diskThroughput.query('$_infra_node'), { x: 0, y: 8, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Disk IOPS: $_infra_node', 'iops', queries.diskIOPS.query('$_infra_node'), { x: 12, y: 8, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Network Utilization: $_infra_node', 'bps', queries.networkUtilization.query('$_infra_node'), { x: 0, y: 16, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Network Packets: $_infra_node', 'pps', queries.networkPackets.query('$_infra_node'), { x: 12, y: 16, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Network packets drop: $_infra_node', 'pps', queries.networkDrop.query('$_infra_node'), { x: 0, y: 24, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Conntrack stats: $_infra_node', '', queries.conntrackStats.query('$_infra_node'), { x: 12, y: 24, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Top 10 container CPU: $_infra_node', 'percent', queries.top10ContainerCPU.query('$_infra_node'), { x: 0, y: 24, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Top 10 container RSS: $_infra_node', 'bytes', queries.top10ContainerRSS.query('$_infra_node'), { x: 12, y: 24, w: 12, h: 8 }), - ]), -]) diff --git a/templates/General/ocp-performance.jsonnet b/templates/General/ocp-performance.jsonnet index 96d1702..c0734d2 100644 --- a/templates/General/ocp-performance.jsonnet +++ b/templates/General/ocp-performance.jsonnet @@ -1,710 +1,145 @@ -local grafana = import '../grafonnet-lib/grafonnet/grafana.libsonnet'; -local prometheus = grafana.prometheus; - - -// Helper functions - -local genericGraphPanel(title, format) = grafana.graphPanel.new( - title=title, - datasource='$datasource', - format=format, - nullPointMode='null as zero', - sort='decreasing', - legend_alignAsTable=true, -); - -local genericGraphLegendPanel(title, format) = grafana.graphPanel.new( - title=title, - datasource='$datasource', - format=format, - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_min=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - legend_sortDesc='true', - nullPointMode='null as zero', - sort='decreasing', -); - - -local nodeMemory(nodeName) = genericGraphLegendPanel('System Memory: ' + nodeName, 'bytes').addTarget( - prometheus.target( - 'node_memory_Active_bytes{instance=~"' + nodeName + '"}', - legendFormat='Active', - ) -).addTarget( - prometheus.target( - 'node_memory_MemTotal_bytes{instance=~"' + nodeName + '"}', - legendFormat='Total', - ) -).addTarget( - prometheus.target( - 'node_memory_Cached_bytes{instance=~"' + nodeName + '"} + node_memory_Buffers_bytes{instance=~"' + nodeName + '"}', - legendFormat='Cached + Buffers', - ) -).addTarget( - prometheus.target( - 'node_memory_MemAvailable_bytes{instance=~"' + nodeName + '"}', - legendFormat='Available', - ) -).addTarget( - prometheus.target( - '(node_memory_MemTotal_bytes{instance=~"' + nodeName + '"} - (node_memory_MemFree_bytes{instance=~"' + nodeName + '"} + node_memory_Buffers_bytes{instance=~"' + nodeName + '"} + node_memory_Cached_bytes{instance=~"' + nodeName + '"}))', - legendFormat='Used', - ) -); - - -local nodeCPU(nodeName) = genericGraphLegendPanel('CPU Basic: ' + nodeName, 'percent').addTarget( - prometheus.target( - 'sum by (instance, mode)(irate(node_cpu_seconds_total{instance=~"' + nodeName + '",job=~".*"}[$interval])) * 100', - legendFormat='Busy {{mode}}', - ) -); - - -local diskThroughput(nodeName) = genericGraphLegendPanel('Disk throughput: ' + nodeName, 'Bps').addTarget( - prometheus.target( - 'rate(node_disk_read_bytes_total{device=~"$block_device",instance=~"' + nodeName + '"}[$interval])', - legendFormat='{{ device }} - read', - ) -).addTarget( - prometheus.target( - 'rate(node_disk_written_bytes_total{device=~"$block_device",instance=~"' + nodeName + '"}[$interval])', - legendFormat='{{ device }} - write', - ) -); - -local diskIOPS(nodeName) = genericGraphLegendPanel('Disk IOPS: ' + nodeName, 'iops').addTarget( - prometheus.target( - 'rate(node_disk_reads_completed_total{device=~"$block_device",instance=~"' + nodeName + '"}[$interval])', - legendFormat='{{ device }} - read', - ) -).addTarget( - prometheus.target( - 'rate(node_disk_writes_completed_total{device=~"$block_device",instance=~"' + nodeName + '"}[$interval])', - legendFormat='{{ device }} - write', - ) -); - -local networkUtilization(nodeName) = genericGraphLegendPanel('Network Utilization: ' + nodeName, 'bps').addTarget( - prometheus.target( - 'rate(node_network_receive_bytes_total{instance=~"' + nodeName + '",device=~"$net_device"}[$interval]) * 8', - legendFormat='{{instance}} - {{device}} - RX', - ) -).addTarget( - prometheus.target( - 'rate(node_network_transmit_bytes_total{instance=~"' + nodeName + '",device=~"$net_device"}[$interval]) * 8', - legendFormat='{{instance}} - {{device}} - TX', - ) -); - -local networkPackets(nodeName) = genericGraphLegendPanel('Network Packets: ' + nodeName, 'pps').addTarget( - prometheus.target( - 'rate(node_network_receive_packets_total{instance=~"' + nodeName + '",device=~"$net_device"}[$interval])', - legendFormat='{{instance}} - {{device}} - RX', - ) -).addTarget( - prometheus.target( - 'rate(node_network_transmit_packets_total{instance=~"' + nodeName + '",device=~"$net_device"}[$interval])', - legendFormat='{{instance}} - {{device}} - TX', - ) -); - -local networkDrop(nodeName) = genericGraphLegendPanel('Network packets drop: ' + nodeName, 'pps').addTarget( - prometheus.target( - 'topk(10, rate(node_network_receive_drop_total{instance=~"' + nodeName + '"}[$interval]))', - legendFormat='rx-drop-{{ device }}', - ) -).addTarget( - prometheus.target( - 'topk(10,rate(node_network_transmit_drop_total{instance=~"' + nodeName + '"}[$interval]))', - legendFormat='tx-drop-{{ device }}', - ) -); - -local conntrackStats(nodeName) = genericGraphLegendPanel('Conntrack stats: ' + nodeName, '') - { - seriesOverrides: [{ - alias: 'conntrack_limit', - yaxis: 2, - }], - yaxes: [{ show: true }, { show: true }], -} - .addTarget( - prometheus.target( - 'node_nf_conntrack_entries{instance=~"' + nodeName + '"}', - legendFormat='conntrack_entries', - ) -).addTarget( - prometheus.target( - 'node_nf_conntrack_entries_limit{instance=~"' + nodeName + '"}', - legendFormat='conntrack_limit', - ) -); - -local top10ContainerCPU(nodeName) = genericGraphLegendPanel('Top 10 container CPU: ' + nodeName, 'percent').addTarget( - prometheus.target( - 'topk(10, sum(irate(container_cpu_usage_seconds_total{container!="POD",name!="",node=~"' + nodeName + '",namespace!="",namespace=~"$namespace"}[$interval])) by (pod,container,namespace,name,service) * 100)', - legendFormat='{{ pod }}: {{ container }}', - ) -); - -local top10ContainerRSS(nodeName) = genericGraphLegendPanel('Top 10 container RSS: ' + nodeName, 'bytes').addTarget( - prometheus.target( - 'topk(10, container_memory_rss{container!="POD",name!="",node=~"' + nodeName + '",namespace!="",namespace=~"$namespace"})', - legendFormat='{{ pod }}: {{ container }}', - ) -); - -local containerWriteBytes(nodeName) = genericGraphLegendPanel('Container fs write rate: ' + nodeName, 'Bps').addTarget( - prometheus.target( - 'sum(rate(container_fs_writes_bytes_total{device!~".+dm.+", node=~"' + nodeName + '", container!=""}[$interval])) by (device, container)', - legendFormat='{{ container }}: {{ device }}', - ) -); - -// Individual panel definitions - -// Stackrox -local stackroxCPU = genericGraphLegendPanel('Top 25 stackrox container CPU percent', 'percent').addTarget( - prometheus.target( - 'topk(25, sum(irate(container_cpu_usage_seconds_total{container!="POD",name!="",namespace!="",namespace=~"stackrox"}[$interval])) by (pod,container,namespace,name,service) * 100)', - legendFormat='{{ pod }}: {{ container }}', - ) -); -local stackroxMem = genericGraphLegendPanel('Top 25 stackrox container RSS bytes', 'bytes').addTarget( - prometheus.target( - 'topk(25, container_memory_rss{container!="POD",name!="",namespace!="",namespace=~"stackrox"})', - legendFormat='{{ pod }}: {{ container }}', - ) -); - -// OVN -local ovnAnnotationLatency = genericGraphPanel('99% Pod Annotation Latency', 's').addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum by (pod, le) (rate(ovnkube_controller_pod_creation_latency_seconds_bucket[$interval]))) > 0', - legendFormat='{{pod}}', - ) -); - -local ovnCNIAdd = genericGraphPanel('99% CNI Request ADD Latency', 's').addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(rate(ovnkube_node_cni_request_duration_seconds_bucket{command="ADD"}[$interval])) by (pod,le)) > 0', - legendFormat='{{pod}}', - ) -); - -local ovnCNIDel = genericGraphPanel('99% CNI Request DEL Latency', 's').addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(rate(ovnkube_node_cni_request_duration_seconds_bucket{command="DEL"}[$interval])) by (pod,le)) > 0', - legendFormat='{{pod}}', - ) -); - -local ovnKubeMasterMem = genericGraphLegendPanel('ovnkube-master Memory Usage', 'bytes').addTarget( - prometheus.target( - 'container_memory_rss{pod=~"ovnkube-master-.*",namespace="openshift-ovn-kubernetes",container!~"POD|"}', - legendFormat='{{container}}-{{pod}}-{{node}}', - ) -); - -local ovnKubeMasterCPU = genericGraphLegendPanel('ovnkube-master CPU Usage', 'percent').addTarget( - prometheus.target( - 'irate(container_cpu_usage_seconds_total{pod=~"ovnkube-master.*",namespace="openshift-ovn-kubernetes",container!~"POD|"}[$interval])*100', - legendFormat='{{container}}-{{pod}}-{{node}}', - ) -); - -local topOvnControllerCPU = genericGraphLegendPanel('Top 10 ovn-controller CPU Usage', 'percent').addTarget( - prometheus.target( - 'topk(10, irate(container_cpu_usage_seconds_total{pod=~"ovnkube-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}[$interval])*100)', - legendFormat='{{node}}', - ) -); - -local topOvnControllerMem = genericGraphLegendPanel('Top 10 ovn-controller Memory Usage', 'bytes').addTarget( - prometheus.target( - 'topk(10, sum(container_memory_rss{pod=~"ovnkube-node-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}) by (node))', - legendFormat='{{node}}', - ) -); - - -// Monitoring Stack - -local promReplMemUsage = genericGraphLegendPanel('Prometheus Replica Memory usage', 'bytes').addTarget( - prometheus.target( - 'sum(container_memory_rss{pod="prometheus-k8s-1",namespace!="",name!="",container="prometheus"}) by (pod)', - legendFormat='{{pod}}', - ) -).addTarget( - prometheus.target( - 'sum(container_memory_rss{pod="prometheus-k8s-0",namespace!="",name!="",container="prometheus"}) by (pod)', - legendFormat='{{pod}}', - ) -); - -// Kubelet - -local kubeletCPU = genericGraphLegendPanel('Top 10 Kubelet CPU usage', 'percent').addTarget( - prometheus.target( - 'topk(10,irate(process_cpu_seconds_total{service="kubelet",job="kubelet"}[$interval])*100)', - legendFormat='kubelet - {{node}}', - ) -); - -local crioCPU = genericGraphLegendPanel('Top 10 crio CPU usage', 'percent').addTarget( - prometheus.target( - 'topk(10,irate(process_cpu_seconds_total{service="kubelet",job="crio"}[$interval])*100)', - legendFormat='crio - {{node}}', - ) -); - -local kubeletMemory = genericGraphLegendPanel('Top 10 Kubelet memory usage', 'bytes').addTarget( - prometheus.target( - 'topk(10,process_resident_memory_bytes{service="kubelet",job="kubelet"})', - legendFormat='kubelet - {{node}}', - ) -); - -local crioMemory = genericGraphLegendPanel('Top 10 crio memory usage', 'bytes').addTarget( - prometheus.target( - 'topk(10,process_resident_memory_bytes{service="kubelet",job="crio"})', - legendFormat='crio - {{node}}', - ) -); - -local crioINodes = genericGraphLegendPanel('inodes usage in /var/run', 'percent').addTarget( - prometheus.target( - '(1 - node_filesystem_files_free{fstype!="",mountpoint="/run"} / node_filesystem_files{fstype!="",mountpoint="/run"}) * 100', - legendFormat='/var/run - {{instance}}', - ) -); - -// Cluster details - -local current_node_count = grafana.statPanel.new( - title='Current Node Count', - datasource='$datasource', - reducerFunction='last', -).addTarget( - prometheus.target( - 'sum(kube_node_info{})', - legendFormat='Number of nodes', - ) -).addTarget( - prometheus.target( - 'sum(kube_node_status_condition{status="true"}) by (condition) > 0', - legendFormat='Node: {{ condition }}', - ) -); - -local current_namespace_count = grafana.statPanel.new( - title='Current namespace Count', - datasource='$datasource', - reducerFunction='last', -).addTarget( - prometheus.target( - 'sum(kube_namespace_status_phase) by (phase)', - legendFormat='{{ phase }}', - ) -); - -local current_pod_count = grafana.statPanel.new( - title='Current Pod Count', - reducerFunction='last', - datasource='$datasource', -).addTarget( - prometheus.target( - 'sum(kube_pod_status_phase{}) by (phase) > 0', - legendFormat='{{ phase}} Pods', - ) -); - -local nodeCount = genericGraphPanel('Number of nodes', 'none').addTarget( - prometheus.target( - 'sum(kube_node_info{})', - legendFormat='Number of nodes', - ) -).addTarget( - prometheus.target( - 'sum(kube_node_status_condition{status="true"}) by (condition) > 0', - legendFormat='Node: {{ condition }}', - ) -); - -local nsCount = genericGraphPanel('Namespace count', 'none').addTarget( - prometheus.target( - 'sum(kube_namespace_status_phase) by (phase) > 0', - legendFormat='{{ phase }} namespaces', - ) -); - -local podCount = genericGraphPanel('Pod count', 'none').addTarget( - prometheus.target( - 'sum(kube_pod_status_phase{}) by (phase)', - legendFormat='{{phase}} pods', - ) -); - -local secretCmCount = genericGraphPanel('Secret & configmap count', 'none').addTarget( - prometheus.target( - 'count(kube_secret_info{})', - legendFormat='secrets', - ) -).addTarget( - prometheus.target( - 'count(kube_configmap_info{})', - legendFormat='Configmaps', - ) -); - -local deployCount = genericGraphPanel('Deployment count', 'none').addTarget( - prometheus.target( - 'count(kube_deployment_labels{})', - legendFormat='Deployments', - ) -); - - -local servicesCount = genericGraphPanel('Services count', 'none').addTarget( - prometheus.target( - 'count(kube_service_info{})', - legendFormat='Services', - ) -); - -local routesCount = genericGraphPanel('Routes count', 'none').addTarget( - prometheus.target( - 'count(openshift_route_info{})', - legendFormat='Routes', - ) -); - -local alerts = genericGraphPanel('Alerts', 'none').addTarget( - prometheus.target( - 'topk(10,sum(ALERTS{severity!="none"}) by (alertname, severity))', - legendFormat='{{severity}}: {{alertname}}', - ) -); - -local top10ContMem = genericGraphLegendPanel('Top 10 container RSS', 'bytes').addTarget( - prometheus.target( - 'topk(10, container_memory_rss{namespace!="",container!="POD",name!=""})', - legendFormat='{{ namespace }} - {{ name }}', - ) -); - -local contMemRSSSystemSlice = genericGraphLegendPanel('container RSS system.slice', 'bytes').addTarget( - prometheus.target( - 'sum by (node)(container_memory_rss{id="/system.slice"})', - legendFormat='system.slice - {{ node }}', - ) -); - -local podDistribution = genericGraphLegendPanel('Pod Distribution', 'none').addTarget( - prometheus.target( - 'count(kube_pod_info{}) by (node)', - legendFormat='{{ node }}', - ) -); - -local top10ContCPU = genericGraphLegendPanel('Top 10 container CPU', 'percent').addTarget( - prometheus.target( - 'topk(10,irate(container_cpu_usage_seconds_total{namespace!="",container!="POD",name!=""}[$interval])*100)', - legendFormat='{{ namespace }} - {{ name }}', - ) -); - - -local goroutines_count = genericGraphPanel('Goroutines count', 'none').addTarget( - prometheus.target( - 'topk(10, sum(go_goroutines{}) by (job,instance))', - legendFormat='{{ job }} - {{ instance }}', - ) -); - -// Cluster operators - -local clusterOperatorsOverview = grafana.statPanel.new( - datasource='$datasource', - title='Cluster operators overview', -).addTarget( - prometheus.target( - 'sum by (condition)(cluster_operator_conditions{condition!=""})', - legendFormat='{{ condition }}', - ) -); - -local clusterOperatorsInformation = genericGraphLegendPanel('Cluster operators information', 'none').addTarget( - prometheus.target( - 'cluster_operator_conditions{name!="",reason!=""}', - legendFormat='{{name}} - {{reason}}', - ) -); - -local clusterOperatorsDegraded = genericGraphLegendPanel('Cluster operators degraded', 'none').addTarget( - prometheus.target( - 'cluster_operator_conditions{condition="Degraded",name!="",reason!=""}', - legendFormat='{{name}} - {{reason}}', - ) -); - - -// Dashboard - -grafana.dashboard.new( - 'OpenShift Performance', - description='Performance dashboard for Red Hat OpenShift', - time_from='now-1h', - timezone='utc', - refresh='30s', - editable='true', -) - - -// Templates - -.addTemplate( - grafana.template.datasource( - 'datasource', - 'prometheus', - '', - regex='/^Cluster Prometheus$/', - ) -) - -.addTemplate( - grafana.template.new( - '_master_node', - '$datasource', - 'label_values(kube_node_role{role="master"}, node)', - '', - refresh=2, - ) { - label: 'Master', - type: 'query', - multi: true, - includeAll: false, - } -) - -.addTemplate( - grafana.template.new( - '_worker_node', - '$datasource', - 'label_values(kube_node_role{role=~"work.*"}, node)', - '', - refresh=2, - ) { - label: 'Worker', - type: 'query', - multi: true, - includeAll: false, - }, -) - -.addTemplate( - grafana.template.new( - '_infra_node', - '$datasource', - 'label_values(kube_node_role{role="infra"}, node)', - '', - refresh=2, - ) { - label: 'Infra', - type: 'query', - multi: true, - includeAll: false, - }, -) - - -.addTemplate( - grafana.template.new( - 'namespace', - '$datasource', - 'label_values(kube_pod_info{namespace!="(cluster-density.*|node-density-.*)"}, namespace)', - '', - regex='', - refresh=2, - ) { - label: 'Namespace', - type: 'query', - multi: false, - includeAll: true, - }, -) - - -.addTemplate( - grafana.template.new( - 'block_device', - '$datasource', - 'label_values(node_disk_written_bytes_total,device)', - '', - regex='/^(?:(?!dm|rb).)*$/', - refresh=2, - ) { - label: 'Block device', - type: 'query', - multi: true, - includeAll: true, - }, -) - - -.addTemplate( - grafana.template.new( - 'net_device', - '$datasource', - 'label_values(node_network_receive_bytes_total,device)', - '', - regex='/^((br|en|et).*)$/', - refresh=2, - ) { - label: 'Network device', - type: 'query', - multi: true, - includeAll: true, - }, -) - -.addTemplate( - grafana.template.new( - 'interval', - '$datasource', - '$__auto_interval_period', - label='interval', - refresh='time', - ) { - type: 'interval', - query: '2m,3m,4m,5m', - auto: false, - }, -) - -// Dashboard definition - -.addPanel( - grafana.row.new(title='OVN', collapse=true).addPanels( - [ - ovnAnnotationLatency { gridPos: { x: 0, y: 1, w: 24, h: 12 } }, - ovnCNIAdd { gridPos: { x: 0, y: 13, w: 12, h: 8 } }, - ovnCNIDel { gridPos: { x: 12, y: 13, w: 12, h: 8 } }, - ovnKubeMasterCPU { gridPos: { x: 0, y: 21, w: 12, h: 8 } }, - ovnKubeMasterMem { gridPos: { x: 12, y: 21, w: 12, h: 8 } }, - topOvnControllerCPU { gridPos: { x: 0, y: 28, w: 12, h: 8 } }, - topOvnControllerMem { gridPos: { x: 12, y: 28, w: 12, h: 8 } }, - ] - ), { gridPos: { x: 0, y: 0, w: 24, h: 1 } } -) - -.addPanel( - grafana.row.new(title='Monitoring stack', collapse=true) - .addPanel(promReplMemUsage, gridPos={ x: 0, y: 2, w: 24, h: 12 }) - , { gridPos: { x: 0, y: 1, w: 24, h: 1 } } -) - -.addPanel( - grafana.row.new(title='Stackrox', collapse=true).addPanels( - [ - stackroxMem { gridPos: { x: 0, y: 2, w: 24, h: 12 } }, - stackroxCPU { gridPos: { x: 0, y: 2, w: 24, h: 12 } }, - ] - ) - , { gridPos: { x: 0, y: 1, w: 24, h: 1 } } -) - - -.addPanel( - grafana.row.new(title='Cluster Kubelet', collapse=true).addPanels( - [ - kubeletCPU { gridPos: { x: 0, y: 3, w: 12, h: 8 } }, - crioCPU { gridPos: { x: 12, y: 3, w: 12, h: 8 } }, - kubeletMemory { gridPos: { x: 0, y: 11, w: 12, h: 8 } }, - crioMemory { gridPos: { x: 12, y: 11, w: 12, h: 8 } }, - crioINodes { gridPos: { x: 0, y: 19, w: 24, h: 8 } }, - ] - ), { gridPos: { x: 0, y: 2, w: 24, h: 1 } } -) - -.addPanel(grafana.row.new(title='Cluster Details', collapse=true).addPanels( - [ - current_node_count { gridPos: { x: 0, y: 4, w: 8, h: 3 } }, - current_namespace_count { gridPos: { x: 8, y: 4, w: 8, h: 3 } }, - current_pod_count { gridPos: { x: 16, y: 4, w: 8, h: 3 } }, - nodeCount { gridPos: { x: 0, y: 12, w: 8, h: 8 } }, - nsCount { gridPos: { x: 8, y: 12, w: 8, h: 8 } }, - podCount { gridPos: { x: 16, y: 12, w: 8, h: 8 } }, - secretCmCount { gridPos: { x: 0, y: 20, w: 8, h: 8 } }, - deployCount { gridPos: { x: 8, y: 20, w: 8, h: 8 } }, - servicesCount { gridPos: { x: 16, y: 20, w: 8, h: 8 } }, - routesCount { gridPos: { x: 0, y: 20, w: 8, h: 8 } }, - alerts { gridPos: { x: 8, y: 20, w: 8, h: 8 } }, - podDistribution { gridPos: { x: 16, y: 20, w: 8, h: 8 } }, - top10ContMem { gridPos: { x: 0, y: 28, w: 12, h: 8 } }, - contMemRSSSystemSlice { gridPos: { x: 12, y: 28, w: 12, h: 8 } }, - top10ContCPU { gridPos: { x: 0, y: 36, w: 12, h: 8 } }, - goroutines_count { gridPos: { x: 12, y: 36, w: 12, h: 8 } }, - ] -), { gridPos: { x: 0, y: 3, w: 24, h: 1 } }) - -.addPanel(grafana.row.new(title='Cluster Operators Details', collapse=true).addPanels( - [ - clusterOperatorsOverview { gridPos: { x: 0, y: 4, w: 24, h: 3 } }, - clusterOperatorsInformation { gridPos: { x: 0, y: 4, w: 8, h: 8 } }, - clusterOperatorsDegraded { gridPos: { x: 8, y: 4, w: 8, h: 8 } }, - ], -), { gridPos: { x: 0, y: 4, w: 24, h: 1 } }) - -.addPanel(grafana.row.new(title='Master: $_master_node', collapse=true, repeat='_master_node').addPanels( - [ - nodeCPU('$_master_node') { gridPos: { x: 0, y: 0, w: 12, h: 8 } }, - nodeMemory('$_master_node') { gridPos: { x: 12, y: 0, w: 12, h: 8 } }, - diskThroughput('$_master_node') { gridPos: { x: 0, y: 8, w: 12, h: 8 } }, - diskIOPS('$_master_node') { gridPos: { x: 12, y: 8, w: 12, h: 8 } }, - networkUtilization('$_master_node') { gridPos: { x: 0, y: 16, w: 12, h: 8 } }, - networkPackets('$_master_node') { gridPos: { x: 12, y: 16, w: 12, h: 8 } }, - networkDrop('$_master_node') { gridPos: { x: 0, y: 24, w: 12, h: 8 } }, - conntrackStats('$_master_node') { gridPos: { x: 12, y: 24, w: 12, h: 8 } }, - top10ContainerCPU('$_master_node') { gridPos: { x: 0, y: 24, w: 12, h: 8 } }, - top10ContainerRSS('$_master_node') { gridPos: { x: 12, y: 24, w: 12, h: 8 } }, - containerWriteBytes('$_master_node') { gridPos: { x: 0, y: 32, w: 12, h: 8 } }, - ], -), { gridPos: { x: 0, y: 1, w: 0, h: 8 } }) - -.addPanel(grafana.row.new(title='Worker: $_worker_node', collapse=true, repeat='_worker_node').addPanels( - [ - nodeCPU('$_worker_node') { gridPos: { x: 0, y: 0, w: 12, h: 8 } }, - nodeMemory('$_worker_node') { gridPos: { x: 12, y: 0, w: 12, h: 8 } }, - diskThroughput('$_worker_node') { gridPos: { x: 0, y: 8, w: 12, h: 8 } }, - diskIOPS('$_worker_node') { gridPos: { x: 12, y: 8, w: 12, h: 8 } }, - networkUtilization('$_worker_node') { gridPos: { x: 0, y: 16, w: 12, h: 8 } }, - networkPackets('$_worker_node') { gridPos: { x: 12, y: 16, w: 12, h: 8 } }, - networkDrop('$_worker_node') { gridPos: { x: 0, y: 24, w: 12, h: 8 } }, - conntrackStats('$_worker_node') { gridPos: { x: 12, y: 24, w: 12, h: 8 } }, - top10ContainerCPU('$_worker_node') { gridPos: { x: 0, y: 32, w: 12, h: 8 } }, - top10ContainerRSS('$_worker_node') { gridPos: { x: 12, y: 32, w: 12, h: 8 } }, - ], -), { gridPos: { x: 0, y: 1, w: 0, h: 8 } }) - -.addPanel(grafana.row.new(title='Infra: $_infra_node', collapse=true, repeat='_infra_node').addPanels( - [ - nodeCPU('$_infra_node') { gridPos: { x: 0, y: 0, w: 12, h: 8 } }, - nodeMemory('$_infra_node') { gridPos: { x: 12, y: 0, w: 12, h: 8 } }, - diskThroughput('$_infra_node') { gridPos: { x: 0, y: 8, w: 12, h: 8 } }, - diskIOPS('$_infra_node') { gridPos: { x: 12, y: 8, w: 12, h: 8 } }, - networkUtilization('$_infra_node') { gridPos: { x: 0, y: 16, w: 12, h: 8 } }, - networkPackets('$_infra_node') { gridPos: { x: 12, y: 16, w: 12, h: 8 } }, - networkDrop('$_infra_node') { gridPos: { x: 0, y: 24, w: 12, h: 8 } }, - conntrackStats('$_infra_node') { gridPos: { x: 12, y: 24, w: 12, h: 8 } }, - top10ContainerCPU('$_infra_node') { gridPos: { x: 0, y: 24, w: 12, h: 8 } }, - top10ContainerRSS('$_infra_node') { gridPos: { x: 12, y: 24, w: 12, h: 8 } }, - ], -), { gridPos: { x: 0, y: 1, w: 0, h: 8 } }) +local panels = import '../../assets/ocp-performance/panels.libsonnet'; +local queries = import '../../assets/ocp-performance/queries.libsonnet'; +local variables = import '../../assets/ocp-performance/variables.libsonnet'; +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; + +g.dashboard.new('Openshift Performance') ++ g.dashboard.withDescription(||| + Performance dashboard for Red Hat Openshift +|||) ++ g.dashboard.time.withFrom('now-1h') ++ g.dashboard.time.withTo('now') ++ g.dashboard.withTimezone('utc') ++ g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) ++ g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) ++ g.dashboard.withRefresh('30s') ++ g.dashboard.withEditable(true) ++ g.dashboard.graphTooltip.withSharedCrosshair() ++ g.dashboard.withVariables([ + variables.datasource, + variables.master_node, + variables.worker_node, + variables.infra_node, + variables.namespace, + variables.block_device, + variables.net_device, + variables.interval, +]) ++ g.dashboard.withPanels([ + g.panel.row.new('OVN') + + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withPanels([ + panels.timeSeries.generic('99% Pod Annotation Latency', 's', queries.ovnAnnotationLatency.query(), { x: 0, y: 1, w: 24, h: 4 }), + panels.timeSeries.generic('99% CNI Request ADD Latency', 's', queries.ovnCNIAdd.query(), { x: 0, y: 13, w: 12, h: 8 }), + panels.timeSeries.generic('99% CNI Request DEL Latency', 's', queries.ovnCNIDel.query(), { x: 12, y: 13, w: 12, h: 8 }), + panels.timeSeries.genericLegend('ovnkube-master CPU Usage', 'percent', queries.ovnKubeMasterCPU.query(), { x: 0, y: 21, w: 12, h: 8 }), + panels.timeSeries.genericLegend('ovnkube-master Memory Usage', 'bytes', queries.ovnKubeMasterMem.query(), { x: 12, y: 21, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Top 10 ovn-controller CPU Usage', 'percent', queries.topOvnControllerCPU.query(), { x: 0, y: 28, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Top 10 ovn-controller Memory Usage', 'bytes', queries.topOvnControllerMem.query(), { x: 12, y: 28, w: 12, h: 8 }), + ]), + g.panel.row.new('Monitoring stack') + + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withPanels([ + panels.timeSeries.genericLegend('Prometheus Replica CPU', 'percent', queries.promReplCpuUsage.query(), { x: 0, y: 2, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Prometheus Replica RSS', 'bytes', queries.promReplMemUsage.query(), { x: 12, y: 2, w: 12, h: 8 }), + panels.timeSeries.genericLegend('metrics-server/prom-adapter CPU', 'percent', queries.metricsServerCpuUsage.query(), { x: 0, y: 10, w: 12, h: 8 }), + panels.timeSeries.genericLegend('metrics-server/prom-adapter RSS', 'bytes', queries.metricsServerMemUsage.query(), { x: 12, y: 10, w: 12, h: 8 }), + ]), + g.panel.row.new('Stackrox') + + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withPanels([ + panels.timeSeries.genericLegend('Top 25 stackrox container RSS bytes', 'bytes', queries.stackroxMem.query(), { x: 0, y: 2, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Top 25 stackrox container CPU percent', 'percent', queries.stackroxCPU.query(), { x: 12, y: 2, w: 12, h: 8 }), + ]), + g.panel.row.new('Cluster Kubelet') + + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withPanels([ + panels.timeSeries.genericLegend('Top 10 Kubelet CPU usage', 'percent', queries.kubeletCPU.query(), { x: 0, y: 3, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Top 10 crio CPU usage', 'percent', queries.crioCPU.query(), { x: 12, y: 3, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Top 10 Kubelet memory usage', 'bytes', queries.kubeletMemory.query(), { x: 0, y: 11, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Top 10 crio memory usage', 'bytes', queries.crioMemory.query(), { x: 12, y: 11, w: 12, h: 8 }), + panels.timeSeries.genericLegend('inodes usage in /var/run', 'percent', queries.crioINodes.query(), { x: 0, y: 19, w: 24, h: 8 }), + ]), + g.panel.row.new('Cluster Details') + + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withPanels([ + panels.stat.base('Current Node Count', queries.currentNodeCount.query(), { x: 0, y: 4, w: 8, h: 3 }), + panels.stat.base('Current Namespace Count', queries.currentNamespaceCount.query(), { x: 8, y: 4, w: 8, h: 3 }), + panels.stat.base('Current Pod Count', queries.currentPodCount.query(), { x: 16, y: 4, w: 8, h: 3 }), + panels.timeSeries.generic('Number of nodes', 'none', queries.currentNodeCount.query(), { x: 0, y: 12, w: 8, h: 8 }), + panels.timeSeries.generic('Namespace count', 'none', queries.nsCount.query(), { x: 8, y: 12, w: 8, h: 8 }), + panels.timeSeries.generic('Pod count', 'none', queries.podCount.query(), { x: 16, y: 12, w: 8, h: 8 }), + panels.timeSeries.generic('Secret & configmap count', 'none', queries.secretCmCount.query(), { x: 0, y: 20, w: 8, h: 8 }), + panels.timeSeries.generic('Deployment count', 'none', queries.deployCount.query(), { x: 8, y: 20, w: 8, h: 8 }), + panels.timeSeries.generic('Services count', 'none', queries.servicesCount.query(), { x: 16, y: 20, w: 8, h: 8 }), + panels.timeSeries.generic('Routes count', 'none', queries.routesCount.query(), { x: 0, y: 20, w: 8, h: 8 }), + panels.timeSeries.generic('Alerts', 'none', queries.alerts.query(), { x: 8, y: 20, w: 8, h: 8 }), + panels.timeSeries.genericLegend('Pod Distribution', 'none', queries.podDistribution.query(), { x: 16, y: 20, w: 8, h: 8 }), + panels.timeSeries.genericLegend('Top 10 container RSS', 'bytes', queries.top10ContMem.query(), { x: 0, y: 28, w: 24, h: 8 }), + panels.timeSeries.genericLegend('container RSS system.slice', 'bytes', queries.contMemRSSSystemSlice.query(), { x: 12, y: 28, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Top 10 container CPU', 'percent', queries.top10ContCPU.query(), { x: 0, y: 36, w: 12, h: 8 }), + panels.timeSeries.generic('Goroutines count', 'none', queries.goroutinesCount.query(), { x: 12, y: 36, w: 12, h: 8 }), + ]), + g.panel.row.new('Cluster Operators Details') + + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withPanels([ + panels.stat.base('Cluster operators overview', queries.clusterOperatorsOverview.query(), { x: 0, y: 4, w: 24, h: 3 }), + panels.timeSeries.genericLegend('Cluster operators information', 'none', queries.clusterOperatorsInformation.query(), { x: 0, y: 4, w: 8, h: 8 }), + panels.timeSeries.genericLegend('Cluster operators degraded', 'none', queries.clusterOperatorsDegraded.query(), { x: 8, y: 4, w: 8, h: 8 }), + ]), + g.panel.row.new('Master: $_master_node') + + g.panel.row.withGridPos({ x: 0, y: 0, w: 0, h: 8 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withRepeat('_master_node') + + g.panel.row.withPanels([ + panels.timeSeries.genericLegend('CPU Basic: $_master_node', 'percent', queries.nodeCPU.query('$_master_node'), { x: 0, y: 0, w: 12, h: 8 }), + panels.timeSeries.genericLegend('System Memory: $_master_node', 'bytes', queries.nodeMemory.query('$_master_node'), { x: 12, y: 0, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Disk throughput: $_master_node', 'Bps', queries.diskThroughput.query('$_master_node'), { x: 0, y: 8, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Disk IOPS: $_master_node', 'iops', queries.diskIOPS.query('$_master_node'), { x: 12, y: 8, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Network Utilization: $_master_node', 'bps', queries.networkUtilization.query('$_master_node'), { x: 0, y: 16, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Network Packets: $_master_node', 'pps', queries.networkPackets.query('$_master_node'), { x: 12, y: 16, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Network packets drop: $_master_node', 'pps', queries.networkDrop.query('$_master_node'), { x: 0, y: 24, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Conntrack stats: $_master_node', '', queries.conntrackStats.query('$_master_node'), { x: 12, y: 24, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Top 10 container CPU: $_master_node', 'percent', queries.top10ContainerCPU.query('$_master_node'), { x: 0, y: 24, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Top 10 container RSS: $_master_node', 'bytes', queries.top10ContainerRSS.query('$_master_node'), { x: 12, y: 24, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Container fs write rate: $_master_node', 'Bps', queries.containerWriteBytes.query('$_master_node'), { x: 0, y: 32, w: 12, h: 8 }), + ]), + g.panel.row.new('Worker: $_worker_node') + + g.panel.row.withGridPos({ x: 0, y: 0, w: 0, h: 8 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withRepeat('_worker_node') + + g.panel.row.withPanels([ + panels.timeSeries.genericLegend('CPU Basic: $_worker_node', 'percent', queries.nodeCPU.query('$_worker_node'), { x: 0, y: 0, w: 12, h: 8 }), + panels.timeSeries.genericLegend('System Memory: $_worker_node', 'bytes', queries.nodeMemory.query('$_worker_node'), { x: 12, y: 0, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Disk throughput: $_worker_node', 'Bps', queries.diskThroughput.query('$_worker_node'), { x: 0, y: 8, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Disk IOPS: $_worker_node', 'iops', queries.diskIOPS.query('$_worker_node'), { x: 12, y: 8, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Network Utilization: $_worker_node', 'bps', queries.networkUtilization.query('$_worker_node'), { x: 0, y: 16, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Network Packets: $_worker_node', 'pps', queries.networkPackets.query('$_worker_node'), { x: 12, y: 16, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Network packets drop: $_worker_node', 'pps', queries.networkDrop.query('$_worker_node'), { x: 0, y: 24, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Conntrack stats: $_worker_node', '', queries.conntrackStats.query('$_worker_node'), { x: 12, y: 24, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Top 10 container CPU: $_worker_node', 'percent', queries.top10ContainerCPU.query('$_worker_node'), { x: 0, y: 24, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Top 10 container RSS: $_worker_node', 'bytes', queries.top10ContainerRSS.query('$_worker_node'), { x: 12, y: 24, w: 12, h: 8 }), + ]), + g.panel.row.new('Infra: $_infra_node') + + g.panel.row.withGridPos({ x: 0, y: 0, w: 0, h: 8 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withRepeat('_infra_node') + + g.panel.row.withPanels([ + panels.timeSeries.genericLegend('CPU Basic: $_infra_node', 'percent', queries.nodeCPU.query('$_infra_node'), { x: 0, y: 0, w: 12, h: 8 }), + panels.timeSeries.genericLegend('System Memory: $_infra_node', 'bytes', queries.nodeMemory.query('$_infra_node'), { x: 12, y: 0, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Disk throughput: $_infra_node', 'Bps', queries.diskThroughput.query('$_infra_node'), { x: 0, y: 8, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Disk IOPS: $_infra_node', 'iops', queries.diskIOPS.query('$_infra_node'), { x: 12, y: 8, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Network Utilization: $_infra_node', 'bps', queries.networkUtilization.query('$_infra_node'), { x: 0, y: 16, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Network Packets: $_infra_node', 'pps', queries.networkPackets.query('$_infra_node'), { x: 12, y: 16, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Network packets drop: $_infra_node', 'pps', queries.networkDrop.query('$_infra_node'), { x: 0, y: 24, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Conntrack stats: $_infra_node', '', queries.conntrackStats.query('$_infra_node'), { x: 12, y: 24, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Top 10 container CPU: $_infra_node', 'percent', queries.top10ContainerCPU.query('$_infra_node'), { x: 0, y: 24, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Top 10 container RSS: $_infra_node', 'bytes', queries.top10ContainerRSS.query('$_infra_node'), { x: 12, y: 24, w: 12, h: 8 }), + ]), +]) diff --git a/templates/General/ovn-dashboard.jsonnet b/templates/General/ovn-dashboard.jsonnet index 93cd51e..081e169 100644 --- a/templates/General/ovn-dashboard.jsonnet +++ b/templates/General/ovn-dashboard.jsonnet @@ -1,319 +1,59 @@ -local grafana = import '../grafonnet-lib/grafonnet/grafana.libsonnet'; -local prometheus = grafana.prometheus; -local stat = grafana.statPanel; - -// Helper functions - -local genericGraphLegendPanel(title, format) = grafana.graphPanel.new( - title=title, - datasource='$datasource', - format=format, - legend_values=true, - legend_alignAsTable=true, - legend_max=true, - legend_avg=true, - legend_hideEmpty=true, - legend_hideZero=true, - legend_sort='max', - nullPointMode='null as zero', - sort='decreasing', -); - -local statPanel(title) = stat.new( - title=title, - datasource='$datasource', - reducerFunction='last', - graphMode='area', - textMode='name', -).addThresholds([ - { color: 'green', value: null }, - { color: 'orange', value: 0 }, - { color: 'green', value: 1 }, -]); - -// Panel definitions - -local num_onv_controller = stat.new( - title='OVN controller', - datasource='$datasource', - reducerFunction='last', -).addTarget( - prometheus.target( - 'count(ovn_controller_monitor_all) by (namespace)', - ) -).addThresholds([ - { color: 'green', value: null }, -]); - - -local ovn_nbdb_leader = statPanel('OVN NBDB leader').addTarget( - prometheus.target( - 'ovn_db_cluster_server_role{server_role="leader",db_name="OVN_Northbound"}', - legendFormat='{{pod}}' - ) -); - -local ovn_sbdb_leader = statPanel('OVN SBDB leader').addTarget( - prometheus.target( - 'ovn_db_cluster_server_role{server_role="leader",db_name="OVN_Southbound"}', - legendFormat='{{pod}}' - ) -); - -local ovn_northd = statPanel('OVN Northd Status').addTarget( - prometheus.target( - 'ovn_northd_status', - legendFormat='{{pod}}' - ) -); - -local ovn_master_leader = statPanel('OVNKube Master').addTarget( - prometheus.target( - 'ovnkube_master_leader', - legendFormat='{{pod}}' - ) -); - - -local ovnKubeMasterMem = genericGraphLegendPanel('ovnkube-master Memory Usage', 'bytes').addTarget( - prometheus.target( - 'container_memory_rss{pod=~"ovnkube-master-.*",namespace="openshift-ovn-kubernetes",container!~"POD|"}', - legendFormat='{{container}}-{{pod}}-{{node}}', - ) -); - -local ovnKubeMasterCPU = genericGraphLegendPanel('ovnkube-master CPU Usage', 'percent').addTarget( - prometheus.target( - 'irate(container_cpu_usage_seconds_total{pod=~"ovnkube-master.*",namespace="openshift-ovn-kubernetes",container!~"POD|"}[2m])*100', - legendFormat='{{container}}-{{pod}}-{{node}}', - ) -); - -local topOvnControllerCPU = genericGraphLegendPanel('Top 10 ovn-controller CPU Usage', 'percent').addTarget( - prometheus.target( - 'topk(10, irate(container_cpu_usage_seconds_total{pod=~"ovnkube-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}[2m])*100)', - legendFormat='{{node}}', - ) -); - -local topOvnControllerMem = genericGraphLegendPanel('Top 10 ovn-controller Memory Usage', 'bytes').addTarget( - prometheus.target( - 'topk(10, sum(container_memory_rss{pod=~"ovnkube-node-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}) by (node))', - legendFormat='{{node}}', - ) -); - -local pod_latency = genericGraphLegendPanel('Pod creation Latency', 's').addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(rate(ovnkube_master_pod_lsp_created_port_binding_duration_seconds_bucket[2m])) by (pod,le))', - legendFormat='{{pod}} - LSP created', - ) -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(rate(ovnkube_master_pod_port_binding_port_binding_chassis_duration_seconds_bucket[2m])) by (pod,le))', - legendFormat='{{pod}} - Port Binding', - ) -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(rate(ovnkube_master_pod_port_binding_chassis_port_binding_up_duration_seconds_bucket[2m])) by (pod,le))', - legendFormat='{{pod}} - Port Binding Up', - ) -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(rate(ovnkube_master_pod_first_seen_lsp_created_duration_seconds_bucket[2m])) by (pod,le))', - legendFormat='{{pod}} - Pod First seen', - ) -); - -local sync_latency = genericGraphLegendPanel('Sync Service Latency', 's').addTarget( - prometheus.target( - 'rate(ovnkube_master_sync_service_latency_seconds_sum[2m])', - legendFormat='{{pod}} - Sync service latency', - ) -); - -local ovnkube_node_ready_latency = genericGraphLegendPanel('OVNKube Node Ready Latency', 's').addTarget( - prometheus.target( - 'ovnkube_node_ready_duration_seconds{pod=~"ovnkube-node-.*",namespace="openshift-ovn-kubernetes",container!~"POD|"}', - legendFormat='{{pod}}', - ) -); - -local work_queue = genericGraphLegendPanel('OVNKube Master workqueue', 'short').addTarget( - prometheus.target( - 'rate(ovnkube_master_workqueue_adds_total[2m])', - legendFormat='{{pod}} - Rate of handled adds', - ) -); - -local work_queue_depth = genericGraphLegendPanel('OVNKube Master workqueue Depth', 'short').addTarget( - prometheus.target( - 'ovnkube_master_workqueue_depth', - legendFormat='{{pod}} - Depth of workqueue', - ) -); - -local work_queue_latency = genericGraphLegendPanel('OVNKube Master workqueue duration', 's').addTarget( - prometheus.target( - 'ovnkube_master_workqueue_longest_running_processor_seconds', - legendFormat='{{pod}} - Longest processor duration', - ) -); -local work_queue_unfinished_latency = genericGraphLegendPanel('OVNKube Master workqueue - Unfinished', 's').addTarget( - prometheus.target( - 'ovnkube_master_workqueue_unfinished_work_seconds', - legendFormat='{{pod}} - Unfinished work duration', - ) -); - -local ovnAnnotationLatency = genericGraphLegendPanel('Pod Annotation Latency', 's').addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum by (pod, le) (rate(ovnkube_controller_pod_creation_latency_seconds_bucket[2m]))) > 0', - legendFormat='{{pod}} - Pod Annotation latency', - ) -); - -local ovnCNIAdd = genericGraphLegendPanel('CNI Request ADD Latency', 's').addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(rate(ovnkube_node_cni_request_duration_seconds_bucket{command="ADD"}[2m])) by (pod,le)) > 0', - legendFormat='{{pod}}', - ) -); - -local ovnCNIDel = genericGraphLegendPanel('CNI Request DEL Latency', 's').addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(rate(ovnkube_node_cni_request_duration_seconds_bucket{command="DEL"}[2m])) by (pod,le)) > 0', - legendFormat='{{pod}}', - ) -); - -local ovnLatencyCalculate = genericGraphLegendPanel('Duration for OVN to apply network configuration', 's').addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(rate(ovnkube_master_network_programming_duration_seconds_bucket[2m])) by (pod, le))', - legendFormat='{{pod}} - Kind Pod', - ) -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(rate(ovnkube_master_network_programming_duration_seconds_bucket[2m])) by (service, le))', - legendFormat='{{service}} - Kind Service', - ) -); - -// Creating the dashboard from the panels described above. - -grafana.dashboard.new( - 'OVN Monitoring', - description='', - timezone='utc', - time_from='now-1h', - editable='true' -) - -.addTemplate( - grafana.template.datasource( - 'datasource', - 'prometheus', - '', - label='datasource' - ) -) - -.addTemplate( - grafana.template.new( - '_master_node', - '$datasource', - 'label_values(kube_node_role{role="master"}, node)', - '', - refresh=2, - ) { - label: 'Master', - type: 'query', - multi: true, - includeAll: false, - } -) - -.addTemplate( - grafana.template.new( - '_worker_node', - '$datasource', - 'label_values(kube_node_role{role=~"work.*"}, node)', - '', - refresh=2, - ) { - label: 'Worker', - type: 'query', - multi: true, - includeAll: false, - }, -) - -.addTemplate( - grafana.template.new( - 'master_pod', - '$datasource', - 'label_values({pod=~"ovnkube-master.*", namespace=~"openshift-ovn-kubernetes"}, pod)', - refresh=1, - ) { - label: 'OVNKube-Master', - type: 'query', - multi: true, - includeAll: false, - } -) - -.addTemplate( - grafana.template.new( - 'kubenode_pod', - '$datasource', - 'label_values({pod=~"ovnkube-node.*", namespace=~"openshift-ovn-kubernetes"}, pod)', - refresh=1, - ) { - label: 'OVNKube-Node', - type: 'query', - multi: true, - includeAll: false, - } -) - - -.addPanel( - grafana.row.new(title='OVN Resource Monitoring', collapse=true).addPanels( - [ - ovn_master_leader { gridPos: { x: 0, y: 0, w: 4, h: 4 } }, - ovn_northd { gridPos: { x: 4, y: 0, w: 4, h: 4 } }, - ovn_nbdb_leader { gridPos: { x: 8, y: 0, w: 4, h: 4 } }, - ovn_sbdb_leader { gridPos: { x: 12, y: 0, w: 4, h: 4 } }, - num_onv_controller { gridPos: { x: 16, y: 0, w: 4, h: 4 } }, - ovnKubeMasterCPU { gridPos: { x: 0, y: 4, w: 12, h: 10 } }, - ovnKubeMasterMem { gridPos: { x: 12, y: 4, w: 12, h: 10 } }, - topOvnControllerCPU { gridPos: { x: 0, y: 12, w: 12, h: 10 } }, - topOvnControllerMem { gridPos: { x: 12, y: 12, w: 12, h: 10 } }, - ] - ), { gridPos: { x: 0, y: 0, w: 24, h: 1 } } -) - - -.addPanel( - grafana.row.new(title='Latency Monitoring', collapse=true).addPanels( - [ - ovnAnnotationLatency { gridPos: { x: 0, y: 0, w: 12, h: 10 } }, - ovnCNIAdd { gridPos: { x: 12, y: 0, w: 12, h: 10 } }, - pod_latency { gridPos: { x: 0, y: 8, w: 24, h: 10 } }, - sync_latency { gridPos: { x: 0, y: 16, w: 24, h: 10 } }, - ovnLatencyCalculate { gridPos: { x: 0, y: 24, w: 24, h: 10 } }, - ovnkube_node_ready_latency { gridPos: { x: 0, y: 32, w: 24, h: 10 } }, - ] - ), { gridPos: { x: 0, y: 0, w: 24, h: 1 } } -) - -.addPanel( - grafana.row.new(title='WorkQueue Monitoring', collapse=true).addPanels( - [ - work_queue { gridPos: { x: 0, y: 0, w: 12, h: 10 } }, - work_queue_depth { gridPos: { x: 12, y: 0, w: 12, h: 10 } }, - work_queue_latency { gridPos: { x: 0, y: 8, w: 12, h: 10 } }, - work_queue_unfinished_latency { gridPos: { x: 12, y: 8, w: 12, h: 10 } }, - ] - ), { gridPos: { x: 0, y: 0, w: 24, h: 1 } } -) +local panels = import '../../assets/ovn-monitoring/panels.libsonnet'; +local queries = import '../../assets/ovn-monitoring/queries.libsonnet'; +local variables = import '../../assets/ovn-monitoring/variables.libsonnet'; +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; + +g.dashboard.new('OVN-Monitoring-dashboard') ++ g.dashboard.time.withFrom('now-1h') ++ g.dashboard.time.withTo('now') ++ g.dashboard.withTimezone('utc') ++ g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) ++ g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) ++ g.dashboard.withRefresh('') ++ g.dashboard.withEditable(false) ++ g.dashboard.graphTooltip.withSharedCrosshair() ++ g.dashboard.withVariables([ + variables.Datasource, + variables._master_node, + variables._worker_node, + variables.master_pod, + variables.kubenode_pod, +]) + + ++ g.dashboard.withPanels([ + g.panel.row.new('OVN Resource Monitoring') + + g.panel.row.withCollapsed(true) + + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) + + g.panel.row.withPanels([ + panels.stat.genericstatThresoldPanel('OVNKube Master', 'none', queries.ovnMasterLeader.query(), { x: 0, y: 0, w: 4, h: 4 }), + panels.stat.genericstatThresoldPanel('OVN Northd Status', 'none', queries.ovnNorthd.query(), { x: 4, y: 0, w: 4, h: 4 }), + panels.stat.genericstatThresoldPanel('OVN NBDB leader', 'none', queries.ovnNbdbLeader.query(), { x: 8, y: 0, w: 4, h: 4 }), + panels.stat.genericstatThresoldPanel('OVN SBDB leader', 'none', queries.ovnSbdbLeader.query(), { x: 12, y: 0, w: 4, h: 4 }), + panels.stat.genericstatThresoldOVNControllerPanel('OVN controller', 'none', queries.numOnvController.query(), { x: 16, y: 0, w: 4, h: 4 }), + panels.timeSeries.genericTimeSeriesLegendPanel('ovnkube-master CPU Usage', 'percent', queries.ovnKubeMasterCPU.query(), { x: 0, y: 4, w: 12, h: 10 }), + panels.timeSeries.genericTimeSeriesLegendPanel('ovnkube-master Memory Usage', 'bytes', queries.ovnKubeMasterMem.query(), { x: 12, y: 4, w: 12, h: 10 }), + panels.timeSeries.genericTimeSeriesLegendPanel('Top 10 ovn-controller CPU Usage', 'percent', queries.topOvnControllerCPU.query(), { x: 0, y: 12, w: 12, h: 10 }), + panels.timeSeries.genericTimeSeriesLegendPanel('Top 10 ovn-controller Memory Usage', 'bytes', queries.topOvnControllerMem.query(), { x: 12, y: 12, w: 12, h: 10 }), + ]), + g.panel.row.new('Latency Monitoring') + + g.panel.row.withCollapsed(true) + + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) + + g.panel.row.withPanels([ + panels.timeSeries.genericTimeSeriesLegendPanel('Pod Annotation Latency', 's', queries.ovnAnnotationLatency.query(), { x: 0, y: 0, w: 12, h: 10 }), + panels.timeSeries.genericTimeSeriesLegendPanel('CNI Request ADD Latency', 's', queries.ovnCNIAdd.query(), { x: 12, y: 0, w: 12, h: 10 }), + panels.timeSeries.genericTimeSeriesLegendPanel('Pod creation Latency', 's', queries.podLatency.query(), { x: 0, y: 8, w: 24, h: 10 }), + panels.timeSeries.genericTimeSeriesLegendPanel('Sync Service Latency', 's', queries.synclatency.query(), { x: 0, y: 16, w: 24, h: 10 }), + panels.timeSeries.genericTimeSeriesLegendPanel('Duration for OVN to apply network configuration', 's', queries.ovnLatencyCalculate.query(), { x: 0, y: 24, w: 24, h: 10 }), + panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Node Ready Latency', 's', queries.ovnkubeNodeReadyLatency.query(), { x: 0, y: 32, w: 24, h: 10 }), + ]), + g.panel.row.new('WorkQueue Monitoring') + + g.panel.row.withCollapsed(true) + + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) + + g.panel.row.withPanels([ + panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Master workqueue', 'short', queries.workQueue.query(), { x: 0, y: 0, w: 12, h: 10 }), + panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Master workqueue Depth', 'short', queries.workQueueDepth.query(), { x: 12, y: 0, w: 12, h: 10 }), + panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Master workqueue duration', 's', queries.workQueueLatency.query(), { x: 0, y: 8, w: 12, h: 10 }), + panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Master workqueue - Unfinished', 's', queries.workQueueUnfinishedLatency.query(), { x: 12, y: 8, w: 12, h: 10 }), + ]), +]) diff --git a/templates/General/ovn-monitoring-v2.jsonnet b/templates/General/ovn-monitoring-v2.jsonnet deleted file mode 100644 index 081e169..0000000 --- a/templates/General/ovn-monitoring-v2.jsonnet +++ /dev/null @@ -1,59 +0,0 @@ -local panels = import '../../assets/ovn-monitoring/panels.libsonnet'; -local queries = import '../../assets/ovn-monitoring/queries.libsonnet'; -local variables = import '../../assets/ovn-monitoring/variables.libsonnet'; -local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; - -g.dashboard.new('OVN-Monitoring-dashboard') -+ g.dashboard.time.withFrom('now-1h') -+ g.dashboard.time.withTo('now') -+ g.dashboard.withTimezone('utc') -+ g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) -+ g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) -+ g.dashboard.withRefresh('') -+ g.dashboard.withEditable(false) -+ g.dashboard.graphTooltip.withSharedCrosshair() -+ g.dashboard.withVariables([ - variables.Datasource, - variables._master_node, - variables._worker_node, - variables.master_pod, - variables.kubenode_pod, -]) - - -+ g.dashboard.withPanels([ - g.panel.row.new('OVN Resource Monitoring') - + g.panel.row.withCollapsed(true) - + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) - + g.panel.row.withPanels([ - panels.stat.genericstatThresoldPanel('OVNKube Master', 'none', queries.ovnMasterLeader.query(), { x: 0, y: 0, w: 4, h: 4 }), - panels.stat.genericstatThresoldPanel('OVN Northd Status', 'none', queries.ovnNorthd.query(), { x: 4, y: 0, w: 4, h: 4 }), - panels.stat.genericstatThresoldPanel('OVN NBDB leader', 'none', queries.ovnNbdbLeader.query(), { x: 8, y: 0, w: 4, h: 4 }), - panels.stat.genericstatThresoldPanel('OVN SBDB leader', 'none', queries.ovnSbdbLeader.query(), { x: 12, y: 0, w: 4, h: 4 }), - panels.stat.genericstatThresoldOVNControllerPanel('OVN controller', 'none', queries.numOnvController.query(), { x: 16, y: 0, w: 4, h: 4 }), - panels.timeSeries.genericTimeSeriesLegendPanel('ovnkube-master CPU Usage', 'percent', queries.ovnKubeMasterCPU.query(), { x: 0, y: 4, w: 12, h: 10 }), - panels.timeSeries.genericTimeSeriesLegendPanel('ovnkube-master Memory Usage', 'bytes', queries.ovnKubeMasterMem.query(), { x: 12, y: 4, w: 12, h: 10 }), - panels.timeSeries.genericTimeSeriesLegendPanel('Top 10 ovn-controller CPU Usage', 'percent', queries.topOvnControllerCPU.query(), { x: 0, y: 12, w: 12, h: 10 }), - panels.timeSeries.genericTimeSeriesLegendPanel('Top 10 ovn-controller Memory Usage', 'bytes', queries.topOvnControllerMem.query(), { x: 12, y: 12, w: 12, h: 10 }), - ]), - g.panel.row.new('Latency Monitoring') - + g.panel.row.withCollapsed(true) - + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) - + g.panel.row.withPanels([ - panels.timeSeries.genericTimeSeriesLegendPanel('Pod Annotation Latency', 's', queries.ovnAnnotationLatency.query(), { x: 0, y: 0, w: 12, h: 10 }), - panels.timeSeries.genericTimeSeriesLegendPanel('CNI Request ADD Latency', 's', queries.ovnCNIAdd.query(), { x: 12, y: 0, w: 12, h: 10 }), - panels.timeSeries.genericTimeSeriesLegendPanel('Pod creation Latency', 's', queries.podLatency.query(), { x: 0, y: 8, w: 24, h: 10 }), - panels.timeSeries.genericTimeSeriesLegendPanel('Sync Service Latency', 's', queries.synclatency.query(), { x: 0, y: 16, w: 24, h: 10 }), - panels.timeSeries.genericTimeSeriesLegendPanel('Duration for OVN to apply network configuration', 's', queries.ovnLatencyCalculate.query(), { x: 0, y: 24, w: 24, h: 10 }), - panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Node Ready Latency', 's', queries.ovnkubeNodeReadyLatency.query(), { x: 0, y: 32, w: 24, h: 10 }), - ]), - g.panel.row.new('WorkQueue Monitoring') - + g.panel.row.withCollapsed(true) - + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) - + g.panel.row.withPanels([ - panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Master workqueue', 'short', queries.workQueue.query(), { x: 0, y: 0, w: 12, h: 10 }), - panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Master workqueue Depth', 'short', queries.workQueueDepth.query(), { x: 12, y: 0, w: 12, h: 10 }), - panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Master workqueue duration', 's', queries.workQueueLatency.query(), { x: 0, y: 8, w: 12, h: 10 }), - panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Master workqueue - Unfinished', 's', queries.workQueueUnfinishedLatency.query(), { x: 12, y: 8, w: 12, h: 10 }), - ]), -]) diff --git a/templates/General/pgbench-dashboard-v2.jsonnet b/templates/General/pgbench-dashboard-v2.jsonnet deleted file mode 100644 index ef389db..0000000 --- a/templates/General/pgbench-dashboard-v2.jsonnet +++ /dev/null @@ -1,31 +0,0 @@ -local annotation = import '../../assets/pgbench-dashboard/annotation.libsonnet'; -local panels = import '../../assets/pgbench-dashboard/panels.libsonnet'; -local queries = import '../../assets/pgbench-dashboard/queries.libsonnet'; -local variables = import '../../assets/pgbench-dashboard/variables.libsonnet'; -local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; - -g.dashboard.new('Pgbench') -+ g.dashboard.time.withFrom('now/y') -+ g.dashboard.time.withTo('now') -+ g.dashboard.withTimezone('utc') -+ g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) -+ g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) -+ g.dashboard.withRefresh('') -+ g.dashboard.withEditable(true) -+ g.dashboard.graphTooltip.withSharedCrosshair() -+ g.dashboard.withVariables([ - variables.Datasource1, - variables.Datasource2, - variables.uuid, - variables.user, -]) -+ g.dashboard.withAnnotations([ - annotation.run_start_timestamp, - annotation.sample_start_timestamp, -]) -+ g.dashboard.withPanels([ - panels.timeSeries.tps_report('TPS Report', 'ops', queries.tps_report.query(), { x: 0, y: 0, w: 12, h: 9 }), - panels.timeSeries.avg_tps('Overall Average TPS Per Run', 'ops', queries.avg_tps.query(), { x: 12, y: 0, w: 12, h: 9 }), - panels.heatmap.base('Latency Report', 'ms', queries.latency_report.query(), { x: 0, y: 9, w: 12, h: 9 }), - panels.table.base('Result Summary', queries.results.query(), { x: 12, y: 9, w: 12, h: 9 }), -]) diff --git a/templates/General/pgbench-dashboard.jsonnet b/templates/General/pgbench-dashboard.jsonnet index 1f39d0f..ef389db 100644 --- a/templates/General/pgbench-dashboard.jsonnet +++ b/templates/General/pgbench-dashboard.jsonnet @@ -1,300 +1,31 @@ -local grafana = import '../grafonnet-lib/grafonnet/grafana.libsonnet'; -local es = grafana.elasticsearch; - -local tps_report = grafana.graphPanel.new( - title='TPS Report', - datasource='$datasource1', - format='ops', - transparent=true, - legend_show=false, - linewidth=2 -) { - yaxes: [ - { - format: 'ops', - min: '0', - show: true, - }, - { - format: 'short', - show: 'false', - }, - ], -}.addTarget( - es.target( - query='(user = $user) AND (uuid = $uuid)', - timeField='timestamp', - metrics=[{ - field: 'tps', - id: '1', - meta: {}, - pipelineAgg: 'select metric', - pipelineVariables: [ - { - name: 'var1', - pipelineAgg: 'select metric', - }, - ], - settings: {}, - type: 'sum', - }], - bucketAggs=[{ - field: 'timestamp', - id: '2', - settings: { - interval: 'auto', - min_doc_count: 0, - trimEdges: 0, - }, - type: 'date_histogram', - }], - ) -); - -local latency_report = grafana.graphPanel.new( - title='Latency Report', - datasource='$datasource1', - format='ms', - transparent=true, - legend_show=true, -) - { - type: 'heatmap', - yaxes: [], - yAxis: { - format: 'ms', - show: true, - }, -}.addTarget( - es.target( - query='(uuid.keyword=$uuid) AND (user.keyword=$user)', - timeField='timestamp', - metrics=[{ - field: 'latency_ms', - id: '1', - meta: {}, - settings: {}, - type: 'avg', - }], - bucketAggs=[{ - field: 'timestamp', - id: '2', - settings: { - interval: 'auto', - min_doc_count: 0, - trimEdges: 0, - }, - type: 'date_histogram', - }], - ) -); - -local avg_tps = grafana.graphPanel.new( - title='Overall Average TPS Per Run', - datasource='$datasource2', - format='ops', - bars=true, - lines=false, - transparent=true, - legend_show=true, - legend_min=true, - legend_max=true, - legend_avg=true, - legend_alignAsTable=true, - legend_values=true, - show_xaxis=false, - x_axis_mode='series', - x_axis_values='avg', -) { - yaxes: [ - { - format: 'ops', - min: '0', - show: true, - }, - { - format: 'short', - show: 'false', - }, - ], -}.addTarget( - es.target( - query='(uuid.keyword=$uuid) AND (user.keyword=$user)', - timeField='timestamp', - metrics=[{ - field: 'tps_incl_con_est', - id: '1', - meta: {}, - pipelineAgg: 'select metric', - settings: {}, - type: 'avg', - }], - bucketAggs=[ - { - field: 'description.keyword', - id: '6', - settings: { - min_doc_count: 1, - order: 'asc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '4', - settings: { - interval: 'auto', - min_doc_count: 0, - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - -local results = grafana.tablePanel.new( - title='Result Summary', - datasource='$datasource1', - transparent=true, - styles=[ - { - pattern: 'Average latency_ms', - alias: 'Avg latency', - align: 'auto', - type: 'number', - decimals: '2', - }, - { - pattern: 'Average tps', - alias: 'Avg TPS', - align: 'auto', - type: 'number', - decimals: '2', - }, - ], -).addTarget( - es.target( - query='(uuid.keyword=$uuid) AND (user.keyword=$user)', - timeField='timestamp', - bucketAggs=[ - { - field: 'user.keyword', - id: '1', - settings: { - min_doc_count: 1, - order: 'desc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - ], - - metrics=[ - { - field: 'latency_ms', - id: '4', - meta: {}, - settings: {}, - type: 'avg', - }, - { - field: 'tps', - id: '20', - meta: {}, - settings: {}, - type: 'avg', - }, - ], - ) -); - -grafana.dashboard.new( - 'Pgbench - Dashboard', - description='', - editable='true', - timezone='utc', - time_from='now/y', - time_to='now' -) - -.addTemplate( - grafana.template.datasource( - 'datasource1', - 'elasticsearch', - 'bull-pgbench', - label='pgbench-results datasource' - ) -) - -.addTemplate( - grafana.template.datasource( - 'datasource2', - 'elasticsearch', - 'bull-pgbench-summary', - label='pgbench-summary datasource' - ) -) - -.addTemplate( - grafana.template.new( - 'uuid', - '$datasource1', - '{"find": "terms", "field": "uuid.keyword"}', - refresh=2, - ) { - type: 'query', - multi: false, - includeAll: true, - } -) - -.addTemplate( - grafana.template.new( - 'user', - '$datasource1', - '{"find": "terms", "field": "user.keyword"}', - refresh=2, - ) { - type: 'query', - multi: false, - includeAll: true, - } -) - -.addAnnotation( - grafana.annotation.datasource( - 'Run Start Time', - '$datasource2', - iconColor='#5794F2' - ) { - enable: true, - type: 'tags', - timeField: 'run_start_timestamp', - textField: 'user', - tagsField: 'description', - } -) - - -.addAnnotation( - grafana.annotation.datasource( - 'Sample Start Time', - '$datasource2', - iconColor='#B877D9' - ) { - enable: false, - type: 'tags', - timeField: 'sample_start_timestamp', - textField: 'user', - tagsField: 'description', - } -) - -.addPanel(tps_report, gridPos={ x: 0, y: 0, w: 12, h: 9 }) -.addPanel(latency_report, gridPos={ x: 0, y: 9, w: 12, h: 9 }) -.addPanel(avg_tps, gridPos={ x: 12, y: 0, w: 12, h: 9 }) -.addPanel(results, gridPos={ x: 12, y: 9, w: 12, h: 9 }) +local annotation = import '../../assets/pgbench-dashboard/annotation.libsonnet'; +local panels = import '../../assets/pgbench-dashboard/panels.libsonnet'; +local queries = import '../../assets/pgbench-dashboard/queries.libsonnet'; +local variables = import '../../assets/pgbench-dashboard/variables.libsonnet'; +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; + +g.dashboard.new('Pgbench') ++ g.dashboard.time.withFrom('now/y') ++ g.dashboard.time.withTo('now') ++ g.dashboard.withTimezone('utc') ++ g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) ++ g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) ++ g.dashboard.withRefresh('') ++ g.dashboard.withEditable(true) ++ g.dashboard.graphTooltip.withSharedCrosshair() ++ g.dashboard.withVariables([ + variables.Datasource1, + variables.Datasource2, + variables.uuid, + variables.user, +]) ++ g.dashboard.withAnnotations([ + annotation.run_start_timestamp, + annotation.sample_start_timestamp, +]) ++ g.dashboard.withPanels([ + panels.timeSeries.tps_report('TPS Report', 'ops', queries.tps_report.query(), { x: 0, y: 0, w: 12, h: 9 }), + panels.timeSeries.avg_tps('Overall Average TPS Per Run', 'ops', queries.avg_tps.query(), { x: 12, y: 0, w: 12, h: 9 }), + panels.heatmap.base('Latency Report', 'ms', queries.latency_report.query(), { x: 0, y: 9, w: 12, h: 9 }), + panels.table.base('Result Summary', queries.results.query(), { x: 12, y: 9, w: 12, h: 9 }), +]) diff --git a/templates/General/uperf-perf.jsonnet b/templates/General/uperf-perf.jsonnet index d70b3ab..26aa6ff 100644 --- a/templates/General/uperf-perf.jsonnet +++ b/templates/General/uperf-perf.jsonnet @@ -1,370 +1,32 @@ -local grafana = import '../grafonnet-lib/grafonnet/grafana.libsonnet'; -local es = grafana.elasticsearch; - - -// Panels -local throughput = grafana.graphPanel.new( - title='UPerf Performance : Throughput per-second', - datasource='$datasource', - format='bps', - legend_max=true, - legend_avg=true, - legend_alignAsTable=true, - legend_values=true, - transparent=true, -) { - yaxes: [ - { - format: 'bps', - show: 'true', - }, - { - format: 'pps', - show: 'false', - }, - ], -}.addTarget( - es.target( - query='uuid: $uuid AND cluster_name: $cluster_name AND user: $user AND iteration: $iteration AND remote_ip: $server AND message_size: $message_size AND test_type: $test_type AND protocol: $protocol AND num_threads: $threads', - timeField='uperf_ts', - metrics=[{ - field: 'norm_byte', - id: '1', - inlineScript: '_value * 8', - meta: {}, - settings: { - script: { - inline: '_value * 8', - }, - }, - transparent: true, - type: 'sum', - }], - bucketAggs=[{ - field: 'uperf_ts', - id: '2', - settings: { - interval: 'auto', - min_doc_count: 0, - trimEdges: null, - }, - type: 'date_histogram', - }], - ) -); - -local operations = grafana.graphPanel.new( - title='UPerf Performance : Operations per-second', - datasource='$datasource', - format='pps', - legend_max=true, - legend_avg=true, - legend_alignAsTable=true, - legend_values=true, - transparent=true, -) { - yaxes: [ - { - format: 'pps', - show: 'true', - }, - { - format: 'pps', - show: 'false', - }, - ], -}.addTarget( - es.target( - query='uuid: $uuid AND user: $user AND iteration: $iteration AND remote_ip: $server AND message_size: $message_size AND test_type: $test_type AND protocol: $protocol AND num_threads: $threads', - timeField='uperf_ts', - metrics=[{ - field: 'norm_ops', - id: '1', - meta: {}, - settings: {}, - type: 'sum', - }], - bucketAggs=[{ - field: 'uperf_ts', - id: '2', - settings: { - interval: 'auto', - min_doc_count: 0, - trimEdges: null, - }, - transparent: true, - type: 'date_histogram', - }], - ) -); - - -local results = grafana.tablePanel.new( - title='UPerf Result Summary', - datasource='$datasource', - transparent=true, - styles=[ - { - pattern: 'message_size', - type: 'string', - unit: 'Bps', - }, - { - decimals: '2', - pattern: 'Average norm_byte', - type: 'number', - unit: 'bps', - }, - { - decimals: '0', - pattern: 'Average norm_ops', - type: 'number', - unit: 'none', - }, - { - decimals: '2', - pattern: 'Average norm_ltcy', - type: 'number', - unit: 'µs', - }, - { - alias: 'Sample count', - decimals: '2', - pattern: 'Count', - type: 'number', - unit: 'short', - }, - ], -).addTarget( - es.target( - query='uuid: $uuid AND user: $user AND iteration: $iteration AND remote_ip: $server AND message_size: $message_size AND test_type: $test_type AND protocol: $protocol AND NOT norm_ops:0', - timeField='uperf_ts', - bucketAggs=[ - { - fake: true, - field: 'test_type.keyword', - id: '3', - settings: { - min_doc_count: 1, - order: 'desc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - { - fake: true, - field: 'protocol.keyword', - id: '4', - settings: { - min_doc_count: 1, - order: 'desc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - { - fake: true, - field: 'num_threads', - id: '5', - settings: { - min_doc_count: 1, - order: 'desc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - { - field: 'message_size', - id: '2', - settings: { - min_doc_count: 1, - order: 'desc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - ], - metrics=[ - { - field: 'norm_byte', - id: '1', - inlineScript: '_value * 8', - meta: {}, - settings: { - script: { - inline: '_value * 8', - }, - }, - type: 'avg', - }, - { - field: 'norm_ops', - id: '6', - meta: {}, - settings: {}, - type: 'avg', - }, - { - field: 'norm_ltcy', - id: '7', - meta: {}, - settings: {}, - type: 'avg', - }, - { - field: 'select field', - id: '8', - type: 'count', - }, - ], - ) -); - - -//Dashboard + Templates - -grafana.dashboard.new( - 'Public - UPerf Results', - description='', - tags=['network', 'performance'], - timezone='utc', - time_from='now-1h', - editable='true', -) - -.addTemplate( - grafana.template.datasource( - 'datasource', - 'elasticsearch', - 'bull-uperf', - label='uperf-results datasource', - regex='/(.*uperf.*)/', - ) -) - -.addTemplate( - grafana.template.new( - 'uuid', - '$datasource', - '{"find": "terms", "field": "uuid.keyword"}', - refresh=2, - ) { - type: 'query', - multi: false, - includeAll: true, - } -) - -.addTemplate( - grafana.template.new( - 'cluster_name', - '$datasource', - '{"find": "terms", "field": "cluster_name.keyword"}', - refresh=2, - ) { - type: 'query', - multi: false, - includeAll: true, - } -) - -.addTemplate( - grafana.template.new( - 'user', - '$datasource', - '{"find": "terms", "field": "user.keyword"}', - refresh=2, - ) { - type: 'query', - multi: false, - includeAll: true, - }, -) - -.addTemplate( - grafana.template.new( - 'iteration', - '$datasource', - '{"find": "terms", "field": "iteration"}', - refresh=2, - ) { - type: 'query', - multi: false, - includeAll: true, - }, -) - - -.addTemplate( - grafana.template.new( - 'server', - '$datasource', - '{"find": "terms", "field": "remote_ip.keyword"}', - refresh=2, - ) { - type: 'query', - multi: false, - includeAll: true, - }, -) - - -.addTemplate( - grafana.template.new( - 'test_type', - '$datasource', - '{"find": "terms", "field": "test_type.keyword"}', - refresh=2, - ) { - type: 'query', - multi: false, - includeAll: true, - }, -) - -.addTemplate( - grafana.template.new( - 'protocol', - '$datasource', - '{"find": "terms", "field": "protocol.keyword"}', - refresh=2, - ) { - type: 'query', - multi: false, - includeAll: true, - }, -) - -.addTemplate( - grafana.template.new( - 'message_size', - '$datasource', - '{"find": "terms", "field": "message_size"}', - refresh=2, - ) { - type: 'query', - multi: false, - includeAll: true, - }, -) - -.addTemplate( - grafana.template.new( - 'threads', - '$datasource', - '{"find": "terms", "field": "num_threads"}', - refresh=2, - ) { - type: 'query', - multi: false, - includeAll: true, - }, -) - -.addPanel(throughput, gridPos={ x: 0, y: 0, w: 12, h: 9 }) -.addPanel(operations, gridPos={ x: 12, y: 0, w: 12, h: 9 }) -.addPanel(results, gridPos={ x: 0, y: 20, w: 24, h: 18 }) +local panels = import '../../assets/uperf/panels.libsonnet'; +local queries = import '../../assets/uperf/queries.libsonnet'; +local variables = import '../../assets/uperf/variables.libsonnet'; +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; + +g.dashboard.new('Public - UPerf Results dashboard') ++ g.dashboard.withTags(['network', 'performance']) ++ g.dashboard.time.withFrom('now-1h') ++ g.dashboard.time.withTo('now') ++ g.dashboard.withTimezone('utc') ++ g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) ++ g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) ++ g.dashboard.withRefresh('') ++ g.dashboard.withEditable(false) ++ g.dashboard.graphTooltip.withSharedCrosshair() ++ g.dashboard.withVariables([ + variables.Datasource, + variables.uuid, + variables.cluster_name, + variables.user, + variables.iteration, + variables.server, + variables.test_type, + variables.protocol, + variables.message_size, + variables.threads, +]) ++ g.dashboard.withPanels([ + panels.timeSeries.uperfPerformance('UPerf Performance : Throughput per-second', 'bps', queries.throughput.query(), { x: 0, y: 0, w: 12, h: 9 }), + panels.timeSeries.uperfPerformance('UPerf Performance : Operations per-second', 'pps', queries.operations.query(), { x: 12, y: 0, w: 12, h: 9 }), + panels.table.base('UPerf Result Summary', queries.results.query(), { x: 0, y: 20, w: 24, h: 18 }), +]) diff --git a/templates/General/uperf-v2.jsonnet b/templates/General/uperf-v2.jsonnet deleted file mode 100644 index 26aa6ff..0000000 --- a/templates/General/uperf-v2.jsonnet +++ /dev/null @@ -1,32 +0,0 @@ -local panels = import '../../assets/uperf/panels.libsonnet'; -local queries = import '../../assets/uperf/queries.libsonnet'; -local variables = import '../../assets/uperf/variables.libsonnet'; -local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; - -g.dashboard.new('Public - UPerf Results dashboard') -+ g.dashboard.withTags(['network', 'performance']) -+ g.dashboard.time.withFrom('now-1h') -+ g.dashboard.time.withTo('now') -+ g.dashboard.withTimezone('utc') -+ g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) -+ g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) -+ g.dashboard.withRefresh('') -+ g.dashboard.withEditable(false) -+ g.dashboard.graphTooltip.withSharedCrosshair() -+ g.dashboard.withVariables([ - variables.Datasource, - variables.uuid, - variables.cluster_name, - variables.user, - variables.iteration, - variables.server, - variables.test_type, - variables.protocol, - variables.message_size, - variables.threads, -]) -+ g.dashboard.withPanels([ - panels.timeSeries.uperfPerformance('UPerf Performance : Throughput per-second', 'bps', queries.throughput.query(), { x: 0, y: 0, w: 12, h: 9 }), - panels.timeSeries.uperfPerformance('UPerf Performance : Operations per-second', 'pps', queries.operations.query(), { x: 12, y: 0, w: 12, h: 9 }), - panels.table.base('UPerf Result Summary', queries.results.query(), { x: 0, y: 20, w: 24, h: 18 }), -]) diff --git a/templates/General/vegeta-wrapper-v2.jsonnet b/templates/General/vegeta-wrapper-v2.jsonnet deleted file mode 100644 index b0c5385..0000000 --- a/templates/General/vegeta-wrapper-v2.jsonnet +++ /dev/null @@ -1,31 +0,0 @@ -local panels = import '../../assets/vegeta-wrapper/panels.libsonnet'; -local queries = import '../../assets/vegeta-wrapper/queries.libsonnet'; -local variables = import '../../assets/vegeta-wrapper/variables.libsonnet'; -local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; - -g.dashboard.new('Vegeta Results') -+ g.dashboard.withDescription(||| - Dashboard for Ingress Performance -|||) -+ g.dashboard.withTags('') -+ g.dashboard.time.withFrom('now-24h') -+ g.dashboard.time.withTo('now') -+ g.dashboard.withTimezone('utc') -+ g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) -+ g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) -+ g.dashboard.withRefresh('') -+ g.dashboard.withEditable(false) -+ g.dashboard.graphTooltip.withSharedCrosshair() -+ g.dashboard.withVariables([ - variables.Datasource, - variables.uuid, - variables.hostname, - variables.targets, - variables.iteration, -]) -+ g.dashboard.withPanels([ - panels.timeSeries.legendDisplayModeTable('RPS (rate of sent requests per second)', 'reqps', queries.rps.query(), { x: 0, y: 0, w: 12, h: 9 }), - panels.timeSeries.legendDisplayModeTable('Throughput (rate of successful requests per second)', 'reqps', queries.throughput.query(), { x: 12, y: 0, w: 12, h: 9 }), - panels.timeSeries.legendDisplayModeTable('Request Latency (observed over given interval)', 'µs', queries.latency.query(), { x: 0, y: 12, w: 12, h: 9 }), - panels.table.base('Vegeta Result Summary', queries.results.query(), { x: 0, y: 24, w: 24, h: 9 }), -]) diff --git a/templates/General/vegeta-wrapper.jsonnet b/templates/General/vegeta-wrapper.jsonnet index eed3278..b0c5385 100644 --- a/templates/General/vegeta-wrapper.jsonnet +++ b/templates/General/vegeta-wrapper.jsonnet @@ -1,349 +1,31 @@ -local grafana = import '../grafonnet-lib/grafonnet/grafana.libsonnet'; -local es = grafana.elasticsearch; - -// Panels -local rps = grafana.graphPanel.new( - title='RPS (rate of sent requests per second)', - datasource='$datasource', - format='reqps', - legend_max=true, - legend_avg=true, - legend_alignAsTable=true, - legend_values=true, - transparent=true, - nullPointMode='connected', -) { - yaxes: [ - { - format: 'reqps', - show: 'true', - }, - { - format: 'pps', - show: 'false', - }, - ], - fill: 2, -}.addTarget( - es.target( - query='uuid: $uuid AND hostname: $hostname AND iteration: $iteration AND targets: "$targets"', - timeField='timestamp', - metrics=[{ - field: 'rps', - id: '1', - meta: {}, - settings: {}, - transparent: true, - type: 'avg', - }], - bucketAggs=[{ - field: 'timestamp', - id: '2', - settings: { - interval: 'auto', - min_doc_count: 0, - trimEdges: null, - }, - type: 'date_histogram', - }], - ) -); - - -local throughput = grafana.graphPanel.new( - title='Throughput (rate of successful requests per second)', - datasource='$datasource', - format='reqps', - legend_max=true, - legend_avg=true, - legend_alignAsTable=true, - legend_values=true, - transparent=true, - nullPointMode='connected', -) { - yaxes: [ - { - format: 'reqps', - show: 'true', - }, - { - format: 'pps', - show: 'false', - }, - ], - fill: 2, -}.addTarget( - es.target( - query='uuid: $uuid AND hostname: $hostname AND iteration: $iteration AND targets: "$targets"', - timeField='timestamp', - metrics=[{ - field: 'throughput', - id: '1', - meta: {}, - settings: {}, - transparent: true, - type: 'avg', - }], - bucketAggs=[{ - field: 'timestamp', - id: '2', - settings: { - interval: 'auto', - min_doc_count: 0, - trimEdges: null, - }, - type: 'date_histogram', - }], - ) -); - -local latency = grafana.graphPanel.new( - title='Request Latency (observed over given interval)', - datasource='$datasource', - format='µs', - legend_max=true, - legend_avg=true, - legend_alignAsTable=true, - legend_values=true, - transparent=true, - nullPointMode='connected', -) { - yaxes: [ - { - format: 'µs', - show: 'true', - }, - { - format: 'pps', - show: 'false', - }, - ], - fill: 2, -}.addTarget( - es.target( - query='uuid: $uuid AND hostname: $hostname AND iteration: $iteration AND targets: "$targets"', - timeField='timestamp', - metrics=[{ - field: 'req_latency', - id: '1', - meta: {}, - settings: {}, - transparent: true, - type: 'avg', - }], - bucketAggs=[{ - field: 'timestamp', - id: '2', - settings: { - interval: 'auto', - min_doc_count: 0, - trimEdges: null, - }, - type: 'date_histogram', - }], - ) -).addTarget( - es.target( - query='uuid: $uuid AND hostname: $hostname AND iteration: $iteration AND targets: "$targets"', - timeField='timestamp', - metrics=[{ - field: 'p99_latency', - id: '1', - meta: {}, - settings: {}, - transparent: true, - type: 'avg', - }], - bucketAggs=[{ - field: 'timestamp', - id: '2', - settings: { - interval: 'auto', - min_doc_count: 0, - trimEdges: null, - }, - type: 'date_histogram', - }], - ) -); - -local results = grafana.tablePanel.new( - title='Vegeta Result Summary', - datasource='$datasource', - transparent=true, - styles=[ - { - decimals: '2', - pattern: 'Average rps', - type: 'number', - unit: 'reqps', - }, - { - decimals: '2', - pattern: 'Average throughput', - type: 'number', - unit: 'reqps', - }, - { - decimals: '2', - pattern: 'Average p99_latency', - type: 'number', - unit: 'µs', - }, - { - decimals: '2', - pattern: 'Average req_latency', - type: 'number', - unit: 'µs', - }, - { - decimals: '2', - pattern: 'Average bytes_in', - type: 'number', - unit: 'bps', - }, - { - decimals: '2', - pattern: 'Average bytes_out', - type: 'number', - unit: 'bps', - }, - ], -).addTarget( - es.target( - query='uuid: $uuid AND hostname: $hostname AND iteration: $iteration AND targets: "$targets"', - timeField='timestamp', - bucketAggs=[ - { - fake: true, - field: 'targets.keyword', - id: '1', - settings: { - min_doc_count: 1, - order: 'desc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - { - field: 'uuid.keyword', - id: '2', - settings: { - min_doc_count: 1, - order: 'desc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - ], - metrics=[ - { - field: 'rps', - id: '3', - type: 'avg', - }, - { - field: 'throughput', - id: '4', - type: 'avg', - }, - { - field: 'p99_latency', - id: '5', - type: 'avg', - }, - { - field: 'req_latency', - id: '6', - type: 'avg', - }, - { - field: 'bytes_in', - id: '7', - type: 'avg', - }, - { - field: 'bytes_out', - id: '8', - type: 'avg', - }, - ], - ) -); - -grafana.dashboard.new( - 'Vegeta Results Dashboard', - description='', - timezone='utc', - time_from='now-24h', - editable='true', -) - -.addTemplate( - grafana.template.datasource( - 'datasource', - 'elasticsearch', - 'ripsaw-vegeta-results', - label='vegeta-results datasource', - regex='/(.*vegeta.*)/', - ) -) - -.addTemplate( - grafana.template.new( - 'uuid', - '$datasource', - '{"find": "terms", "field": "uuid.keyword"}', - refresh=2, - ) { - type: 'query', - multi: false, - includeAll: true, - } -) - -.addTemplate( - grafana.template.new( - 'hostname', - '$datasource', - '{"find": "terms", "field": "hostname.keyword"}', - refresh=2, - ) { - type: 'query', - multi: false, - includeAll: true, - } -) - -.addTemplate( - grafana.template.new( - 'targets', - '$datasource', - '{"find": "terms", "field": "targets.keyword"}', - refresh=2, - ) { - type: 'query', - multi: false, - includeAll: true, - } -) - -.addTemplate( - grafana.template.new( - 'iteration', - '$datasource', - '{"find": "terms", "field": "iteration"}', - refresh=2, - ) { - type: 'query', - multi: false, - includeAll: true, - } -) - -.addPanel(rps, gridPos={ x: 0, y: 0, w: 12, h: 9 }) -.addPanel(throughput, gridPos={ x: 12, y: 0, w: 12, h: 9 }) -.addPanel(latency, gridPos={ x: 0, y: 12, w: 12, h: 9 }) -.addPanel(results, gridPos={ x: 0, y: 24, w: 24, h: 9 }) +local panels = import '../../assets/vegeta-wrapper/panels.libsonnet'; +local queries = import '../../assets/vegeta-wrapper/queries.libsonnet'; +local variables = import '../../assets/vegeta-wrapper/variables.libsonnet'; +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; + +g.dashboard.new('Vegeta Results') ++ g.dashboard.withDescription(||| + Dashboard for Ingress Performance +|||) ++ g.dashboard.withTags('') ++ g.dashboard.time.withFrom('now-24h') ++ g.dashboard.time.withTo('now') ++ g.dashboard.withTimezone('utc') ++ g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) ++ g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) ++ g.dashboard.withRefresh('') ++ g.dashboard.withEditable(false) ++ g.dashboard.graphTooltip.withSharedCrosshair() ++ g.dashboard.withVariables([ + variables.Datasource, + variables.uuid, + variables.hostname, + variables.targets, + variables.iteration, +]) ++ g.dashboard.withPanels([ + panels.timeSeries.legendDisplayModeTable('RPS (rate of sent requests per second)', 'reqps', queries.rps.query(), { x: 0, y: 0, w: 12, h: 9 }), + panels.timeSeries.legendDisplayModeTable('Throughput (rate of successful requests per second)', 'reqps', queries.throughput.query(), { x: 12, y: 0, w: 12, h: 9 }), + panels.timeSeries.legendDisplayModeTable('Request Latency (observed over given interval)', 'µs', queries.latency.query(), { x: 0, y: 12, w: 12, h: 9 }), + panels.table.base('Vegeta Result Summary', queries.results.query(), { x: 0, y: 24, w: 24, h: 9 }), +]) diff --git a/templates/General/ycsb-v2.jsonnet b/templates/General/ycsb-v2.jsonnet deleted file mode 100644 index 14eab24..0000000 --- a/templates/General/ycsb-v2.jsonnet +++ /dev/null @@ -1,29 +0,0 @@ -local panels = import '../../assets/ycsb/panels.libsonnet'; -local queries = import '../../assets/ycsb/queries.libsonnet'; -local variables = import '../../assets/ycsb/variables.libsonnet'; -local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; - -g.dashboard.new('YCSB') -+ g.dashboard.time.withFrom('now/y') -+ g.dashboard.time.withTo('now') -+ g.dashboard.withTimezone('utc') -+ g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) -+ g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) -+ g.dashboard.withRefresh('') -+ g.dashboard.withEditable(false) -+ g.dashboard.graphTooltip.withSharedCrosshair() -+ g.dashboard.withVariables([ - variables.Datasource1, - variables.Datasource2, - variables.uuid, - variables.user, - variables.phase, - variables.operation, -]) -+ g.dashboard.withPanels([ - panels.timeSeries.throughputOvertimePhase('Throughput overtime - Phase = $phase : Operation = $operation', '$Datasource1', 'ops', queries.throughput_overtime.query(), { x: 0, y: 0, w: 12, h: 9 }), - panels.timeSeries.latency90percReportedFromYCSB('Phase = $phase :: Latency - 90%tile Reported from YCSB', '$Datasource1', 'µs', queries.phase_average_latency.query(), { x: 12, y: 0, w: 12, h: 9 }), - panels.timeSeries.LatancyofEachWorkloadPerYCSBOperation('95th% Latency of each workload per YCSB Operation', '$Datasource2', 'µs', queries.latency_95.query(), { x: 0, y: 9, w: 24, h: 6 }), - panels.timeSeries.overallThroughputPerYCSB('Overall Throughput per YCSB Workload', '$Datasource2', 'ops', queries.overall_workload_throughput.query(), { x: 0, y: 15, w: 16, h: 10 }), - panels.table.base('Phase = $phase :: $operation - Count', '$Datasource2', queries.aggregate_operation_sum.query(), { x: 16, y: 15, w: 8, h: 10 }), -]) diff --git a/templates/General/ycsb.jsonnet b/templates/General/ycsb.jsonnet index e6fa8c6..14eab24 100644 --- a/templates/General/ycsb.jsonnet +++ b/templates/General/ycsb.jsonnet @@ -1,356 +1,29 @@ -local grafana = import '../grafonnet-lib/grafonnet/grafana.libsonnet'; -local es = grafana.elasticsearch; - -//Panel definitions - -local throughput_overtime = grafana.graphPanel.new( - title='Throughput overtime - Phase = $phase : Operation = $operation', - datasource='$datasource1', - format='ops', - linewidth=2 -) { - yaxes: [ - { - format: 'ops', - show: true, - }, - { - format: 'short', - show: false, - }, - ], - fill: 2, -}.addTarget( - es.target( - query='(uuid.keyword = $uuid) AND (phase.keyword = $phase) AND (user.keyword=$user) AND (action.keyword=$operation)', - timeField='timestamp', - metrics=[{ - field: 'overall_rate', - id: '1', - meta: {}, - settings: {}, - type: 'avg', - }], - bucketAggs=[ - { - field: 'action.keyword', - id: '4', - settings: { - min_doc_count: 1, - order: 'desc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '3', - settings: { - interval: 'auto', - min_doc_count: 0, - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - -local phase_average_latency = grafana.graphPanel.new( - title='Phase = $phase :: Latency - 90%tile Reported from YCSB', - datasource='$datasource1', - format='µs', - linewidth=2, - lines=false, - points=true, - nullPointMode='connected', - pointradius=1 -) { - yaxes: [ - { - format: 'µs', - show: true, - }, - { - format: 'short', - show: true, - }, - ], - fill: 2, -}.addTarget( - es.target( - query='(uuid.keyword = $uuid) AND (phase.keyword = $phase) AND (user.keyword=$user) AND (action.keyword=$operation)', - timeField='timestamp', - metrics=[{ - field: 'latency_90', - id: '1', - meta: {}, - settings: {}, - type: 'avg', - }], - bucketAggs=[ - { - field: 'action.keyword', - id: '3', - settings: { - min_doc_count: 1, - order: 'desc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '2', - settings: { - interval: 'auto', - min_doc_count: 0, - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - -local latency_95 = grafana.graphPanel.new( - title='95th% Latency of each workload per YCSB Operation', - datasource='$datasource2', - format='µs', - legend_show=false, - linewidth=2, - bars=true, - lines=false, - x_axis_mode='series', - x_axis_values='total', -) { - yaxes: [ - { - format: 'µs', - show: true, - }, - { - format: 'short', - show: false, - }, - ], -}.addTarget( - es.target( - query='(uuid.keyword = $uuid) AND (phase.keyword = $phase) AND (user.keyword=$user)', - timeField='timestamp', - metrics=[{ - field: 'data.$operation.95thPercentileLatency(us)', - id: '1', - meta: {}, - settings: {}, - type: 'avg', - }], - bucketAggs=[ - { - field: 'workload_type.keyword', - id: '5', - settings: { - min_doc_count: 1, - order: 'desc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '3', - settings: { - interval: 'auto', - min_doc_count: 0, - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - -local overall_workload_throughput = grafana.graphPanel.new( - title='Overall Throughput per YCSB Workload', - datasource='$datasource2', - format='ops', - legend_rightSide=true, - legend_total=true, - legend_alignAsTable=true, - legend_values=true, - linewidth=2, - bars=true, - lines=false, - x_axis_mode='series', - x_axis_values='total', -) { - yaxes: [ - { - format: 'ops', - show: true, - }, - { - format: 'short', - show: false, - }, - ], -}.addTarget( - es.target( - query='(uuid.keyword = $uuid) AND (phase.keyword = $phase) AND (user.keyword=$user)', - timeField='timestamp', - metrics=[{ - field: 'data.OVERALL.Throughput(ops/sec)', - id: '1', - meta: {}, - settings: {}, - type: 'sum', - }], - bucketAggs=[ - { - field: 'workload_type.keyword', - id: '5', - settings: { - min_doc_count: 1, - order: 'desc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - { - field: 'timestamp', - id: '3', - settings: { - interval: 'auto', - min_doc_count: 0, - trimEdges: 0, - }, - type: 'date_histogram', - }, - ], - ) -); - -local aggregate_operation_sum = grafana.tablePanel.new( - title='Phase = $phase :: $operation - Count', - datasource='$datasource2', -).addTarget( - es.target( - query='(uuid.keyword = $uuid) AND (phase.keyword = $phase) AND (user.keyword=$user)', - timeField='timestamp', - alias='$operation - Operations', - metrics=[{ - field: 'data.$operation.Operations', - id: '1', - meta: {}, - settings: {}, - type: 'sum', - }], - bucketAggs=[ - { - field: 'workload_type.keyword', - id: '3', - settings: { - min_doc_count: 1, - order: 'desc', - orderBy: '_term', - size: '10', - }, - type: 'terms', - }, - ], - ) -); - -//Dashboard & Templates - -grafana.dashboard.new( - 'YCSB - Dashboard', - description='', - editable='true', - time_from='now/y', - time_to='now', - timezone='utc', -) - -.addTemplate( - grafana.template.datasource( - 'datasource1', - 'elasticsearch', - 'Prod ES - ripsaw-ycsb-results', - label='ycsb-results datasource' - ) -) - -.addTemplate( - grafana.template.datasource( - 'datasource2', - 'elasticsearch', - 'Prod ES - ripsaw-ycsb-summary', - label='ycsb-summary datasource' - ) -) - -.addTemplate( - grafana.template.new( - 'uuid', - '$datasource2', - '{"find": "terms", "field": "uuid.keyword"}', - refresh=2, - ) { - type: 'query', - multi: false, - includeAll: true, - } -) - -.addTemplate( - grafana.template.new( - 'user', - '$datasource2', - '{"find": "terms", "field": "user.keyword"}', - refresh=2, - ) { - type: 'query', - multi: false, - includeAll: true, - } -) - -.addTemplate( - grafana.template.new( - 'phase', - '$datasource2', - '{"find": "terms", "field": "phase.keyword"}', - refresh=2, - current='run' - ) { - type: 'query', - multi: false, - includeAll: true, - } -) - -.addTemplate( - grafana.template.new( - 'operation', - '$datasource2', - '{"find": "fields", "field": "data.*.Operations"}', - regex='/data.(.*).Operations/', - refresh=2, - current='READ' - ) { - type: 'query', - multi: false, - includeAll: true, - } -) - -.addPanel(throughput_overtime, gridPos={ x: 0, y: 0, w: 12, h: 9 }) -.addPanel(phase_average_latency, gridPos={ x: 12, y: 0, w: 12, h: 9 }) -.addPanel(latency_95, gridPos={ x: 0, y: 9, w: 24, h: 6 }) -.addPanel(overall_workload_throughput, gridPos={ x: 0, y: 15, w: 16, h: 10 }) -.addPanel(aggregate_operation_sum, gridPos={ x: 16, y: 15, w: 8, h: 10 }) +local panels = import '../../assets/ycsb/panels.libsonnet'; +local queries = import '../../assets/ycsb/queries.libsonnet'; +local variables = import '../../assets/ycsb/variables.libsonnet'; +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; + +g.dashboard.new('YCSB') ++ g.dashboard.time.withFrom('now/y') ++ g.dashboard.time.withTo('now') ++ g.dashboard.withTimezone('utc') ++ g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) ++ g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) ++ g.dashboard.withRefresh('') ++ g.dashboard.withEditable(false) ++ g.dashboard.graphTooltip.withSharedCrosshair() ++ g.dashboard.withVariables([ + variables.Datasource1, + variables.Datasource2, + variables.uuid, + variables.user, + variables.phase, + variables.operation, +]) ++ g.dashboard.withPanels([ + panels.timeSeries.throughputOvertimePhase('Throughput overtime - Phase = $phase : Operation = $operation', '$Datasource1', 'ops', queries.throughput_overtime.query(), { x: 0, y: 0, w: 12, h: 9 }), + panels.timeSeries.latency90percReportedFromYCSB('Phase = $phase :: Latency - 90%tile Reported from YCSB', '$Datasource1', 'µs', queries.phase_average_latency.query(), { x: 12, y: 0, w: 12, h: 9 }), + panels.timeSeries.LatancyofEachWorkloadPerYCSBOperation('95th% Latency of each workload per YCSB Operation', '$Datasource2', 'µs', queries.latency_95.query(), { x: 0, y: 9, w: 24, h: 6 }), + panels.timeSeries.overallThroughputPerYCSB('Overall Throughput per YCSB Workload', '$Datasource2', 'ops', queries.overall_workload_throughput.query(), { x: 0, y: 15, w: 16, h: 10 }), + panels.table.base('Phase = $phase :: $operation - Count', '$Datasource2', queries.aggregate_operation_sum.query(), { x: 16, y: 15, w: 8, h: 10 }), +])