From c35f00fba981763b4de27d2d47db998088e9ce3d Mon Sep 17 00:00:00 2001 From: Docs Date: Fri, 5 Jan 2024 13:51:28 +0100 Subject: [PATCH] Add workload summary and compare data graphs on k8s-netperf --- assets/k8s-netperf/panels.libsonnet | 172 ++++++++++++++++++++++++- assets/k8s-netperf/queries.libsonnet | 54 +++++++- assets/k8s-netperf/variables.libsonnet | 18 ++- templates/CPT/k8s-netperf-v2.jsonnet | 15 +++ 4 files changed, 248 insertions(+), 11 deletions(-) diff --git a/assets/k8s-netperf/panels.libsonnet b/assets/k8s-netperf/panels.libsonnet index 0e10a3f..1d93ef3 100644 --- a/assets/k8s-netperf/panels.libsonnet +++ b/assets/k8s-netperf/panels.libsonnet @@ -105,7 +105,144 @@ local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonn "displayName": "ea7b29d7-8991-4752-a0d4-e26446d34915 TCP_STREAM 4096 Mb/s AWS" } ]), - + workloadSummary(title, targets, gridPos): + self.base(title, targets, gridPos) + + table.queryOptions.withTransformations([ + { + "id": "organize", + "options": { + "excludeByName": { + "_id": true, + "_index": true, + "_type": true, + "clientCPU.idleCPU": true, + "clientCPU.ioCPU": true, + "clientCPU.irqCPU": true, + "clientCPU.niceCPU": true, + "clientCPU.softCPU": true, + "clientCPU.stealCPU": true, + "clientCPU.systemCPU": true, + "clientCPU.userCPU": true, + "clientNodeLabels.beta.kubernetes.io/arch": true, + "clientNodeLabels.beta.kubernetes.io/instance-type": true, + "clientNodeLabels.beta.kubernetes.io/os": true, + "clientNodeLabels.failure-domain.beta.kubernetes.io/region": true, + "clientNodeLabels.failure-domain.beta.kubernetes.io/zone": true, + "clientNodeLabels.hypershift.openshift.io/managed": true, + "clientNodeLabels.hypershift.openshift.io/nodePool": true, + "clientNodeLabels.kubernetes.io/arch": true, + "clientNodeLabels.kubernetes.io/hostname": true, + "clientNodeLabels.kubernetes.io/os": true, + "clientNodeLabels.node-role.kubernetes.io/worker": true, + "clientNodeLabels.node.kubernetes.io/instance-type": true, + "clientNodeLabels.node.openshift.io/os_id": true, + "clientNodeLabels.topology.ebs.csi.aws.com/zone": true, + "clientNodeLabels.topology.kubernetes.io/region": true, + "clientNodeLabels.topology.kubernetes.io/zone": true, + "clientPods": true, + "confidence": true, + "driver": true, + "highlight": true, + "hostNetwork": true, + "latency": true, + "local": true, + "ltcyMetric": true, + "messageSize": true, + "metadata.ipsec": true, + "metadata.k8sVersion": true, + "metadata.kernel": true, + "metadata.masterNodesCount": true, + "metadata.masterNodesType": true, + "metadata.metricName": true, + "metadata.mtu": true, + "metadata.ocpShortVersion": true, + "metadata.totalNodes": true, + "parallelism": true, + "profile": true, + "samples": true, + "serverCPU.idleCPU": true, + "serverCPU.ioCPU": true, + "serverCPU.irqCPU": true, + "serverCPU.niceCPU": true, + "serverCPU.softCPU": true, + "serverCPU.stealCPU": true, + "serverCPU.systemCPU": true, + "serverCPU.userCPU": true, + "serverNodeLabels.beta.kubernetes.io/arch": true, + "serverNodeLabels.beta.kubernetes.io/instance-type": true, + "serverNodeLabels.beta.kubernetes.io/os": true, + "serverNodeLabels.failure-domain.beta.kubernetes.io/region": true, + "serverNodeLabels.failure-domain.beta.kubernetes.io/zone": true, + "serverNodeLabels.hypershift.openshift.io/managed": true, + "serverNodeLabels.hypershift.openshift.io/nodePool": true, + "serverNodeLabels.kubernetes.io/arch": true, + "serverNodeLabels.kubernetes.io/hostname": true, + "serverNodeLabels.kubernetes.io/os": true, + "serverNodeLabels.node-role.kubernetes.io/worker": true, + "serverNodeLabels.node.kubernetes.io/instance-type": true, + "serverNodeLabels.node.openshift.io/os_id": true, + "serverNodeLabels.topology.ebs.csi.aws.com/zone": true, + "serverNodeLabels.topology.kubernetes.io/region": true, + "serverNodeLabels.topology.kubernetes.io/zone": true, + "serverPods": true, + "service": true, + "sort": true, + "tcpRetransmits": true, + "throughput": true, + "tputMetric": true, + "udpLossPercent": true + }, + "indexByName": { + "uuid": 0, + "timestamp": 1, + "metadata.platform": 2, + "metadata.ocpVersion": 3, + "metadata.clusterName": 4, + "metadata.sdnType": 5, + "metadata.infraNodesCount": 6, + "metadata.infraNodesType": 7, + "metadata.workerNodesCount": 8, + "metadata.workerNodesType": 9, + "metadata.acrossAZ": 10, + "metadata.region": 11 + }, + "renameByName": { + "acrossAZ": "Multi-AZ", + "metadata.clusterName": "Cluster Name", + "metadata.infraNodesCount": "Infras", + "metadata.infraNodesType": "Infra Type", + "metadata.ocpVersion": "Version", + "metadata.platform": "Platform", + "metadata.region": "Region", + "metadata.sdnType": "SDN", + "metadata.workerNodesCount": "Workers", + "metadata.workerNodesType": "Workers Type", + "timestamp": "Timestamp", + "uuid": "UUID" + } + } + }, + { + "id": "groupBy", + "options": { + "fields": { + "UUID": {"aggregations": [], "operation": "groupby"}, + "Cluster Name": {"aggregations": ["lastNotNull"], "operation": "aggregate"}, + "Infra Type": {"aggregations": ["lastNotNull"], "operation": "aggregate"}, + "Infras": {"aggregations": ["lastNotNull"], "operation": "aggregate"}, + "Platform": {"aggregations": ["lastNotNull"], "operation": "aggregate"}, + "Region": {"aggregations": ["lastNotNull"], "operation": "aggregate"}, + "SDN": {"aggregations": ["lastNotNull"], "operation": "aggregate"}, + "Timestamp": {"aggregations": ["lastNotNull"], "operation": "aggregate"}, + "Version": {"aggregations": ["lastNotNull"], "operation": "aggregate"}, + "Workers": {"aggregations": ["lastNotNull"], "operation": "aggregate"}, + "Workers Type": {"aggregations": ["lastNotNull"], "operation": "aggregate"}, + "duration": {"aggregations": ["lastNotNull"], "operation": "aggregate"}, + "Multi-AZ": {"aggregations": ["last"], "operation": "aggregate"} + } + } + } + ]), withLatencyOverrides(title, targets, gridPos): self.base(title, targets, gridPos) + table.queryOptions.withTransformations([ @@ -168,7 +305,38 @@ local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonn } ]), }, + barGauge: { + local barGauge = g.panel.barGauge, + local custom = barGauge.fields.defaults.custom, + local options = barGauge.options, + base(title, targets, gridPos): + barGauge.new(title) + + barGauge.queryOptions.withTargets(targets) + + barGauge.datasource.withType('elasticsearch') + + barGauge.datasource.withUid('$datasource') + + barGauge.options.reduceOptions.withValues(false) + + barGauge.options.reduceOptions.withCalcs(["lastNotNull"]) + + barGauge.options.reduceOptions.withFields("") + + barGauge.options.withOrientation("horizontal") + + barGauge.options.withDisplayMode("gradient") + + barGauge.options.withValueMode("color") + + barGauge.panelOptions.withRepeat("messageSize") + + barGauge.standardOptions.withMin("0") + + barGauge.standardOptions.color.withMode("palette-classic") + + barGauge.gridPos.withX(gridPos.x) + + barGauge.gridPos.withY(gridPos.y) + + barGauge.gridPos.withH(gridPos.h) + + barGauge.gridPos.withW(gridPos.w), + + withThroughput(title, targets,gridPos): + self.base(title, targets, gridPos) + + barGauge.standardOptions.withUnit("Mbits"), + + withLatency(title, targets,gridPos): + self.base(title, targets, gridPos) + + barGauge.standardOptions.withUnit("µs"), + }, timeSeries: { local timeSeries = g.panel.timeSeries, local custom = timeSeries.fieldConfig.defaults.custom, @@ -246,4 +414,4 @@ local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonn } ]), }, -} \ No newline at end of file +} diff --git a/assets/k8s-netperf/queries.libsonnet b/assets/k8s-netperf/queries.libsonnet index 1ef4dee..fa8ce73 100644 --- a/assets/k8s-netperf/queries.libsonnet +++ b/assets/k8s-netperf/queries.libsonnet @@ -3,7 +3,7 @@ local elasticsearch = g.query.elasticsearch; { all: { - query(metric, aggregationMetric): + query(metric, aggregationMetric): elasticsearch.withAlias("{{metadata.ocpVersion.keyword}} hostNetwork={{hostNetwork}} procs={{parallelism}}") + elasticsearch.withBucketAggs([ elasticsearch.bucketAggs.Terms.withField("messageSize") @@ -79,7 +79,7 @@ local elasticsearch = g.query.elasticsearch; + elasticsearch.withTimeField('timestamp') }, parallelismAll: { - query(metric, aggregationMetric): + query(metric, aggregationMetric): elasticsearch.withAlias("") + elasticsearch.withBucketAggs([ elasticsearch.bucketAggs.Terms.withField("uuid.keyword") @@ -161,4 +161,52 @@ local elasticsearch = g.query.elasticsearch; + elasticsearch.withQuery('uuid: $uuid AND parallelism: $parallelism AND profile: ' + metric + ' AND messageSize: $messageSize AND driver.keyword: $driver AND metadata.platform: $platform AND hostNetwork: $hostNetwork AND service: $service') + elasticsearch.withTimeField('timestamp') }, -} \ No newline at end of file + summary: { + query(metric, aggregationMetric): + elasticsearch.withAlias("") + + elasticsearch.withBucketAggs([ + ]) + + elasticsearch.withMetrics([ + elasticsearch.metrics.MetricAggregationWithSettings.RawData.withHide(false) + + elasticsearch.metrics.MetricAggregationWithSettings.RawData.withId("1") + + elasticsearch.metrics.MetricAggregationWithSettings.RawData.withType("raw_data") + + elasticsearch.metrics.MetricAggregationWithSettings.RawData.settings.withSize("500") + ]) + + elasticsearch.withQuery('uuid: $uuid AND parallelism: $parallelism AND profile: ' + metric + ' AND messageSize: $messageSize AND driver.keyword: $driver AND metadata.platform: $platform AND hostNetwork: $hostNetwork AND service: $service') + + elasticsearch.withTimeField('timestamp') + }, + metricCompare: { + query(metric, aggregationMetric, hostNetwork, service): + elasticsearch.withAlias("{{$compare_by}} Procs: {{parallelism}}") + + elasticsearch.withBucketAggs([ + elasticsearch.bucketAggs.Terms.withField("parallelism") + + elasticsearch.bucketAggs.Terms.withId("1") + + elasticsearch.bucketAggs.Terms.withType('terms') + + elasticsearch.bucketAggs.Terms.settings.withOrder('asc') + + elasticsearch.bucketAggs.Terms.settings.withOrderBy('_term') + + elasticsearch.bucketAggs.Terms.settings.withMinDocCount('1') + + elasticsearch.bucketAggs.Terms.settings.withSize("0"), + elasticsearch.bucketAggs.Terms.withField("$compare_by") + + elasticsearch.bucketAggs.Terms.withId("2") + + elasticsearch.bucketAggs.Terms.withType('terms') + + elasticsearch.bucketAggs.Terms.settings.withOrder('desc') + + elasticsearch.bucketAggs.Terms.settings.withOrderBy('_term') + + elasticsearch.bucketAggs.Terms.settings.withMinDocCount('1') + + elasticsearch.bucketAggs.Terms.settings.withSize("10"), + elasticsearch.bucketAggs.DateHistogram.withField('timestamp') + + elasticsearch.bucketAggs.DateHistogram.withId("3") + + elasticsearch.bucketAggs.DateHistogram.withType('date_histogram') + + elasticsearch.bucketAggs.DateHistogram.settings.withInterval('auto') + + elasticsearch.bucketAggs.DateHistogram.settings.withMinDocCount("1") + + elasticsearch.bucketAggs.DateHistogram.settings.withTimeZone("utc") + + elasticsearch.bucketAggs.DateHistogram.settings.withTrimEdges(0), + ]) + + elasticsearch.withMetrics([ + elasticsearch.metrics.MetricAggregationWithSettings.Average.withField(aggregationMetric) + + elasticsearch.metrics.MetricAggregationWithSettings.RawData.withId("1") + + elasticsearch.metrics.MetricAggregationWithSettings.RawData.withType("avg") + ]) + + elasticsearch.withQuery('uuid: $uuid AND parallelism: $parallelism AND profile: ' + metric + ' AND messageSize: $messageSize AND driver.keyword: $driver AND hostNetwork: ' + hostNetwork + ' AND service: ' + service + ' AND acrossAZ: false' ) + + elasticsearch.withTimeField('timestamp') + } +} diff --git a/assets/k8s-netperf/variables.libsonnet b/assets/k8s-netperf/variables.libsonnet index 7f8713c..e5bdca3 100644 --- a/assets/k8s-netperf/variables.libsonnet +++ b/assets/k8s-netperf/variables.libsonnet @@ -14,7 +14,7 @@ local var = g.dashboard.variable; var.query.new('platform', "{\"find\": \"terms\", \"field\": \"metadata.platform.keyword\"}") + var.query.withDatasourceFromVariable(self.datasource) + var.query.withRefresh(2) - + var.query.selectionOptions.withMulti(false) + + var.query.selectionOptions.withMulti(true) + var.query.selectionOptions.withIncludeAll(true) + var.query.generalOptions.withLabel('Platform'), @@ -33,7 +33,7 @@ local var = g.dashboard.variable; + var.query.selectionOptions.withMulti(true) + var.query.selectionOptions.withIncludeAll(true) + var.query.generalOptions.withLabel('uuid'), - + hostNetwork: var.custom.new('hostNetwork', ['true', 'false'],) + var.custom.selectionOptions.withMulti(true) @@ -45,7 +45,7 @@ local var = g.dashboard.variable; + var.custom.selectionOptions.withMulti(true) + var.custom.selectionOptions.withIncludeAll(true) + var.custom.generalOptions.withLabel('service'), - + streams: var.query.new('parallelism', "{\"find\": \"terms\", \"field\": \"parallelism\", \"query\":\"uuid: $uuid\"}") + var.query.withDatasourceFromVariable(self.datasource) @@ -53,7 +53,7 @@ local var = g.dashboard.variable; + var.query.selectionOptions.withMulti(true) + var.query.selectionOptions.withIncludeAll(true) + var.query.generalOptions.withLabel('streams'), - + throughput_profile: var.query.new('throughput_profile', "{\"find\": \"terms\", \"field\": \"profile.keyword\", \"query\":\"uuid:$uuid\"}") + var.query.withDatasourceFromVariable(self.datasource) @@ -71,7 +71,7 @@ local var = g.dashboard.variable; + var.query.selectionOptions.withMulti(true) + var.query.selectionOptions.withIncludeAll(true) + var.query.generalOptions.withLabel('Latency profile'), - + messageSize: var.query.new('messageSize', "{\"find\": \"terms\", \"field\": \"messageSize\",\"query\":\"uuid:$uuid\"}") + var.query.withDatasourceFromVariable(self.datasource) @@ -87,4 +87,10 @@ local var = g.dashboard.variable; + var.query.selectionOptions.withMulti(false) + var.query.selectionOptions.withIncludeAll(false) + var.query.generalOptions.withLabel('Driver'), -} \ No newline at end of file + + compare_by: + var.custom.new('compare_by', ['uuid.keyword', 'metadata.ocpVersion.keyword', 'metadata.clusterName.keyword', 'metadata.ocpShortVersion.keyword', 'metadata.platform.keyword'],) + + var.custom.selectionOptions.withMulti(false) + + var.custom.selectionOptions.withIncludeAll(false) + + var.custom.generalOptions.withLabel('Compare By'), +} diff --git a/templates/CPT/k8s-netperf-v2.jsonnet b/templates/CPT/k8s-netperf-v2.jsonnet index 34c1b5a..b4745cd 100644 --- a/templates/CPT/k8s-netperf-v2.jsonnet +++ b/templates/CPT/k8s-netperf-v2.jsonnet @@ -24,8 +24,11 @@ g.dashboard.new('k8s-netperf') variables.latency_profile, variables.messageSize, variables.driver, + variables.compare_by, ]) + g.dashboard.withPanels([ + panels.row.base('Workload Summary', '', { x: 0, y: 0, w: 24, h: 0 }), + panels.table.workloadSummary('', queries.summary.query('$throughput_profile', 'throughput'), { x: 0, y: 0, w: 24, h: 11 }), panels.row.base('$latency_profile', 'latency_profile', { x: 0, y: 0, w: 24, h: 1 }), panels.timeSeries.base('$latency_profile - $driver - $messageSize', queries.all.query('$latency_profile', 'latency'), { x: 0, y: 0, w: 24, h: 8 }), panels.row.base('$throughput_profile', 'throughput_profile', { x: 0, y: 9, w: 24, h: 1 }), @@ -33,4 +36,16 @@ g.dashboard.new('k8s-netperf') panels.row.base('Parallelism $parallelism', 'parallelism', { x: 0, y: 18, w: 24, h: 1 }), panels.table.base('Throughput - Parallelism: $parallelism', queries.parallelismAll.query('$throughput_profile', 'throughput'), { x: 0, y: 19, w: 24, h: 11 }), panels.table.withLatencyOverrides('Latency - Parallelism: $parallelism', queries.parallelismAll.query('$latency_profile', 'latency'), { x: 0, y: 19, w: 24, h: 11 }), + + panels.row.base('Node to Node', '', { x: 0, y: 20, w: 24, h: 1 }), + panels.barGauge.withThroughput('$throughput_profile - $driver - $messageSize', queries.metricCompare.query('TCP_STREAM', 'throughput', true, false), { x: 0, y: 21, w: 11, h: 11 }), + panels.barGauge.withLatency('Latency - $driver - $messageSize', queries.metricCompare.query('TCP_RR', 'latency', true, false), { x: 0, y: 21, w: 11, h: 11 }), + + panels.row.base('Pod to Pod', '', { x: 0, y: 22, w: 24, h: 1 }), + panels.barGauge.withThroughput('$throughput_profile - $driver - $messageSize', queries.metricCompare.query('TCP_STREAM', 'throughput', false, false), { x: 0, y: 23, w: 11, h: 11 }), + panels.barGauge.withLatency('Latency - $driver - $messageSize', queries.metricCompare.query('TCP_RR', 'latency', false, false), { x: 0, y: 23, w: 11, h: 11 }), + + panels.row.base('Pod to Pod via Service', '', { x: 0, y: 24, w: 24, h: 1 }), + panels.barGauge.withThroughput('$throughput_profile - $driver - $messageSize', queries.metricCompare.query('TCP_STREAM', 'throughput', false, true), { x: 0, y: 25, w: 11, h: 11 }), + panels.barGauge.withLatency('Latency - $driver - $messageSize', queries.metricCompare.query('TCP_RR', 'latency', false, true), { x: 0, y: 25, w: 11, h: 11 }), ])