cloud-bulldozer · rsevilla87 · Jan 19, 2024 · Jan 5, 2024
diff --git a/assets/k8s-netperf/panels.libsonnet b/assets/k8s-netperf/panels.libsonnet
@@ -105,7 +105,144 @@ local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonn
             "displayName": "ea7b29d7-8991-4752-a0d4-e26446d34915 TCP_STREAM 4096 Mb/s AWS"
           }
         ]),
-
+    workloadSummary(title, targets, gridPos):
+      self.base(title, targets, gridPos)
+      + table.queryOptions.withTransformations([
+        {
+          "id": "organize",
+          "options": {
+            "excludeByName": {
+              "_id": true,
+              "_index": true,
+              "_type": true,
+              "clientCPU.idleCPU": true,
+              "clientCPU.ioCPU": true,
+              "clientCPU.irqCPU": true,
+              "clientCPU.niceCPU": true,
+              "clientCPU.softCPU": true,
+              "clientCPU.stealCPU": true,
+              "clientCPU.systemCPU": true,
+              "clientCPU.userCPU": true,
+              "clientNodeLabels.beta.kubernetes.io/arch": true,
+              "clientNodeLabels.beta.kubernetes.io/instance-type": true,
+              "clientNodeLabels.beta.kubernetes.io/os": true,
+              "clientNodeLabels.failure-domain.beta.kubernetes.io/region": true,
+              "clientNodeLabels.failure-domain.beta.kubernetes.io/zone": true,
+              "clientNodeLabels.hypershift.openshift.io/managed": true,
+              "clientNodeLabels.hypershift.openshift.io/nodePool": true,
+              "clientNodeLabels.kubernetes.io/arch": true,
+              "clientNodeLabels.kubernetes.io/hostname": true,
+              "clientNodeLabels.kubernetes.io/os": true,
+              "clientNodeLabels.node-role.kubernetes.io/worker": true,
+              "clientNodeLabels.node.kubernetes.io/instance-type": true,
+              "clientNodeLabels.node.openshift.io/os_id": true,
+              "clientNodeLabels.topology.ebs.csi.aws.com/zone": true,
+              "clientNodeLabels.topology.kubernetes.io/region": true,
+              "clientNodeLabels.topology.kubernetes.io/zone": true,
+              "clientPods": true,
+              "confidence": true,
+              "driver": true,
+              "highlight": true,
+              "hostNetwork": true,
+              "latency": true,
+              "local": true,
+              "ltcyMetric": true,
+              "messageSize": true,
+              "metadata.ipsec": true,
+              "metadata.k8sVersion": true,
+              "metadata.kernel": true,
+              "metadata.masterNodesCount": true,
+              "metadata.masterNodesType": true,
+              "metadata.metricName": true,
+              "metadata.mtu": true,
+              "metadata.ocpShortVersion": true,
+              "metadata.totalNodes": true,
+              "parallelism": true,
+              "profile": true,
+              "samples": true,
+              "serverCPU.idleCPU": true,
+              "serverCPU.ioCPU": true,
+              "serverCPU.irqCPU": true,
+              "serverCPU.niceCPU": true,
+              "serverCPU.softCPU": true,
+              "serverCPU.stealCPU": true,
+              "serverCPU.systemCPU": true,
+              "serverCPU.userCPU": true,
+              "serverNodeLabels.beta.kubernetes.io/arch": true,
+              "serverNodeLabels.beta.kubernetes.io/instance-type": true,
+              "serverNodeLabels.beta.kubernetes.io/os": true,
+              "serverNodeLabels.failure-domain.beta.kubernetes.io/region": true,
+              "serverNodeLabels.failure-domain.beta.kubernetes.io/zone": true,
+              "serverNodeLabels.hypershift.openshift.io/managed": true,
+              "serverNodeLabels.hypershift.openshift.io/nodePool": true,
+              "serverNodeLabels.kubernetes.io/arch": true,
+              "serverNodeLabels.kubernetes.io/hostname": true,
+              "serverNodeLabels.kubernetes.io/os": true,
+              "serverNodeLabels.node-role.kubernetes.io/worker": true,
+              "serverNodeLabels.node.kubernetes.io/instance-type": true,
+              "serverNodeLabels.node.openshift.io/os_id": true,
+              "serverNodeLabels.topology.ebs.csi.aws.com/zone": true,
+              "serverNodeLabels.topology.kubernetes.io/region": true,
+              "serverNodeLabels.topology.kubernetes.io/zone": true,
+              "serverPods": true,
+              "service": true,
+              "sort": true,
+              "tcpRetransmits": true,
+              "throughput": true,
+              "tputMetric": true,
+              "udpLossPercent": true
+            },
+            "indexByName": {
+              "uuid": 0,
+              "timestamp": 1,
+              "metadata.platform": 2,
+              "metadata.ocpVersion": 3,
+              "metadata.clusterName": 4,
+              "metadata.sdnType": 5,
+              "metadata.infraNodesCount": 6,
+              "metadata.infraNodesType": 7,
+              "metadata.workerNodesCount": 8,
+              "metadata.workerNodesType": 9,
+              "metadata.acrossAZ": 10,
+              "metadata.region": 11
+            },
+            "renameByName": {
+              "acrossAZ": "Multi-AZ",
+              "metadata.clusterName": "Cluster Name",
+              "metadata.infraNodesCount": "Infras",
+              "metadata.infraNodesType": "Infra Type",
+              "metadata.ocpVersion": "Version",
+              "metadata.platform": "Platform",
+              "metadata.region": "Region",
+              "metadata.sdnType": "SDN",
+              "metadata.workerNodesCount": "Workers",
+              "metadata.workerNodesType": "Workers Type",
+              "timestamp": "Timestamp",
+              "uuid": "UUID"
+            }
+          }
+        },
+        {
+          "id": "groupBy",
+          "options": {
+            "fields": {
+              "UUID":         {"aggregations": [], "operation": "groupby"},
+              "Cluster Name": {"aggregations": ["lastNotNull"], "operation": "aggregate"},
+              "Infra Type":   {"aggregations": ["lastNotNull"], "operation": "aggregate"},
+              "Infras":       {"aggregations": ["lastNotNull"], "operation": "aggregate"},
+              "Platform":     {"aggregations": ["lastNotNull"], "operation": "aggregate"},
+              "Region":       {"aggregations": ["lastNotNull"], "operation": "aggregate"},
+              "SDN":          {"aggregations": ["lastNotNull"], "operation": "aggregate"},
+              "Timestamp":    {"aggregations": ["lastNotNull"], "operation": "aggregate"},
+              "Version":      {"aggregations": ["lastNotNull"], "operation": "aggregate"},
+              "Workers":      {"aggregations": ["lastNotNull"], "operation": "aggregate"},
+              "Workers Type": {"aggregations": ["lastNotNull"], "operation": "aggregate"},
+              "duration":     {"aggregations": ["lastNotNull"], "operation": "aggregate"},
+              "Multi-AZ":     {"aggregations": ["last"], "operation": "aggregate"}
+            }
+          }
+        }
+      ]),
     withLatencyOverrides(title, targets, gridPos):
       self.base(title, targets, gridPos)
       + table.queryOptions.withTransformations([
@@ -168,7 +305,38 @@ local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonn
         }
       ]),
   },
+  barGauge: {
+    local barGauge = g.panel.barGauge,
+    local custom = barGauge.fields.defaults.custom,
+    local options = barGauge.options,
 
+    base(title, targets, gridPos):
+      barGauge.new(title)
+      + barGauge.queryOptions.withTargets(targets)
+      + barGauge.datasource.withType('elasticsearch')
+      + barGauge.datasource.withUid('$datasource')
+      + barGauge.options.reduceOptions.withValues(false)
+      + barGauge.options.reduceOptions.withCalcs(["lastNotNull"])
+      + barGauge.options.reduceOptions.withFields("")
+      + barGauge.options.withOrientation("horizontal")
+      + barGauge.options.withDisplayMode("gradient")
+      + barGauge.options.withValueMode("color")
+      + barGauge.panelOptions.withRepeat("messageSize")
+      + barGauge.standardOptions.withMin("0")
+      + barGauge.standardOptions.color.withMode("palette-classic")
+      + barGauge.gridPos.withX(gridPos.x)
+      + barGauge.gridPos.withY(gridPos.y)
+      + barGauge.gridPos.withH(gridPos.h)
+      + barGauge.gridPos.withW(gridPos.w),
+
+    withThroughput(title, targets,gridPos):
+      self.base(title, targets, gridPos)
+      + barGauge.standardOptions.withUnit("Mbits"),
+
+    withLatency(title, targets,gridPos):
+      self.base(title, targets, gridPos)
+      + barGauge.standardOptions.withUnit("µs"),
+  },
   timeSeries: {
     local timeSeries = g.panel.timeSeries,
     local custom = timeSeries.fieldConfig.defaults.custom,
@@ -246,4 +414,4 @@ local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonn
       }
     ]),
   },
-}
+}
diff --git a/assets/k8s-netperf/queries.libsonnet b/assets/k8s-netperf/queries.libsonnet
@@ -3,7 +3,7 @@ local elasticsearch = g.query.elasticsearch;
 
 {
   all: {
-    query(metric, aggregationMetric): 
+    query(metric, aggregationMetric):
         elasticsearch.withAlias("{{metadata.ocpVersion.keyword}} hostNetwork={{hostNetwork}} procs={{parallelism}}")
         + elasticsearch.withBucketAggs([
           elasticsearch.bucketAggs.Terms.withField("messageSize")
@@ -79,7 +79,7 @@ local elasticsearch = g.query.elasticsearch;
         + elasticsearch.withTimeField('timestamp')
   },
   parallelismAll: {
-    query(metric, aggregationMetric): 
+    query(metric, aggregationMetric):
         elasticsearch.withAlias("")
         + elasticsearch.withBucketAggs([
           elasticsearch.bucketAggs.Terms.withField("uuid.keyword")
@@ -161,4 +161,52 @@ local elasticsearch = g.query.elasticsearch;
         + elasticsearch.withQuery('uuid: $uuid AND parallelism: $parallelism AND profile: ' + metric + ' AND messageSize: $messageSize AND driver.keyword: $driver AND metadata.platform: $platform AND hostNetwork: $hostNetwork AND service: $service')
         + elasticsearch.withTimeField('timestamp')
   },
-}
+  summary: {
+    query(metric, aggregationMetric):
+      elasticsearch.withAlias("")
+      + elasticsearch.withBucketAggs([
+      ])
+      + elasticsearch.withMetrics([
+        elasticsearch.metrics.MetricAggregationWithSettings.RawData.withHide(false)
+        + elasticsearch.metrics.MetricAggregationWithSettings.RawData.withId("1")
+        + elasticsearch.metrics.MetricAggregationWithSettings.RawData.withType("raw_data")
+        + elasticsearch.metrics.MetricAggregationWithSettings.RawData.settings.withSize("500")
+      ])
+      + elasticsearch.withQuery('uuid: $uuid AND parallelism: $parallelism AND profile: ' + metric + ' AND messageSize: $messageSize AND driver.keyword: $driver AND metadata.platform: $platform AND hostNetwork: $hostNetwork AND service: $service')
+      + elasticsearch.withTimeField('timestamp')
+  },
+  metricCompare: {
+    query(metric, aggregationMetric, hostNetwork, service):
+      elasticsearch.withAlias("{{$compare_by}} Procs: {{parallelism}}")
+      + elasticsearch.withBucketAggs([
+        elasticsearch.bucketAggs.Terms.withField("parallelism")
+        + elasticsearch.bucketAggs.Terms.withId("1")
+        + elasticsearch.bucketAggs.Terms.withType('terms')
+        + elasticsearch.bucketAggs.Terms.settings.withOrder('asc')
+        + elasticsearch.bucketAggs.Terms.settings.withOrderBy('_term')
+        + elasticsearch.bucketAggs.Terms.settings.withMinDocCount('1')
+        + elasticsearch.bucketAggs.Terms.settings.withSize("0"),
+        elasticsearch.bucketAggs.Terms.withField("$compare_by")
+        + elasticsearch.bucketAggs.Terms.withId("2")
+        + elasticsearch.bucketAggs.Terms.withType('terms')
+        + elasticsearch.bucketAggs.Terms.settings.withOrder('desc')
+        + elasticsearch.bucketAggs.Terms.settings.withOrderBy('_term')
+        + elasticsearch.bucketAggs.Terms.settings.withMinDocCount('1')
+        + elasticsearch.bucketAggs.Terms.settings.withSize("10"),
+        elasticsearch.bucketAggs.DateHistogram.withField('timestamp')
+        + elasticsearch.bucketAggs.DateHistogram.withId("3")
+        + elasticsearch.bucketAggs.DateHistogram.withType('date_histogram')
+        + elasticsearch.bucketAggs.DateHistogram.settings.withInterval('auto')
+        + elasticsearch.bucketAggs.DateHistogram.settings.withMinDocCount("1")
+        + elasticsearch.bucketAggs.DateHistogram.settings.withTimeZone("utc")
+        + elasticsearch.bucketAggs.DateHistogram.settings.withTrimEdges(0),
+      ])
+      + elasticsearch.withMetrics([
+        elasticsearch.metrics.MetricAggregationWithSettings.Average.withField(aggregationMetric)
+        + elasticsearch.metrics.MetricAggregationWithSettings.RawData.withId("1")
+        + elasticsearch.metrics.MetricAggregationWithSettings.RawData.withType("avg")
+      ])
+        + elasticsearch.withQuery('uuid: $uuid AND parallelism: $parallelism AND profile: ' + metric + ' AND messageSize: $messageSize AND driver.keyword: $driver AND hostNetwork: ' + hostNetwork + ' AND service: ' + service + ' AND acrossAZ: false' )
+        + elasticsearch.withTimeField('timestamp')
+  }
+}
diff --git a/assets/k8s-netperf/variables.libsonnet b/assets/k8s-netperf/variables.libsonnet
@@ -14,7 +14,7 @@ local var = g.dashboard.variable;
     var.query.new('platform', "{\"find\": \"terms\", \"field\": \"metadata.platform.keyword\"}")
     + var.query.withDatasourceFromVariable(self.datasource)
     + var.query.withRefresh(2)
-    + var.query.selectionOptions.withMulti(false)
+    + var.query.selectionOptions.withMulti(true)
     + var.query.selectionOptions.withIncludeAll(true)
     + var.query.generalOptions.withLabel('Platform'),
 
@@ -33,7 +33,7 @@ local var = g.dashboard.variable;
     + var.query.selectionOptions.withMulti(true)
     + var.query.selectionOptions.withIncludeAll(true)
     + var.query.generalOptions.withLabel('uuid'),
-  
+
   hostNetwork:
     var.custom.new('hostNetwork', ['true', 'false'],)
     + var.custom.selectionOptions.withMulti(true)
@@ -45,15 +45,15 @@ local var = g.dashboard.variable;
     + var.custom.selectionOptions.withMulti(true)
     + var.custom.selectionOptions.withIncludeAll(true)
     + var.custom.generalOptions.withLabel('service'),
-  
+
   streams:
     var.query.new('parallelism', "{\"find\": \"terms\", \"field\": \"parallelism\", \"query\":\"uuid: $uuid\"}")
     + var.query.withDatasourceFromVariable(self.datasource)
     + var.query.withRefresh(2)
     + var.query.selectionOptions.withMulti(true)
     + var.query.selectionOptions.withIncludeAll(true)
     + var.query.generalOptions.withLabel('streams'),
-  
+
   throughput_profile:
     var.query.new('throughput_profile', "{\"find\": \"terms\", \"field\": \"profile.keyword\", \"query\":\"uuid:$uuid\"}")
     + var.query.withDatasourceFromVariable(self.datasource)
@@ -71,7 +71,7 @@ local var = g.dashboard.variable;
     + var.query.selectionOptions.withMulti(true)
     + var.query.selectionOptions.withIncludeAll(true)
     + var.query.generalOptions.withLabel('Latency profile'),
-  
+
   messageSize:
     var.query.new('messageSize', "{\"find\": \"terms\", \"field\": \"messageSize\",\"query\":\"uuid:$uuid\"}")
     + var.query.withDatasourceFromVariable(self.datasource)
@@ -87,4 +87,10 @@ local var = g.dashboard.variable;
     + var.query.selectionOptions.withMulti(false)
     + var.query.selectionOptions.withIncludeAll(false)
     + var.query.generalOptions.withLabel('Driver'),
-}
+
+  compare_by:
+    var.custom.new('compare_by', ['uuid.keyword', 'metadata.ocpVersion.keyword', 'metadata.clusterName.keyword', 'metadata.ocpShortVersion.keyword', 'metadata.platform.keyword'],)
+    + var.custom.selectionOptions.withMulti(false)
+    + var.custom.selectionOptions.withIncludeAll(false)
+    + var.custom.generalOptions.withLabel('Compare By'),
+}
diff --git a/templates/CPT/k8s-netperf-v2.jsonnet b/templates/CPT/k8s-netperf-v2.jsonnet
@@ -24,13 +24,28 @@ g.dashboard.new('k8s-netperf')
   variables.latency_profile,
   variables.messageSize,
   variables.driver,
+  variables.compare_by,
 ])
 + g.dashboard.withPanels([
+  panels.row.base('Workload Summary', '', { x: 0, y: 0, w: 24, h: 0 }),
+  panels.table.workloadSummary('', queries.summary.query('$throughput_profile', 'throughput'), { x: 0, y: 0, w: 24, h: 11 }),
   panels.row.base('$latency_profile', 'latency_profile', { x: 0, y: 0, w: 24, h: 1 }),
   panels.timeSeries.base('$latency_profile - $driver - $messageSize', queries.all.query('$latency_profile', 'latency'), { x: 0, y: 0, w: 24, h: 8 }),
   panels.row.base('$throughput_profile', 'throughput_profile', { x: 0, y: 9, w: 24, h: 1 }),
   panels.timeSeries.withThroughputOverrides('$throughput_profile - $driver - $messageSize', queries.all.query('$throughput_profile', 'throughput'), { x: 0, y: 10, w: 24, h: 8 }),
   panels.row.base('Parallelism $parallelism', 'parallelism', { x: 0, y: 18, w: 24, h: 1 }),
   panels.table.base('Throughput - Parallelism: $parallelism', queries.parallelismAll.query('$throughput_profile', 'throughput'), { x: 0, y: 19, w: 24, h: 11 }),
   panels.table.withLatencyOverrides('Latency - Parallelism: $parallelism', queries.parallelismAll.query('$latency_profile', 'latency'), { x: 0, y: 19, w: 24, h: 11 }),
+
+  panels.row.base('Node to Node', '', { x: 0, y: 20, w: 24, h: 1 }),
+  panels.barGauge.withThroughput('$throughput_profile - $driver - $messageSize', queries.metricCompare.query('TCP_STREAM', 'throughput', true, false), { x: 0, y: 21, w: 11, h: 11 }),
+  panels.barGauge.withLatency('Latency - $driver - $messageSize', queries.metricCompare.query('TCP_RR', 'latency', true, false), { x: 0, y: 21, w: 11, h: 11 }),
+
+  panels.row.base('Pod to Pod', '', { x: 0, y: 22, w: 24, h: 1 }),
+  panels.barGauge.withThroughput('$throughput_profile - $driver - $messageSize', queries.metricCompare.query('TCP_STREAM', 'throughput', false, false), { x: 0, y: 23, w: 11, h: 11 }),
+  panels.barGauge.withLatency('Latency - $driver - $messageSize', queries.metricCompare.query('TCP_RR', 'latency', false, false), { x: 0, y: 23, w: 11, h: 11 }),
+
+  panels.row.base('Pod to Pod via Service', '', { x: 0, y: 24, w: 24, h: 1 }),
+  panels.barGauge.withThroughput('$throughput_profile - $driver - $messageSize', queries.metricCompare.query('TCP_STREAM', 'throughput', false, true), { x: 0, y: 25, w: 11, h: 11 }),
+  panels.barGauge.withLatency('Latency - $driver - $messageSize', queries.metricCompare.query('TCP_RR', 'latency', false, true), { x: 0, y: 25, w: 11, h: 11 }),
 ])