From 5f676d12091580c48f249d759751999ab8eb4ca3 Mon Sep 17 00:00:00 2001 From: Ben Strauss <81588812+straussb@users.noreply.github.com> Date: Wed, 3 Apr 2024 15:48:36 -0400 Subject: [PATCH] Add support for Elastic Fabric Adapter (EFA) metrics. (#1117) --- .../emf_and_kubernetes_with_gpu_config.yaml | 70 +++++++++++++++++++ .../otel/exporter/awsemf/kubernetes.go | 59 ++++++++++++++++ .../otel/exporter/awsemf/translator_test.go | 18 +++++ 3 files changed, 147 insertions(+) diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml index 248ace6500..c69bd44647 100644 --- a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml @@ -538,6 +538,76 @@ exporters: - node_neurondevice_hw_ecc_events_total_mem_ecc_uncorrected - node_neurondevice_hw_ecc_events_total_sram_ecc_corrected - node_neurondevice_hw_ecc_events_total_sram_ecc_uncorrected + - dimensions: + - - ClusterName + - - ClusterName + - ContainerName + - Namespace + - PodName + - - ClusterName + - ContainerName + - FullPodName + - Namespace + - PodName + - - ClusterName + - ContainerName + - EfaDevice + - FullPodName + - Namespace + - PodName + label_matchers: [] + metric_name_selectors: + - container_efa_rx_bytes + - container_efa_tx_bytes + - container_efa_rx_dropped + - container_efa_rdma_read_bytes + - container_efa_rdma_write_bytes + - container_efa_rdma_write_recv_bytes + - dimensions: + - - ClusterName + - - ClusterName + - Namespace + - - ClusterName + - Namespace + - Service + - - ClusterName + - Namespace + - PodName + - - ClusterName + - FullPodName + - Namespace + - PodName + - - ClusterName + - EfaDevice + - FullPodName + - Namespace + - PodName + label_matchers: [] + metric_name_selectors: + - pod_efa_rx_bytes + - pod_efa_tx_bytes + - pod_efa_rx_dropped + - pod_efa_rdma_read_bytes + - pod_efa_rdma_write_bytes + - pod_efa_rdma_write_recv_bytes + - dimensions: + - - ClusterName + - - ClusterName + - InstanceId + - NodeName + - - ClusterName + - EfaDevice + - InstanceId + - InstanceType + - NodeName + label_matchers: [] + metric_name_selectors: + - node_efa_rx_bytes + - node_efa_tx_bytes + - node_efa_rx_dropped + - node_efa_rdma_read_bytes + - node_efa_rdma_write_bytes + - node_efa_rdma_write_recv_bytes metric_descriptors: - metric_name: apiserver_admission_controller_admission_duration_seconds overwrite: true diff --git a/translator/translate/otel/exporter/awsemf/kubernetes.go b/translator/translate/otel/exporter/awsemf/kubernetes.go index 7b73f38998..7547a4e563 100644 --- a/translator/translate/otel/exporter/awsemf/kubernetes.go +++ b/translator/translate/otel/exporter/awsemf/kubernetes.go @@ -52,6 +52,8 @@ func setKubernetesMetricDeclaration(conf *confmap.Conf, cfg *awsemfexporter.Conf // Setup Aws Neuron metrics kubernetesMetricDeclarations = append(kubernetesMetricDeclarations, getAwsNeuronMetricDeclarations(conf)...) + kubernetesMetricDeclarations = append(kubernetesMetricDeclarations, getEFAMetricDeclarations(conf)...) + cfg.MetricDeclarations = kubernetesMetricDeclarations cfg.MetricDescriptors = getControlPlaneMetricDescriptors(conf) @@ -615,3 +617,60 @@ func getAwsNeuronMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.Metric } return metricDeclarations } + +func getEFAMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.MetricDeclaration { + var metricDeclarations []*awsemfexporter.MetricDeclaration + if awscontainerinsight.EnhancedContainerInsightsEnabled(conf) && awscontainerinsight.AcceleratedComputeMetricsEnabled(conf) { + metricDeclarations = []*awsemfexporter.MetricDeclaration{ + { + Dimensions: [][]string{ + {"ClusterName"}, + {"ClusterName", "Namespace", "PodName", "ContainerName"}, + {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName"}, + {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName", "EfaDevice"}}, + MetricNameSelectors: []string{ + "container_efa_rx_bytes", + "container_efa_tx_bytes", + "container_efa_rx_dropped", + "container_efa_rdma_read_bytes", + "container_efa_rdma_write_bytes", + "container_efa_rdma_write_recv_bytes", + }, + }, + { + Dimensions: [][]string{ + {"ClusterName"}, + {"ClusterName", "Namespace"}, + {"ClusterName", "Namespace", "Service"}, + {"ClusterName", "Namespace", "PodName"}, + {"ClusterName", "Namespace", "PodName", "FullPodName"}, + {"ClusterName", "Namespace", "PodName", "FullPodName", "EfaDevice"}, + }, + MetricNameSelectors: []string{ + "pod_efa_rx_bytes", + "pod_efa_tx_bytes", + "pod_efa_rx_dropped", + "pod_efa_rdma_read_bytes", + "pod_efa_rdma_write_bytes", + "pod_efa_rdma_write_recv_bytes", + }, + }, + { + Dimensions: [][]string{ + {"ClusterName"}, + {"ClusterName", "NodeName", "InstanceId"}, + {"ClusterName", "NodeName", "InstanceId", "InstanceType", "EfaDevice"}, + }, + MetricNameSelectors: []string{ + "node_efa_rx_bytes", + "node_efa_tx_bytes", + "node_efa_rx_dropped", + "node_efa_rdma_read_bytes", + "node_efa_rdma_write_bytes", + "node_efa_rdma_write_recv_bytes", + }, + }, + } + } + return metricDeclarations +} diff --git a/translator/translate/otel/exporter/awsemf/translator_test.go b/translator/translate/otel/exporter/awsemf/translator_test.go index 4c41f05ad9..faaa856aba 100644 --- a/translator/translate/otel/exporter/awsemf/translator_test.go +++ b/translator/translate/otel/exporter/awsemf/translator_test.go @@ -513,6 +513,24 @@ func TestTranslator(t *testing.T) { "node_neurondevice_hw_ecc_events_total_sram_ecc_uncorrected", }, }, + { + Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace", "PodName", "ContainerName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName", "EfaDevice"}}, + MetricNameSelectors: []string{ + "container_efa_rx_bytes", "container_efa_tx_bytes", "container_efa_rx_dropped", "container_efa_rdma_read_bytes", "container_efa_rdma_write_bytes", "container_efa_rdma_write_recv_bytes", + }, + }, + { + Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace"}, {"ClusterName", "Namespace", "Service"}, {"ClusterName", "Namespace", "PodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "EfaDevice"}}, + MetricNameSelectors: []string{ + "pod_efa_rx_bytes", "pod_efa_tx_bytes", "pod_efa_rx_dropped", "pod_efa_rdma_read_bytes", "pod_efa_rdma_write_bytes", "pod_efa_rdma_write_recv_bytes", + }, + }, + { + Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "NodeName", "InstanceId"}, {"ClusterName", "NodeName", "InstanceId", "InstanceType", "EfaDevice"}}, + MetricNameSelectors: []string{ + "node_efa_rx_bytes", "node_efa_tx_bytes", "node_efa_rx_dropped", "node_efa_rdma_read_bytes", "node_efa_rdma_write_bytes", "node_efa_rdma_write_recv_bytes", + }, + }, }, "metric_descriptors": []awsemfexporter.MetricDescriptor{ {