From c9fbf915216328caa32ec41c7811c5c1b8c2ba1f Mon Sep 17 00:00:00 2001 From: Aditya Purang <44022838+aditya-purang@users.noreply.github.com> Date: Wed, 3 Apr 2024 23:05:32 +0100 Subject: [PATCH] Update metrics for AWS Neuron (#1104) --- internal/containerinsightscommon/const.go | 2 +- .../internal/awsneuron_metric_modifier.go | 129 +++++---- .../awsneuron_metric_modifier_test.go | 81 +++--- .../emf_and_kubernetes_with_gpu_config.yaml | 259 ++++++++---------- .../otel/exporter/awsemf/kubernetes.go | 28 +- .../otel/exporter/awsemf/translator_test.go | 28 +- 6 files changed, 247 insertions(+), 280 deletions(-) diff --git a/internal/containerinsightscommon/const.go b/internal/containerinsightscommon/const.go index 49e615ad70..6a0a44706f 100644 --- a/internal/containerinsightscommon/const.go +++ b/internal/containerinsightscommon/const.go @@ -94,7 +94,7 @@ const ( NeuronCoreMemoryUtilizationSharedScratchpad = "neuroncore_memory_usage_model_shared_scratchpad" NeuronCoreMemoryUtilizationRuntimeMemory = "neuroncore_memory_usage_runtime_memory" NeuronCoreMemoryUtilizationTensors = "neuroncore_memory_usage_tensors" - NeuronDeviceHardwareEccEvents = "neurondevice_hw_ecc_events_total" + NeuronDeviceHardwareEccEvents = "neurondevice_hw_ecc_events" NeuronExecutionStatus = "neuron_execution_status" NeuronExecutionErrors = "neuron_execution_errors" NeuronRuntimeMemoryUsage = "neurondevice_runtime_memory_used_bytes" diff --git a/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier.go b/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier.go index 8d46809608..a979282e20 100644 --- a/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier.go +++ b/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier.go @@ -14,39 +14,40 @@ import ( ) const ( - aggregatedMetricSuffix = "_total" - ErrorType = "error_type" - StatusType = "status_type" - EventType = "event_type" - logTypeSuffix = "AWSNeuron" - MemoryLocation = "memory_location" - - Core = "Core" - Device = "Device" - Percentile = "percentile" - PodName = "PodName" - Count = "Count" - Bytes = "Bytes" - Seconds = "Seconds" - Percent = "Percent" - NeuronCoreAttributeKey = "NeuronCore" - NeuronDeviceAttributeKey = "NeuronDevice" - RuntimeTag = "runtime_tag" - ClusterName = "ClusterName" - ContainerName = "ContainerName" - FullPodName = "FullPodName" - InstanceId = "InstanceId" - InstanceType = "InstanceType" - K8sPodName = "K8sPodName" - Namespace = "Namespace" - NeuronCore = "NeuronCore" - NeuronDevice = "NeuronDevice" - NodeName = "NodeName" - Service = "Service" - AvailabilityZone = "availability_zone" - Kubernetes = "kubernetes" - Region = "region" - SubnetId = "subnet_id" + ErrorType = "error_type" + StatusType = "status_type" + EventType = "event_type" + logTypeSuffix = "AWSNeuron" + MemoryLocation = "memory_location" + + Core = "Core" + Device = "Device" + Percentile = "percentile" + PodName = "PodName" + Count = "Count" + Bytes = "Bytes" + Seconds = "Seconds" + Percent = "Percent" + NeuronCoreAttributeKey = "NeuronCore" + NeuronDeviceAttributeKey = "NeuronDevice" + RuntimeTag = "runtime_tag" + ClusterName = "ClusterName" + ContainerName = "ContainerName" + FullPodName = "FullPodName" + InstanceId = "InstanceId" + InstanceType = "InstanceType" + K8sPodName = "K8sPodName" + Namespace = "Namespace" + NeuronCore = "NeuronCore" + NeuronDevice = "NeuronDevice" + NodeName = "NodeName" + Service = "Service" + AvailabilityZone = "availability_zone" + Kubernetes = "kubernetes" + Region = "region" + SubnetId = "subnet_id" + NeuronExecutionErrorsAggregatedMetric = containerinsightscommon.NeuronExecutionErrors + "_total" + NeuronDeviceHardwareEccEventsAggregatedMetric = containerinsightscommon.NeuronDeviceHardwareEccEvents + "_total" ) type AwsNeuronMetricModifier struct { @@ -60,6 +61,12 @@ type MetricModifications struct { Unit string } +type MetricDatapointAggregationKey struct { + runtimeTag string + aggregatedMetricName string + deviceId string +} + var ( metricModificationsMap = map[string]MetricModifications{ containerinsightscommon.NeuronExecutionErrors: {DuplicationTypes: []string{containerinsightscommon.TypeNode}, UniqueAttribute: ErrorType, LogTypeSuffix: "", Unit: Count}, @@ -79,6 +86,20 @@ var ( } attributeValuePrefixingMap = map[string]string{NeuronCoreAttributeKey: "core", NeuronDeviceAttributeKey: "device"} + uniquesDatapointsToAggregatedMetricMappings = map[string]map[string]string{ + containerinsightscommon.NeuronExecutionErrors: {"generic": NeuronExecutionErrorsAggregatedMetric, + "numerical": NeuronExecutionErrorsAggregatedMetric, + "transient": NeuronExecutionErrorsAggregatedMetric, + "model": NeuronExecutionErrorsAggregatedMetric, + "runtime": NeuronExecutionErrorsAggregatedMetric, + "hardware": NeuronExecutionErrorsAggregatedMetric}, + // execution_status metric will be added here incrementally + containerinsightscommon.NeuronDeviceHardwareEccEvents: {"mem_ecc_corrected": NeuronDeviceHardwareEccEventsAggregatedMetric, + "mem_ecc_uncorrected": NeuronDeviceHardwareEccEventsAggregatedMetric, + "sram_ecc_corrected": NeuronDeviceHardwareEccEventsAggregatedMetric, + "sram_ecc_uncorrected": NeuronDeviceHardwareEccEventsAggregatedMetric}, + } + MetricAttributesToKeep = map[string]struct{}{ ClusterName: {}, ContainerName: {}, @@ -171,6 +192,7 @@ func keepSpecificDatapointBasedOnAttribute(originalMetric pmetric.Metric, attrib // It also creates a new metric for each datapoint based on the unique target attribute. // example : // in: unique_target_attribute = error_type +// and error_type: A,B,C need to be aggregated in neuron_execution_errors_total metric then // // neuron_execution_errors { // datapoints : [ @@ -206,34 +228,43 @@ func (md *AwsNeuronMetricModifier) extractDatapointsAsMetricsAndAggregate(origin } originalMetricDatapoints := originalMetric.Sum().DataPoints() - aggregatedValuesPerRuntimeTag := map[string]float64{} + + aggregatedValuesPerRuntimeTag := map[MetricDatapointAggregationKey]float64{} + uniqueAttributeToAggregatedMetricMappings, needsAggregation := uniquesDatapointsToAggregatedMetricMappings[originalMetric.Name()] for i := 0; i < originalMetricDatapoints.Len(); i++ { originalDatapoint := originalMetricDatapoints.At(i) - runtimeTag, _ := originalDatapoint.Attributes().Get(RuntimeTag) - aggregatedValuesPerRuntimeTag[runtimeTag.AsString()] += originalDatapoint.DoubleValue() + deviceId, _ := originalDatapoint.Attributes().Get(NeuronDeviceAttributeKey) + uniqueAttributeValue, _ := originalDatapoint.Attributes().Get(uniqueAttribute) + + // only add to the aggregation map if the datapoint to aggregated metric mappings are defined for the original metric + if needsAggregation { + aggregatedMetricName := uniqueAttributeToAggregatedMetricMappings[uniqueAttributeValue.Str()] + aggregatedValuesPerRuntimeTag[MetricDatapointAggregationKey{runtimeTag: runtimeTag.Str(), aggregatedMetricName: aggregatedMetricName, deviceId: deviceId.Str()}] += originalDatapoint.DoubleValue() + } // Creating a new metric from the current datapoint and adding it to the new newMetricSlice - subtypeValue, _ := originalDatapoint.Attributes().Get(uniqueAttribute) - newNameMetric := setMetricMetadata(newMetricSlice.AppendEmpty(), originalMetric.Name()+"_"+subtypeValue.Str(), originalMetric.Unit()) + newNameMetric := setMetricMetadata(newMetricSlice.AppendEmpty(), originalMetric.Name()+"_"+uniqueAttributeValue.Str(), originalMetric.Unit()) originalDatapoint.CopyTo(newNameMetric.SetEmptySum().DataPoints().AppendEmpty()) // setting value of temporality to cumulative so that agent performs delta conversion on this metric newNameMetric.Sum().SetAggregationTemporality(pmetric.AggregationTemporalityCumulative) } - if originalMetric.Name() != containerinsightscommon.NeuronDeviceHardwareEccEvents { - // Creating body for the aggregated metric and add it to the new newMetricSlice for each runtime - for runtimeTag, value := range aggregatedValuesPerRuntimeTag { - // Aggregated metric for neuron device ecc events is not required - aggregatedMetric := setMetricMetadata(newMetricSlice.AppendEmpty(), originalMetric.Name()+aggregatedMetricSuffix, originalMetric.Unit()) + // Creating body for the aggregated metric and add it to the new newMetricSlice for each runtime + for aggregatedMetricMetadata, value := range aggregatedValuesPerRuntimeTag { + // Aggregated metric for neuron device ecc events is not required + aggregatedMetric := setMetricMetadata(newMetricSlice.AppendEmpty(), aggregatedMetricMetadata.aggregatedMetricName, originalMetric.Unit()) - originalMetricDatapoints.At(0).CopyTo(aggregatedMetric.SetEmptySum().DataPoints().AppendEmpty()) - aggregatedMetric.Sum().DataPoints().At(0).SetDoubleValue(value) - aggregatedMetric.Sum().DataPoints().At(0).Attributes().PutStr(RuntimeTag, runtimeTag) + originalMetricDatapoints.At(0).CopyTo(aggregatedMetric.SetEmptySum().DataPoints().AppendEmpty()) + aggregatedMetric.Sum().DataPoints().At(0).SetDoubleValue(value) + aggregatedMetric.Sum().DataPoints().At(0).Attributes().PutStr(RuntimeTag, aggregatedMetricMetadata.runtimeTag) - // setting value of temporality to cumulative so that agent performs delta conversion on this metric - aggregatedMetric.Sum().SetAggregationTemporality(pmetric.AggregationTemporalityCumulative) + if aggregatedMetricMetadata.deviceId != "" { + aggregatedMetric.Sum().DataPoints().At(0).Attributes().PutStr(NeuronDeviceAttributeKey, aggregatedMetricMetadata.deviceId) } + + // setting value of temporality to cumulative so that agent performs delta conversion on this metric + aggregatedMetric.Sum().SetAggregationTemporality(pmetric.AggregationTemporalityCumulative) } return newMetricSlice @@ -271,7 +302,7 @@ func prefixCoreAndDeviceLabels(originalMetric pmetric.Metric) { dp := dps.At(i) for attributeKey, attributeValuePrefix := range attributeValuePrefixingMap { if value, exists := dp.Attributes().Get(attributeKey); exists { - dp.Attributes().PutStr(attributeKey, attributeValuePrefix+value.AsString()) + dp.Attributes().PutStr(attributeKey, attributeValuePrefix+value.Str()) } } } diff --git a/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier_test.go b/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier_test.go index ba17806c8b..9942c877bf 100644 --- a/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier_test.go +++ b/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier_test.go @@ -33,7 +33,7 @@ const ( NeuronCoreMemoryUsageModelSharedScratchpad = "neuroncore_memory_usage_model_shared_scratchpad" NeuronDeviceRuntimeMemoryUsedBytes = "neurondevice_runtime_memory_used_bytes" NeuronExecutionLatency = "neuron_execution_latency" - NeuronDeviceHwEccEventsTotal = "neurondevice_hw_ecc_events_total" + NeuronDeviceHwEccEvents = "neurondevice_hw_ecc_events" NeuronDeviceIndex = "neuron_device_index" DummyPod = "DummyPod" Type = "Type" @@ -60,7 +60,7 @@ var metricNameToMetricLayout = map[string]MetricDefinition{ NeuronCoreMemoryUsageModelSharedScratchpad: {MetricType: pmetric.MetricTypeGauge, MetricValues: []float64{1, 2, 3}, SpecialAttributes: [][]string{{NeuronCore, "0", NeuronDevice, "0", MemoryLocation, "None", PodName, DummyPod}, {NeuronCore, "1", NeuronDevice, "0", MemoryLocation, "None", PodName, DummyPod}, {NeuronCore, "2", NeuronDevice, "1", MemoryLocation, "None", PodName, DummyPod}}, Unit: Bytes}, NeuronDeviceRuntimeMemoryUsedBytes: {MetricType: pmetric.MetricTypeGauge, MetricValues: []float64{1, 2}, SpecialAttributes: [][]string{{MemoryLocation, "host"}, {MemoryLocation, "neuron_device"}}, Unit: Bytes}, NeuronExecutionLatency: {MetricType: pmetric.MetricTypeGauge, MetricValues: []float64{0, 0, 0, 0, 1, 0, 0}, SpecialAttributes: [][]string{{Percentile, "p0"}, {Percentile, "p1"}, {Percentile, "p100"}, {Percentile, "p25"}, {Percentile, "p50"}, {Percentile, "p75"}, {Percentile, "p99"}}, Unit: Seconds}, - NeuronDeviceHwEccEventsTotal: {MetricType: pmetric.MetricTypeSum, MetricValues: []float64{1, 2, 3, 4}, SpecialAttributes: [][]string{{NeuronDeviceIndex, "1", NeuronDevice, "1", EventType, "mem_ecc_corrected", PodName, DummyPod, RuntimeTag, "1"}, {NeuronDeviceIndex, "1", NeuronDevice, "1", EventType, "mem_ecc_uncorrected", PodName, DummyPod, RuntimeTag, "1"}, {NeuronDeviceIndex, "1", NeuronDevice, "1", EventType, "sram_ecc_corrected", PodName, DummyPod, RuntimeTag, "1"}, {NeuronDeviceIndex, "1", NeuronDevice, "1", EventType, "sram_ecc_uncorrected", PodName, DummyPod, RuntimeTag, "1"}}, Unit: Count}, + NeuronDeviceHwEccEvents: {MetricType: pmetric.MetricTypeSum, MetricValues: []float64{1, 2, 3, 4}, SpecialAttributes: [][]string{{NeuronDeviceIndex, "1", NeuronDevice, "1", EventType, "mem_ecc_corrected", PodName, DummyPod, RuntimeTag, "1"}, {NeuronDeviceIndex, "1", NeuronDevice, "1", EventType, "mem_ecc_uncorrected", PodName, DummyPod, RuntimeTag, "1"}, {NeuronDeviceIndex, "1", NeuronDevice, "1", EventType, "sram_ecc_corrected", PodName, DummyPod, RuntimeTag, "1"}, {NeuronDeviceIndex, "1", NeuronDevice, "1", EventType, "sram_ecc_uncorrected", PodName, DummyPod, RuntimeTag, "1"}}, Unit: Count}, } func setupMetricModifier() *AwsNeuronMetricModifier { @@ -117,7 +117,6 @@ func TestMetricModifierForExecutionStatusMetric(t *testing.T) { "node_neuron_execution_status_timed_out": createExpectedMetric("node_neuron_execution_status_timed_out", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), "node_neuron_execution_status_incorrect_input": createExpectedMetric("node_neuron_execution_status_incorrect_input", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{5}, pmetric.MetricTypeSum, Count), "node_neuron_execution_status_failed_to_queue": createExpectedMetric("node_neuron_execution_status_failed_to_queue", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{6}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_total": createExpectedMetric("node_neuron_execution_status_total", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{21}, pmetric.MetricTypeSum, Count), } assertModifiedMetric(t, metricsList, expectedMetrics) @@ -170,23 +169,26 @@ func TestMetricModifierForNeuronDeviceRuntimeMemoryUsageMetric(t *testing.T) { func TestMetricModifierForNeuronDeviceEccEventMetric(t *testing.T) { metricModifier := setupMetricModifier() metricsList := pmetric.NewMetricSlice() - createActualMetricForKey(NeuronDeviceHwEccEventsTotal).CopyTo(metricsList.AppendEmpty()) + createActualMetricForKey(NeuronDeviceHwEccEvents).CopyTo(metricsList.AppendEmpty()) metricModifier.ModifyMetric(metricsList.At(0), metricsList) expectedMetrics := map[string]pmetric.Metric{ - NeuronDeviceHwEccEventsTotal: metricsList.At(0), - "node_neurondevice_hw_ecc_events_total_mem_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_total_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_total_mem_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_total_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_total_sram_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_total_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_total_sram_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_total_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_total_mem_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_total_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_total_mem_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_total_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_total_sram_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_total_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_total_sram_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_total_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_total_mem_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_total_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_total_mem_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_total_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_total_sram_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_total_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_total_sram_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_total_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + NeuronDeviceHwEccEvents: metricsList.At(0), + "node_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_total": createExpectedMetric("node_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{10}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_total": createExpectedMetric("pod_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{10}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_total": createExpectedMetric("container_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{10}, pmetric.MetricTypeSum, Count), } assertModifiedMetric(t, metricsList, expectedMetrics) @@ -195,15 +197,16 @@ func TestMetricModifierForNeuronDeviceEccEventMetric(t *testing.T) { func TestMetricModifierForNeuronDeviceEccEventMetric_PodNameMissing(t *testing.T) { metricModifier := setupMetricModifier() metricsList := pmetric.NewMetricSlice() - removeAttributefromMetric(createActualMetricForKey(NeuronDeviceHwEccEventsTotal), PodName).CopyTo(metricsList.AppendEmpty()) + removeAttributefromMetric(createActualMetricForKey(NeuronDeviceHwEccEvents), PodName).CopyTo(metricsList.AppendEmpty()) metricModifier.ModifyMetric(metricsList.At(0), metricsList) expectedMetrics := map[string]pmetric.Metric{ - NeuronDeviceHwEccEventsTotal: metricsList.At(0), - "node_neurondevice_hw_ecc_events_total_mem_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_total_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_total_mem_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_total_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_total_sram_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_total_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_total_sram_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_total_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + NeuronDeviceHwEccEvents: metricsList.At(0), + "node_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_total": createExpectedMetric("node_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{10}, pmetric.MetricTypeSum, Count), } assertModifiedMetric(t, metricsList, expectedMetrics) @@ -230,7 +233,7 @@ func TestListWithMultipleMetrics(t *testing.T) { createActualMetricForKey(NeuronExecutionStatus).CopyTo(metricsList.AppendEmpty()) createActualMetricForKey(NeuronCoreMemoryUsageModelSharedScratchpad).CopyTo(metricsList.AppendEmpty()) createActualMetricForKey(NeuronDeviceRuntimeMemoryUsedBytes).CopyTo(metricsList.AppendEmpty()) - createActualMetricForKey(NeuronDeviceHwEccEventsTotal).CopyTo(metricsList.AppendEmpty()) + createActualMetricForKey(NeuronDeviceHwEccEvents).CopyTo(metricsList.AppendEmpty()) createActualMetricForKey(NonNeuronMetric).CopyTo(metricsList.AppendEmpty()) for i := 0; i < metricsList.Len(); i++ { @@ -243,7 +246,7 @@ func TestListWithMultipleMetrics(t *testing.T) { NeuronExecutionStatus: metricsList.At(2), NeuronCoreMemoryUsageModelSharedScratchpad: metricsList.At(3), NeuronDeviceRuntimeMemoryUsedBytes: metricsList.At(4), - NeuronDeviceHwEccEventsTotal: metricsList.At(5), + NeuronDeviceHwEccEvents: metricsList.At(5), NonNeuronMetric: metricsList.At(6), "node_neuron_execution_latency": createExpectedMetric("node_neuron_execution_latency", false, []map[string]string{{Type: NodeAWSNeuron}}, []float64{1}, pmetric.MetricTypeSum, Seconds), @@ -262,7 +265,6 @@ func TestListWithMultipleMetrics(t *testing.T) { "node_neuron_execution_status_timed_out": createExpectedMetric("node_neuron_execution_status_timed_out", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), "node_neuron_execution_status_incorrect_input": createExpectedMetric("node_neuron_execution_status_incorrect_input", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{5}, pmetric.MetricTypeSum, Count), "node_neuron_execution_status_failed_to_queue": createExpectedMetric("node_neuron_execution_status_failed_to_queue", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{6}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_total": createExpectedMetric("node_neuron_execution_status_total", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{21}, pmetric.MetricTypeSum, Count), "node_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("node_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: NodeAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core1", NeuronDevice: "device0", Type: NodeAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core2", NeuronDevice: "device1", Type: NodeAWSNeuronCore, PodName: DummyPod}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), "pod_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("pod_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: PodAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core1", NeuronDevice: "device0", Type: PodAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core2", NeuronDevice: "device1", Type: PodAWSNeuronCore, PodName: DummyPod}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), @@ -270,18 +272,21 @@ func TestListWithMultipleMetrics(t *testing.T) { "node_neurondevice_runtime_memory_used_bytes": createExpectedMetric("node_neurondevice_runtime_memory_used_bytes", false, []map[string]string{{Type: NodeAWSNeuron}}, []float64{2}, pmetric.MetricTypeSum, Bytes), - "node_neurondevice_hw_ecc_events_total_mem_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_total_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_total_mem_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_total_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_total_sram_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_total_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_total_sram_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_total_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_total_mem_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_total_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_total_mem_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_total_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_total_sram_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_total_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_total_sram_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_total_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_total_mem_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_total_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_total_mem_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_total_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_total_sram_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_total_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_total_sram_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_total_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_total": createExpectedMetric("node_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{10}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_total": createExpectedMetric("pod_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{10}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_total": createExpectedMetric("container_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{10}, pmetric.MetricTypeSum, Count), } assertModifiedMetric(t, metricsList, expectedMetrics) } diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml index c69bd44647..c1cf19d160 100644 --- a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml @@ -427,10 +427,7 @@ exporters: - NeuronDevice - PodName metric_name_selectors: - - container_neurondevice_hw_ecc_events_total_mem_ecc_corrected - - container_neurondevice_hw_ecc_events_total_mem_ecc_uncorrected - - container_neurondevice_hw_ecc_events_total_sram_ecc_corrected - - container_neurondevice_hw_ecc_events_total_sram_ecc_uncorrected + - container_neurondevice_hw_ecc_events_total - dimensions: - - ClusterName - - ClusterName @@ -479,10 +476,7 @@ exporters: - NeuronDevice - PodName metric_name_selectors: - - pod_neurondevice_hw_ecc_events_total_mem_ecc_corrected - - pod_neurondevice_hw_ecc_events_total_mem_ecc_uncorrected - - pod_neurondevice_hw_ecc_events_total_sram_ecc_corrected - - pod_neurondevice_hw_ecc_events_total_sram_ecc_uncorrected + - pod_neurondevice_hw_ecc_events_total - dimensions: - - ClusterName - - ClusterName @@ -509,19 +503,6 @@ exporters: - NodeName metric_name_selectors: - node_neuron_execution_errors_total - - node_neuron_execution_errors_generic - - node_neuron_execution_errors_numerical - - node_neuron_execution_errors_transient - - node_neuron_execution_errors_model - - node_neuron_execution_errors_runtime - - node_neuron_execution_errors_hardware - - node_neuron_execution_status_total - - node_neuron_execution_status_completed - - node_neuron_execution_status_timed_out - - node_neuron_execution_status_completed_with_err - - node_neuron_execution_status_completed_with_num_err - - node_neuron_execution_status_incorrect_input - - node_neuron_execution_status_failed_to_queue - node_neurondevice_runtime_memory_used_bytes - node_neuron_execution_latency - dimensions: @@ -534,10 +515,7 @@ exporters: - NeuronDevice - NodeName metric_name_selectors: - - node_neurondevice_hw_ecc_events_total_mem_ecc_corrected - - node_neurondevice_hw_ecc_events_total_mem_ecc_uncorrected - - node_neurondevice_hw_ecc_events_total_sram_ecc_corrected - - node_neurondevice_hw_ecc_events_total_sram_ecc_uncorrected + - node_neurondevice_hw_ecc_events_total - dimensions: - - ClusterName - - ClusterName @@ -555,7 +533,6 @@ exporters: - FullPodName - Namespace - PodName - label_matchers: [] metric_name_selectors: - container_efa_rx_bytes - container_efa_tx_bytes @@ -582,7 +559,6 @@ exporters: - FullPodName - Namespace - PodName - label_matchers: [] metric_name_selectors: - pod_efa_rx_bytes - pod_efa_tx_bytes @@ -600,7 +576,6 @@ exporters: - InstanceId - InstanceType - NodeName - label_matchers: [] metric_name_selectors: - node_efa_rx_bytes - node_efa_tx_bytes @@ -722,9 +697,9 @@ processors: submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_FB_TOTAL + include: DCGM_FI_DEV_FB_USED_PERCENT match_type: "" - new_name: container_gpu_memory_total + new_name: container_gpu_memory_utilization operations: - action: add_label aggregation_type: "" @@ -735,7 +710,7 @@ processors: new_value: ContainerGPU - action: experimental_scale_value aggregation_type: "" - experimental_scale: 1.048576e+06 + experimental_scale: 100 label: "" label_value: "" new_label: "" @@ -743,9 +718,9 @@ processors: submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_FB_TOTAL + include: DCGM_FI_DEV_FB_USED_PERCENT match_type: "" - new_name: pod_gpu_memory_total + new_name: pod_gpu_memory_utilization operations: - action: add_label aggregation_type: "" @@ -756,7 +731,7 @@ processors: new_value: PodGPU - action: experimental_scale_value aggregation_type: "" - experimental_scale: 1.048576e+06 + experimental_scale: 100 label: "" label_value: "" new_label: "" @@ -764,9 +739,9 @@ processors: submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_FB_TOTAL + include: DCGM_FI_DEV_FB_USED_PERCENT match_type: "" - new_name: node_gpu_memory_total + new_name: node_gpu_memory_utilization operations: - action: add_label aggregation_type: "" @@ -777,7 +752,7 @@ processors: new_value: NodeGPU - action: experimental_scale_value aggregation_type: "" - experimental_scale: 1.048576e+06 + experimental_scale: 100 label: "" label_value: "" new_label: "" @@ -785,9 +760,9 @@ processors: submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_GPU_TEMP + include: DCGM_FI_DEV_FB_USED match_type: "" - new_name: container_gpu_temperature + new_name: container_gpu_memory_used operations: - action: add_label aggregation_type: "" @@ -796,12 +771,19 @@ processors: label_value: "" new_label: Type new_value: ContainerGPU + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 1.048576e+06 + label: "" + label_value: "" + new_label: "" + new_value: "" submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_GPU_TEMP + include: DCGM_FI_DEV_FB_USED match_type: "" - new_name: pod_gpu_temperature + new_name: pod_gpu_memory_used operations: - action: add_label aggregation_type: "" @@ -810,12 +792,19 @@ processors: label_value: "" new_label: Type new_value: PodGPU + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 1.048576e+06 + label: "" + label_value: "" + new_label: "" + new_value: "" submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_GPU_TEMP + include: DCGM_FI_DEV_FB_USED match_type: "" - new_name: node_gpu_temperature + new_name: node_gpu_memory_used operations: - action: add_label aggregation_type: "" @@ -824,12 +813,19 @@ processors: label_value: "" new_label: Type new_value: NodeGPU + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 1.048576e+06 + label: "" + label_value: "" + new_label: "" + new_value: "" submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_POWER_USAGE + include: DCGM_FI_DEV_FB_TOTAL match_type: "" - new_name: container_gpu_power_draw + new_name: container_gpu_memory_total operations: - action: add_label aggregation_type: "" @@ -838,12 +834,19 @@ processors: label_value: "" new_label: Type new_value: ContainerGPU + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 1.048576e+06 + label: "" + label_value: "" + new_label: "" + new_value: "" submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_POWER_USAGE + include: DCGM_FI_DEV_FB_TOTAL match_type: "" - new_name: pod_gpu_power_draw + new_name: pod_gpu_memory_total operations: - action: add_label aggregation_type: "" @@ -852,12 +855,19 @@ processors: label_value: "" new_label: Type new_value: PodGPU + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 1.048576e+06 + label: "" + label_value: "" + new_label: "" + new_value: "" submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_POWER_USAGE + include: DCGM_FI_DEV_FB_TOTAL match_type: "" - new_name: node_gpu_power_draw + new_name: node_gpu_memory_total operations: - action: add_label aggregation_type: "" @@ -866,12 +876,19 @@ processors: label_value: "" new_label: Type new_value: NodeGPU + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 1.048576e+06 + label: "" + label_value: "" + new_label: "" + new_value: "" submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_GPU_UTIL + include: DCGM_FI_DEV_GPU_TEMP match_type: "" - new_name: container_gpu_utilization + new_name: container_gpu_temperature operations: - action: add_label aggregation_type: "" @@ -883,9 +900,9 @@ processors: submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_GPU_UTIL + include: DCGM_FI_DEV_GPU_TEMP match_type: "" - new_name: pod_gpu_utilization + new_name: pod_gpu_temperature operations: - action: add_label aggregation_type: "" @@ -897,9 +914,9 @@ processors: submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_GPU_UTIL + include: DCGM_FI_DEV_GPU_TEMP match_type: "" - new_name: node_gpu_utilization + new_name: node_gpu_temperature operations: - action: add_label aggregation_type: "" @@ -911,9 +928,9 @@ processors: submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_FB_USED_PERCENT + include: DCGM_FI_DEV_POWER_USAGE match_type: "" - new_name: container_gpu_memory_utilization + new_name: container_gpu_power_draw operations: - action: add_label aggregation_type: "" @@ -922,19 +939,12 @@ processors: label_value: "" new_label: Type new_value: ContainerGPU - - action: experimental_scale_value - aggregation_type: "" - experimental_scale: 100 - label: "" - label_value: "" - new_label: "" - new_value: "" submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_FB_USED_PERCENT + include: DCGM_FI_DEV_POWER_USAGE match_type: "" - new_name: pod_gpu_memory_utilization + new_name: pod_gpu_power_draw operations: - action: add_label aggregation_type: "" @@ -943,19 +953,12 @@ processors: label_value: "" new_label: Type new_value: PodGPU - - action: experimental_scale_value - aggregation_type: "" - experimental_scale: 100 - label: "" - label_value: "" - new_label: "" - new_value: "" submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_FB_USED_PERCENT + include: DCGM_FI_DEV_POWER_USAGE match_type: "" - new_name: node_gpu_memory_utilization + new_name: node_gpu_power_draw operations: - action: add_label aggregation_type: "" @@ -964,19 +967,12 @@ processors: label_value: "" new_label: Type new_value: NodeGPU - - action: experimental_scale_value - aggregation_type: "" - experimental_scale: 100 - label: "" - label_value: "" - new_label: "" - new_value: "" submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_FB_USED + include: DCGM_FI_DEV_GPU_UTIL match_type: "" - new_name: container_gpu_memory_used + new_name: container_gpu_utilization operations: - action: add_label aggregation_type: "" @@ -985,19 +981,12 @@ processors: label_value: "" new_label: Type new_value: ContainerGPU - - action: experimental_scale_value - aggregation_type: "" - experimental_scale: 1.048576e+06 - label: "" - label_value: "" - new_label: "" - new_value: "" submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_FB_USED + include: DCGM_FI_DEV_GPU_UTIL match_type: "" - new_name: pod_gpu_memory_used + new_name: pod_gpu_utilization operations: - action: add_label aggregation_type: "" @@ -1006,19 +995,12 @@ processors: label_value: "" new_label: Type new_value: PodGPU - - action: experimental_scale_value - aggregation_type: "" - experimental_scale: 1.048576e+06 - label: "" - label_value: "" - new_label: "" - new_value: "" submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_FB_USED + include: DCGM_FI_DEV_GPU_UTIL match_type: "" - new_name: node_gpu_memory_used + new_name: node_gpu_utilization operations: - action: add_label aggregation_type: "" @@ -1027,19 +1009,12 @@ processors: label_value: "" new_label: Type new_value: NodeGPU - - action: experimental_scale_value - aggregation_type: "" - experimental_scale: 1.048576e+06 - label: "" - label_value: "" - new_label: "" - new_value: "" submatch_case: "" - action: update aggregation_type: "" - include: neuroncore_memory_usage_runtime_memory + include: neuroncore_memory_usage_model_shared_scratchpad match_type: "" - new_name: neuroncore_memory_usage_runtime_memory + new_name: neuroncore_memory_usage_model_shared_scratchpad operations: [] submatch_case: "" - action: update @@ -1051,86 +1026,86 @@ processors: submatch_case: "" - action: update aggregation_type: "" - include: neuroncore_utilization_ratio + include: hardware_ecc_events_total match_type: "" - new_name: neuroncore_utilization - operations: - - action: experimental_scale_value - aggregation_type: "" - experimental_scale: 100 - label: "" - label_value: "" - new_label: "" - new_value: "" + new_name: neurondevice_hw_ecc_events + operations: [] submatch_case: "" - action: update aggregation_type: "" - include: instance_info + include: execution_latency_seconds match_type: "" - new_name: instance_info + new_name: neuron_execution_latency operations: [] submatch_case: "" - action: update aggregation_type: "" - include: neuron_hardware + include: execution_status_total match_type: "" - new_name: neuron_hardware + new_name: neuron_execution_status operations: [] submatch_case: "" - action: update aggregation_type: "" - include: hardware_ecc_events_total + include: neuron_runtime_memory_used_bytes match_type: "" - new_name: neurondevice_hw_ecc_events_total + new_name: neurondevice_runtime_memory_used_bytes operations: [] submatch_case: "" - action: update aggregation_type: "" - include: execution_latency_seconds + include: neuroncore_memory_usage_model_code match_type: "" - new_name: neuron_execution_latency + new_name: neuroncore_memory_usage_model_code operations: [] submatch_case: "" - action: update aggregation_type: "" - include: execution_errors_total + include: neuroncore_memory_usage_runtime_memory match_type: "" - new_name: neuron_execution_errors + new_name: neuroncore_memory_usage_runtime_memory operations: [] submatch_case: "" - action: update aggregation_type: "" - include: execution_status_total + include: neuroncore_utilization_ratio match_type: "" - new_name: neuron_execution_status - operations: [] + new_name: neuroncore_utilization + operations: + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 100 + label: "" + label_value: "" + new_label: "" + new_value: "" submatch_case: "" - action: update aggregation_type: "" - include: neuron_runtime_memory_used_bytes + include: instance_info match_type: "" - new_name: neurondevice_runtime_memory_used_bytes + new_name: instance_info operations: [] submatch_case: "" - action: update aggregation_type: "" - include: neuroncore_memory_usage_constants + include: neuron_hardware match_type: "" - new_name: neuroncore_memory_usage_constants + new_name: neuron_hardware operations: [] submatch_case: "" - action: update aggregation_type: "" - include: neuroncore_memory_usage_model_code + include: execution_errors_total match_type: "" - new_name: neuroncore_memory_usage_model_code + new_name: neuron_execution_errors operations: [] submatch_case: "" - action: update aggregation_type: "" - include: neuroncore_memory_usage_model_shared_scratchpad + include: neuroncore_memory_usage_constants match_type: "" - new_name: neuroncore_memory_usage_model_shared_scratchpad + new_name: neuroncore_memory_usage_constants operations: [] submatch_case: "" receivers: diff --git a/translator/translate/otel/exporter/awsemf/kubernetes.go b/translator/translate/otel/exporter/awsemf/kubernetes.go index 7547a4e563..bf9fbbe5cd 100644 --- a/translator/translate/otel/exporter/awsemf/kubernetes.go +++ b/translator/translate/otel/exporter/awsemf/kubernetes.go @@ -544,10 +544,7 @@ func getAwsNeuronMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.Metric { Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace", "PodName", "ContainerName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName", "NeuronDevice"}}, MetricNameSelectors: []string{ - "container_neurondevice_hw_ecc_events_total_mem_ecc_corrected", - "container_neurondevice_hw_ecc_events_total_mem_ecc_uncorrected", - "container_neurondevice_hw_ecc_events_total_sram_ecc_corrected", - "container_neurondevice_hw_ecc_events_total_sram_ecc_uncorrected", + "container_neurondevice_hw_ecc_events_total", }, }, { @@ -565,10 +562,7 @@ func getAwsNeuronMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.Metric { Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace"}, {"ClusterName", "Namespace", "Service"}, {"ClusterName", "Namespace", "PodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "NeuronDevice"}}, MetricNameSelectors: []string{ - "pod_neurondevice_hw_ecc_events_total_mem_ecc_corrected", - "pod_neurondevice_hw_ecc_events_total_mem_ecc_uncorrected", - "pod_neurondevice_hw_ecc_events_total_sram_ecc_corrected", - "pod_neurondevice_hw_ecc_events_total_sram_ecc_uncorrected", + "pod_neurondevice_hw_ecc_events_total", }, }, { @@ -587,19 +581,6 @@ func getAwsNeuronMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.Metric Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "InstanceId", "NodeName"}}, MetricNameSelectors: []string{ "node_neuron_execution_errors_total", - "node_neuron_execution_errors_generic", - "node_neuron_execution_errors_numerical", - "node_neuron_execution_errors_transient", - "node_neuron_execution_errors_model", - "node_neuron_execution_errors_runtime", - "node_neuron_execution_errors_hardware", - "node_neuron_execution_status_total", - "node_neuron_execution_status_completed", - "node_neuron_execution_status_timed_out", - "node_neuron_execution_status_completed_with_err", - "node_neuron_execution_status_completed_with_num_err", - "node_neuron_execution_status_incorrect_input", - "node_neuron_execution_status_failed_to_queue", "node_neurondevice_runtime_memory_used_bytes", "node_neuron_execution_latency", }, @@ -607,10 +588,7 @@ func getAwsNeuronMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.Metric { Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "InstanceId", "NodeName"}, {"ClusterName", "InstanceId", "NodeName", "NeuronDevice"}}, MetricNameSelectors: []string{ - "node_neurondevice_hw_ecc_events_total_mem_ecc_corrected", - "node_neurondevice_hw_ecc_events_total_mem_ecc_uncorrected", - "node_neurondevice_hw_ecc_events_total_sram_ecc_corrected", - "node_neurondevice_hw_ecc_events_total_sram_ecc_uncorrected", + "node_neurondevice_hw_ecc_events_total", }, }, }...) diff --git a/translator/translate/otel/exporter/awsemf/translator_test.go b/translator/translate/otel/exporter/awsemf/translator_test.go index faaa856aba..0298dd0a65 100644 --- a/translator/translate/otel/exporter/awsemf/translator_test.go +++ b/translator/translate/otel/exporter/awsemf/translator_test.go @@ -444,10 +444,7 @@ func TestTranslator(t *testing.T) { { Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace", "PodName", "ContainerName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName", "NeuronDevice"}}, MetricNameSelectors: []string{ - "container_neurondevice_hw_ecc_events_total_mem_ecc_corrected", - "container_neurondevice_hw_ecc_events_total_mem_ecc_uncorrected", - "container_neurondevice_hw_ecc_events_total_sram_ecc_corrected", - "container_neurondevice_hw_ecc_events_total_sram_ecc_uncorrected", + "container_neurondevice_hw_ecc_events_total", }, }, { @@ -465,10 +462,7 @@ func TestTranslator(t *testing.T) { { Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace"}, {"ClusterName", "Namespace", "Service"}, {"ClusterName", "Namespace", "PodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "NeuronDevice"}}, MetricNameSelectors: []string{ - "pod_neurondevice_hw_ecc_events_total_mem_ecc_corrected", - "pod_neurondevice_hw_ecc_events_total_mem_ecc_uncorrected", - "pod_neurondevice_hw_ecc_events_total_sram_ecc_corrected", - "pod_neurondevice_hw_ecc_events_total_sram_ecc_uncorrected", + "pod_neurondevice_hw_ecc_events_total", }, }, { @@ -487,19 +481,6 @@ func TestTranslator(t *testing.T) { Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "InstanceId", "NodeName"}}, MetricNameSelectors: []string{ "node_neuron_execution_errors_total", - "node_neuron_execution_errors_generic", - "node_neuron_execution_errors_numerical", - "node_neuron_execution_errors_transient", - "node_neuron_execution_errors_model", - "node_neuron_execution_errors_runtime", - "node_neuron_execution_errors_hardware", - "node_neuron_execution_status_total", - "node_neuron_execution_status_completed", - "node_neuron_execution_status_timed_out", - "node_neuron_execution_status_completed_with_err", - "node_neuron_execution_status_completed_with_num_err", - "node_neuron_execution_status_incorrect_input", - "node_neuron_execution_status_failed_to_queue", "node_neurondevice_runtime_memory_used_bytes", "node_neuron_execution_latency", }, @@ -507,10 +488,7 @@ func TestTranslator(t *testing.T) { { Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "InstanceId", "NodeName"}, {"ClusterName", "InstanceId", "NodeName", "NeuronDevice"}}, MetricNameSelectors: []string{ - "node_neurondevice_hw_ecc_events_total_mem_ecc_corrected", - "node_neurondevice_hw_ecc_events_total_mem_ecc_uncorrected", - "node_neurondevice_hw_ecc_events_total_sram_ecc_corrected", - "node_neurondevice_hw_ecc_events_total_sram_ecc_uncorrected", + "node_neurondevice_hw_ecc_events_total", }, }, {