From 55a3d8828d1104c86f5fec0c1c87949e30c36f8c Mon Sep 17 00:00:00 2001 From: Aditya Purang <44022838+aditya-purang@users.noreply.github.com> Date: Wed, 7 Aug 2024 19:41:38 +0100 Subject: [PATCH] Filter attributes neuron metrics (#1262) This change adds per-type emf attribute filtering to AWSNeuron EMF metrics --- internal/containerinsightscommon/k8sconst.go | 1 + .../internal/awsneuron_metric_checker.go | 28 ++ .../internal/awsneuron_metric_checker_test.go | 73 +++++ .../internal/awsneuron_metric_modifier.go | 47 --- .../awsneuron_metric_modifier_test.go | 156 ++++----- .../metricFilters/gpumetricfilters.go | 159 ++++++++++ plugins/processors/gpuattributes/processor.go | 137 ++++---- .../gpuattributes/processor_test.go | 300 +++++++++++++++++- 8 files changed, 684 insertions(+), 217 deletions(-) create mode 100644 plugins/processors/gpuattributes/internal/awsneuron_metric_checker.go create mode 100644 plugins/processors/gpuattributes/internal/awsneuron_metric_checker_test.go create mode 100644 plugins/processors/gpuattributes/internal/metricFilters/gpumetricfilters.go diff --git a/internal/containerinsightscommon/k8sconst.go b/internal/containerinsightscommon/k8sconst.go index c8423665fa..309c0ea9ef 100644 --- a/internal/containerinsightscommon/k8sconst.go +++ b/internal/containerinsightscommon/k8sconst.go @@ -20,6 +20,7 @@ const ( PodOwnersKey = "PodOwners" HostKey = "host" K8sKey = "kubernetes" + K8sLabelsKey = "labels" RunningPodCount = "number_of_running_pods" RunningContainerCount = "number_of_running_containers" diff --git a/plugins/processors/gpuattributes/internal/awsneuron_metric_checker.go b/plugins/processors/gpuattributes/internal/awsneuron_metric_checker.go new file mode 100644 index 0000000000..6718bd623e --- /dev/null +++ b/plugins/processors/gpuattributes/internal/awsneuron_metric_checker.go @@ -0,0 +1,28 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package internal + +import ( + "regexp" +) + +const ( + PROCESSED_NEURON_METRIC_PATTERN = "^(container|node|pod)_(neuroncore_|neurondevice_).*|^node_neuron_.*" +) + +type AwsNeuronMetricChecker struct { +} + +func NewAwsNeuronMetricChecker() *AwsNeuronMetricChecker { + return &AwsNeuronMetricChecker{} +} + +func (md *AwsNeuronMetricChecker) IsProcessedNeuronMetric(name string) bool { + matched, err := regexp.MatchString(PROCESSED_NEURON_METRIC_PATTERN, name) + if err != nil { + print(err) + return false + } + return matched +} diff --git a/plugins/processors/gpuattributes/internal/awsneuron_metric_checker_test.go b/plugins/processors/gpuattributes/internal/awsneuron_metric_checker_test.go new file mode 100644 index 0000000000..c9eb5aacf1 --- /dev/null +++ b/plugins/processors/gpuattributes/internal/awsneuron_metric_checker_test.go @@ -0,0 +1,73 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package internal + +import ( + "testing" +) + +func TestAwsNeuronMetricModifier_IsProcessedNeuronMetric(t *testing.T) { + tests := []struct { + name string + input string + expected bool + }{ + { + name: "container_neuroncore_prefix", + input: "container_neuroncore_metric", + expected: true, + }, + { + name: "pod_neuroncore_prefix", + input: "pod_neuroncore_metric", + expected: true, + }, + { + name: "node_neuroncore_prefix", + input: "node_neuroncore_metric", + expected: true, + }, + { + name: "container_neurondevice_prefix", + input: "container_neurondevice_metric", + expected: true, + }, + { + name: "pod_neurondevice_prefix", + input: "pod_neurondevice_metric", + expected: true, + }, + { + name: "node_neurondevice_prefix", + input: "node_neurondevice_metric", + expected: true, + }, + { + name: "node_neuron_prefix", + input: "node_neuron_metric", + expected: true, + }, + { + name: "container_neuron_prefix", + input: "container_neuron_metric", + expected: false, + }, + { + name: "other_prefix", + input: "other_metric", + expected: false, + }, + } + + md := NewAwsNeuronMetricChecker() + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + result := md.IsProcessedNeuronMetric(test.input) + if result != test.expected { + t.Errorf("IsProcessedNeuronMetric(%q) = %v, expected %v", test.input, result, test.expected) + } + }) + } +} diff --git a/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier.go b/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier.go index a2a463e83a..311855cf34 100644 --- a/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier.go +++ b/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier.go @@ -6,7 +6,6 @@ package internal import ( "strings" - "go.opentelemetry.io/collector/pdata/pcommon" "go.opentelemetry.io/collector/pdata/pmetric" "go.uber.org/zap" @@ -99,26 +98,6 @@ var ( "sram_ecc_corrected": NeuronDeviceHardwareEccEventsAggregatedMetric, "sram_ecc_uncorrected": NeuronDeviceHardwareEccEventsAggregatedMetric}, } - - MetricAttributesToKeep = map[string]struct{}{ - ClusterName: {}, - ContainerName: {}, - FullPodName: {}, - InstanceId: {}, - InstanceType: {}, - K8sPodName: {}, - Namespace: {}, - NeuronDevice: {}, - NodeName: {}, - PodName: {}, - Service: {}, - AvailabilityZone: {}, - Kubernetes: {}, - Region: {}, - RuntimeTag: {}, - SubnetId: {}, - NeuronCore: {}, - } ) func NewMetricModifier(logger *zap.Logger) *AwsNeuronMetricModifier { @@ -156,7 +135,6 @@ func (md *AwsNeuronMetricModifier) ModifyMetric(originalMetric pmetric.Metric, m } modifiedMetricSlice := md.extractDatapointsAsMetricsAndAggregate(originalMetric) - filterLabels(modifiedMetricSlice, originalMetricName) md.duplicateMetrics(modifiedMetricSlice, originalMetricName, originalMetric.Sum().DataPoints(), metrics) } @@ -251,7 +229,6 @@ func (md *AwsNeuronMetricModifier) extractDatapointsAsMetricsAndAggregate(origin // Creating body for the aggregated metric and add it to the new newMetricSlice for each runtime for aggregatedMetricMetadata, value := range aggregatedValuesPerRuntimeTag { - // Aggregated metric for neuron device ecc events is not required aggregatedMetric := setMetricMetadata(newMetricSlice.AppendEmpty(), aggregatedMetricMetadata.aggregatedMetricName, originalMetric.Unit()) originalMetricDatapoints.At(0).CopyTo(aggregatedMetric.SetEmptySum().DataPoints().AppendEmpty()) @@ -269,30 +246,6 @@ func (md *AwsNeuronMetricModifier) extractDatapointsAsMetricsAndAggregate(origin return newMetricSlice } -// This method removes the attribute keys which are not required. The removal is necessary so that the metrics are grouped together -func filterLabels(slice pmetric.MetricSlice, originalMetricName string) { - _, exists := metricModificationsMap[originalMetricName] - if !exists { - return - } - - for i := 0; i < slice.Len(); i++ { - m := slice.At(i) - - dps := m.Sum().DataPoints() - for j := 0; j < dps.Len(); j++ { - attributes := dps.At(j).Attributes() - attributes.RemoveIf(func(label string, value pcommon.Value) bool { - _, exists := MetricAttributesToKeep[label] - if !exists { - return true - } - return false - }) - } - } -} - // This method prefixes NeuronCore and NeuronDevice values with `core` and `device` respectively // to make the attribute values more verbose func prefixCoreAndDeviceLabels(originalMetric pmetric.Metric) { diff --git a/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier_test.go b/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier_test.go index b0140b831c..9bff85917a 100644 --- a/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier_test.go +++ b/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier_test.go @@ -75,7 +75,7 @@ func TestMetricModifierForExecutionLatencyMetric(t *testing.T) { expectedMetrics := map[string]pmetric.Metric{ NeuronExecutionLatency: metricsList.At(0), - "node_neuron_execution_latency": createExpectedMetric("node_neuron_execution_latency", false, []map[string]string{{Type: NodeAWSNeuron}}, []float64{1}, pmetric.MetricTypeSum, Seconds), + "node_neuron_execution_latency": createExpectedMetric("node_neuron_execution_latency", false, []map[string]string{{Type: NodeAWSNeuron, Percentile: "p50"}}, []float64{1}, pmetric.MetricTypeSum, Seconds), } assertModifiedMetric(t, metricsList, expectedMetrics) @@ -88,13 +88,13 @@ func TestMetricModifierForExecutionErrorMetric(t *testing.T) { expectedMetrics := map[string]pmetric.Metric{ NeuronExecutionErrors: metricsList.At(0), - "node_neuron_execution_errors_generic": createExpectedMetric("node_neuron_execution_errors_generic", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_numerical": createExpectedMetric("node_neuron_execution_errors_numerical", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_transient": createExpectedMetric("node_neuron_execution_errors_transient", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_model": createExpectedMetric("node_neuron_execution_errors_model", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_runtime": createExpectedMetric("node_neuron_execution_errors_runtime", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{5}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_hardware": createExpectedMetric("node_neuron_execution_errors_hardware", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{6}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_total": createExpectedMetric("node_neuron_execution_errors_total", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{21}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_generic": createExpectedMetric("node_neuron_execution_errors_generic", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", ErrorType: "generic"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_numerical": createExpectedMetric("node_neuron_execution_errors_numerical", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", ErrorType: "numerical"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_transient": createExpectedMetric("node_neuron_execution_errors_transient", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", ErrorType: "transient"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_model": createExpectedMetric("node_neuron_execution_errors_model", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", ErrorType: "model"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_runtime": createExpectedMetric("node_neuron_execution_errors_runtime", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", ErrorType: "runtime"}}, []float64{5}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_hardware": createExpectedMetric("node_neuron_execution_errors_hardware", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", ErrorType: "hardware"}}, []float64{6}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_total": createExpectedMetric("node_neuron_execution_errors_total", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", ErrorType: "generic"}}, []float64{21}, pmetric.MetricTypeSum, Count), } assertModifiedMetric(t, metricsList, expectedMetrics) @@ -111,12 +111,12 @@ func TestMetricModifierForExecutionStatusMetric(t *testing.T) { expectedMetrics := map[string]pmetric.Metric{ NeuronExecutionStatus: metricsList.At(0), - "node_neuron_execution_status_completed": createExpectedMetric("node_neuron_execution_status_completed", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_completed_with_err": createExpectedMetric("node_neuron_execution_status_completed_with_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_completed_with_num_err": createExpectedMetric("node_neuron_execution_status_completed_with_num_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_timed_out": createExpectedMetric("node_neuron_execution_status_timed_out", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_incorrect_input": createExpectedMetric("node_neuron_execution_status_incorrect_input", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{5}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_failed_to_queue": createExpectedMetric("node_neuron_execution_status_failed_to_queue", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{6}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_completed": createExpectedMetric("node_neuron_execution_status_completed", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", StatusType: "completed"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_completed_with_err": createExpectedMetric("node_neuron_execution_status_completed_with_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", StatusType: "completed_with_err"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_completed_with_num_err": createExpectedMetric("node_neuron_execution_status_completed_with_num_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", StatusType: "completed_with_num_err"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_timed_out": createExpectedMetric("node_neuron_execution_status_timed_out", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", StatusType: "timed_out"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_incorrect_input": createExpectedMetric("node_neuron_execution_status_incorrect_input", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", StatusType: "incorrect_input"}}, []float64{5}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_failed_to_queue": createExpectedMetric("node_neuron_execution_status_failed_to_queue", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", StatusType: "failed_to_queue"}}, []float64{6}, pmetric.MetricTypeSum, Count), } assertModifiedMetric(t, metricsList, expectedMetrics) @@ -130,9 +130,9 @@ func TestMetricModifierForNeuronCoreMemoryUsageMetric(t *testing.T) { expectedMetrics := map[string]pmetric.Metric{ NeuronCoreMemoryUsageModelSharedScratchpad: metricsList.At(0), - "node_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("node_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: NodeAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core1", NeuronDevice: "device0", Type: NodeAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core2", NeuronDevice: "device1", Type: NodeAWSNeuronCore, PodName: DummyPod}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), - "pod_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("pod_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: PodAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core1", NeuronDevice: "device0", Type: PodAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core2", NeuronDevice: "device1", Type: PodAWSNeuronCore, PodName: DummyPod}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), - "container_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("container_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: ContainerAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core1", NeuronDevice: "device0", Type: ContainerAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core2", NeuronDevice: "device1", Type: ContainerAWSNeuronCore, PodName: DummyPod}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), + "node_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("node_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: NodeAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}, {NeuronCore: "core1", NeuronDevice: "device0", Type: NodeAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}, {NeuronCore: "core2", NeuronDevice: "device1", Type: NodeAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), + "pod_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("pod_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: PodAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}, {NeuronCore: "core1", NeuronDevice: "device0", Type: PodAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}, {NeuronCore: "core2", NeuronDevice: "device1", Type: PodAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), + "container_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("container_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: ContainerAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}, {NeuronCore: "core1", NeuronDevice: "device0", Type: ContainerAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}, {NeuronCore: "core2", NeuronDevice: "device1", Type: ContainerAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), } assertModifiedMetric(t, metricsList, expectedMetrics) @@ -146,7 +146,7 @@ func TestMetricModifierForNeuronCoreMemoryUsageMetric_PodNameMissing(t *testing. expectedMetrics := map[string]pmetric.Metric{ NeuronCoreMemoryUsageModelSharedScratchpad: metricsList.At(0), - "node_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("node_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: NodeAWSNeuronCore}, {NeuronCore: "core1", NeuronDevice: "device0", Type: NodeAWSNeuronCore}, {NeuronCore: "core2", NeuronDevice: "device1", Type: NodeAWSNeuronCore}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), + "node_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("node_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: NodeAWSNeuronCore, MemoryLocation: "None"}, {NeuronCore: "core1", NeuronDevice: "device0", Type: NodeAWSNeuronCore, MemoryLocation: "None"}, {NeuronCore: "core2", NeuronDevice: "device1", Type: NodeAWSNeuronCore, MemoryLocation: "None"}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), } assertModifiedMetric(t, metricsList, expectedMetrics) @@ -160,7 +160,7 @@ func TestMetricModifierForNeuronDeviceRuntimeMemoryUsageMetric(t *testing.T) { expectedMetrics := map[string]pmetric.Metric{ NeuronDeviceRuntimeMemoryUsedBytes: metricsList.At(0), - "node_neurondevice_runtime_memory_used_bytes": createExpectedMetric("node_neurondevice_runtime_memory_used_bytes", false, []map[string]string{{Type: NodeAWSNeuron}}, []float64{2}, pmetric.MetricTypeSum, Bytes), + "node_neurondevice_runtime_memory_used_bytes": createExpectedMetric("node_neurondevice_runtime_memory_used_bytes", false, []map[string]string{{Type: NodeAWSNeuron, MemoryLocation: "neuron_device"}}, []float64{2}, pmetric.MetricTypeSum, Bytes), } assertModifiedMetric(t, metricsList, expectedMetrics) @@ -174,21 +174,21 @@ func TestMetricModifierForNeuronDeviceEccEventMetric(t *testing.T) { expectedMetrics := map[string]pmetric.Metric{ NeuronDeviceHwEccEvents: metricsList.At(0), - "node_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_total": createExpectedMetric("node_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{10}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_total": createExpectedMetric("pod_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{10}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_total": createExpectedMetric("container_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{10}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_corrected"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_uncorrected"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1", EventType: "sram_ecc_corrected"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1", EventType: "sram_ecc_uncorrected"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_total": createExpectedMetric("node_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_corrected"}}, []float64{10}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_corrected"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_uncorrected"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1", EventType: "sram_ecc_corrected"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1", EventType: "sram_ecc_uncorrected"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_total": createExpectedMetric("pod_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_corrected"}}, []float64{10}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_corrected"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_uncorrected"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1", EventType: "sram_ecc_corrected"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1", EventType: "sram_ecc_uncorrected"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_total": createExpectedMetric("container_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_corrected"}}, []float64{10}, pmetric.MetricTypeSum, Count), } assertModifiedMetric(t, metricsList, expectedMetrics) @@ -202,11 +202,11 @@ func TestMetricModifierForNeuronDeviceEccEventMetric_PodNameMissing(t *testing.T expectedMetrics := map[string]pmetric.Metric{ NeuronDeviceHwEccEvents: metricsList.At(0), - "node_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_total": createExpectedMetric("node_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{10}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_corrected"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_uncorrected"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1", EventType: "sram_ecc_corrected"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1", EventType: "sram_ecc_uncorrected"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_total": createExpectedMetric("node_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_corrected"}}, []float64{10}, pmetric.MetricTypeSum, Count), } assertModifiedMetric(t, metricsList, expectedMetrics) @@ -249,44 +249,44 @@ func TestListWithMultipleMetrics(t *testing.T) { NeuronDeviceHwEccEvents: metricsList.At(5), NonNeuronMetric: metricsList.At(6), - "node_neuron_execution_latency": createExpectedMetric("node_neuron_execution_latency", false, []map[string]string{{Type: NodeAWSNeuron}}, []float64{1}, pmetric.MetricTypeSum, Seconds), - - "node_neuron_execution_errors_generic": createExpectedMetric("node_neuron_execution_errors_generic", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_numerical": createExpectedMetric("node_neuron_execution_errors_numerical", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_transient": createExpectedMetric("node_neuron_execution_errors_transient", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_model": createExpectedMetric("node_neuron_execution_errors_model", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_runtime": createExpectedMetric("node_neuron_execution_errors_runtime", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{5}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_hardware": createExpectedMetric("node_neuron_execution_errors_hardware", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{6}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_total": createExpectedMetric("node_neuron_execution_errors_total", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{21}, pmetric.MetricTypeSum, Count), - - "node_neuron_execution_status_completed": createExpectedMetric("node_neuron_execution_status_completed", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_completed_with_err": createExpectedMetric("node_neuron_execution_status_completed_with_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_completed_with_num_err": createExpectedMetric("node_neuron_execution_status_completed_with_num_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_timed_out": createExpectedMetric("node_neuron_execution_status_timed_out", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_incorrect_input": createExpectedMetric("node_neuron_execution_status_incorrect_input", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{5}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_failed_to_queue": createExpectedMetric("node_neuron_execution_status_failed_to_queue", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{6}, pmetric.MetricTypeSum, Count), - - "node_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("node_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: NodeAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core1", NeuronDevice: "device0", Type: NodeAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core2", NeuronDevice: "device1", Type: NodeAWSNeuronCore, PodName: DummyPod}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), - "pod_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("pod_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: PodAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core1", NeuronDevice: "device0", Type: PodAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core2", NeuronDevice: "device1", Type: PodAWSNeuronCore, PodName: DummyPod}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), - "container_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("container_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: ContainerAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core1", NeuronDevice: "device0", Type: ContainerAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core2", NeuronDevice: "device1", Type: ContainerAWSNeuronCore, PodName: DummyPod}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), - - "node_neurondevice_runtime_memory_used_bytes": createExpectedMetric("node_neurondevice_runtime_memory_used_bytes", false, []map[string]string{{Type: NodeAWSNeuron}}, []float64{2}, pmetric.MetricTypeSum, Bytes), - - "node_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_total": createExpectedMetric("node_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{10}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_total": createExpectedMetric("pod_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{10}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_total": createExpectedMetric("container_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{10}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_latency": createExpectedMetric("node_neuron_execution_latency", false, []map[string]string{{Type: NodeAWSNeuron, Percentile: "p50"}}, []float64{1}, pmetric.MetricTypeSum, Seconds), + + "node_neuron_execution_errors_generic": createExpectedMetric("node_neuron_execution_errors_generic", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", ErrorType: "generic"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_numerical": createExpectedMetric("node_neuron_execution_errors_numerical", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", ErrorType: "numerical"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_transient": createExpectedMetric("node_neuron_execution_errors_transient", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", ErrorType: "transient"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_model": createExpectedMetric("node_neuron_execution_errors_model", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", ErrorType: "model"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_runtime": createExpectedMetric("node_neuron_execution_errors_runtime", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", ErrorType: "runtime"}}, []float64{5}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_hardware": createExpectedMetric("node_neuron_execution_errors_hardware", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", ErrorType: "hardware"}}, []float64{6}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_total": createExpectedMetric("node_neuron_execution_errors_total", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", ErrorType: "generic"}}, []float64{21}, pmetric.MetricTypeSum, Count), + + "node_neuron_execution_status_completed": createExpectedMetric("node_neuron_execution_status_completed", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", StatusType: "completed"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_completed_with_err": createExpectedMetric("node_neuron_execution_status_completed_with_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", StatusType: "completed_with_err"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_completed_with_num_err": createExpectedMetric("node_neuron_execution_status_completed_with_num_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", StatusType: "completed_with_num_err"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_timed_out": createExpectedMetric("node_neuron_execution_status_timed_out", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", StatusType: "timed_out"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_incorrect_input": createExpectedMetric("node_neuron_execution_status_incorrect_input", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", StatusType: "incorrect_input"}}, []float64{5}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_failed_to_queue": createExpectedMetric("node_neuron_execution_status_failed_to_queue", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", StatusType: "failed_to_queue"}}, []float64{6}, pmetric.MetricTypeSum, Count), + + "node_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("node_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: NodeAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}, {NeuronCore: "core1", NeuronDevice: "device0", Type: NodeAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}, {NeuronCore: "core2", NeuronDevice: "device1", Type: NodeAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), + "pod_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("pod_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: PodAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}, {NeuronCore: "core1", NeuronDevice: "device0", Type: PodAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}, {NeuronCore: "core2", NeuronDevice: "device1", Type: PodAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), + "container_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("container_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: ContainerAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}, {NeuronCore: "core1", NeuronDevice: "device0", Type: ContainerAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}, {NeuronCore: "core2", NeuronDevice: "device1", Type: ContainerAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), + + "node_neurondevice_runtime_memory_used_bytes": createExpectedMetric("node_neurondevice_runtime_memory_used_bytes", false, []map[string]string{{Type: NodeAWSNeuron, MemoryLocation: "neuron_device"}}, []float64{2}, pmetric.MetricTypeSum, Bytes), + + "node_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_corrected"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_uncorrected"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1", EventType: "sram_ecc_corrected"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1", EventType: "sram_ecc_uncorrected"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_total": createExpectedMetric("node_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_corrected"}}, []float64{10}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_corrected"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_uncorrected"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1", EventType: "sram_ecc_corrected"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1", EventType: "sram_ecc_uncorrected"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_total": createExpectedMetric("pod_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_corrected"}}, []float64{10}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_corrected"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_uncorrected"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1", EventType: "sram_ecc_corrected"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1", EventType: "sram_ecc_uncorrected"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_total": createExpectedMetric("container_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_corrected"}}, []float64{10}, pmetric.MetricTypeSum, Count), } assertModifiedMetric(t, metricsList, expectedMetrics) } @@ -303,7 +303,7 @@ func TestMetricWithStaleDatapoint(t *testing.T) { expectedMetrics := map[string]pmetric.Metric{ NeuronExecutionLatency: metricsList.At(0), - "node_neuron_execution_latency": createExpectedMetric("node_neuron_execution_latency", false, []map[string]string{{Type: NodeAWSNeuron}}, []float64{1}, pmetric.MetricTypeSum, Seconds), + "node_neuron_execution_latency": createExpectedMetric("node_neuron_execution_latency", false, []map[string]string{{Type: NodeAWSNeuron, Percentile: "p50"}}, []float64{1}, pmetric.MetricTypeSum, Seconds), } assertModifiedMetric(t, metricsList, expectedMetrics) diff --git a/plugins/processors/gpuattributes/internal/metricFilters/gpumetricfilters.go b/plugins/processors/gpuattributes/internal/metricFilters/gpumetricfilters.go new file mode 100644 index 0000000000..e7868be616 --- /dev/null +++ b/plugins/processors/gpuattributes/internal/metricFilters/gpumetricfilters.go @@ -0,0 +1,159 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package metricFilters + +import ( + "github.com/aws/amazon-cloudwatch-agent/internal/containerinsightscommon" + "github.com/aws/amazon-cloudwatch-agent/plugins/processors/gpuattributes/internal" +) + +// This class contains the attribute filters which are applied to the metric datapoints of GPU and Neuron metrics. +// If the datapoint contains metrics apart from the ones mentioned in the filter, then they'll be dropped. + +const ( + containerd = "containerd" + pod_id = "pod_id" + pod_name = "pod_name" + pod_owners = "pod_owners" + namespace = "namespace" + container_name = "container_name" +) + +var ContainerGpuLabelFilter = map[string]map[string]interface{}{ + containerinsightscommon.ClusterNameKey: nil, + containerinsightscommon.InstanceIdKey: nil, + containerinsightscommon.GpuDeviceKey: nil, + containerinsightscommon.MetricType: nil, + containerinsightscommon.NodeNameKey: nil, + containerinsightscommon.K8sNamespace: nil, + containerinsightscommon.FullPodNameKey: nil, + containerinsightscommon.PodNameKey: nil, + containerinsightscommon.TypeService: nil, + containerinsightscommon.GpuUniqueId: nil, + containerinsightscommon.ContainerNamekey: nil, + containerinsightscommon.InstanceTypeKey: nil, + containerinsightscommon.VersionKey: nil, + containerinsightscommon.SourcesKey: nil, + containerinsightscommon.Timestamp: nil, + containerinsightscommon.K8sKey: { + containerinsightscommon.HostKey: nil, + containerinsightscommon.K8sLabelsKey: nil, + pod_id: nil, + pod_name: nil, + pod_owners: nil, + namespace: nil, + container_name: nil, + containerd: nil, + }, +} +var PodGpuLabelFilter = map[string]map[string]interface{}{ + containerinsightscommon.ClusterNameKey: nil, + containerinsightscommon.InstanceIdKey: nil, + containerinsightscommon.GpuDeviceKey: nil, + containerinsightscommon.MetricType: nil, + containerinsightscommon.NodeNameKey: nil, + containerinsightscommon.K8sNamespace: nil, + containerinsightscommon.FullPodNameKey: nil, + containerinsightscommon.PodNameKey: nil, + containerinsightscommon.TypeService: nil, + containerinsightscommon.GpuUniqueId: nil, + containerinsightscommon.InstanceTypeKey: nil, + containerinsightscommon.VersionKey: nil, + containerinsightscommon.SourcesKey: nil, + containerinsightscommon.Timestamp: nil, + containerinsightscommon.K8sKey: { + containerinsightscommon.HostKey: nil, + containerinsightscommon.K8sLabelsKey: nil, + pod_id: nil, + pod_name: nil, + pod_owners: nil, + namespace: nil, + }, +} +var NodeGpuLabelFilter = map[string]map[string]interface{}{ + containerinsightscommon.ClusterNameKey: nil, + containerinsightscommon.InstanceIdKey: nil, + containerinsightscommon.GpuDeviceKey: nil, + containerinsightscommon.MetricType: nil, + containerinsightscommon.NodeNameKey: nil, + containerinsightscommon.InstanceTypeKey: nil, + containerinsightscommon.VersionKey: nil, + containerinsightscommon.SourcesKey: nil, + containerinsightscommon.Timestamp: nil, + containerinsightscommon.K8sKey: { + containerinsightscommon.HostKey: nil, + }, +} + +var PodNeuronLabelFilter = map[string]map[string]interface{}{ + containerinsightscommon.ClusterNameKey: nil, + containerinsightscommon.FullPodNameKey: nil, + containerinsightscommon.InstanceIdKey: nil, + containerinsightscommon.InstanceTypeKey: nil, + containerinsightscommon.K8sPodNameKey: nil, + containerinsightscommon.K8sNamespace: nil, + internal.NeuronDevice: nil, + containerinsightscommon.NodeNameKey: nil, + containerinsightscommon.PodNameKey: nil, + containerinsightscommon.TypeService: nil, + internal.AvailabilityZone: nil, + containerinsightscommon.K8sKey: { + containerinsightscommon.HostKey: nil, + pod_id: nil, + pod_owners: nil, + containerinsightscommon.K8sLabelsKey: nil, + }, + internal.Region: nil, + internal.RuntimeTag: nil, + internal.SubnetId: nil, + internal.NeuronCore: nil, + containerinsightscommon.MetricType: nil, +} + +var ContainerNeuronLabelFilter = map[string]map[string]interface{}{ + containerinsightscommon.ClusterNameKey: nil, + containerinsightscommon.ContainerNamekey: nil, + containerinsightscommon.FullPodNameKey: nil, + containerinsightscommon.InstanceIdKey: nil, + containerinsightscommon.InstanceTypeKey: nil, + containerinsightscommon.K8sPodNameKey: nil, + containerinsightscommon.K8sNamespace: nil, + internal.NeuronDevice: nil, + containerinsightscommon.NodeNameKey: nil, + containerinsightscommon.PodNameKey: nil, + containerinsightscommon.TypeService: nil, + internal.AvailabilityZone: nil, + containerinsightscommon.Kubernetes: { + containerinsightscommon.HostKey: nil, + "containerd": nil, + pod_id: nil, + pod_owners: nil, + containerinsightscommon.K8sLabelsKey: nil, + }, + internal.Region: nil, + internal.RuntimeTag: nil, + internal.SubnetId: nil, + internal.NeuronCore: nil, + containerinsightscommon.MetricType: nil, +} + +var NodeNeuronLabelFilter = map[string]map[string]interface{}{ + containerinsightscommon.ClusterNameKey: nil, + containerinsightscommon.InstanceIdKey: nil, + containerinsightscommon.InstanceTypeKey: nil, + containerinsightscommon.K8sNamespace: nil, + internal.NeuronDevice: nil, + containerinsightscommon.NodeNameKey: nil, + containerinsightscommon.TypeService: nil, + internal.AvailabilityZone: nil, + containerinsightscommon.Kubernetes: { + containerinsightscommon.HostKey: nil, + containerinsightscommon.K8sLabelsKey: nil, + }, + internal.Region: nil, + internal.RuntimeTag: nil, + internal.SubnetId: nil, + internal.NeuronCore: nil, + containerinsightscommon.MetricType: nil, +} diff --git a/plugins/processors/gpuattributes/processor.go b/plugins/processors/gpuattributes/processor.go index 94fb411a53..d37ff0298a 100644 --- a/plugins/processors/gpuattributes/processor.go +++ b/plugins/processors/gpuattributes/processor.go @@ -11,16 +11,18 @@ import ( "go.opentelemetry.io/collector/pdata/pcommon" "go.opentelemetry.io/collector/pdata/pmetric" "go.uber.org/zap" + "golang.org/x/exp/maps" "github.com/aws/amazon-cloudwatch-agent/internal/containerinsightscommon" "github.com/aws/amazon-cloudwatch-agent/plugins/processors/gpuattributes/internal" + "github.com/aws/amazon-cloudwatch-agent/plugins/processors/gpuattributes/internal/metricFilters" ) const ( - gpuMetricIdentifier = "_gpu_" - gpuContainerMetricPrefix = "container_" - gpuPodMetricPrefix = "pod_" - gpuNodeMetricPrefix = "node_" + gpuMetricIdentifier = "_gpu_" + containerMetricPrefix = "container_" + podMetricPrefix = "pod_" + nodeMetricPrefix = "node_" ) // schemas at each resource level @@ -42,77 +44,12 @@ const ( // - ClusterName // - ClusterName, InstanceIdKey, NodeName // - ClusterName, InstanceIdKey, NodeName, GpuDevice -var containerLabelFilter = map[string]map[string]interface{}{ - containerinsightscommon.ClusterNameKey: nil, - containerinsightscommon.InstanceIdKey: nil, - containerinsightscommon.GpuDeviceKey: nil, - containerinsightscommon.MetricType: nil, - containerinsightscommon.NodeNameKey: nil, - containerinsightscommon.K8sNamespace: nil, - containerinsightscommon.FullPodNameKey: nil, - containerinsightscommon.PodNameKey: nil, - containerinsightscommon.TypeService: nil, - containerinsightscommon.GpuUniqueId: nil, - containerinsightscommon.ContainerNamekey: nil, - containerinsightscommon.InstanceTypeKey: nil, - containerinsightscommon.VersionKey: nil, - containerinsightscommon.SourcesKey: nil, - containerinsightscommon.Timestamp: nil, - containerinsightscommon.K8sKey: { - containerinsightscommon.HostKey: nil, - "labels": nil, - "pod_id": nil, - "pod_name": nil, - "pod_owners": nil, - "namespace": nil, - "container_name": nil, - "containerd": nil, - }, -} -var podLabelFilter = map[string]map[string]interface{}{ - containerinsightscommon.ClusterNameKey: nil, - containerinsightscommon.InstanceIdKey: nil, - containerinsightscommon.GpuDeviceKey: nil, - containerinsightscommon.MetricType: nil, - containerinsightscommon.NodeNameKey: nil, - containerinsightscommon.K8sNamespace: nil, - containerinsightscommon.FullPodNameKey: nil, - containerinsightscommon.PodNameKey: nil, - containerinsightscommon.TypeService: nil, - containerinsightscommon.GpuUniqueId: nil, - containerinsightscommon.InstanceTypeKey: nil, - containerinsightscommon.VersionKey: nil, - containerinsightscommon.SourcesKey: nil, - containerinsightscommon.Timestamp: nil, - containerinsightscommon.K8sKey: { - containerinsightscommon.HostKey: nil, - "labels": nil, - "pod_id": nil, - "pod_name": nil, - "pod_owners": nil, - "namespace": nil, - }, -} -var nodeLabelFilter = map[string]map[string]interface{}{ - containerinsightscommon.ClusterNameKey: nil, - containerinsightscommon.InstanceIdKey: nil, - containerinsightscommon.GpuDeviceKey: nil, - containerinsightscommon.MetricType: nil, - containerinsightscommon.NodeNameKey: nil, - containerinsightscommon.InstanceTypeKey: nil, - containerinsightscommon.VersionKey: nil, - containerinsightscommon.SourcesKey: nil, - containerinsightscommon.Timestamp: nil, - containerinsightscommon.K8sKey: { - containerinsightscommon.HostKey: nil, - }, -} - type gpuAttributesProcessor struct { *Config logger *zap.Logger awsNeuronMetricModifier *internal.AwsNeuronMetricModifier awsNeuronMemoryMetricAggregator *internal.AwsNeuronMemoryMetricsAggregator + awsNeuronMetricChecker *internal.AwsNeuronMetricChecker } func newGpuAttributesProcessor(config *Config, logger *zap.Logger) *gpuAttributesProcessor { @@ -121,6 +58,7 @@ func newGpuAttributesProcessor(config *Config, logger *zap.Logger) *gpuAttribute logger: logger, awsNeuronMetricModifier: internal.NewMetricModifier(logger), awsNeuronMemoryMetricAggregator: internal.NewMemoryMemoryAggregator(), + awsNeuronMetricChecker: internal.NewAwsNeuronMetricChecker(), } return d } @@ -139,7 +77,6 @@ func (d *gpuAttributesProcessor) processMetrics(_ context.Context, md pmetric.Me metricsLength := metrics.Len() for k := 0; k < metricsLength; k++ { m := metrics.At(k) - d.processGPUMetricAttributes(m) d.awsNeuronMemoryMetricAggregator.AggregateMemoryMetric(m) // non neuron metric is returned as a singleton list d.awsNeuronMetricModifier.ModifyMetric(m, metrics) @@ -148,24 +85,54 @@ func (d *gpuAttributesProcessor) processMetrics(_ context.Context, md pmetric.Me aggregatedMemoryMetric := d.awsNeuronMemoryMetricAggregator.FlushAggregatedMemoryMetric() d.awsNeuronMetricModifier.ModifyMetric(aggregatedMemoryMetric, metrics) } + + //loop over all metrics and filter labels + for k := 0; k < metrics.Len(); k++ { + m := metrics.At(k) + d.processMetricAttributes(m) + } } + + dropResourceMetricAttributes(rs) } return md, nil } -func (d *gpuAttributesProcessor) processGPUMetricAttributes(m pmetric.Metric) { +func (d *gpuAttributesProcessor) processMetricAttributes(m pmetric.Metric) { // only decorate GPU metrics - if !strings.Contains(m.Name(), gpuMetricIdentifier) { + isGpuMetric := strings.Contains(m.Name(), gpuMetricIdentifier) + isNeuronMetric := d.awsNeuronMetricChecker.IsProcessedNeuronMetric(m.Name()) + if !isNeuronMetric && !isGpuMetric { return } labelFilter := map[string]map[string]interface{}{} - if strings.HasPrefix(m.Name(), gpuContainerMetricPrefix) { - labelFilter = containerLabelFilter - } else if strings.HasPrefix(m.Name(), gpuPodMetricPrefix) { - labelFilter = podLabelFilter - } else if strings.HasPrefix(m.Name(), gpuNodeMetricPrefix) { - labelFilter = nodeLabelFilter + if isGpuMetric { + if strings.HasPrefix(m.Name(), containerMetricPrefix) { + labelFilter = metricFilters.ContainerGpuLabelFilter + } else if strings.HasPrefix(m.Name(), podMetricPrefix) { + labelFilter = metricFilters.PodGpuLabelFilter + } else if strings.HasPrefix(m.Name(), nodeMetricPrefix) { + labelFilter = metricFilters.NodeGpuLabelFilter + } + } else if isNeuronMetric { + if strings.HasPrefix(m.Name(), containerMetricPrefix) { + labelFilter = metricFilters.ContainerNeuronLabelFilter + } else if strings.HasPrefix(m.Name(), podMetricPrefix) { + labelFilter = metricFilters.PodNeuronLabelFilter + } else if strings.HasPrefix(m.Name(), nodeMetricPrefix) { + labelFilter = metricFilters.NodeNeuronLabelFilter + } + + if strings.Contains(m.Name(), "_neurondevice_hw") { + if kubernetesMap, ok := labelFilter[internal.Kubernetes]; ok { + // cloning is done to avoid modifying the original label filters + labelFilter = maps.Clone(labelFilter) + kubernetesMap := maps.Clone(kubernetesMap) + delete(kubernetesMap, "labels") + labelFilter[internal.Kubernetes] = kubernetesMap + } + } } var dps pmetric.NumberDataPointSlice @@ -230,7 +197,7 @@ func (d *gpuAttributesProcessor) filterAttributes(attributes pcommon.Map, labels func (d *gpuAttributesProcessor) filterGpuMetricsWithoutPodName(metrics pmetric.MetricSlice, resourceAttributes pcommon.Map) { metrics.RemoveIf(func(m pmetric.Metric) bool { isGpu := strings.Contains(m.Name(), gpuMetricIdentifier) - isContainerOrPod := strings.HasPrefix(m.Name(), gpuContainerMetricPrefix) || strings.HasPrefix(m.Name(), gpuPodMetricPrefix) + isContainerOrPod := strings.HasPrefix(m.Name(), containerMetricPrefix) || strings.HasPrefix(m.Name(), podMetricPrefix) if !isGpu || !isContainerOrPod { return false } @@ -253,3 +220,13 @@ func (d *gpuAttributesProcessor) filterGpuMetricsWithoutPodName(metrics pmetric. return dps.Len() == 0 }) } + +func dropResourceMetricAttributes(resourceMetric pmetric.ResourceMetrics) { + serviceNameKey := "service.name" + attributes := resourceMetric.Resource().Attributes() + serviceName, exists := attributes.Get(serviceNameKey) + + if exists && (serviceName.Str() == "containerInsightsNeuronMonitorScraper" || serviceName.Str() == "containerInsightsDCGMExporterScraper") { + resourceMetric.Resource().Attributes().Clear() + } +} diff --git a/plugins/processors/gpuattributes/processor_test.go b/plugins/processors/gpuattributes/processor_test.go index a625945eda..30fdcfa646 100644 --- a/plugins/processors/gpuattributes/processor_test.go +++ b/plugins/processors/gpuattributes/processor_test.go @@ -13,7 +13,7 @@ import ( "go.uber.org/zap" ) -func TestProcessMetrics(t *testing.T) { +func TestProcessMetricsForGPUMetrics(t *testing.T) { logger, _ := zap.NewDevelopment() gp := newGpuAttributesProcessor(createDefaultConfig().(*Config), logger) ctx := context.Background() @@ -25,7 +25,7 @@ func TestProcessMetrics(t *testing.T) { want []map[string]string }{ "nonNode": { - metrics: generateMetrics("prefix", []map[string]string{ + metrics: generateGPUMetrics("prefix", []map[string]string{ { "ClusterName": "cluster", }, @@ -38,7 +38,7 @@ func TestProcessMetrics(t *testing.T) { }, }, "nodeDropSimple": { - metrics: generateMetrics("node", []map[string]string{ + metrics: generateGPUMetrics("node", []map[string]string{ { "ClusterName": "cluster", "Drop": "val", @@ -52,7 +52,7 @@ func TestProcessMetrics(t *testing.T) { }, }, "nodeDropJson": { - metrics: generateMetrics("node", []map[string]string{ + metrics: generateGPUMetrics("node", []map[string]string{ { "ClusterName": "cluster", "kubernetes": "{\"host\":\"test\"}", @@ -67,7 +67,7 @@ func TestProcessMetrics(t *testing.T) { }, }, "nodeDropMixed": { - metrics: generateMetrics("node", []map[string]string{ + metrics: generateGPUMetrics("node", []map[string]string{ { "ClusterName": "cluster", "Drop": "val", @@ -83,7 +83,7 @@ func TestProcessMetrics(t *testing.T) { }, }, "dropPodWithoutPodName": { - metrics: generateMetrics("pod", []map[string]string{ + metrics: generateGPUMetrics("pod", []map[string]string{ { "ClusterName": "cluster", "kubernetes": "{\"host\":\"test\",\"b\":\"2\"}", @@ -93,7 +93,7 @@ func TestProcessMetrics(t *testing.T) { want: []map[string]string{}, }, "keepPodWithPodName": { - metrics: generateMetrics("pod", []map[string]string{ + metrics: generateGPUMetrics("pod", []map[string]string{ { "ClusterName": "cluster", "PodName": "pod", @@ -110,7 +110,7 @@ func TestProcessMetrics(t *testing.T) { }, }, "dropContainerWithoutPodName": { - metrics: generateMetrics("container", []map[string]string{ + metrics: generateGPUMetrics("container", []map[string]string{ { "ClusterName": "cluster", "kubernetes": "{\"host\":\"test\",\"b\":\"2\"}", @@ -120,7 +120,7 @@ func TestProcessMetrics(t *testing.T) { want: []map[string]string{}, }, "keepContainerWithPodName": { - metrics: generateMetrics("container", []map[string]string{ + metrics: generateGPUMetrics("container", []map[string]string{ { "ClusterName": "cluster", "PodName": "pod", @@ -137,7 +137,7 @@ func TestProcessMetrics(t *testing.T) { }, }, "dropSingleDatapointWithoutPodName": { - metrics: generateMetrics("container", []map[string]string{ + metrics: generateGPUMetrics("container", []map[string]string{ { "ClusterName": "cluster", "kubernetes": "{\"host\":\"test\",\"b\":\"2\"}", @@ -158,7 +158,7 @@ func TestProcessMetrics(t *testing.T) { }, }, "keepAllDatapoints": { - metrics: generateMetrics("container", []map[string]string{ + metrics: generateGPUMetrics("container", []map[string]string{ { "ClusterName": "cluster", "PodName": "pod1", @@ -206,7 +206,267 @@ func TestProcessMetrics(t *testing.T) { } } -func generateMetrics(prefix string, dimensions []map[string]string) pmetric.Metrics { +func TestProcessMetricsForNeuronMetrics(t *testing.T) { + logger, _ := zap.NewDevelopment() + gp := newGpuAttributesProcessor(createDefaultConfig().(*Config), logger) + ctx := context.Background() + + testcases := map[string]struct { + resource string + metrics pmetric.Metrics + wantMetricCnt int + want []map[string]string + }{ + "neuronMetricsProcessedWithNoPodCorrelation": { + metrics: generateNeuronMetrics("neuron_execution_latency", []map[string]string{ + { + "ClusterName": "cluster", + "Drop": "val", + "percentile": "p50", + "kubernetes": "{\"host\":\"test\",\"drop\":\"2\",\"labels\":\"label\"}", + }, + }), + wantMetricCnt: 2, + want: []map[string]string{ + { + "ClusterName": "cluster", + "Drop": "val", + "percentile": "p50", + "kubernetes": "{\"host\":\"test\",\"drop\":\"2\",\"labels\":\"label\"}", + }, + { + "ClusterName": "cluster", + "Type": "NodeAWSNeuron", + "kubernetes": "{\"host\":\"test\",\"labels\":\"label\"}", + }, + }, + }, + "neuronMetricsProcessedWithPodCorrelation": { + metrics: generateNeuronMetrics("neuroncore_memory_usage_constants", []map[string]string{ + { + "ClusterName": "cluster", + "Drop": "val", + "runtime_tag": "10", + "NeuronCore": "0", + "NeuronDevice": "0", + "PodName": "testPod", + "ContainerName": "testContainer", + "kubernetes": "{\"host\":\"test\",\"drop\":\"2\",\"labels\":\"label\"}", + }, + }), + wantMetricCnt: 7, + want: []map[string]string{ + { + "ClusterName": "cluster", + "Drop": "val", + "runtime_tag": "10", + "NeuronCore": "core0", + "NeuronDevice": "device0", + "PodName": "testPod", + "ContainerName": "testContainer", + "kubernetes": "{\"host\":\"test\",\"drop\":\"2\",\"labels\":\"label\"}", + }, + { + "ClusterName": "cluster", + "runtime_tag": "10", + "NeuronCore": "core0", + "NeuronDevice": "device0", + "Type": "ContainerAWSNeuronCore", + "PodName": "testPod", + "ContainerName": "testContainer", + "kubernetes": "{\"host\":\"test\",\"labels\":\"label\"}", + }, + { + "ClusterName": "cluster", + "runtime_tag": "10", + "NeuronCore": "core0", + "NeuronDevice": "device0", + "Type": "PodAWSNeuronCore", + "PodName": "testPod", + "kubernetes": "{\"host\":\"test\",\"labels\":\"label\"}", + }, + { + "ClusterName": "cluster", + "runtime_tag": "10", + "NeuronCore": "core0", + "NeuronDevice": "device0", + "Type": "NodeAWSNeuronCore", + "kubernetes": "{\"host\":\"test\",\"labels\":\"label\"}", + }, + { + "ClusterName": "cluster", + "runtime_tag": "10", + "NeuronCore": "core0", + "NeuronDevice": "device0", + "Type": "ContainerAWSNeuronCore", + "PodName": "testPod", + "ContainerName": "testContainer", + "kubernetes": "{\"host\":\"test\",\"labels\":\"label\"}", + }, + { + "ClusterName": "cluster", + "runtime_tag": "10", + "NeuronCore": "core0", + "NeuronDevice": "device0", + "Type": "PodAWSNeuronCore", + "PodName": "testPod", + "kubernetes": "{\"host\":\"test\",\"labels\":\"label\"}", + }, + { + "ClusterName": "cluster", + "runtime_tag": "10", + "NeuronCore": "core0", + "NeuronDevice": "device0", + "Type": "NodeAWSNeuronCore", + "kubernetes": "{\"host\":\"test\",\"labels\":\"label\"}", + }, + }, + }, + "neuronMemoryMetricsAggregated": { + metrics: generateNeuronMetrics("neuroncore_memory_usage_constants", []map[string]string{ + { + "ClusterName": "cluster", + "Drop": "val", + "runtime_tag": "10", + "NeuronCore": "0", + "NeuronDevice": "0", + "kubernetes": "{\"host\":\"test\",\"drop\":\"2\",\"labels\":\"label\"}", + }, + }), + wantMetricCnt: 3, + want: []map[string]string{ + { + "ClusterName": "cluster", + "Drop": "val", + "runtime_tag": "10", + "NeuronCore": "core0", + "NeuronDevice": "device0", + "kubernetes": "{\"host\":\"test\",\"drop\":\"2\",\"labels\":\"label\"}", + }, + { + "ClusterName": "cluster", + "runtime_tag": "10", + "NeuronCore": "core0", + "NeuronDevice": "device0", + "Type": "NodeAWSNeuronCore", + "kubernetes": "{\"host\":\"test\",\"labels\":\"label\"}", + }, + { + "ClusterName": "cluster", + "runtime_tag": "10", + "NeuronCore": "core0", + "NeuronDevice": "device0", + "Type": "NodeAWSNeuronCore", + "kubernetes": "{\"host\":\"test\",\"labels\":\"label\"}", + }, + }, + }, + "neuronDeviceHardwareMetrics_labelsAreDropped": { + metrics: generateNeuronMetrics("neurondevice_hw_ecc_events", []map[string]string{ + { + "ClusterName": "cluster", + "Drop": "val", + "runtime_tag": "10", + "NeuronCore": "0", + "NeuronDevice": "0", + "event_type": "mem_ecc_corrected", + "kubernetes": "{\"host\":\"test\",\"drop\":\"2\",\"labels\":\"label\"}", + "PodName": "testPod", + "ContainerName": "testContainer", + }, + }), + wantMetricCnt: 7, + want: []map[string]string{ + { + "ClusterName": "cluster", + "Drop": "val", + "runtime_tag": "10", + "NeuronCore": "core0", + "NeuronDevice": "device0", + "event_type": "mem_ecc_corrected", + "kubernetes": "{\"host\":\"test\",\"drop\":\"2\",\"labels\":\"label\"}", + "PodName": "testPod", + "ContainerName": "testContainer", + }, + { + "ClusterName": "cluster", + "runtime_tag": "10", + "NeuronCore": "core0", + "NeuronDevice": "device0", + "Type": "ContainerAWSNeuronDevice", + "kubernetes": "{\"host\":\"test\"}", + "PodName": "testPod", + "ContainerName": "testContainer", + }, + { + "ClusterName": "cluster", + "runtime_tag": "10", + "NeuronCore": "core0", + "NeuronDevice": "device0", + "Type": "PodAWSNeuronDevice", + "kubernetes": "{\"host\":\"test\"}", + "PodName": "testPod", + }, + { + "ClusterName": "cluster", + "runtime_tag": "10", + "NeuronCore": "core0", + "NeuronDevice": "device0", + "Type": "NodeAWSNeuronDevice", + "kubernetes": "{\"host\":\"test\"}", + }, + { + "ClusterName": "cluster", + "runtime_tag": "10", + "NeuronCore": "core0", + "NeuronDevice": "device0", + "Type": "ContainerAWSNeuronDevice", + "kubernetes": "{\"host\":\"test\"}", + "PodName": "testPod", + "ContainerName": "testContainer", + }, + { + "ClusterName": "cluster", + "runtime_tag": "10", + "NeuronCore": "core0", + "NeuronDevice": "device0", + "Type": "PodAWSNeuronDevice", + "kubernetes": "{\"host\":\"test\"}", + "PodName": "testPod", + }, + { + "ClusterName": "cluster", + "runtime_tag": "10", + "NeuronCore": "core0", + "NeuronDevice": "device0", + "Type": "NodeAWSNeuronDevice", + "kubernetes": "{\"host\":\"test\"}", + }, + }, + }, + } + + for tname, tc := range testcases { + fmt.Printf("running %s\n", tname) + ms, _ := gp.processMetrics(ctx, tc.metrics) + assert.Equal(t, tc.wantMetricCnt, ms.MetricCount()) + if tc.wantMetricCnt > 0 { + resourceMetricsAttributes := ms.ResourceMetrics().At(0).Resource().Attributes() + assert.Equal(t, 0, resourceMetricsAttributes.Len()) + for i, dim := range tc.want { + dpAttr := ms.ResourceMetrics().At(0).ScopeMetrics().At(0).Metrics().At(i).Sum().DataPoints().At(0).Attributes() + assert.Equal(t, len(dim), dpAttr.Len()) + for k, v := range dim { + got, ok := dpAttr.Get(k) + assert.True(t, ok) + assert.Equal(t, v, got.Str()) + } + } + } + } +} + +func generateGPUMetrics(prefix string, dimensions []map[string]string) pmetric.Metrics { md := pmetric.NewMetrics() ms := md.ResourceMetrics().AppendEmpty().ScopeMetrics().AppendEmpty().Metrics().AppendEmpty() ms.SetName(prefix + gpuMetricIdentifier) @@ -220,3 +480,19 @@ func generateMetrics(prefix string, dimensions []map[string]string) pmetric.Metr } return md } + +func generateNeuronMetrics(prefix string, dimensions []map[string]string) pmetric.Metrics { + md := pmetric.NewMetrics() + ms := md.ResourceMetrics().AppendEmpty().ScopeMetrics().AppendEmpty().Metrics().AppendEmpty() + md.ResourceMetrics().At(0).Resource().Attributes().PutStr("service.name", "containerInsightsNeuronMonitorScraper") + ms.SetName(prefix) + dps := ms.SetEmptyGauge().DataPoints() + for _, dim := range dimensions { + dp := dps.AppendEmpty() + dp.SetIntValue(10) + for k, v := range dim { + dp.Attributes().PutStr(k, v) + } + } + return md +}