diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 4a3ca6140e..56bcfd596d 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -926,8 +926,6 @@ jobs: -var="instance_type=${{ matrix.arrays.instanceType }}" \ -var="k8s_version=${{ matrix.arrays.k8s_version }}"; then terraform destroy -auto-approve - elif [ "${{ matrix.arrays.test_dir }}" == "./test/awsneuron" ]; then - echo "NOT DELETING CLUSTER, DELETE MANUALLY" else terraform destroy -auto-approve && exit 1 fi @@ -945,12 +943,7 @@ jobs: else cd terraform/eks/daemon fi - - if [ "${{ matrix.arrays.test_dir }}" == "./test/awsneuron" ]; then - echo "NOT DELETING CLUSTER, DELETE MANUALLY" - else - terraform destroy --auto-approve - fi + terraform destroy --auto-approve EKSPrometheusIntegrationTest: name: 'EKSPrometheusIntegrationTest' diff --git a/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier_test.go b/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier_test.go index 6f475e17c9..b0140b831c 100644 --- a/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier_test.go +++ b/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier_test.go @@ -21,6 +21,7 @@ var staticAttributes = map[string]any{ NodeName: "dummyAttribute", AvailabilityZone: "dummyAttribute", Kubernetes: "dummyAttribute", + RuntimeTag: "dummyAttribute", SubnetId: "dummyAttribute", } var staticTimestamp = pcommon.NewTimestampFromTime(time.Date(2023, time.March, 12, 11, 0, 0, 0, time.UTC)) @@ -35,7 +36,6 @@ const ( NeuronDeviceHwEccEvents = "neurondevice_hw_ecc_events" NeuronDeviceIndex = "neuron_device_index" DummyPod = "DummyPod" - DummyRuntimeTag = "1" Type = "Type" NodeAWSNeuronDevice = "NodeAWSNeuronDevice" PodAWSNeuronDevice = "PodAWSNeuronDevice" @@ -55,12 +55,12 @@ type MetricDefinition struct { var metricNameToMetricLayout = map[string]MetricDefinition{ NonNeuronMetric: {MetricType: pmetric.MetricTypeGauge, MetricValues: []float64{1}, SpecialAttributes: [][]string{}, Unit: Count}, - NeuronExecutionErrors: {MetricType: pmetric.MetricTypeSum, MetricValues: []float64{1, 2, 3, 4, 5, 6}, SpecialAttributes: [][]string{{ErrorType, "generic", RuntimeTag, DummyRuntimeTag}, {ErrorType, "numerical", RuntimeTag, DummyRuntimeTag}, {ErrorType, "transient", RuntimeTag, DummyRuntimeTag}, {ErrorType, "model", RuntimeTag, DummyRuntimeTag}, {ErrorType, "runtime", RuntimeTag, DummyRuntimeTag}, {ErrorType, "hardware", RuntimeTag, DummyRuntimeTag}}, Unit: Count}, - NeuronExecutionStatus: {MetricType: pmetric.MetricTypeSum, MetricValues: []float64{1, 2, 3, 4, 5, 6}, SpecialAttributes: [][]string{{StatusType, "completed", RuntimeTag, DummyRuntimeTag}, {StatusType, "completed_with_err", RuntimeTag, DummyRuntimeTag}, {StatusType, "completed_with_num_err", RuntimeTag, DummyRuntimeTag}, {StatusType, "timed_out", RuntimeTag, DummyRuntimeTag}, {StatusType, "incorrect_input", RuntimeTag, DummyRuntimeTag}, {StatusType, "failed_to_queue", RuntimeTag, DummyRuntimeTag}}, Unit: Count}, - NeuronCoreMemoryUsageModelSharedScratchpad: {MetricType: pmetric.MetricTypeGauge, MetricValues: []float64{1, 2, 3}, SpecialAttributes: [][]string{{NeuronCore, "0", NeuronDevice, "0", MemoryLocation, "None", PodName, DummyPod, RuntimeTag, DummyRuntimeTag}, {NeuronCore, "1", NeuronDevice, "0", MemoryLocation, "None", PodName, DummyPod, RuntimeTag, DummyRuntimeTag}, {NeuronCore, "2", NeuronDevice, "1", MemoryLocation, "None", PodName, DummyPod, RuntimeTag, DummyRuntimeTag}}, Unit: Bytes}, - NeuronDeviceRuntimeMemoryUsedBytes: {MetricType: pmetric.MetricTypeGauge, MetricValues: []float64{1, 2}, SpecialAttributes: [][]string{{MemoryLocation, "host", RuntimeTag, DummyRuntimeTag}, {MemoryLocation, "neuron_device", RuntimeTag, DummyRuntimeTag}}, Unit: Bytes}, - NeuronExecutionLatency: {MetricType: pmetric.MetricTypeGauge, MetricValues: []float64{0, 0, 0, 0, 1, 0, 0}, SpecialAttributes: [][]string{{Percentile, "p0", RuntimeTag, DummyRuntimeTag}, {Percentile, "p1", RuntimeTag, DummyRuntimeTag}, {Percentile, "p100", RuntimeTag, DummyRuntimeTag}, {Percentile, "p25", RuntimeTag, DummyRuntimeTag}, {Percentile, "p50", RuntimeTag, DummyRuntimeTag}, {Percentile, "p75", RuntimeTag, DummyRuntimeTag}, {Percentile, "p99", RuntimeTag, DummyRuntimeTag}}, Unit: Seconds}, - NeuronDeviceHwEccEvents: {MetricType: pmetric.MetricTypeSum, MetricValues: []float64{1, 2, 3, 4}, SpecialAttributes: [][]string{{NeuronDeviceIndex, "1", NeuronDevice, "1", EventType, "mem_ecc_corrected", PodName, DummyPod}, {NeuronDeviceIndex, "1", NeuronDevice, "1", EventType, "mem_ecc_uncorrected", PodName, DummyPod}, {NeuronDeviceIndex, "1", NeuronDevice, "1", EventType, "sram_ecc_corrected", PodName, DummyPod}, {NeuronDeviceIndex, "1", NeuronDevice, "1", EventType, "sram_ecc_uncorrected", PodName, DummyPod}}, Unit: Count}, + NeuronExecutionErrors: {MetricType: pmetric.MetricTypeSum, MetricValues: []float64{1, 2, 3, 4, 5, 6}, SpecialAttributes: [][]string{{ErrorType, "generic", RuntimeTag, "1"}, {ErrorType, "numerical", RuntimeTag, "1"}, {ErrorType, "transient", RuntimeTag, "1"}, {ErrorType, "model", RuntimeTag, "1"}, {ErrorType, "runtime", RuntimeTag, "1"}, {ErrorType, "hardware", RuntimeTag, "1"}}, Unit: Count}, + NeuronExecutionStatus: {MetricType: pmetric.MetricTypeSum, MetricValues: []float64{1, 2, 3, 4, 5, 6}, SpecialAttributes: [][]string{{StatusType, "completed", RuntimeTag, "1"}, {StatusType, "completed_with_err", RuntimeTag, "1"}, {StatusType, "completed_with_num_err", RuntimeTag, "1"}, {StatusType, "timed_out", RuntimeTag, "1"}, {StatusType, "incorrect_input", RuntimeTag, "1"}, {StatusType, "failed_to_queue", RuntimeTag, "1"}}, Unit: Count}, + NeuronCoreMemoryUsageModelSharedScratchpad: {MetricType: pmetric.MetricTypeGauge, MetricValues: []float64{1, 2, 3}, SpecialAttributes: [][]string{{NeuronCore, "0", NeuronDevice, "0", MemoryLocation, "None", PodName, DummyPod}, {NeuronCore, "1", NeuronDevice, "0", MemoryLocation, "None", PodName, DummyPod}, {NeuronCore, "2", NeuronDevice, "1", MemoryLocation, "None", PodName, DummyPod}}, Unit: Bytes}, + NeuronDeviceRuntimeMemoryUsedBytes: {MetricType: pmetric.MetricTypeGauge, MetricValues: []float64{1, 2}, SpecialAttributes: [][]string{{MemoryLocation, "host"}, {MemoryLocation, "neuron_device"}}, Unit: Bytes}, + NeuronExecutionLatency: {MetricType: pmetric.MetricTypeGauge, MetricValues: []float64{0, 0, 0, 0, 1, 0, 0}, SpecialAttributes: [][]string{{Percentile, "p0"}, {Percentile, "p1"}, {Percentile, "p100"}, {Percentile, "p25"}, {Percentile, "p50"}, {Percentile, "p75"}, {Percentile, "p99"}}, Unit: Seconds}, + NeuronDeviceHwEccEvents: {MetricType: pmetric.MetricTypeSum, MetricValues: []float64{1, 2, 3, 4}, SpecialAttributes: [][]string{{NeuronDeviceIndex, "1", NeuronDevice, "1", EventType, "mem_ecc_corrected", PodName, DummyPod, RuntimeTag, "1"}, {NeuronDeviceIndex, "1", NeuronDevice, "1", EventType, "mem_ecc_uncorrected", PodName, DummyPod, RuntimeTag, "1"}, {NeuronDeviceIndex, "1", NeuronDevice, "1", EventType, "sram_ecc_corrected", PodName, DummyPod, RuntimeTag, "1"}, {NeuronDeviceIndex, "1", NeuronDevice, "1", EventType, "sram_ecc_uncorrected", PodName, DummyPod, RuntimeTag, "1"}}, Unit: Count}, } func setupMetricModifier() *AwsNeuronMetricModifier { @@ -75,7 +75,7 @@ func TestMetricModifierForExecutionLatencyMetric(t *testing.T) { expectedMetrics := map[string]pmetric.Metric{ NeuronExecutionLatency: metricsList.At(0), - "node_neuron_execution_latency": createExpectedMetric("node_neuron_execution_latency", false, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: DummyRuntimeTag}}, []float64{1}, pmetric.MetricTypeSum, Seconds), + "node_neuron_execution_latency": createExpectedMetric("node_neuron_execution_latency", false, []map[string]string{{Type: NodeAWSNeuron}}, []float64{1}, pmetric.MetricTypeSum, Seconds), } assertModifiedMetric(t, metricsList, expectedMetrics) @@ -88,13 +88,13 @@ func TestMetricModifierForExecutionErrorMetric(t *testing.T) { expectedMetrics := map[string]pmetric.Metric{ NeuronExecutionErrors: metricsList.At(0), - "node_neuron_execution_errors_generic": createExpectedMetric("node_neuron_execution_errors_generic", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: DummyRuntimeTag}}, []float64{1}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_numerical": createExpectedMetric("node_neuron_execution_errors_numerical", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: DummyRuntimeTag}}, []float64{2}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_transient": createExpectedMetric("node_neuron_execution_errors_transient", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: DummyRuntimeTag}}, []float64{3}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_model": createExpectedMetric("node_neuron_execution_errors_model", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: DummyRuntimeTag}}, []float64{4}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_runtime": createExpectedMetric("node_neuron_execution_errors_runtime", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: DummyRuntimeTag}}, []float64{5}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_hardware": createExpectedMetric("node_neuron_execution_errors_hardware", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: DummyRuntimeTag}}, []float64{6}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_total": createExpectedMetric("node_neuron_execution_errors_total", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: DummyRuntimeTag}}, []float64{21}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_generic": createExpectedMetric("node_neuron_execution_errors_generic", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_numerical": createExpectedMetric("node_neuron_execution_errors_numerical", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_transient": createExpectedMetric("node_neuron_execution_errors_transient", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_model": createExpectedMetric("node_neuron_execution_errors_model", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_runtime": createExpectedMetric("node_neuron_execution_errors_runtime", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{5}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_hardware": createExpectedMetric("node_neuron_execution_errors_hardware", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{6}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_total": createExpectedMetric("node_neuron_execution_errors_total", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{21}, pmetric.MetricTypeSum, Count), } assertModifiedMetric(t, metricsList, expectedMetrics) @@ -111,12 +111,12 @@ func TestMetricModifierForExecutionStatusMetric(t *testing.T) { expectedMetrics := map[string]pmetric.Metric{ NeuronExecutionStatus: metricsList.At(0), - "node_neuron_execution_status_completed": createExpectedMetric("node_neuron_execution_status_completed", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: DummyRuntimeTag}}, []float64{1}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_completed_with_err": createExpectedMetric("node_neuron_execution_status_completed_with_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: DummyRuntimeTag}}, []float64{2}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_completed_with_num_err": createExpectedMetric("node_neuron_execution_status_completed_with_num_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: DummyRuntimeTag}}, []float64{3}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_timed_out": createExpectedMetric("node_neuron_execution_status_timed_out", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: DummyRuntimeTag}}, []float64{4}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_incorrect_input": createExpectedMetric("node_neuron_execution_status_incorrect_input", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: DummyRuntimeTag}}, []float64{5}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_failed_to_queue": createExpectedMetric("node_neuron_execution_status_failed_to_queue", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: DummyRuntimeTag}}, []float64{6}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_completed": createExpectedMetric("node_neuron_execution_status_completed", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_completed_with_err": createExpectedMetric("node_neuron_execution_status_completed_with_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_completed_with_num_err": createExpectedMetric("node_neuron_execution_status_completed_with_num_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_timed_out": createExpectedMetric("node_neuron_execution_status_timed_out", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_incorrect_input": createExpectedMetric("node_neuron_execution_status_incorrect_input", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{5}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_failed_to_queue": createExpectedMetric("node_neuron_execution_status_failed_to_queue", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{6}, pmetric.MetricTypeSum, Count), } assertModifiedMetric(t, metricsList, expectedMetrics) @@ -130,9 +130,9 @@ func TestMetricModifierForNeuronCoreMemoryUsageMetric(t *testing.T) { expectedMetrics := map[string]pmetric.Metric{ NeuronCoreMemoryUsageModelSharedScratchpad: metricsList.At(0), - "node_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("node_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: NodeAWSNeuronCore, PodName: DummyPod, RuntimeTag: DummyRuntimeTag}, {NeuronCore: "core1", NeuronDevice: "device0", Type: NodeAWSNeuronCore, PodName: DummyPod, RuntimeTag: DummyRuntimeTag}, {NeuronCore: "core2", NeuronDevice: "device1", Type: NodeAWSNeuronCore, PodName: DummyPod, RuntimeTag: DummyRuntimeTag}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), - "pod_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("pod_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: PodAWSNeuronCore, PodName: DummyPod, RuntimeTag: DummyRuntimeTag}, {NeuronCore: "core1", NeuronDevice: "device0", Type: PodAWSNeuronCore, PodName: DummyPod, RuntimeTag: DummyRuntimeTag}, {NeuronCore: "core2", NeuronDevice: "device1", Type: PodAWSNeuronCore, PodName: DummyPod, RuntimeTag: DummyRuntimeTag}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), - "container_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("container_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: ContainerAWSNeuronCore, PodName: DummyPod, RuntimeTag: DummyRuntimeTag}, {NeuronCore: "core1", NeuronDevice: "device0", Type: ContainerAWSNeuronCore, PodName: DummyPod, RuntimeTag: DummyRuntimeTag}, {NeuronCore: "core2", NeuronDevice: "device1", Type: ContainerAWSNeuronCore, PodName: DummyPod, RuntimeTag: DummyRuntimeTag}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), + "node_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("node_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: NodeAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core1", NeuronDevice: "device0", Type: NodeAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core2", NeuronDevice: "device1", Type: NodeAWSNeuronCore, PodName: DummyPod}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), + "pod_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("pod_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: PodAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core1", NeuronDevice: "device0", Type: PodAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core2", NeuronDevice: "device1", Type: PodAWSNeuronCore, PodName: DummyPod}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), + "container_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("container_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: ContainerAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core1", NeuronDevice: "device0", Type: ContainerAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core2", NeuronDevice: "device1", Type: ContainerAWSNeuronCore, PodName: DummyPod}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), } assertModifiedMetric(t, metricsList, expectedMetrics) @@ -146,7 +146,7 @@ func TestMetricModifierForNeuronCoreMemoryUsageMetric_PodNameMissing(t *testing. expectedMetrics := map[string]pmetric.Metric{ NeuronCoreMemoryUsageModelSharedScratchpad: metricsList.At(0), - "node_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("node_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: NodeAWSNeuronCore, RuntimeTag: DummyRuntimeTag}, {NeuronCore: "core1", NeuronDevice: "device0", Type: NodeAWSNeuronCore, RuntimeTag: DummyRuntimeTag}, {NeuronCore: "core2", NeuronDevice: "device1", Type: NodeAWSNeuronCore, RuntimeTag: DummyRuntimeTag}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), + "node_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("node_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: NodeAWSNeuronCore}, {NeuronCore: "core1", NeuronDevice: "device0", Type: NodeAWSNeuronCore}, {NeuronCore: "core2", NeuronDevice: "device1", Type: NodeAWSNeuronCore}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), } assertModifiedMetric(t, metricsList, expectedMetrics) @@ -160,7 +160,7 @@ func TestMetricModifierForNeuronDeviceRuntimeMemoryUsageMetric(t *testing.T) { expectedMetrics := map[string]pmetric.Metric{ NeuronDeviceRuntimeMemoryUsedBytes: metricsList.At(0), - "node_neurondevice_runtime_memory_used_bytes": createExpectedMetric("node_neurondevice_runtime_memory_used_bytes", false, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: DummyRuntimeTag}}, []float64{2}, pmetric.MetricTypeSum, Bytes), + "node_neurondevice_runtime_memory_used_bytes": createExpectedMetric("node_neurondevice_runtime_memory_used_bytes", false, []map[string]string{{Type: NodeAWSNeuron}}, []float64{2}, pmetric.MetricTypeSum, Bytes), } assertModifiedMetric(t, metricsList, expectedMetrics) @@ -174,21 +174,21 @@ func TestMetricModifierForNeuronDeviceEccEventMetric(t *testing.T) { expectedMetrics := map[string]pmetric.Metric{ NeuronDeviceHwEccEvents: metricsList.At(0), - "node_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice}}, []float64{1}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice}}, []float64{2}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice}}, []float64{3}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice}}, []float64{4}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_total": createExpectedMetric("node_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: ""}}, []float64{10}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice}}, []float64{1}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice}}, []float64{2}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice}}, []float64{3}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice}}, []float64{4}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_total": createExpectedMetric("pod_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: ""}}, []float64{10}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice}}, []float64{1}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice}}, []float64{2}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice}}, []float64{3}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice}}, []float64{4}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_total": createExpectedMetric("container_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: ""}}, []float64{10}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_total": createExpectedMetric("node_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{10}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_total": createExpectedMetric("pod_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{10}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_total": createExpectedMetric("container_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{10}, pmetric.MetricTypeSum, Count), } assertModifiedMetric(t, metricsList, expectedMetrics) @@ -202,11 +202,11 @@ func TestMetricModifierForNeuronDeviceEccEventMetric_PodNameMissing(t *testing.T expectedMetrics := map[string]pmetric.Metric{ NeuronDeviceHwEccEvents: metricsList.At(0), - "node_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", Type: NodeAWSNeuronDevice}}, []float64{1}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", Type: NodeAWSNeuronDevice}}, []float64{2}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", Type: NodeAWSNeuronDevice}}, []float64{3}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", Type: NodeAWSNeuronDevice}}, []float64{4}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_total": createExpectedMetric("node_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: ""}}, []float64{10}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_total": createExpectedMetric("node_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{10}, pmetric.MetricTypeSum, Count), } assertModifiedMetric(t, metricsList, expectedMetrics) @@ -249,44 +249,44 @@ func TestListWithMultipleMetrics(t *testing.T) { NeuronDeviceHwEccEvents: metricsList.At(5), NonNeuronMetric: metricsList.At(6), - "node_neuron_execution_latency": createExpectedMetric("node_neuron_execution_latency", false, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: DummyRuntimeTag}}, []float64{1}, pmetric.MetricTypeSum, Seconds), - - "node_neuron_execution_errors_generic": createExpectedMetric("node_neuron_execution_errors_generic", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: DummyRuntimeTag}}, []float64{1}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_numerical": createExpectedMetric("node_neuron_execution_errors_numerical", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: DummyRuntimeTag}}, []float64{2}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_transient": createExpectedMetric("node_neuron_execution_errors_transient", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: DummyRuntimeTag}}, []float64{3}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_model": createExpectedMetric("node_neuron_execution_errors_model", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: DummyRuntimeTag}}, []float64{4}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_runtime": createExpectedMetric("node_neuron_execution_errors_runtime", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: DummyRuntimeTag}}, []float64{5}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_hardware": createExpectedMetric("node_neuron_execution_errors_hardware", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: DummyRuntimeTag}}, []float64{6}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_total": createExpectedMetric("node_neuron_execution_errors_total", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: DummyRuntimeTag}}, []float64{21}, pmetric.MetricTypeSum, Count), - - "node_neuron_execution_status_completed": createExpectedMetric("node_neuron_execution_status_completed", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: DummyRuntimeTag}}, []float64{1}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_completed_with_err": createExpectedMetric("node_neuron_execution_status_completed_with_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: DummyRuntimeTag}}, []float64{2}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_completed_with_num_err": createExpectedMetric("node_neuron_execution_status_completed_with_num_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: DummyRuntimeTag}}, []float64{3}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_timed_out": createExpectedMetric("node_neuron_execution_status_timed_out", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: DummyRuntimeTag}}, []float64{4}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_incorrect_input": createExpectedMetric("node_neuron_execution_status_incorrect_input", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: DummyRuntimeTag}}, []float64{5}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_failed_to_queue": createExpectedMetric("node_neuron_execution_status_failed_to_queue", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: DummyRuntimeTag}}, []float64{6}, pmetric.MetricTypeSum, Count), - - "node_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("node_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: NodeAWSNeuronCore, PodName: DummyPod, RuntimeTag: DummyRuntimeTag}, {NeuronCore: "core1", NeuronDevice: "device0", Type: NodeAWSNeuronCore, PodName: DummyPod, RuntimeTag: DummyRuntimeTag}, {NeuronCore: "core2", NeuronDevice: "device1", Type: NodeAWSNeuronCore, PodName: DummyPod, RuntimeTag: DummyRuntimeTag}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), - "pod_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("pod_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: PodAWSNeuronCore, PodName: DummyPod, RuntimeTag: DummyRuntimeTag}, {NeuronCore: "core1", NeuronDevice: "device0", Type: PodAWSNeuronCore, PodName: DummyPod, RuntimeTag: DummyRuntimeTag}, {NeuronCore: "core2", NeuronDevice: "device1", Type: PodAWSNeuronCore, PodName: DummyPod, RuntimeTag: DummyRuntimeTag}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), - "container_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("container_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: ContainerAWSNeuronCore, PodName: DummyPod, RuntimeTag: DummyRuntimeTag}, {NeuronCore: "core1", NeuronDevice: "device0", Type: ContainerAWSNeuronCore, PodName: DummyPod, RuntimeTag: DummyRuntimeTag}, {NeuronCore: "core2", NeuronDevice: "device1", Type: ContainerAWSNeuronCore, PodName: DummyPod, RuntimeTag: DummyRuntimeTag}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), - - "node_neurondevice_runtime_memory_used_bytes": createExpectedMetric("node_neurondevice_runtime_memory_used_bytes", false, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: DummyRuntimeTag}}, []float64{2}, pmetric.MetricTypeSum, Bytes), - - "node_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice}}, []float64{1}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice}}, []float64{2}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice}}, []float64{3}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice}}, []float64{4}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_total": createExpectedMetric("node_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: ""}}, []float64{10}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice}}, []float64{1}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice}}, []float64{2}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice}}, []float64{3}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice}}, []float64{4}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_total": createExpectedMetric("pod_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: ""}}, []float64{10}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice}}, []float64{1}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice}}, []float64{2}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice}}, []float64{3}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice}}, []float64{4}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_total": createExpectedMetric("container_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: ""}}, []float64{10}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_latency": createExpectedMetric("node_neuron_execution_latency", false, []map[string]string{{Type: NodeAWSNeuron}}, []float64{1}, pmetric.MetricTypeSum, Seconds), + + "node_neuron_execution_errors_generic": createExpectedMetric("node_neuron_execution_errors_generic", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_numerical": createExpectedMetric("node_neuron_execution_errors_numerical", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_transient": createExpectedMetric("node_neuron_execution_errors_transient", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_model": createExpectedMetric("node_neuron_execution_errors_model", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_runtime": createExpectedMetric("node_neuron_execution_errors_runtime", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{5}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_hardware": createExpectedMetric("node_neuron_execution_errors_hardware", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{6}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_total": createExpectedMetric("node_neuron_execution_errors_total", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{21}, pmetric.MetricTypeSum, Count), + + "node_neuron_execution_status_completed": createExpectedMetric("node_neuron_execution_status_completed", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_completed_with_err": createExpectedMetric("node_neuron_execution_status_completed_with_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_completed_with_num_err": createExpectedMetric("node_neuron_execution_status_completed_with_num_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_timed_out": createExpectedMetric("node_neuron_execution_status_timed_out", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_incorrect_input": createExpectedMetric("node_neuron_execution_status_incorrect_input", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{5}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_failed_to_queue": createExpectedMetric("node_neuron_execution_status_failed_to_queue", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{6}, pmetric.MetricTypeSum, Count), + + "node_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("node_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: NodeAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core1", NeuronDevice: "device0", Type: NodeAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core2", NeuronDevice: "device1", Type: NodeAWSNeuronCore, PodName: DummyPod}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), + "pod_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("pod_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: PodAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core1", NeuronDevice: "device0", Type: PodAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core2", NeuronDevice: "device1", Type: PodAWSNeuronCore, PodName: DummyPod}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), + "container_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("container_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: ContainerAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core1", NeuronDevice: "device0", Type: ContainerAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core2", NeuronDevice: "device1", Type: ContainerAWSNeuronCore, PodName: DummyPod}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), + + "node_neurondevice_runtime_memory_used_bytes": createExpectedMetric("node_neurondevice_runtime_memory_used_bytes", false, []map[string]string{{Type: NodeAWSNeuron}}, []float64{2}, pmetric.MetricTypeSum, Bytes), + + "node_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_total": createExpectedMetric("node_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{10}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_total": createExpectedMetric("pod_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{10}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_total": createExpectedMetric("container_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{10}, pmetric.MetricTypeSum, Count), } assertModifiedMetric(t, metricsList, expectedMetrics) } @@ -303,7 +303,7 @@ func TestMetricWithStaleDatapoint(t *testing.T) { expectedMetrics := map[string]pmetric.Metric{ NeuronExecutionLatency: metricsList.At(0), - "node_neuron_execution_latency": createExpectedMetric("node_neuron_execution_latency", false, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: DummyRuntimeTag}}, []float64{1}, pmetric.MetricTypeSum, Seconds), + "node_neuron_execution_latency": createExpectedMetric("node_neuron_execution_latency", false, []map[string]string{{Type: NodeAWSNeuron}}, []float64{1}, pmetric.MetricTypeSum, Seconds), } assertModifiedMetric(t, metricsList, expectedMetrics) diff --git a/plugins/processors/gpuattributes/processor.go b/plugins/processors/gpuattributes/processor.go index cc86d7a8ce..94fb411a53 100644 --- a/plugins/processors/gpuattributes/processor.go +++ b/plugins/processors/gpuattributes/processor.go @@ -6,7 +6,6 @@ package gpuattributes import ( "context" "encoding/json" - "fmt" "strings" "go.opentelemetry.io/collector/pdata/pcommon" @@ -128,7 +127,6 @@ func newGpuAttributesProcessor(config *Config, logger *zap.Logger) *gpuAttribute func (d *gpuAttributesProcessor) processMetrics(_ context.Context, md pmetric.Metrics) (pmetric.Metrics, error) { rms := md.ResourceMetrics() - d.logMd(md, "before gpu processor") for i := 0; i < rms.Len(); i++ { rs := rms.At(i) ilms := rs.ScopeMetrics() @@ -152,7 +150,6 @@ func (d *gpuAttributesProcessor) processMetrics(_ context.Context, md pmetric.Me } } } - d.logMd(md, "after gpu processor") return md, nil } @@ -256,57 +253,3 @@ func (d *gpuAttributesProcessor) filterGpuMetricsWithoutPodName(metrics pmetric. return dps.Len() == 0 }) } - -func (d *gpuAttributesProcessor) logMd(md pmetric.Metrics, name string) { - var logMessage strings.Builder - - logMessage.WriteString(fmt.Sprintf("\"%s_METRICS_MD\" : {\n", name)) - rms := md.ResourceMetrics() - for i := 0; i < rms.Len(); i++ { - rs := rms.At(i) - ilms := rs.ScopeMetrics() - logMessage.WriteString(fmt.Sprintf("\t\"ResourceMetric_%d\": {\n", i)) - for j := 0; j < ilms.Len(); j++ { - ils := ilms.At(j) - metrics := ils.Metrics() - logMessage.WriteString(fmt.Sprintf("\t\t\"ScopeMetric_%d\": {\n", j)) - logMessage.WriteString(fmt.Sprintf("\t\t\"Metrics_%d\": [\n", j)) - - for k := 0; k < metrics.Len(); k++ { - m := metrics.At(k) - logMessage.WriteString(fmt.Sprintf("\t\t\t\"Metric_%d\": {\n", k)) - logMessage.WriteString(fmt.Sprintf("\t\t\t\t\"name\": \"%s\",\n", m.Name())) - logMessage.WriteString(fmt.Sprintf("\t\t\t\t\"type\": \"%s\",\n", m.Type())) - - var datapoints pmetric.NumberDataPointSlice - switch m.Type() { - case pmetric.MetricTypeGauge: - datapoints = m.Gauge().DataPoints() - case pmetric.MetricTypeSum: - datapoints = m.Sum().DataPoints() - default: - datapoints = pmetric.NewNumberDataPointSlice() - } - - logMessage.WriteString("\t\t\t\t\"datapoints\": [\n") - for yu := 0; yu < datapoints.Len(); yu++ { - logMessage.WriteString("\t\t\t\t\t{\n") - logMessage.WriteString(fmt.Sprintf("\t\t\t\t\t\t\"attributes\": \"%v\",\n", datapoints.At(yu).Attributes().AsRaw())) - logMessage.WriteString(fmt.Sprintf("\t\t\t\t\t\t\"value\": %v,\n", datapoints.At(yu).DoubleValue())) - logMessage.WriteString(fmt.Sprintf("\t\t\t\t\t\t\"timestamp\": %v,\n", datapoints.At(yu).Timestamp())) - logMessage.WriteString(fmt.Sprintf("\t\t\t\t\t\t\"flags\": %v,\n", datapoints.At(yu).Flags())) - logMessage.WriteString(fmt.Sprintf("\t\t\t\t\t\t\"value type\": %v,\n", datapoints.At(yu).ValueType())) - logMessage.WriteString("\t\t\t\t\t},\n") - } - logMessage.WriteString("\t\t\t\t],\n") - logMessage.WriteString("\t\t\t},\n") - } - logMessage.WriteString("\t\t],\n") - logMessage.WriteString("\t\t},\n") - } - logMessage.WriteString("\t},\n") - } - logMessage.WriteString("},\n") - - d.logger.Info(logMessage.String()) -}