Skip to content

Commit

Permalink
Add hardware ecc events to NeuronConfig (#210)
Browse files Browse the repository at this point in the history
* Add hardware ecc events to NeuronConfig

* add changes from gpu test

---------

Co-authored-by: Aditya Purang <[email protected]>
  • Loading branch information
sam6134 and aditya-purang authored May 8, 2024
1 parent cc2b921 commit e12c964
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ func GetNeuronMetricRelabelConfigs(hostinfo prometheusscraper.HostInfoProvider)
return []*relabel.Config{
{
SourceLabels: model.LabelNames{"__name__"},
Regex: relabel.MustNewRegexp("neuron.*|system_.*|execution_.*"),
Regex: relabel.MustNewRegexp("neuron.*|system_.*|execution_.*|hardware_ecc_.*"),
Action: relabel.Keep,
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ system_memory_total_bytes{availability_zone="us-east-1c",instance_id="i-09db9b55
# HELP neurondevice_hw_ecc_events_total_mem_ecc_corrected Neuron hardware errors
# TYPE neurondevice_hw_ecc_events_total_mem_ecc_corrected gauge
neurondevice_hw_ecc_events_total_mem_ecc_corrected{availability_zone="us-east-1c",instance_id="i-09db9b55e0095612f",instance_name="",instance_type="trn1n.32xlarge",neuron_device_index="5",region="us-east-1",runtime_tag="367",subnet_id="subnet-06a7754948e8a000f"} 3
hardware_ecc_events_total{availability_zone="us-east-1c",event_type="sram_ecc_uncorrected",instance_id="i-09db9b55e0095612f",instance_name="",instance_type="trn1n.32xlarge",neuron_device_index="7",region="us-east-1",subnet_id="subnet-06a7754948e8a000f"} 864.0
`

const dummyClusterName = "cluster-name"
Expand Down Expand Up @@ -98,6 +99,15 @@ func TestNewNeuronScraperEndToEnd(t *testing.T) {
},
}

expectedMetrics["hardware_ecc_events_total"] = prometheusscraper.ExpectedMetricStruct{
MetricValue: 864.0,
MetricLabels: []prometheusscraper.MetricLabel{
{LabelName: "InstanceId", LabelValue: "i-09db9b55e0095612f"},
{LabelName: "ClusterName", LabelValue: dummyClusterName},
{LabelName: "NodeName", LabelValue: dummyNodeName},
},
}

expectedMetrics["up"] = prometheusscraper.ExpectedMetricStruct{
MetricValue: 1.0,
MetricLabels: []prometheusscraper.MetricLabel{},
Expand Down

0 comments on commit e12c964

Please sign in to comment.