forked from fritchie/nvme_exporter
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.go
295 lines (284 loc) · 11.4 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
package main
// Export nvme smart-log metrics in prometheus format
import (
"flag"
"log"
"net/http"
"os/exec"
"os/user"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/tidwall/gjson"
)
var labels = []string{"device"}
type nvmeCollector struct {
nvmeCriticalWarning *prometheus.Desc
nvmeTemperature *prometheus.Desc
nvmeAvailSpare *prometheus.Desc
nvmeSpareThresh *prometheus.Desc
nvmePercentUsed *prometheus.Desc
nvmeEnduranceGrpCriticalWarningSummary *prometheus.Desc
nvmeDataUnitsRead *prometheus.Desc
nvmeDataUnitsWritten *prometheus.Desc
nvmeHostReadCommands *prometheus.Desc
nvmeHostWriteCommands *prometheus.Desc
nvmeControllerBusyTime *prometheus.Desc
nvmePowerCycles *prometheus.Desc
nvmePowerOnHours *prometheus.Desc
nvmeUnsafeShutdowns *prometheus.Desc
nvmeMediaErrors *prometheus.Desc
nvmeNumErrLogEntries *prometheus.Desc
nvmeWarningTempTime *prometheus.Desc
nvmeCriticalCompTime *prometheus.Desc
nvmeThmTemp1TransCount *prometheus.Desc
nvmeThmTemp2TransCount *prometheus.Desc
nvmeThmTemp1TotalTime *prometheus.Desc
nvmeThmTemp2TotalTime *prometheus.Desc
}
// nvme smart-log field descriptions can be found on page 180 of:
// https://nvmexpress.org/wp-content/uploads/NVM-Express-Base-Specification-2_0-2021.06.02-Ratified-5.pdf
func newNvmeCollector() prometheus.Collector {
return &nvmeCollector{
nvmeCriticalWarning: prometheus.NewDesc(
"nvme_critical_warning",
"Critical warnings for the state of the controller",
labels,
nil,
),
nvmeTemperature: prometheus.NewDesc(
"nvme_temperature",
"Temperature in degrees fahrenheit",
labels,
nil,
),
nvmeAvailSpare: prometheus.NewDesc(
"nvme_avail_spare",
"Normalized percentage of remaining spare capacity available",
labels,
nil,
),
nvmeSpareThresh: prometheus.NewDesc(
"nvme_spare_thresh",
"Async event completion may occur when avail spare < threshold",
labels,
nil,
),
nvmePercentUsed: prometheus.NewDesc(
"nvme_percent_used",
"Vendor specific estimate of the percentage of life used",
labels,
nil,
),
nvmeEnduranceGrpCriticalWarningSummary: prometheus.NewDesc(
"nvme_endurance_grp_critical_warning_summary",
"Critical warnings for the state of endurance groups",
labels,
nil,
),
nvmeDataUnitsRead: prometheus.NewDesc(
"nvme_data_units_read",
"Number of 512 byte data units host has read",
labels,
nil,
),
nvmeDataUnitsWritten: prometheus.NewDesc(
"nvme_data_units_written",
"Number of 512 byte data units the host has written",
labels,
nil,
),
nvmeHostReadCommands: prometheus.NewDesc(
"nvme_host_read_commands",
"Number of read commands completed",
labels,
nil,
),
nvmeHostWriteCommands: prometheus.NewDesc(
"nvme_host_write_commands",
"Number of write commands completed",
labels,
nil,
),
nvmeControllerBusyTime: prometheus.NewDesc(
"nvme_controller_busy_time",
"Amount of time in minutes controller busy with IO commands",
labels,
nil,
),
nvmePowerCycles: prometheus.NewDesc(
"nvme_power_cycles",
"Number of power cycles",
labels,
nil,
),
nvmePowerOnHours: prometheus.NewDesc(
"nvme_power_on_hours",
"Number of power on hours",
labels,
nil,
),
nvmeUnsafeShutdowns: prometheus.NewDesc(
"nvme_unsafe_shutdowns",
"Number of unsafe shutdowns",
labels,
nil,
),
nvmeMediaErrors: prometheus.NewDesc(
"nvme_media_errors",
"Number of unrecovered data integrity errors",
labels,
nil,
),
nvmeNumErrLogEntries: prometheus.NewDesc(
"nvme_num_err_log_entries",
"Lifetime number of error log entries",
labels,
nil,
),
nvmeWarningTempTime: prometheus.NewDesc(
"nvme_warning_temp_time",
"Amount of time in minutes temperature > warning threshold",
labels,
nil,
),
nvmeCriticalCompTime: prometheus.NewDesc(
"nvme_critical_comp_time",
"Amount of time in minutes temperature > critical threshold",
labels,
nil,
),
nvmeThmTemp1TransCount: prometheus.NewDesc(
"nvme_thm_temp1_trans_count",
"Number of times controller transitioned to lower power",
labels,
nil,
),
nvmeThmTemp2TransCount: prometheus.NewDesc(
"nvme_thm_temp2_trans_count",
"Number of times controller transitioned to lower power",
labels,
nil,
),
nvmeThmTemp1TotalTime: prometheus.NewDesc(
"nvme_thm_temp1_trans_time",
"Total number of seconds controller transitioned to lower power",
labels,
nil,
),
nvmeThmTemp2TotalTime: prometheus.NewDesc(
"nvme_thm_temp2_trans_time",
"Total number of seconds controller transitioned to lower power",
labels,
nil,
),
}
}
func (c *nvmeCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- c.nvmeCriticalWarning
ch <- c.nvmeTemperature
ch <- c.nvmeAvailSpare
ch <- c.nvmeSpareThresh
ch <- c.nvmePercentUsed
ch <- c.nvmeEnduranceGrpCriticalWarningSummary
ch <- c.nvmeDataUnitsRead
ch <- c.nvmeDataUnitsWritten
ch <- c.nvmeHostReadCommands
ch <- c.nvmeHostWriteCommands
ch <- c.nvmeControllerBusyTime
ch <- c.nvmePowerCycles
ch <- c.nvmePowerOnHours
ch <- c.nvmeUnsafeShutdowns
ch <- c.nvmeMediaErrors
ch <- c.nvmeNumErrLogEntries
ch <- c.nvmeWarningTempTime
ch <- c.nvmeCriticalCompTime
ch <- c.nvmeThmTemp1TransCount
ch <- c.nvmeThmTemp2TransCount
ch <- c.nvmeThmTemp1TotalTime
ch <- c.nvmeThmTemp2TotalTime
}
func (c *nvmeCollector) Collect(ch chan<- prometheus.Metric) {
nvmeDeviceCmd, err := exec.Command("nvme", "list", "-o", "json").Output()
if err != nil {
log.Fatalf("Error running nvme command: %s\n", err)
}
if !gjson.Valid(string(nvmeDeviceCmd)) {
log.Fatal("nvmeDeviceCmd json is not valid")
}
nvmeDeviceList := gjson.Get(string(nvmeDeviceCmd), "Devices.#.DevicePath")
for _, nvmeDevice := range nvmeDeviceList.Array() {
nvmeSmartLog, err := exec.Command("nvme", "smart-log", nvmeDevice.String(), "-o", "json").Output()
if err != nil {
log.Fatalf("Error running nvme smart-log command for device %s: %s\n", nvmeDevice.String(), err)
}
if !gjson.Valid(string(nvmeSmartLog)) {
log.Fatalf("nvmeSmartLog json is not valid for device: %s: %s\n", nvmeDevice.String(), err)
}
nvmeSmartLogMetrics := gjson.GetMany(string(nvmeSmartLog),
"critical_warning",
"temperature",
"avail_spare",
"spare_thresh",
"percent_used",
"endurance_grp_critical_warning_summary",
"data_units_read",
"data_units_written",
"host_read_commands",
"host_write_commands",
"controller_busy_time",
"power_cycles",
"power_on_hours",
"unsafe_shutdowns",
"media_errors",
"num_err_log_entries",
"warning_temp_time",
"critical_comp_time",
"thm_temp1_trans_count",
"thm_temp2_trans_count",
"thm_temp1_total_time",
"thm_temp2_total_time",)
ch <- prometheus.MustNewConstMetric(c.nvmeCriticalWarning, prometheus.GaugeValue, nvmeSmartLogMetrics[0].Float(), nvmeDevice.String())
// convert kelvin to fahrenheit
ch <- prometheus.MustNewConstMetric(c.nvmeTemperature, prometheus.GaugeValue, (nvmeSmartLogMetrics[1].Float() - 273.15) * 9/5 + 32, nvmeDevice.String())
ch <- prometheus.MustNewConstMetric(c.nvmeAvailSpare, prometheus.GaugeValue, nvmeSmartLogMetrics[2].Float(), nvmeDevice.String())
ch <- prometheus.MustNewConstMetric(c.nvmeSpareThresh, prometheus.GaugeValue, nvmeSmartLogMetrics[3].Float(), nvmeDevice.String())
ch <- prometheus.MustNewConstMetric(c.nvmePercentUsed, prometheus.GaugeValue, nvmeSmartLogMetrics[4].Float(), nvmeDevice.String())
ch <- prometheus.MustNewConstMetric(c.nvmeEnduranceGrpCriticalWarningSummary, prometheus.GaugeValue, nvmeSmartLogMetrics[5].Float(), nvmeDevice.String())
ch <- prometheus.MustNewConstMetric(c.nvmeDataUnitsRead, prometheus.CounterValue, nvmeSmartLogMetrics[6].Float(), nvmeDevice.String())
ch <- prometheus.MustNewConstMetric(c.nvmeDataUnitsWritten, prometheus.CounterValue, nvmeSmartLogMetrics[7].Float(), nvmeDevice.String())
ch <- prometheus.MustNewConstMetric(c.nvmeHostReadCommands, prometheus.CounterValue, nvmeSmartLogMetrics[8].Float(), nvmeDevice.String())
ch <- prometheus.MustNewConstMetric(c.nvmeHostWriteCommands, prometheus.CounterValue, nvmeSmartLogMetrics[9].Float(), nvmeDevice.String())
ch <- prometheus.MustNewConstMetric(c.nvmeControllerBusyTime, prometheus.CounterValue, nvmeSmartLogMetrics[10].Float(), nvmeDevice.String())
ch <- prometheus.MustNewConstMetric(c.nvmePowerCycles, prometheus.CounterValue, nvmeSmartLogMetrics[11].Float(), nvmeDevice.String())
ch <- prometheus.MustNewConstMetric(c.nvmePowerOnHours, prometheus.CounterValue, nvmeSmartLogMetrics[12].Float(), nvmeDevice.String())
ch <- prometheus.MustNewConstMetric(c.nvmeUnsafeShutdowns, prometheus.CounterValue, nvmeSmartLogMetrics[13].Float(), nvmeDevice.String())
ch <- prometheus.MustNewConstMetric(c.nvmeMediaErrors, prometheus.CounterValue, nvmeSmartLogMetrics[14].Float(), nvmeDevice.String())
ch <- prometheus.MustNewConstMetric(c.nvmeNumErrLogEntries, prometheus.CounterValue, nvmeSmartLogMetrics[15].Float(), nvmeDevice.String())
ch <- prometheus.MustNewConstMetric(c.nvmeWarningTempTime, prometheus.CounterValue, nvmeSmartLogMetrics[16].Float(), nvmeDevice.String())
ch <- prometheus.MustNewConstMetric(c.nvmeCriticalCompTime, prometheus.CounterValue, nvmeSmartLogMetrics[17].Float(), nvmeDevice.String())
ch <- prometheus.MustNewConstMetric(c.nvmeThmTemp1TransCount, prometheus.CounterValue, nvmeSmartLogMetrics[18].Float(), nvmeDevice.String())
ch <- prometheus.MustNewConstMetric(c.nvmeThmTemp2TransCount, prometheus.CounterValue, nvmeSmartLogMetrics[19].Float(), nvmeDevice.String())
ch <- prometheus.MustNewConstMetric(c.nvmeThmTemp1TotalTime, prometheus.CounterValue, nvmeSmartLogMetrics[20].Float(), nvmeDevice.String())
ch <- prometheus.MustNewConstMetric(c.nvmeThmTemp2TotalTime, prometheus.CounterValue, nvmeSmartLogMetrics[21].Float(), nvmeDevice.String())
}
}
func main() {
port := flag.String("port", "9998", "port to listen on")
flag.Parse()
// check user
currentUser, err := user.Current()
if err != nil {
log.Fatalf("Error getting current user %s\n", err)
}
if currentUser.Username != "root" {
log.Fatalln("Error: you must be root to use nvme-cli")
}
// check for nvme-cli executable
_, err = exec.LookPath("nvme")
if err != nil {
log.Fatalf("Cannot find nvme command in path: %s\n", err)
}
prometheus.MustRegister(newNvmeCollector())
http.Handle("/metrics", promhttp.Handler())
log.Fatal(http.ListenAndServe(":"+*port, nil))
}