-
Notifications
You must be signed in to change notification settings - Fork 34
/
prometheus-snmp-iface-counters-exporter
executable file
·299 lines (247 loc) · 11.5 KB
/
prometheus-snmp-iface-counters-exporter
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
#!/usr/bin/env python
import pathlib as pl, dataclasses as dc
import os, sys, signal, time, unicodedata, json, logging
# https://github.com/prometheus/client_python
import prometheus_client as pc
# https://github.com/etingof/pysnmp
from pysnmp import hlapi as snmp
import pysnmp.error as snmp_err_core
import pysnmp.proto.errind as snmp_err
class LogMessage:
def __init__(self, fmt, a, k): self.fmt, self.a, self.k = fmt, a, k
def __str__(self): return self.fmt.format(*self.a, **self.k) if self.a or self.k else self.fmt
class LogStyleAdapter(logging.LoggerAdapter):
def __init__(self, logger, extra=None): super().__init__(logger, extra or dict())
def log(self, level, msg, *args, **kws):
if not self.isEnabledFor(level): return
log_kws = {} if 'exc_info' not in kws else dict(exc_info=kws.pop('exc_info'))
msg, kws = self.process(msg, kws)
self.logger._log(level, LogMessage(msg, args, kws), (), **log_kws)
get_logger = lambda name: LogStyleAdapter(logging.getLogger(f'prms-snmp.{name}'))
err_fmt = lambda err: '[{}] {}'.format(err.__class__.__name__, err)
def str_part(s, sep, default=None):
'Examples: str_part("user@host", "<@", "root"), str_part("host:port", ":>")'
c = sep.strip('<>')
if sep.strip(c) == '<': return (default, s) if c not in s else s.split(c, 1)
else: return (s, default) if c not in s else s.rsplit(c, 1)
str_norm = lambda v: unicodedata.normalize('NFKC', v.strip()).casefold()
### SNMP bits
class SNMPError(Exception): pass
class SNMPNX(SNMPError): pass
class SNMPTimeout(SNMPError): pass
@dc.dataclass
class SNMPData:
uptime:float; query_time:float; iface_stats:dict
class SNMP:
def __init__( self,
host, port, user, sha_auth_pw, iface_set=None, iface_n_max=10 ):
self.snmp_engine = snmp.SnmpEngine()
self.snmp_auth = snmp.UsmUserData(
user, sha_auth_pw, authProtocol=snmp.usmHMACSHAAuthProtocol )
self.snmp_ctx = snmp.ContextData()
self.log = get_logger('snmp')
# Different retries to make sure reboot doesn't happen in-between queries
ep = host, port
self.snmp_ep_init = snmp.UdpTransportTarget(ep, retries=5, timeout=30.0)
self.snmp_ep_value = snmp.UdpTransportTarget(ep, retries=1, timeout=15.0)
if iface_set and len(iface_set) > iface_n_max: raise ValueError(iface_n_max)
self.iface_set = iface_set
# NETSERVER-MIB::hrSystemUptime
self.oid_uptime = snmp.ObjectType(snmp.ObjectIdentity('.1.3.6.1.2.1.25.1.1.0'))
iface_ns = list(range(1, iface_n_max+1))
self.oid_iface_names, self.oid_iface_ingress, self.oid_iface_egress = (
list(snmp.ObjectType(snmp.ObjectIdentity( # IF-MIB::ifXEntry
f'.1.3.6.1.2.1.31.1.1.1.{k}.{n}' )) for n in iface_ns) for k in [1, 6, 10] )
def _query(self, oid, conv=None, init=False):
snmp_ep = self.snmp_ep_init if init else self.snmp_ep_value
cmd = snmp.getCmd( self.snmp_engine,
self.snmp_auth, snmp_ep, self.snmp_ctx, oid )
try: err_indication, err_st, err_idx, var_list = next(cmd)
except snmp_err_core.PySnmpError as err:
raise SNMPError('SNMP carrier/conn error') # msg is too long here
if err_indication:
if isinstance(err_indication, snmp_err.RequestTimedOut):
raise SNMPTimeout(f'{snmp_ep.retries=} {snmp_ep.timeout=}')
raise SNMPError(err_fmt(err))
if err_st:
raise SNMPError( f'{err_st.prettyPrint()}'
f' [{var_list[int(err_idx)-1] if err_idx else "?"}]' )
if len(var_list) != 1: raise SNMPError(f'Unexpected result structure: {var_list}')
(k, v), = var_list
# print(k.prettyPrint(), type(v), v.prettyPrint())
if isinstance(v, snmp.NoSuchObject): raise SNMPNX(oid)
return conv(v) if conv else v
def query_iface_data(self, iface_set=None):
if iface_set is None: iface_set = self.iface_set
ts0 = time.monotonic()
try: uptime = self._query(self.oid_uptime, int, init=True) / 100
except SNMPError as err: return self.log.debug('Uptime query error: {}', err)
iface_stats, iface_names = dict(), dict((str_norm(iface), iface) for iface in iface_set)
for n, oid_iface in enumerate(self.oid_iface_names):
if iface_set is not None and not iface_names: break # found all ifaces
try: iface_name = self._query(oid_iface, str)
except SNMPNX: break
except SNMPError as err: return self.log.debug('Iface query error: {}', err)
if iface_set is not None:
iface_name = iface_names.pop(str_norm(iface_name), None)
if not iface_name: continue
try:
bs_in = self._query(self.oid_iface_ingress[n], int)
bs_out = self._query(self.oid_iface_egress[n], int)
except SNMPNX: continue # iface is gone
except SNMPError as err: return self.log.debug('Counter query error: {}', err)
iface_stats[iface_name] = bs_in, bs_out
query_time = time.monotonic() - ts0
return SNMPData(uptime, query_time, iface_stats)
### Poller and prometheus collector loop
class SNMPPoller:
def __init__(self, snmp, interval, metric_prefix, spool_file, spool_tmp):
self.snmp, self.interval, self.prefix = snmp, interval, metric_prefix
self.spool_file, self.spool_tmp = spool_file, spool_tmp
self.log = get_logger('poller')
def snmp_spool_parse(self):
try:
data = json.loads(self.spool_file.read_text())
data = SNMPData(**data)
except OSError as err: return
except Exception as err:
self.log.error( 'Unrecognized spool-file'
' structure, discarding [{}]: {}', spool_file, err_fmt(err) )
else:
self.log.debug('Parsed spool data [{}]: {}', self.spool_file, data)
return data
def snmp_spool_dump(self, data):
if isinstance(data, SNMPData): data = dc.asdict(data)
with self.spool_tmp.open('w') as tmp:
tmp.write(json.dumps(data))
tmp.flush()
os.fdatasync(tmp)
self.spool_tmp.rename(self.spool_file)
self.log.debug('Updated spool file [{}]', self.spool_file)
def run_query_loop(self):
# Traffic counter resets are avoided here, if possible,
# by restoring old value from spool-file and tracking all increments from there.
c_query_count = pc.Counter(
f'{self.prefix}snmp_query_count',
'Total number of SNMP queries made', ['result'] )
c_query_time = pc.Histogram(
f'{self.prefix}snmp_query_time_seconds',
'Total SNMP querying time, in seconds' )
c_iface_traffic = pc.Counter(
f'{self.prefix}iface_traffic_bytes',
'Total traffic through network interface', ['iface', 'dir'] )
data = self.snmp_spool_parse()
if data:
stats_uptime = data.uptime
stats_last, stats_total = data.iface_stats, dict()
for iface, (bs_in, bs_out) in data.iface_stats.items():
self.log.debug('Init - iface stats [{}]: in={:,} out={:,}', iface, bs_in, bs_out)
stats_total[iface] = [bs_in, bs_out]
c_iface_traffic.labels(iface, 'in').inc(bs_in)
c_iface_traffic.labels(iface, 'out').inc(bs_out)
else:
self.log.debug('Init - blank iface stats')
stats_uptime, stats_last, stats_total = 0, dict(), dict()
while True:
self.log.debug('Running SNMP query...')
data = self.snmp.query_iface_data()
c_query_count.labels('total').inc()
if not data:
self.log.debug(' - query result: fail')
c_query_count.labels('fail').inc()
else:
self.log.debug(' - query result: success')
c_query_count.labels('success').inc()
self.log.debug(' - query time: {:,.1f}', data.query_time)
c_query_time.observe(data.query_time)
if data.uptime < stats_uptime:
self.log.debug(
' - query uptime: {:,.0f} -> {:,.0f} [reboot detected]',
stats_uptime, data.uptime )
stats_last.clear()
else: self.log.debug(' - query uptime: {:,.0f}', data.uptime)
stats_uptime = data.uptime
for iface, bs_vals in data.iface_stats.items():
bs_in_old, bs_out_old = stats_last.get(iface, (0, 0))
bs_in, bs_out = stats_last[iface] = bs_vals
inc_in, inc_out = max(0, bs_in - bs_in_old), max(0, bs_out - bs_out_old)
self.log.debug(
' - iface traffic [{}]: in={:,} [+{:,}] out={:,} [+{:,}]',
iface, bs_in, inc_in, bs_out, inc_out )
for n, (k, inc) in enumerate([('in', inc_in), ('out', inc_out)]):
stats_total[iface][n] += inc
c_iface_traffic.labels(iface, k).inc(inc)
data.iface_stats = stats_total
self.snmp_spool_dump(data)
self.log.debug('Query loop delay: {:,.1f}', self.interval)
time.sleep(self.interval)
### CLI and HTTP server
def main(args=None):
import argparse, textwrap
dd = lambda text: (textwrap.dedent(text).strip('\n') + '\n').replace('\t', ' ')
fill = lambda s,w=90,ind='',ind_next=' ',**k: textwrap.fill(
s, w, initial_indent=ind, subsequent_indent=ind if ind_next is None else ind_next, **k )
parser = argparse.ArgumentParser(
formatter_class=argparse.RawTextHelpFormatter,
description='Run prometheus exporter for SNMP network interface bandwidth totals.')
parser.add_argument('snmp_addr_port', help='SNMP host[:port] to query.')
parser.add_argument('snmp_auth_file',
help='File with "user:password" for SNMPv3 SHA1-HMAC authentication.')
parser.add_argument('spool_file', nargs='?',
help=dd('''
JSON file to store/track remote uptime and last SNMP values in.
Must be in a writable directory to replace via create/rename.'''))
parser.add_argument('-i', '--iface', action='append', metavar='iface',
help=dd('''
Specific interface name(s) to query/export
(default - all of them). Can be used multiple times.'''))
parser.add_argument('-m', '--metric-prefix', metavar='prefix', default='',
help=dd('''
Optional prefix for exported prometheus metrics.
For example, with "mikrotik_" prefix, interface traffic counter
value will be "mikrotik_iface_traffic_bytes", while snmp stats
will look like "mikrotik_snmp_query_count" and such.'''))
parser.add_argument('-p', '--print',
action='store_true', help='Query and print data once instead of starting the daemon.')
parser.add_argument('-b', '--bind',
metavar='(host:)port', default='9101',
help='[host:]port to bind data exporter interface to.')
parser.add_argument('-t', '--snmp-poll-interval',
type=float, metavar='seconds', default=13*60,
help='Interval between polling counter values via SNMP, in seconds. Default: %(default)s')
parser.add_argument('--debug', action='store_true', help='Verbose operation mode.')
opts = parser.parse_args(sys.argv[1:] if args is None else args)
logging.basicConfig(level=logging.DEBUG if opts.debug else logging.WARNING)
log = get_logger('main')
snmp_user, snmp_pw = pl.Path(opts.snmp_auth_file).read_text().splitlines()[0].split(':', 1)
snmp_host, snmp_port = str_part(opts.snmp_addr_port, ':>')
iface_list = opts.iface
if iface_list: iface_list = list(iface.lower() for iface in iface_list)
snmp = SNMP(snmp_host, snmp_port or 161, snmp_user, snmp_pw, iface_set=opts.iface)
if opts.print:
data = snmp.query_iface_data()
if not data: return print('SNMP query failed')
print( 'Interface byte counters'
f' (uptime={data.uptime:,.0f}s query-time={data.query_time:.1f}s):' )
for iface, (bs_in, bs_out) in data.iface_stats.items():
print(f' {iface}: in={bs_in:,} out={bs_out:,}')
return
for sig in 'int term'.upper().split():
signal.signal(getattr(signal, f'SIG{sig}'), lambda sig,frm: sys.exit(0))
# Make sure spool paths are ok
if not opts.spool_file:
parser.error('Spool file argument is required in daemon mode')
spool_tmp = pl.Path(opts.spool_file + '.tmp')
spool_tmp.write_bytes(b'test')
spool_tmp.unlink()
spool_file = pl.Path(opts.spool_file)
try: data = json.loads(spool_file.read_text())
except: spool_file.unlink(missing_ok=True) # can be corrupted after restart
host, port = str_part(opts.bind, '<:')
pc.start_http_server(int(port), host or '')
log.debug('Starting poller loop...')
SNMPPoller( snmp,
opts.snmp_poll_interval, opts.metric_prefix,
spool_file, spool_tmp ).run_query_loop()
log.debug('Finished')
if __name__ == '__main__': sys.exit(main())