diff --git a/README.md b/README.md index 67d0b0b..408dc97 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,14 @@ # Remote Port Health Manager (rphm) extension for Arista EOS ## Overview -The Remote Port Health Manager (rphm) extension monitors interface counters, then performs actions whenever such counters exceed given thresholds. The first action generates an SNMP trap. Other actions could include sending and email or taking corrective action on an erroring port. +The Remote Port Health Manager (rphm) extension monitors interface counters, +then performs actions whenever such selected counters exceed given thresholds. +The default actions are to log a syslog message and generate an SNMP trap. +Other actions could include sending and email or taking corrective action on an +erroring port. -This extension may be run directly on Arista EOS devices or on a separate monitoring station and can monitor 1 or many devices based on the config file. +This extension may be run directly on Arista EOS devices or on a separate +monitoring station and can monitor 1 or more devices based on the config file. ## License @@ -31,44 +36,59 @@ RPM package: ## Install -rphm can be installed directly on one or more EOS switches and run as a daemon, there, or centrally on a linux system to monitor multiple switches from one place. While running it centrally reduces the load put on a switch, it also requires that eAPI on the switch be allowed from that workstation. When run directly on a switch, eAPI may be restricted to localhost-only. Additionally, SNMP traps will be sourced from the switch’s IP address. +rphm can be installed directly on one or more EOS switches and run as a daemon, +there, or centrally on a linux system to monitor multiple switches from one +place. While running it centrally reduces the load put on a switch, it also +requires that eAPI on the switch be allowed from that workstation. When run +directly on a switch, eAPI may be restricted to localhost-only. Additionally, +SNMP traps will be sourced from the switch’s IP address when run on-device. ### On an EOS device ``` -EOS#copy scp://username@10.10.10.1/src/rphm/rpmbuild/rphm-1.0.0-1.rpm extension: +EOS#copy scp://username@10.10.10.1/src/rphm/rpmbuild/rphm-1.1.0-1.rpm extension: Password: -rphm-1.0.0-1.rpm 100% 17KB 17.0KB/s 00:00 +rphm-1.0.0-1.rpm 100% 17KB 17.0KB/s 00:00 Copy completed successfully. EOS#show extensions Name Version/Release Status RPMs ------------------------------------------ ------------------------- ------ ---- -rphm-1.0.0-1.rpm 1.0.0/1 A, NI 1 +rphm-1.1.0-1.rpm 1.1.0/1 A, NI 1 A: available | NA: not available | I: installed | NI: not installed | F: forced EOS#extension rphm-1.0.0-1.rpm EOS#show extensions Name Version/Release Status RPMs ------------------------------------------ ------------------------- ------ ---- -rphm-1.0.0-1.rpm 1.0.0/1 A, I 1 +rphm-1.1.0-1.rpm 1.1.0/1 A, I 1 A: available | NA: not available | I: installed | NI: not installed | F: forced vEOS-1# ``` ### On a linux system - rpm -Uvh rpmbuild/rphm-1.0.0-1.rpm + rpm -Uvh rpmbuild/rphm-1.1.0-1.rpm + + Optionally, save + https://raw.githubusercontent.com/arista-eosext/rphm/master/systemd/rphm.service + as `/etc/systemd/system/rphm.service`. This will enable service management + with `systemctl {start|status|stop} rphm.service` commands. ### Configuration -The configuration file contains SNMP settings, the poll timer, and information on which switch(es) and counter(s) to monitor. +The configuration file contains SNMP settings, the poll timer, and information +on which switch(es) and counter(s) to monitor. The default location for the config file is `/persist/sys/rphm.conf`. +For any counter set to be monitored, a threshold must be defined and greater +than zero. + ## Usage ``` usage: rphm [-h] [--config CONFIG] [--debug] -Poll switches with eAPI then send SNMP trap on changes in interface error counters. +Poll switches with eAPI then send SNMP trap on changes in interface error +counters. optional arguments: -h, --help show this help message and exit @@ -85,7 +105,8 @@ optional arguments: ## EOS Configuration -EOS eAPI must be configured on each monitored device. At a minimum, this requires: +EOS eAPI must be configured on each monitored device. At a minimum, this +requires: ``` EOS#configure terminal @@ -98,7 +119,7 @@ To execute rphm directly on the switch, add the following: ``` EOS(config)#daemon rphm -EOS(config-daemon-rphm)#command /usr/bin/rphm --test=snmp +EOS(config-daemon-rphm)#command /usr/bin/rphm EOS(config)#end ``` @@ -119,23 +140,26 @@ EOS#show processes | include rphm EOS#config termintal EOS(config)#no daemon rphm EOS(config)#end -EOS#no extension rphm-1.0.0-1.rpm +EOS#no extension rphm-1.1.0-1.rpm ``` ### Linux ``` -rpm -e rphm-1.0.0-1.rpm +rpm -e rphm-1.1.0-1.rpm ``` # Demo -1. Copy snmptrapd-start and snmptrapd.conf from the demo/ directory to the receiver host, then execute +1. Copy snmptrapd-start and snmptrapd.conf from the demo/ directory to the + receiver host, then execute ./snmptrapd-start -2. On the client, in rphm.conf, enter the traphost and select which snmp version you with to use. Then check your switch settings. Defaults to https://arista:arista@localhost/command-api +2. On the client, in rphm.conf, enter the traphost and select which snmp + version you with to use. Then check your switch settings. Defaults to + https://arista:arista@localhost/command-api 3. Verify your SNMP configuration with diff --git a/VERSION b/VERSION index 3eefcb9..9084fa2 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.0.0 +1.1.0 diff --git a/rphm.spec b/rphm.spec index 80e80cf..5e6f3b3 100644 --- a/rphm.spec +++ b/rphm.spec @@ -31,7 +31,7 @@ rm -rf $RPM_BUILD_ROOT rm -rf $RPM_BUILD_ROOT %files -%defattr(-,root,eosadmin,-) +%defattr(-,root,root,-) %{python_sitelib}/rphm* %{_bindir}/rphm #/usr/rphm.conf diff --git a/rphm/app.py b/rphm/app.py index 245eb7e..929f8b2 100644 --- a/rphm/app.py +++ b/rphm/app.py @@ -1,6 +1,6 @@ # vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 # -# Copyright (c) 2014, Arista Networks, Inc. +# Copyright (c) 2016, Arista Networks, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -40,13 +40,14 @@ import os import sys import syslog -from pprint import pprint +from pprint import pprint, pformat from jsonrpclib import Server from jsonrpclib import ProtocolError from subprocess import call -SNMP_SETTINGS = None #pylint: disable=C0103 -DEBUG = False #pylint: disable=C0103 +SNMP_SETTINGS = None # pylint: disable=C0103 +DEBUG = False # pylint: disable=C0103 + class EapiException(Exception): """ An EapiException can be raised when there is a communication issue with @@ -57,6 +58,7 @@ class EapiException(Exception): pass + def log(msg, level='INFO', error=False): """Logging facility setup. @@ -84,6 +86,7 @@ def log(msg, level='INFO', error=False): priority = ''.join(["syslog.LOG_", level]) syslog.syslog(eval(priority), msg) + def parse_cmd_line(): """Parse the command line options and return an args dict. @@ -102,7 +105,7 @@ def parse_cmd_line(): parser.add_argument('--config', type=str, default='/persist/sys/rphm.conf', - help='Specifies the configuration file to load' + \ + help='Specifies the configuration file to load' + '(Default: /persist/sys/rphm.conf)' ) @@ -145,17 +148,13 @@ def remove_unneded_keys(keys, this_dict): """ log("Entering {0}.".format(sys._getframe().f_code.co_name), level='DEBUG') - #print "cleaning up keys" for section in this_dict: - #print " cleaning up section {0}".format(section) for key in keys: - #print " looking for {0}".format(key) key = key.lower() if key in this_dict[section]: - #print " Found a match... deleting" - #del this_dict[section][key] this_dict[section].pop(key, None) + def read_config(filename): """ Read the config file (Default: /persist/sys/rphm.conf) @@ -175,36 +174,34 @@ def read_config(filename): setting = {} defaults = { - 'hostname': 'localhost', 'protocol': 'https', - 'port': '443', 'username': 'arista', 'password': 'arista', 'url': '%(protocol)s://%(username)s:%(password)s@%(hostname)s:%(port)s'\ '/command-api', 'interfaceList': 'Management 1', 'counterList': 'alignmentErrors,fcsErrors,symbolErrors', - 'traphost' : 'localhost', - 'version' : '2c', - 'community' : 'public', - 'secLevel' : 'authPriv', - 'contextName' : '', - 'authProtocol' : 'SHA', - 'authPassword' : '', - 'privProtocol' : 'AES', - 'privPassword' : '%(authPassword)s', - 'inUcastPkts' : '1', - 'inDiscards' : '1', - 'alignmentErrors' : '1', - 'fcsErrors' : '1', - 'giantFrames' : '1', - 'runtFrames' : '1', - 'rxPause' : '1', - 'symbolErrors' : '1', - 'collisions' : '1', - 'deferredTransmissions' : '1', - 'lateCollisions' : '1', - 'txPause' : '1' + 'traphost': 'localhost', + 'version': '2c', + 'community': 'public', + 'secLevel': 'authPriv', + 'contextName': '', + 'authProtocol': 'SHA', + 'authPassword': '', + 'privProtocol': 'AES', + 'privPassword': '%(authPassword)s', + 'inUcastPkts': '1', + 'inDiscards': '1', + 'alignmentErrors': '1', + 'fcsErrors': '1', + 'giantFrames': '1', + 'runtFrames': '1', + 'rxPause': '1', + 'symbolErrors': '1', + 'collisions': '1', + 'deferredTransmissions': '1', + 'lateCollisions': '1', + 'txPause': '1' } switchkeys = ( 'hostname', @@ -229,6 +226,8 @@ def read_config(filename): 'txPause' ) + default_port = '443' + os.path.isfile(filename) if not os.access(filename, os.R_OK): log("Unable to read config file {0}".format(filename)) @@ -248,10 +247,13 @@ def read_config(filename): format(filename, err)) setting[section] = {} + if 'hostname' not in options: + config.set(section, 'hostname', 'localhost') + if 'port' not in options: + config.set(section, 'port', default_port) for option in options: setting[section][option] = config.get(section, option) - # Remove standard sections so all we have left are switch customizations config.remove_section(section) @@ -264,24 +266,39 @@ def read_config(filename): # Convert switchList from a string to config sections with defaults # then make sure a section exists for each one in the event that this # is the only place this switch is included in the config. - # Thsi has the added effect of picking up the defaults. + # This has the added effect of picking up the defaults. if 'switchlist' in setting['switches']: for line in setting['switches']['switchlist'].split("\n"): for device in line.split(","): device = device.strip('" \t') if device: - known_list = config.sections() - for section in config.sections(): - known_list.append(config.get(section, 'hostname')) - if device not in known_list: + switch_sections = config.sections() + if device not in switch_sections: config.add_section(device) config.set(device, 'hostname', device) + if len(config.sections()) == 0: + log("No switches defined. Adding default localhost section", + level='DEBUG') + config.add_section('localhost') + # Any section remaining, should being a switch definition setting['switches'] = [] for section in config.sections(): switch = {} config.set(section, 'name', section) + options = config.options(section) + + if 'hostname' not in options: + config.set(section, 'hostname', section) + + if 'port' not in options: + proto = config.get(section, 'protocol') + if proto == 'http': + default_port = '80' + elif proto == 'https': + default_port = '443' + config.set(section, 'port', default_port) for option in config.options(section): switch[option] = config.get(section, option) @@ -306,6 +323,7 @@ def read_config(filename): return setting + def get_interfaces(switch): """Get all of the interfaces on a switch. @@ -349,6 +367,7 @@ def get_interfaces(switch): return response[0][u'interfaceStatuses'] + def get_device_status(device): """Get interface uptime and model information @@ -367,10 +386,9 @@ def get_device_status(device): if device['name'] == "localhost": hostname = device['eapi_obj'].runCmds(1, ["show hostname"]) - print "Hostname:" - pprint(hostname) device['name'] = hostname[0]['hostname'] + def get_intf_counters(switch, interface="Management1"): """Get interface details for an interface @@ -449,7 +467,8 @@ def get_intf_counters(switch, interface="Management1"): (errno, msg) = err[0] # 1002: invalid command if errno == 1002: - log("Invalid EOS interface name ({0})".format(commands), error=True) + log("Invalid EOS interface name ({0})".format(commands), + error=True) else: conn_error = True log("ProtocolError while retrieving {0} ([{1}] {2})". @@ -498,6 +517,7 @@ def get_intf_counters(switch, interface="Management1"): else: return (interface, None) + def get_device_counters(device): """ Get counters from a device and return a dict of results @@ -520,6 +540,7 @@ def get_device_counters(device): return counters + def compare_counters(device, reference, current, test=None): """Compare the counters in 2 sets and return only differences @@ -580,13 +601,31 @@ def compare_counters(device, reference, current, test=None): format(interface)) continue + diffs[interface] = {} + + if 'linkStatusChanges' in device['counters']: + ref_counter = list(get_all(ref, 'linkStatusChanges')) + cur_counter = list(get_all(cur, 'linkStatusChanges')) + tmp = is_delta_significant(device, + 'linkStatusChanges', + ref_counter[0], + cur_counter[0], + cur_counter[0], + 'in') + if tmp is not None: + diffs[interface]['linkStatusChanges'] = tmp + diffs[interface]['linkStatusChanges']['current'] = current[interface][u'lineProtocolStatus'] + # Skip interfaces not in Up state. if current[interface][u'lineProtocolStatus'] != u'up': - log("Interface [{0}] is not up. Skipping...". + # Remove the interface entry if there were no linkStatusChanges + # found + if not diffs[interface].keys(): + diffs.pop(interface) + log("Interface [{0}] is not up. Skipping counters...". format(interface)) continue - diffs[interface] = {} total_in = list(get_all(ref, 'inUcastPkts'))[0] + \ list(get_all(ref, 'inMulticastPkts'))[0] + \ list(get_all(ref, 'inBroadcastPkts'))[0] @@ -596,15 +635,19 @@ def compare_counters(device, reference, current, test=None): list(get_all(ref, 'outBroadcastPkts'))[0] for counter in device['counters']: + if counter == 'linkStatusChanges': + # This counter checked separately + continue + ref_counter = list(get_all(ref, counter)) cur_counter = list(get_all(cur, counter)) if not ref_counter or not cur_counter: - log("Counter [{0}] missing from Reference or Current data" \ + log("Counter [{0}] missing from Reference or Current data" " set. Skipping.".format(counter), error=True) continue - #Check counter direction + # Check counter direction if counter in input_counters: direction = "in" total = total_in @@ -612,7 +655,6 @@ def compare_counters(device, reference, current, test=None): direction = "out" total = total_out else: - #direction = "NA" # linkStatusChanges don't fall in to either category, exactly. direction = "in" total = total_in @@ -638,6 +680,7 @@ def compare_counters(device, reference, current, test=None): return diffs + def get_all(data, key): """Return all values of a key in a multi-level dictionary @@ -661,6 +704,7 @@ def get_all(data, key): for obj in get_all(subset, key): yield obj + def is_delta_significant(device, counter, ref, cur, total=0, direction="in"): """Verify whether the difference in a counter value, if any, between two checks is equal to or greater than the set rate threshold for that counter. @@ -693,16 +737,25 @@ def is_delta_significant(device, counter, ref, cur, total=0, direction="in"): format(counter), error=True) return None + if threshold < 1: + log("Invalid threshold ({}) set for {} in the config file. Skipping.". + format(threshold, counter), error=True) + return None + log("Comparing {0}[{1}]: REF [{2}], CUR [{3}]. " "expecting < {4}".format(device['name'], counter, ref, cur, device[counter.lower()]), level='DEBUG') if delta >= threshold: - return {'threshold': threshold, 'found': delta, 'total': total, 'direction': direction} + return {'threshold': threshold, + 'found': delta, + 'total': total, + 'direction': direction} else: return None + def do_actions(device, changes, interval): """ Perform actions for each delta noted in "changes" @@ -732,21 +785,36 @@ def do_actions(device, changes, interval): for interface in changes: for counter in changes[interface]: - trap_content = "Device {0} {1}, interface {2}: {3}"\ - " increasing at > {4} per {5} seconds. Found {6}/{7} packets {8}".\ - format(hostname, - device['modelName'], - interface, - counter, - changes[interface][counter]['threshold'], - interval, - changes[interface][counter]['found'], - changes[interface][counter]['total'], - changes[interface][counter]['direction']) + if counter == 'linkStatusChanges': + trap_content = "Device {0} {1}, interface {2}: {3}"\ + " increasing at least {4} in {5} seconds. Found {6}/{7} changes."\ + " Currently {8}".\ + format(hostname, + device['modelName'], + interface, + counter, + changes[interface][counter]['threshold'], + interval, + changes[interface][counter]['found'], + changes[interface][counter]['total'], + changes[interface][counter]['current']) + else: + trap_content = "Device {0} {1}, interface {2}: {3} increasing"\ + " at least {4} per {5} seconds. Found {6}/{7} packets {8}".\ + format(hostname, + device['modelName'], + interface, + counter, + changes[interface][counter]['threshold'], + interval, + changes[interface][counter]['found'], + changes[interface][counter]['total'], + changes[interface][counter]['direction']) # system uptime send_trap(trap_content, uptime=int(device['bootupTimestamp'])) + def send_trap(message, uptime='', test=False): """ Send an Arista enterprise-specific SNMP trap containing message. @@ -769,7 +837,6 @@ def send_trap(message, uptime='', test=False): # # Build the arguments to snmptrap - #trap_args = ['snmptrap'] trap_args = ['snmptrap'] trap_args.append('-v') trap_args.append(SNMP_SETTINGS['version']) @@ -803,7 +870,7 @@ def send_trap(message, uptime='', test=False): format(SNMP_SETTINGS['version'])) trap_args.append(SNMP_SETTINGS['traphost']) - #.iso.org.dod.internet.private. .arista + # .iso.org.dod.internet.private. .arista # enterprises.30065 enterprise_oid = '.1.3.6.1.4.1.30065' # enterpriseSpecific = 6 @@ -817,7 +884,8 @@ def send_trap(message, uptime='', test=False): if test == "trap": message = "Device TEST-Device, interface Management 1:" \ - " FAKEalignmentErrors increasing at > 1 per 5 seconds. Found 7/2401 packets in" + " FAKEalignmentErrors increasing at > 1 per 5 seconds." \ + " Found 7/2401 packets in" log("Sending SNMPTRAP to {0} with arguments: {1}". format(SNMP_SETTINGS['traphost'], trap_args), level='DEBUG') @@ -829,6 +897,7 @@ def send_trap(message, uptime='', test=False): call(trap_args) + def main(): ''' main execution routine for devops command. Parse the command line options, build the RESTful request, send it to stdlib, @@ -861,7 +930,8 @@ def main(): elif args['test'] == 'get': print "Connecting to eAPI at {0}".format(config['switches'][0]['url']) switch = Server(config['switches'][0]['url']) - (interface, counters) = get_intf_counters(switch, interface="Management1") + (interface, counters) = get_intf_counters(switch, + interface="Management1") print "\nTest=get: received the following counters from Management 1:" pprint(counters) print "Test=get: --------------------------\n" @@ -910,4 +980,3 @@ def main(): log("---sleeping for {0} seconds.".format(config['counters']['poll']), level='DEBUG') time.sleep(int(config['counters']['poll'])) - diff --git a/systemd/rphm.service b/systemd/rphm.service new file mode 100644 index 0000000..57426d7 --- /dev/null +++ b/systemd/rphm.service @@ -0,0 +1,16 @@ +# systemd script for RPHM. +# +# This file should be installed when running rphm on a central server which +# manages system services with systemd. +# +# Install as /etc/systemd/system/rphm.service +# +[Unit] +Description=Remote Port Health Monitor for Arista EOS devices +After=syslog.target network.target + +[Service] +ExecStart=/usr/bin/rphm + +[Install] +WantedBy=multi-user.target