Skip to content

Commit

Permalink
Add NRPE monitor for OVN state
Browse files Browse the repository at this point in the history
Adds a Nagios plugin to check the OVN socket for database and/or
controller state.

Adds an NRPE check to confirm that OVN controller state is OK for OVN
chassis units.
  • Loading branch information
Xav Paice committed Oct 22, 2020
1 parent e987e12 commit f323cbc
Show file tree
Hide file tree
Showing 7 changed files with 402 additions and 1 deletion.
165 changes: 165 additions & 0 deletions files/check_ovn_status.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
#!/usr/bin/env python3
"""Nagios plugin for OVN status."""

import argparse
import os
import subprocess

from nagios_plugin3 import CriticalError, UnknownError, try_check


class NRPEBase:
"""Base class for NRPE checks."""

def __init__(self, args):
"""Init base class."""
self.args = args
self.db = args.db

@property
def cmds(self):
"""Determine which command to use for checks."""
# Check for version based on socket location

socket_paths = {"ovs": "/var/run/openvswitch", "ovn": "/var/run/ovn"}
if os.path.exists(socket_paths["ovn"]):
appctl_cmd = "/usr/bin/ovn-appctl"
socket_path = socket_paths["ovn"]
elif os.path.exists(socket_paths["ovs"]):
appctl_cmd = "/usr/bin/ovs-appctl"
socket_path = socket_paths["ovs"]
else:
raise UnknownError(
"UNKNOWN: Path for OVN socket does not exist"
)

commands = {
"nb": [
"sudo",
appctl_cmd,
"-t",
"{}/ovnnb_db.ctl".format(socket_path),
"cluster/status",
"OVN_Northbound",
],
"sb": [
"sudo",
appctl_cmd,
"-t",
"{}/ovnsb_db.ctl".format(socket_path),
"cluster/status",
"OVN_Southbound",
],
}

controller_pidfile = "{}/ovn-controller.pid".format(socket_path)
if os.path.exists(controller_pidfile):
# the socket path contains the pid
# TODO check what happens on Train
with open(
controller_pidfile, "r"
) as pidfile:
pid = pidfile.read().rstrip()
commands["controller"] = [
"sudo",
appctl_cmd,
"-t",
"{}/ovn-controller.{}.ctl".format(socket_path, pid),
"connection-status",
]

return commands

def get_db_status(self):
"""Query the requested database for state."""
status_output = self._run_command(self.cmds[self.db])
status = self._parse_status_output(status_output)

if status["Status"] != "cluster member":
raise CriticalError(
"CRITICAL: cluster status for {} db is {}".format(
self.db, status["Status"]
)
)
# TODO, check for growth in key "Term"
# TODO, review 'Entries not yet committed'

return True

def _run_command(self, cmd):
"""Run a command, and return it's result."""
try:
output = subprocess.check_output(cmd).decode("UTF-8")
except (subprocess.CalledProcessError, FileNotFoundError) as error:
msg = "CRITICAL: {} failed: {}".format(" ".join(cmd), error)
raise CriticalError(msg)

return False

return output

def _parse_status_output(self, status_output):
"""Parse output from database status query."""
lines = status_output.split("\n")
status = {}
# Crude split by first colon

for line in lines:
if ":" in line:
(key, value) = line.split(":", 1)
status[key] = value.strip()

return status

def get_controller_status(self):
"""Query the status of the ovn-controller socket."""
status_output = self._run_command(self.cmds['controller']).rstrip()

if status_output != "connected":
raise CriticalError(
"CRITICAL: OVN controller status is {}".format(status_output)
)

return True


def collect_args():
"""Parse provided arguments."""
parser = argparse.ArgumentParser(
description="NRPE check for OVN database state"
)
parser.add_argument(
"--db",
help="Which database to check, Northbound (nb) or Southbound (sb). "
"Defaults to nb.",
choices=["nb", "sb"],
type=str,
)
parser.add_argument(
"--controller",
help="Check the ovn-controller status",
action='store_true',
)

args = parser.parse_args()

return args


def main():
"""Define main subroutine."""
args = collect_args()
nrpe_check = NRPEBase(args)

if args.controller:
try_check(nrpe_check.get_controller_status)

if args.db:
try_check(nrpe_check.get_db_status)

# If we got here, everything is good
print("OK: OVN process reports it is healthy.")


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions files/ovn-central-ovn-sudoers
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
nagios ALL=(root) NOPASSWD: /usr/bin/ovn-appctl
1 change: 1 addition & 0 deletions files/ovn-central-ovs-sudoers
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
nagios ALL=(root) NOPASSWD: /usr/bin/ovs-appctl
43 changes: 43 additions & 0 deletions lib/charms/ovn_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import collections
import ipaddress
import os
import shutil
import subprocess

import charms.reactive as reactive
Expand All @@ -29,6 +30,14 @@


CERT_RELATION = 'certificates'
SUDOERS_DIR = "/etc/sudoers.d"
SUDOERS_MODE = 0o100440
SUDOERS_UID = 0
SUDOERS_GID = 0
NRPE_PLUGINS_DIR = "/usr/local/lib/nagios/plugins"
NRPE_PLUGINS_MODE = 0o100755
NRPE_PLUGINS_UID = 0
NRPE_PLUGINS_GID = 0


class OVNConfigurationAdapter(
Expand Down Expand Up @@ -135,6 +144,9 @@ def __init__(self, **kwargs):
self.restart_map = {
'/etc/openvswitch/system-id.conf': [],
}
self._files_dir = os.path.join(ch_core.hookenv.charm_dir(), 'files')
self._sudoer_file = 'ovn-central-ovn-sudoers'
self._nrpe_script = 'check_ovn_status.py'

if self.options.enable_dpdk:
self.packages.extend(['openvswitch-switch-dpdk'])
Expand Down Expand Up @@ -624,8 +636,38 @@ def render_nrpe(self):
charm_nrpe = nrpe.NRPE(hostname=hostname, primary=primary)
nrpe.add_init_service_checks(
charm_nrpe, self.nrpe_check_services, current_unit)

# Install a sudoers file so the plugin can execute queries
self._install_file(os.path.join(self._files_dir, self._sudoer_file),
SUDOERS_DIR,
SUDOERS_MODE,
SUDOERS_UID,
SUDOERS_GID)
# Install Nagios plugins
self._install_file(os.path.join(self._files_dir, self._nrpe_script),
NRPE_PLUGINS_DIR,
NRPE_PLUGINS_MODE,
NRPE_PLUGINS_UID,
NRPE_PLUGINS_GID)

charm_nrpe.add_check(
'ovn_controller_state',
'OVN chassis controller status',
'check_ovn_status.py --controller',
)

charm_nrpe.write()

def _install_file(self, src, target, mode, uid, gid):
"""Install a file."""
dst = shutil.copy(src, target)
os.chmod(dst, mode)
os.chown(dst, uid=uid, gid=gid)
ch_core.hookenv.log(
"File installed at {}".format(dst),
ch_core.hookenv.DEBUG,
)


class BaseTrainOVNChassisCharm(BaseOVNChassisCharm):
"""Train incarnation of the OVN Chassis base charm class."""
Expand All @@ -652,6 +694,7 @@ def __init__(self, **kwargs):
'/etc/neutron/'
'networking_ovn_metadata_agent.ini': [metadata_agent],
})
self._sudoer_file = 'ovn-central-ovs-sudoers'


class BaseUssuriOVNChassisCharm(BaseOVNChassisCharm):
Expand Down
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ commands = stestr run {posargs}
[testenv:pep8]
basepython = python3
deps = -r{toxinidir}/test-requirements.txt
commands = flake8 {posargs} actions lib unit_tests
commands = flake8 {posargs} actions lib unit_tests files

[testenv:cover]
# Technique based heavily upon
Expand Down
Loading

0 comments on commit f323cbc

Please sign in to comment.