Skip to content

Commit

Permalink
Retain state (#66)
Browse files Browse the repository at this point in the history
* persist state on graceful shutdown

* unit test

* dependencies already specified in pyproject.yaml

* changelog entry

* bump version

* add no cover

* Update cos_alerter/alerter.py

Co-authored-by: PietroPasotti <[email protected]>

* more appropriate variable name

* bump version in changelog

---------

Co-authored-by: PietroPasotti <[email protected]>
  • Loading branch information
dstathis and PietroPasotti authored Feb 26, 2024
1 parent 26c2617 commit 927d69b
Show file tree
Hide file tree
Showing 9 changed files with 97 additions and 12 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [0.7.0] - 2024-02-26

- Client state is now retained on a graceful shutdown (#66).

## [0.6.0] - 2023-11-30

- Added badges to README.md (#62).
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ COS Alerter is intended to be used together with alertmanager and prometheus:
- Liveness of COS Alerter itself from a metric endpoint it exposes and prometheus scrapes
## Configuring Alertmanager

In order to integrate with COS Alerter you need to a heartbeat rule to Prometheus and Add a route to the Alertmanager config
In order to integrate with COS Alerter you need to add a heartbeat rule to Prometheus and add a route to the Alertmanager config.

If you are using the Canonical Observability Stack, the alert rule is already created for you. If not, you can use a rule similar to the following:
```yaml
Expand Down
52 changes: 51 additions & 1 deletion cos_alerter/alerter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""Main logic for COS Alerter."""

import datetime
import json
import logging
import os
import sys
Expand All @@ -15,6 +16,7 @@

import apprise
import durationpy
import xdg_base_dirs
from ruamel.yaml import YAML
from ruamel.yaml.constructor import DuplicateKeyError

Expand Down Expand Up @@ -69,6 +71,13 @@ def reload(self):
self.data["notify"]["repeat_interval"]
).total_seconds()

# Static variables. We define them here so it is easy to expose them later as config
# values if needed.
base_dir = xdg_base_dirs.xdg_state_home() / "cos_alerter"
if not base_dir.exists():
base_dir.mkdir(parents=True)
self.data["clients_file"] = base_dir / "clients.state"


def deep_update(base: dict, new: typing.Optional[dict]):
"""Deep dict update.
Expand Down Expand Up @@ -151,6 +160,42 @@ def initialize():
"notify_time": None,
}

# Recover any state that was dumped on last exit.
if config["clients_file"].exists():
with config["clients_file"].open() as f:
existing_clients = json.load(f)
config["clients_file"].unlink()
for client in existing_clients:
if client in state["clients"]:
state["clients"][client]["alert_time"] = existing_clients[client]["alert_time"]
state["clients"][client]["notify_time"] = existing_clients[client][
"notify_time"
]

# This is difficult to test in unit tests because it acquires and does not release all of the
# locks. When integration tests have been solved we need to remove the "no cover" from this
# method.
@staticmethod
def dump_and_pause(): # pragma: no cover
"""Dump the state of the program and exit gracefully.
This function acquires all the locks and never releases them, effectively pausing the
program.
"""
logger.info("Starting safe shutdown.")
for client in state["clients"]:
state["clients"][client]["lock"].acquire()
# Locks are not json serializable.
clients_without_locks = {
client: {
"alert_time": state["clients"][client]["alert_time"],
"notify_time": state["clients"][client]["notify_time"],
}
for client in state["clients"]
}
with config["clients_file"].open("w") as f:
json.dump(clients_without_locks, f)

@staticmethod
def clients():
"""Return a list of clientids."""
Expand All @@ -170,7 +215,12 @@ def is_down(self) -> bool:
"""Determine if Alertmanager should be considered down based on the last alert."""
if self.data["alert_time"] is None:
return False
return time.monotonic() - self.data["alert_time"] > config["watch"]["down_interval"]
# We need to take the max of the alert and the start time, so that we only count time when
# cos-alerter was running.
return (
time.monotonic() - max(self.data["alert_time"], self.start_time)
> config["watch"]["down_interval"]
)

def _recently_notified(self) -> bool:
"""Determine if a notification has been previously sent within the repeat interval."""
Expand Down
8 changes: 8 additions & 0 deletions cos_alerter/daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,13 @@ def sigint(_, __): # pragma: no cover
sys.exit()


def sigterm(_, __): # pragma: no cover
"""Signal handler for graceful shutdown on sigterm."""
logger.info("Shutting down.")
AlerterState.dump_and_pause()
sys.exit()


def sigusr1(_, __): # pragma: no cover
"""Signal handler for SIGUSR1 which sends a test notification."""
logger.info("Received SIGUSR1.")
Expand Down Expand Up @@ -82,6 +89,7 @@ def main(run_for: Optional[int] = None, argv: List[str] = sys.argv):
# Observe signal handlers
try: # pragma: no cover
signal.signal(signal.SIGINT, sigint)
signal.signal(signal.SIGTERM, sigterm)
signal.signal(signal.SIGUSR1, sigusr1)
logger.debug("Signal handlers set.")
except ValueError as e:
Expand Down
9 changes: 6 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "cos-alerter"
version = "0.6.0"
version = "0.7.0"
authors = [
{ name="Dylan Stephano-Shachter", email="[email protected]" }
]
Expand All @@ -24,9 +24,10 @@ dependencies = [
"flask~=2.2",
"prometheus_flask_exporter~=0.22",
"pyyaml~=6.0",
"ruamel.yaml~=0.18.0",
"timeago~=1.0",
"waitress~=2.1",
"ruamel.yaml~=0.18.0"
"xdg-base-dirs~=6.0.1",
]

[project.urls]
Expand All @@ -45,14 +46,16 @@ line-length = 99
[tool.ruff]
line-length = 99
extend-exclude = ["__pycache__", "*.egg_info"]

[tool.ruff.lint]
select = ["E", "W", "F", "C", "N", "R", "D", "I001"]
# Ignore E501 because using black creates errors with this
# Ignore D107 Missing docstring in __init__
ignore = ["E501", "D107"]
# D100, D101, D102, D103: Ignore missing docstrings in tests
per-file-ignores = {"tests/*" = ["D100","D101","D102","D103"]}

[tool.ruff.pydocstyle]
[tool.ruff.lint.pydocstyle]
convention = "google"

[tool.pyright]
Expand Down
4 changes: 2 additions & 2 deletions rockcraft.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name: cos-alerter
summary: A liveness checker for self-monitoring.
description: Receive regular pings from the cos stack and alert when they stop.
version: "0.6.0" # NOTE: Make sure this matches `cos-alerter` below
version: "0.7.0" # NOTE: Make sure this matches `cos-alerter` below
base: ubuntu:22.04
license: Apache-2.0
platforms:
Expand All @@ -11,7 +11,7 @@ parts:
plugin: python
source: .
python-packages:
- cos-alerter==0.6.0 # NOTE: Make sure this matches `version` above
- cos-alerter==0.7.0 # NOTE: Make sure this matches `version` above
stage-packages:
- python3-venv
services:
Expand Down
2 changes: 1 addition & 1 deletion snap/snapcraft.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: cos-alerter
version: '0.6.0'
version: '0.7.0'
summary: A watchdog alerting on alertmanager notification failures.
license: Apache-2.0
contact: [email protected]
Expand Down
24 changes: 24 additions & 0 deletions tests/test_alerter.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,30 @@ def test_is_down_with_wait_for_first_connection(monotonic_mock, fake_fs):
assert state.is_down() is True # 5.5 minutes since reset_alert_timeout() was called.


@freezegun.freeze_time("2023-01-01")
@unittest.mock.patch("time.monotonic")
def test_is_down_from_graceful_shutdown(monotonic_mock, fake_fs):
with open("/etc/cos-alerter.yaml") as f:
conf = yaml.safe_load(f)
conf["watch"]["wait_for_first_connection"] = True
with open("/etc/cos-alerter.yaml", "w") as f:
yaml.dump(conf, f)
config.reload()
fake_fs.create_file(config["clients_file"])
with config["clients_file"].open("w") as f:
f.write('{"clientid1": {"alert_time": 500, "notify_time": null}}')
monotonic_mock.return_value = 1000
print("Hello Test")
AlerterState.initialize()
state = AlerterState(clientid="clientid1")
with state:
print(list(AlerterState.clients()))
print(state.data)
assert state.is_down() is False
monotonic_mock.return_value = 2330
assert state.is_down() is True


@freezegun.freeze_time("2023-01-01")
@unittest.mock.patch("time.monotonic")
def test_is_down(monotonic_mock, fake_fs):
Expand Down
4 changes: 0 additions & 4 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,7 @@ deps =
freezegun
pyfakefs
pytest
pyyaml
werkzeug
ruamel.yaml
commands =
coverage run --source {[vars]src_path} -m pytest -m "not slow" -v --log-cli-level=INFO {[vars]tst_path}

Expand All @@ -59,8 +57,6 @@ deps =
freezegun
pyfakefs
pytest
pyyaml
ruamel.yaml
commands =
coverage run -a --source {[vars]src_path} -m pytest -m slow -v --log-cli-level=INFO {[vars]tst_path}

Expand Down

0 comments on commit 927d69b

Please sign in to comment.