diff --git a/.gitignore b/.gitignore index b6308c9..8a475b0 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ venv *.egg-info *.snap **/__pycache__ +*.rock diff --git a/CHANGELOG.md b/CHANGELOG.md index 288e880..ad0b024 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.9.0] - 2024-05-30 + +- Added PagerDuty native support (#76). + + ## [0.8.0] - 2024-03-07 - Fixes container silently running by exiting with non-zero status when configuration file is missing. (#70). diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 465d981..331faf0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -28,6 +28,7 @@ cp config-defaults.yaml cos-alerter.yaml docker run -p 8080:8080 --rm --mount type=bind,source="$(pwd)"/cos-alerter.yaml,target=/etc/cos-alerter.yaml,readonly -it cos-alerter:0.2.0 ``` + ## Run Tests * `pip install tox` diff --git a/cos_alerter/alerter.py b/cos_alerter/alerter.py index 661a97e..b380be5 100644 --- a/cos_alerter/alerter.py +++ b/cos_alerter/alerter.py @@ -13,10 +13,12 @@ import time import typing from pathlib import Path +from typing import Dict, List, Optional import apprise import durationpy import xdg_base_dirs +from pdpyras import EventsAPISession from ruamel.yaml import YAML from ruamel.yaml.constructor import DuplicateKeyError @@ -211,6 +213,9 @@ def clients(): def reset_alert_timeout(self): """Set the "last alert time" to right now.""" + # In case an instance was down, resolve the PagerDuty incident before resetting the last alert time + if self.is_down(): + self.resolve_existing_alerts() logger.debug("Resetting alert timeout for %s.", self.clientid) self.data["alert_time"] = time.monotonic() @@ -274,10 +279,26 @@ def notify(self): # Sending notifications can be a long operation so handle that in a separate thread. # This avoids interfering with the execution of the main loop. notify_thread = threading.Thread( - target=send_notifications, kwargs={"title": title, "body": body} + target=send_all_notifications, + kwargs={ + "title": title, + "body": body, + "destinations": split_destinations(config["notify"]["destinations"]), + "incident_type": "trigger", + "dedup_key": f"{self.clientid}-{self.last_alert_datetime()}", + }, ) notify_thread.start() + def resolve_existing_alerts(self): + """Resolves the current alerts.""" + categorized_destinations = split_destinations(config["notify"]["destinations"]) + handle_pagerduty_incidents( + incident_type="resolve", + dedup_key=f"{self.clientid}-{self.last_alert_datetime()}", + destinations=categorized_destinations["pagerduty"], + ) + def now_datetime(): """Return the current datetime using the monotonic clock.""" @@ -290,20 +311,75 @@ def up_time(): return time.monotonic() - state["start_time"] -def send_notifications(title: str, body: str): +def split_destinations(destinations: List[str]) -> Dict[str, List[str]]: + """Split destinations into categorized lists.""" + categorized_destinations = {"standard": [], "pagerduty": []} + + for source in destinations: + if source.startswith("pagerduty"): + categorized_destinations["pagerduty"].append(source) + else: + categorized_destinations["standard"].append(source) + + return categorized_destinations + + +def send_all_notifications( + title: str, body: str, destinations: Dict[str, List[str]], incident_type: str, dedup_key: str +): """Send a notification to all receivers.""" + send_standard_notifications(title=title, body=body, destinations=destinations["standard"]) + handle_pagerduty_incidents( + incident_type=incident_type, + dedup_key=dedup_key, + destinations=destinations["pagerduty"], + incident_summary=body, + ) + + +def send_standard_notifications(title: str, body: str, destinations: list): + """Send a notification to all standard receivers.""" # TODO: Since this is run in its own thread, we have to make sure we properly # log failures here. + + # Send notifications to non-PagerDuty destinations sender = apprise.Apprise() - for source in config["notify"]["destinations"]: + for source in destinations: sender.add(source) sender.notify(title=title, body=body) +def handle_pagerduty_incidents( + incident_type: str, + dedup_key: str, + destinations: list, + incident_summary: Optional[str] = None, +): + """Handles PagerDuty incidents by triggering or resolving incidents based on the specified incident type. + + Args: + incident_type (str): The type of incident action to perform. Should be either 'trigger' or 'resolve'. + dedup_key (str): The deduplication key to uniquely identify the incident. + destinations (list): List of destinations to handle PagerDuty incidents for. + incident_summary (str, optional): A summary of the incident, used only when triggering an incident. Defaults to None. + """ + for source in destinations: + integration_key = source.split("//")[1].split("@")[0] + session = EventsAPISession(integration_key) + + if incident_type == "trigger": + session.trigger(source="cos-alerter", summary=incident_summary, dedup_key=dedup_key) + elif incident_type == "resolve": + session.resolve(dedup_key) + + def send_test_notification(): """Signal handler which sends a test email to all configured receivers.""" logger.info("Sending test notifications.") - send_notifications( + send_all_notifications( title="COS-Alerter test email.", body="This is a test email automatically generated by COS-alerter.", + destinations=split_destinations(config["notify"]["destinations"]), + incident_type="trigger", + dedup_key="test-dedup-key", ) diff --git a/pyproject.toml b/pyproject.toml index 996831d..105afcd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "cos-alerter" -version = "0.8.0" +version = "0.9.0" authors = [ { name="Dylan Stephano-Shachter", email="dylan.stephano-shachter@canonical.com" } ] @@ -28,6 +28,7 @@ dependencies = [ "timeago~=1.0", "waitress~=2.1", "xdg-base-dirs~=6.0.1", + "pdpyras~=5.2.0" ] [project.urls] diff --git a/rockcraft.yaml b/rockcraft.yaml index 274f491..9c036a9 100644 --- a/rockcraft.yaml +++ b/rockcraft.yaml @@ -1,7 +1,7 @@ name: cos-alerter summary: A liveness checker for self-monitoring. description: Receive regular pings from the cos stack and alert when they stop. -version: "0.8.0" +version: "0.9.0" base: ubuntu@22.04 license: Apache-2.0 platforms: diff --git a/snap/snapcraft.yaml b/snap/snapcraft.yaml index b8570a6..a7b34ad 100644 --- a/snap/snapcraft.yaml +++ b/snap/snapcraft.yaml @@ -1,5 +1,5 @@ name: cos-alerter -version: '0.8.0' +version: '0.9.0' summary: A watchdog alerting on alertmanager notification failures. license: Apache-2.0 contact: simon.aronsson@canonical.com diff --git a/tests/helpers.py b/tests/helpers.py index 8ae1d33..7584e9b 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -4,6 +4,7 @@ DESTINATIONS = [ "mailtos://user:pass@domain/?to=example-0@example.com,example-1@example.com", "slack://xoxb-1234-1234-4ddbc191d40ee098cbaae6f3523ada2d/#general", + "pagerduty://integration-key@api-key", ] CONFIG = { diff --git a/tests/test_alerter.py b/tests/test_alerter.py index 8cb29e7..0e4e236 100644 --- a/tests/test_alerter.py +++ b/tests/test_alerter.py @@ -10,13 +10,24 @@ import freezegun import yaml from helpers import DESTINATIONS +from pdpyras import EventsAPISession -from cos_alerter.alerter import AlerterState, config, send_test_notification, up_time +from cos_alerter.alerter import ( + AlerterState, + config, + send_test_notification, + split_destinations, + up_time, +) -def assert_notifications(notify_mock, add_mock, title, body): - add_mock.assert_has_calls([unittest.mock.call(x) for x in DESTINATIONS]) +def assert_notifications(notify_mock, add_mock, pd_mock, title, body, dedup_key): + categorized_destinations = split_destinations(DESTINATIONS) + add_mock.assert_has_calls( + [unittest.mock.call(x) for x in categorized_destinations["standard"]] + ) notify_mock.assert_called_with(title=title, body=body) + pd_mock.assert_called_with(source="cos-alerter", summary=body, dedup_key=dedup_key) def test_config_gets_item(fake_fs): @@ -142,7 +153,8 @@ def test_is_down_from_initialize(monotonic_mock, fake_fs): @freezegun.freeze_time("2023-01-01") @unittest.mock.patch("time.monotonic") -def test_is_down_with_reset_alert_timeout(monotonic_mock, fake_fs): +@unittest.mock.patch.object(EventsAPISession, "resolve") +def test_is_down_with_reset_alert_timeout(pd_mock, monotonic_mock, fake_fs): monotonic_mock.return_value = 1000 AlerterState.initialize() state = AlerterState(clientid="clientid1") @@ -153,6 +165,7 @@ def test_is_down_with_reset_alert_timeout(monotonic_mock, fake_fs): assert state.is_down() is False monotonic_mock.return_value = 2330 # Five and a half minutes have passed assert state.is_down() is True + pd_mock.assert_called_with(f"{state.clientid}-None") @freezegun.freeze_time("2023-01-01") @@ -201,21 +214,6 @@ def test_is_down_from_graceful_shutdown(monotonic_mock, fake_fs): assert state.is_down() is True -@freezegun.freeze_time("2023-01-01") -@unittest.mock.patch("time.monotonic") -def test_is_down(monotonic_mock, fake_fs): - monotonic_mock.return_value = 1000 - AlerterState.initialize() - state = AlerterState(clientid="clientid1") - with state: - monotonic_mock.return_value = 2000 - state.reset_alert_timeout() - monotonic_mock.return_value = 2180 # Three minutes have passed - assert state.is_down() is False - monotonic_mock.return_value = 2330 # Five and a half minutes have passed - assert state.is_down() is True - - @freezegun.freeze_time("2023-01-01") @unittest.mock.patch("time.monotonic") def test_recently_notified(monotonic_mock, fake_fs): @@ -234,19 +232,22 @@ def test_recently_notified(monotonic_mock, fake_fs): @unittest.mock.patch("time.monotonic") @unittest.mock.patch.object(apprise.Apprise, "add") @unittest.mock.patch.object(apprise.Apprise, "notify") -def test_notify(notify_mock, add_mock, monotonic_mock, fake_fs): +@unittest.mock.patch.object(EventsAPISession, "trigger") +def test_notify(pd_mock, notify_mock, add_mock, monotonic_mock, fake_fs): monotonic_mock.return_value = 1000 AlerterState.initialize() state = AlerterState(clientid="clientid1") - + dedup_key = f"{state.clientid}-{state.last_alert_datetime()}" with state: state.notify() for thread in threading.enumerate(): if thread != threading.current_thread(): thread.join() + assert_notifications( - notify_mock, - add_mock, + notify_mock=notify_mock, + add_mock=add_mock, + pd_mock=pd_mock, title="**Alertmanager is Down!**", body=textwrap.dedent( """ @@ -254,10 +255,12 @@ def test_notify(notify_mock, add_mock, monotonic_mock, fake_fs): It has not alerted COS-Alerter ever. """ ), + dedup_key=dedup_key, ) # Make sure if we try again, nothing is sent notify_mock.reset_mock() + pd_mock.reset_mock() with state: state.notify() @@ -265,15 +268,19 @@ def test_notify(notify_mock, add_mock, monotonic_mock, fake_fs): if thread != threading.current_thread(): thread.join() notify_mock.assert_not_called() + pd_mock.assert_not_called() @unittest.mock.patch.object(apprise.Apprise, "add") @unittest.mock.patch.object(apprise.Apprise, "notify") -def test_send_test_notification(notify_mock, add_mock, fake_fs): +@unittest.mock.patch.object(EventsAPISession, "trigger") +def test_send_test_notification(pd_mock, notify_mock, add_mock, fake_fs): send_test_notification() assert_notifications( - notify_mock, - add_mock, + notify_mock=notify_mock, + add_mock=add_mock, + pd_mock=pd_mock, title="COS-Alerter test email.", body="This is a test email automatically generated by COS-alerter.", + dedup_key="test-dedup-key", )