Skip to content

Commit

Permalink
Add an exponential backoff to harvester keep-alive notifications
Browse files Browse the repository at this point in the history
At first notifications will arrive roughly at the same speed as before
but the delay starts increasing gradually over time. Keep-alive is still
tested with the same frequency but failures in between notifications are
only logged with the next notification threshold included.

Fixes martomi#317
  • Loading branch information
jinnatar committed Jan 23, 2022
1 parent b4342fe commit e6b4d63
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 10 deletions.
14 changes: 14 additions & 0 deletions src/notifier/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from dataclasses import dataclass
from typing import List
from enum import Enum
from datetime import datetime


class EventPriority(Enum):
Expand Down Expand Up @@ -51,6 +52,7 @@ class Event:
priority: EventPriority
service: EventService
message: str
iteration: int = 0


class Notifier(ABC):
Expand Down Expand Up @@ -96,3 +98,15 @@ def get_title_for_event(self, event):
def send_events_to_user(self, events: List[Event]) -> bool:
"""Implementation specific to the integration"""
pass

def exponential_backoff(incident_time, interval, iteration=0, rate=1.5) -> float:
"""Calculate timestamps of notification thresholds.
Given an initial incident time and normal reasonable notification interval in seconds,
calculate notification threshold timestamps for different iterations.
The iteration is expected to increase every time a notification is sent.
"""
if type(incident_time) is datetime:
incident_time = incident_time.timestamp()
timestamp = incident_time + (interval * pow(rate, iteration))
return datetime.fromtimestamp(timestamp)
46 changes: 36 additions & 10 deletions src/notifier/keep_alive_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from typing import List

# project
from . import EventService, Event, EventType, EventPriority
from . import EventService, Event, EventType, EventPriority, exponential_backoff


class KeepAliveMonitor:
Expand All @@ -29,6 +29,8 @@ def __init__(self, config: dict = None, thresholds: dict = None):

self._last_keep_alive = {EventService.HARVESTER: datetime.now()}
self._last_keep_alive_threshold_seconds = thresholds or {EventService.HARVESTER: 300}
self._keep_alive_iteration = {EventService.HARVESTER: 0}
self._keep_alive_incident_time = {EventService.HARVESTER: None}

self._ping_url = None
if config and config["enable_remote_ping"]:
Expand Down Expand Up @@ -77,18 +79,42 @@ def check_last_keep_alive(self):
logging.debug(f"Keep-alive check for {service.name}: Last activity {seconds_since_last} seconds ago.")
if seconds_since_last > self._last_keep_alive_threshold_seconds[service]:
message = (
f"Your harvester appears to be offline! "
"Your harvester appears to be offline! "
f"No events for the past {seconds_since_last} seconds."
)
logging.warning(message)
events.append(
Event(
type=EventType.USER,
priority=EventPriority.HIGH,
service=EventService.HARVESTER,
message=message,
new_incident = False
if self._keep_alive_iteration[service] == 0:
# The incident starts
new_incident = True
self._keep_alive_incident_time[service] = datetime.now()
# Get the current notify threshold timestamp based on initial event time & iteration count.
notification_threshold = exponential_backoff(
incident_time=self._keep_alive_incident_time[service],
interval=self._last_keep_alive_threshold_seconds[service],
iteration=self._keep_alive_iteration[service]
)
if new_incident or datetime.now() >= notification_threshold:
# We only increase iteration for notifications sent, not for every time the check fails.
self._keep_alive_iteration[service] += 1
events.append(
Event(
type=EventType.USER,
priority=EventPriority.HIGH,
service=EventService.HARVESTER,
message=message,
iteration=self._keep_alive_iteration,
)
)
)
else:
message += f" The next notification won't be sent before {notification_threshold}"
# Message is logged regardless of threshold
logging.warning(message)
else:
# All is fine, reset iteration
if self._keep_alive_iteration[service] > 0:
logging.info(f'incident for {service} is over')
self._keep_alive_iteration[service] = 0
self._keep_alive_incident_time[service] = None
if len(events):
if self._notify_manager:
self._notify_manager.process_events(events)
Expand Down

0 comments on commit e6b4d63

Please sign in to comment.