diff --git a/integrations/opsgenie/README.md b/integrations/opsgenie/README.md new file mode 100644 index 00000000..088c4aad --- /dev/null +++ b/integrations/opsgenie/README.md @@ -0,0 +1,216 @@ +Set up OpsGenie with an OpsGenie Edge Connector integration. +================== + + +While the OpsGenie plugin provided by Alerta can send alerts to OpsGenie it does not allow OpsGenie to update Alerta. +Fortunately, OpsGenie has an edge connector we can install and configure to use some code to do this for us. + + +Set up OpsGenie Edge Connector (oec) +------------------ + + Log in to OpsGenie + In Settings under the "Integration List" search for Edge. + Select OEC from the results and click "Add". This will generate an API key for the integration. Name it whatever you'd like. + + The currently supported actions from OEC to Alerta are mapped as follows: + + - from OEC "alert is acknowledged" -> Alerta will ack the alert + - from OEC "alert is closed" -> Alerta will close the alert + - from OEC "alert is unacknowledged" -> Alerta will unack the alert + - from OEC "a note is added" -> Alerta will add a note to the alert + - from OEC "A user executes assign ownership" -> Alerta will assign the alert + - from OEC "A user takes ownership" -> Alerta will assign the alert + + + Click "Add new action" and add whichever actions you desire from OEC. + Copy the API Key and set it aside somewhere for later. This is the API key that OEC will need to communicate with OpsGenie + Click "Save Integration" when you have added all the actions you want to be sent to Alerta from OEC + + +![Configuring OpsGenie Edge Connector for Alerta](./images/2.png) + +Set up an API user and key for Alerta. This is the key that OEC needs to auth into Alerta with. + + Set up a user and api key. This integration is currently set to use the api as a single user. In our setup we chose to use a local Alerta user 'opsgenie' and assigned an API key. + +[Alerta api key docs]( https://docs.alerta.io/en/latest/webui/apikeys.html#webui-api-keys) + + Set the Alerta API key you will use aside for configuration later. + +![Configuring OpsGenie Edge Connector API key for Alerta](./images/3.png) + + +As mentioned all actions will be shown to be executed by the user you chose to add the API key to. Notes will include the user name from OpsGenie. This could be addressed in Alerta in the future if a permission was added to be able to impersonate an Alerta user and assigned to the api key. Passing another field to the API that would associate the action with an existing Alerta user. + + +Install and configure OpsGenie Edge Connector on a host in your network. Alerta has been tested with OEC version 1.1.3 +------------------ + + Some links to OpsGenie OEC documentation: + +[Installation docs for OEC provided by Atlassian](https://support.atlassian.com/opsgenie/docs/opsgenie-edge-connector-installation-packs/) + +[Basic OEC configuration information is provided by Atlassian](https://support.atlassian.com/opsgenie/docs/configure-opsgenie-edge-connector/) + + By default OEC installs into /home/opsgenie. Ensure that the following directories are created and owned by opsgenie:opsgenie + + /home/opsgenie + /home/opsgenie/oec + /home/opsgenie/oec/conf + /home/opsgenie/oec/output + /home/opsgenie/oec/scripts + + + ensure python3 is installed in the opsgenie user's PATH + +Install and edit the OEC config for Alerta +------------------ + + Edit and install the config.json into /home/opsgenie/oec/conf/config.json + + add the API key you generated in OpsGenie for apiKey, the double-quotes are necessary as this is a string. This is the key that lets OEC communicate with OpsGenie. + + "apiKey": "your_alerta_oec_api_key_goes_here", + + + add the alertaApiKey and alertaApiUrl for Alerta to the globalArgs portion of the config. This is the key that lets OEC communicate with Alerta at the alertaApiUrl. + + "globalArgs": ["--alertaApiUrl", "{{ alerta_api_url_goes_here }}", + "--alertaApiKey", "{{ alerta_opsgenie_api_key_goes_here}}" ], + + + change any paths for stderr and stdout if you don't want any logging or want it somewhere else + install the newly edited config.json file to /home/opsgenie/oec/conf/config.json + +Install the script that will be run by OEC to interact with Alerta +------------------ + + install oecAlertaExecutor.py script to: /home/opsgenie/oec/scripts + + ensure that the perms are + + owner: opsgenie + group: opsgenie + mode: 0755 + + + +Remove some things that OEC installs by default +------------------ + + The following seemed to cause issues on our install. Removing them resolved our issues + + /home/opsgenie/oec/scripts/http.py + /home/opsgenie/oec/scripts/actionExecutor.py + /home/opsgenie/oec/scripts/__pycache__ + + + Restart OEC on your system. + + If Alerta is configured to send alerts to OpsGenie then OEC should get updates and be able to update alerts in Alerta from any of the OpsGenie interfaces (web/phone etc..) + +Troubleshooting +------------------ + If alerts are not firing it could be due to the alert source not being set. This requires an update to the OpsGenie plugin that hasn't been accepted yet. + Including a line in the plugin to set the source from the config or a reasonable default should address this. + + ```OPSGENIE_ALERT_SOURCE = os.environ.get('OPSGENIE_ALERT_SOURCE') or app.config.get('OPSGENIE_ALERT_SOURCE', 'Alerta'``` + + and later in the plugin include that in your payload + + ``` + payload = { + "alias": alert.id, + "message": "{}: {} -> {}".format(alert.severity, alert.text, alert.value), + "entity": "{}-{}".format("-".join(alert.service), alert.environment), + "responders": teams, + "tags": tags, + "source": "{}".format(OPSGENIE_ALERT_SOURCE), + "details": details + } + ``` + + This is useful for OpsGenie Edge Connector to not update ALL Edge connector integrations if you have more than one running in your env. This will send updates to Alerta when the + source was Alerta. So if JIRA is also integrated through OEC it won't be trying to send any updates to Alerta etc. + +![Limiting which integrations can update Alerta in OpsGenie](./images/alert-filter.png) + + + + + +Here is the the full config we use in prod ( templatized) + + + { + "apiKey": "{{ alerta_oec_api_key }}", + "baseUrl": "https://api.opsgenie.com", + "logLevel": "WARN", + "globalArgs": ["--alertaApiUrl", "{{ alerta_api_url }}", + "--alertaApiKey", "{{ alerta_stg_opsgenie_api_key}}" ], + "globalFlags": {}, + "actionMappings": { + "Acknowledge": { + "filepath": "/home/opsgenie/oec/scripts/oecAlertaExecutor.py", + "sourceType": "local", + "env": [], + "stderr": "/var/log/opsgenie/oecAlertaExecutor-errors.log", + "stdout": "/var/log/opsgenie/oecAlertaExecutor.log" + }, + "AddNote": { + "filepath": "/home/opsgenie/oec/scripts/oecAlertaExecutor.py", + "sourceType": "local", + "env": [], + "stderr": "/var/log/opsgenie/oecAlertaExecutor-errors.log", + "stdout": "/var/log/opsgenie/oecAlertaExecutor.log" + }, + "Close": { + "filepath": "/home/opsgenie/oec/scripts/oecAlertaExecutor.py", + "sourceType": "local", + "env": [], + "stderr": "/var/log/opsgenie/oecAlertaExecutor-errors.log", + "stdout": "/var/log/opsgenie/oecAlertaExecutor.log" + }, + "AssignOwnership": { + "filepath": "/home/opsgenie/oec/scripts/oecAlertaExecutor.py", + "sourceType": "local", + "env": [], + "stderr": "/var/log/opsgenie/oecAlertaExecutor-errors.log", + "stdout": "/var/log/opsgenie/oecAlertaExecutor.log" + }, + "Snooze": { + "filepath": "/home/opsgenie/oec/scripts/oecAlertaExecutor.py", + "sourceType": "local", + "env": [], + "stderr": "/var/log/opsgenie/oecAlertaExecutor-errors.log", + "stdout": "/var/log/opsgenie/oecAlertaExecutor.log" + }, + "TakeOwnership": { + "filepath": "/home/opsgenie/oec/scripts/oecAlertaExecutor.py", + "sourceType": "local", + "env": [], + "stderr": "/var/log/opsgenie/oecAlertaExecutor-errors.log", + "stdout": "/var/log/opsgenie/oecAlertaExecutor.log" + }, + "UnAcknowledge": { + "filepath": "/home/opsgenie/oec/scripts/oecAlertaExecutor.py", + "sourceType": "local", + "env": [], + "stderr": "/var/log/opsgenie/oecAlertaExecutor-errors.log", + "stdout": "/var/log/opsgenie/oecAlertaExecutor.log" + } + }, + "pollerConf": { + "pollingWaitIntervalInMillis": 100, + "visibilityTimeoutInSec": 30, + "maxNumberOfMessages": 10 + }, + "poolConf": { + "maxNumberOfWorker": 12, + "minNumberOfWorker": 4, + "monitoringPeriodInMillis": 15000, + "keepAliveTimeInMillis": 6000, + "queueSize": 0 + } + } diff --git a/integrations/opsgenie/images/2.png b/integrations/opsgenie/images/2.png new file mode 100644 index 00000000..e8ef2c21 Binary files /dev/null and b/integrations/opsgenie/images/2.png differ diff --git a/integrations/opsgenie/images/3.png b/integrations/opsgenie/images/3.png new file mode 100644 index 00000000..2e74f112 Binary files /dev/null and b/integrations/opsgenie/images/3.png differ diff --git a/integrations/opsgenie/images/alert-filter.png b/integrations/opsgenie/images/alert-filter.png new file mode 100644 index 00000000..9461f98d Binary files /dev/null and b/integrations/opsgenie/images/alert-filter.png differ diff --git a/integrations/opsgenie/oecAlertaExecutor.py b/integrations/opsgenie/oecAlertaExecutor.py new file mode 100644 index 00000000..32c6fd22 --- /dev/null +++ b/integrations/opsgenie/oecAlertaExecutor.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 +import argparse +import json +import logging +import sys +import requests + +parser = argparse.ArgumentParser() +parser.add_argument('-payload', '--queuePayload', help='Payload from queue', required=True) +parser.add_argument('-apiKey', '--apiKey', help='The apiKey of the integration', required=True) +parser.add_argument('-opsgenieUrl', '--opsgenieUrl', help='The url', required=True) +parser.add_argument('-logLevel', '--logLevel', help='Log level', required=True) +parser.add_argument('-alertaApiUrl', '--alertaApiUrl', help='The url to do alerta api operations', required=True) +parser.add_argument('-alertaApiKey', '--alertaApiKey', help='The api key to do alerta api operations', required=True) +args = vars(parser.parse_args()) + +logging.basicConfig(stream=sys.stdout, level=args['logLevel']) +LOG_PREFIX = 'oec_action' + + +def do_alerta_things(alerta_api_target, alerta_headers, payload): + try: + r = requests.put(alerta_api_target, json=payload, headers=alerta_headers, timeout=2) + except Exception as e: + logging.error("{} - Error updating {}. Error: {}".format(LOG_PREFIX, alerta_api_target, e)) + + logging.info('{} - Call to {} return status code: {}'.format(LOG_PREFIX, alerta_api_target, r.status_code)) + return r.status_code + + +def get_alert_status(alerta_api_target, alerta_headers): + try: + r = requests.get(alerta_api_target, headers=alerta_headers, timeout=2) + except Exception as e: + logging.error("{} - Error getting {} : {}".format(LOG_PREFIX, alerta_api_target, e)) + + contents = json.loads(r.content) + cur_status = contents['alert'].get('status', None) + + return cur_status + + +def main(): + + queue_message_string = args['queuePayload'] + queue_message = json.loads(queue_message_string) + + action = queue_message["action"] + LOG_PREFIX = "[ {} ]".format(action) + + alert_id = queue_message["alert"]["alertId"] + origin = queue_message["alert"]["source"] + username = queue_message["alert"]["username"] + logging.debug("{} - Username is: {}, Origin is: {}".format(LOG_PREFIX, username, origin)) + alerta_url = args['alertaApiUrl'] + alerta_headers = {'Content-type': 'application/json', 'Authorization': 'Key {}'.format(args['alertaApiKey'])} + + logging.info("{} - Using Alerta URL : {}".format(LOG_PREFIX, alerta_url)) + logging.debug("{} - Message: {}".format(LOG_PREFIX, queue_message)) + timeout = 300 # default timeout for connections to opsgenie api + action_timeout = 7200 # default alerta action timeout + + logging.info("{} - Will execute {} for alertId {}".format(LOG_PREFIX, action, alert_id)) + + action_map = {"Acknowledge": "ack", + "AddNote": "note", + "AssignOwnership": "assign", + "TakeOwnership": "assign", + "UnAcknowledge": "unack", + "Close": "close", + "Snooze": "shelve"} + + if alert_id: + alert_api_url = "{}/v2/alerts/{}".format(args['opsgenieUrl'], alert_id) + headers = { + "Content-Type": "application/json", + "Accept-Language": "application/json", + "Authorization": "GenieKey {}".format(args['apiKey']) + } + alert_response = requests.get(alert_api_url, headers=headers, timeout=timeout) + if alert_response.status_code < 299 and alert_response.json()['data']: + if action in action_map.keys() and origin == 'Alerta': + + alias = queue_message["alert"]["alias"] + logging.info("{} - {} {} from {}".format(LOG_PREFIX, action, alias, username)) + alerta_action = action_map[action] + + # set default target and payload + alerta_api_target = "{}/{}/action".format(alerta_url, alias) + payload = {"action": alerta_action, "text": "{}d by {}.".format(action, username), "timeout": action_timeout} + + # payload will change according to action and then fall through to the + # default api call unless the alerta_api_target is set to None on its way down + if action == 'Snooze': + # snooze_end = queue_message["alert"]["snoozedUntil"] + snooze_end = queue_message["alert"]["snoozeEndDate"] + # snooze_end = dt.fromtimestamp(int("{}".format(snooze_end)[:-3])) # < - datetime object + # now = dt.fromtimestamp(dt.timestamp(dt.utcnow())) + # snooze_seconds = int((snooze_end - now).total_seconds()) + # if snooze_seconds > 0: + # logging.info("{} - Snoozing for {} seconds".format(LOG_PREFIX, snooze_seconds)) + payload["text"] = "Shelved until: {} by {}".format(snooze_end, username) + + elif action == 'AddNote': + # payload and target for notes is different than actions + alerta_api_target = "{}/{}/note".format(alerta_url, alias) + + # since we have one api key assigned to a default 'opsgenie' user + # include the username with the note so we know who wrote it + payload = {"note": "{} Added by {}".format(queue_message["alert"]["note"], username)} + + elif action == 'AssignOwnership': # + owner = queue_message["alert"]["owner"] + # update the payload + payload["text"] = "Assigned to {} by {}".format(owner, username) + elif action == 'TakeOwnership': # + # open_payload = { "action": "open", "text": "transisition to open for assignment", "timeout": action_timeout } + # do_alerta_things(alerta_api_target,open_payload) + + # update the payload + payload["text"] = "{} took ownership".format(username) + elif action == 'Acknowledge': # update the acked-by attribute too.. + # opsgenie does not send an action when an alert comes out of snooze + # we will check the alert and if it has a 'shelved' status unshelve it + # this is silly but the tags opgsgenie has are NOT the alert tags. + # or I would just look at those + # Get the alert so we can check the status + alert_url = "{}/{}".format(alerta_url, alias) + status = get_alert_status(alert_url, alerta_headers) + if status == 'shelved': + # unshelve the thing (default) + # and then the normal action can run + unshelve_payload = {"action": "unshelve", "text": "Unshelved by {}.".format(username), "timeout": action_timeout} + + do_alerta_things(alerta_api_target, alerta_headers, unshelve_payload) + # update the api target to None unshelving will put it back to Ack + alerta_api_target = None + + # update the acked-by attribute + ack_by_payload = {"attributes": {"acked-by": username}} + ack_by_target = "{}/{}/attributes".format(alerta_url, alias) + do_alerta_things(ack_by_target, alerta_headers, ack_by_payload) + + if alerta_api_target: + # as long as none of the above set the + # alerta_api_target to None we should do the original action + do_alerta_things(alerta_api_target, alerta_headers, payload) + + else: + logging.warning("{} - Alert with id [ {} ] does not exist in Opsgenie. It is probably deleted.".format(LOG_PREFIX, alert_id)) + else: + logging.warning("{} - Alert id was not sent in the payload. Ignoring.".format(LOG_PREFIX)) + + +if __name__ == '__main__': + main() diff --git a/plugins/opsgenie/alerta_opsgenie.py b/plugins/opsgenie/alerta_opsgenie.py index fe572b81..0b18eaab 100644 --- a/plugins/opsgenie/alerta_opsgenie.py +++ b/plugins/opsgenie/alerta_opsgenie.py @@ -22,6 +22,9 @@ DASHBOARD_URL = os.environ.get('DASHBOARD_URL') or app.config.get('DASHBOARD_URL', '') LOG.info('Initialized: %s key, %s matchers' % (OPSGENIE_SERVICE_KEY, SERVICE_KEY_MATCHERS)) +# when using with OpsGenie Edge connector setting a known source is useful +OPSGENIE_ALERT_SOURCE = os.environ.get('OPSGENIE_ALERT_SOURCE') or app.config.get('OPSGENIE_ALERT_SOURCE', 'Alerta') + class TriggerEvent(PluginBase): def opsgenie_service_key(self, resource): @@ -105,6 +108,7 @@ def post_receive(self, alert): "entity": alert.environment, "responders" : self.get_opsgenie_teams(), "tags": [alert.environment, alert.resource, alert.service[0], alert.event], + "source": "{}".format(OPSGENIE_ALERT_SOURCE), "details": details } diff --git a/plugins/prometheus/README.md b/plugins/prometheus/README.md index 16684c34..42c72c04 100644 --- a/plugins/prometheus/README.md +++ b/plugins/prometheus/README.md @@ -47,6 +47,18 @@ ALERTMANAGER_API_URL = 'http://localhost:9093' ALERTMANAGER_SILENCE_FROM_ACK = True ``` + +Prometheus docs specify that prometheus should send all alerts to all alertmanagers. If you have configured your +ALERTMANAGER_API_URL to be a load balanced endpoint that mirrors requests to a set of alertmanagers then the following setting +will create/remove silences if alertmanager has set the externalUrl, the following will configure alerta to use that for silences + instead of the Alertmanager API URL. + +Alertmanager syncs silences across all alertmanagers so only sendng it to one AM is appropriate. Using a load-balanced API that mirrors +requests will create one unique silenceId per alertmanager instance and sync them across all alertmanagers, which is not necessary. +```python +ALERTMANAGER_USE_EXTERNALURL_FOR_SILENCES = True +``` + **Robust Perception Demo Example** ```python @@ -55,6 +67,7 @@ ALERTMANAGER_API_URL = 'http://demo.robustperception.io:9093' # default=http:// ALERTMANAGER_SILENCE_DAYS = 2 # default=1 ``` + Authentication -------------- diff --git a/plugins/prometheus/alerta_prometheus.py b/plugins/prometheus/alerta_prometheus.py index c468b0ef..77703daf 100644 --- a/plugins/prometheus/alerta_prometheus.py +++ b/plugins/prometheus/alerta_prometheus.py @@ -1,8 +1,8 @@ - import datetime import logging import os import requests +import json from typing import Any try: @@ -22,6 +22,8 @@ ALERTMANAGER_PASSWORD = os.environ.get('ALERTMANAGER_PASSWORD') or app.config.get('ALERTMANAGER_PASSWORD', None) ALERTMANAGER_SILENCE_DAYS = os.environ.get('ALERTMANAGER_SILENCE_DAYS') or app.config.get('ALERTMANAGER_SILENCE_DAYS', 1) ALERTMANAGER_SILENCE_FROM_ACK = os.environ.get('ALERTMANAGER_SILENCE_FROM_ACK') or app.config.get('ALERTMANAGER_SILENCE_FROM_ACK', False) +ALERTMANAGER_USE_EXTERNALURL_FOR_SILENCES = os.environ.get('ALERTMANAGER_USE_EXTERNALURL_FOR_SILENCES') or app.config.get('ALERTMANAGER_USE_EXTERNALURL_FOR_SILENCES',False) + class AlertmanagerSilence(PluginBase): @@ -38,6 +40,30 @@ def post_receive(self, alert): return def status_change(self, alert, status, text): + ''' + If a silence exists for an open or closed alert we probably want to remove it + ''' + if status in ('open', 'closed'): + + silenceId = alert.attributes.get('silenceId', None) + if silenceId: + LOG.debug('Alertmanager: Remove silence for alertname=%s instance=%s', alert.event, alert.resource) + base_url = ALERTMANAGER_API_URL or alert.attributes.get('externalUrl', DEFAULT_ALERTMANAGER_API_URL) + url = base_url + '/api/v1/silence/%s' % silenceId + try: + r = requests.delete(url, auth=self.auth, timeout=2) + except Exception as e: + raise RuntimeError("Alertmanager: ERROR - %s" % e) + LOG.debug('Alertmanager: %s - %s', r.status_code, r.text) + + try: + alert.attributes['silenceId'] = None + except Exception as e: + raise RuntimeError("Alertmanager: ERROR - %s" % e) + LOG.debug('Alertmanager: Removed silenceId %s from attributes', silenceId) + if status == 'closed': + LOG.warning("Status is now closed") + return alert def take_action(self, alert: Alert, action: str, text: str, **kwargs) -> Any: @@ -48,9 +74,26 @@ def take_action(self, alert: Alert, action: str, text: str, **kwargs) -> Any: if alert.event_type != 'prometheusAlert': return alert - if action == 'ack': - if ALERTMANAGER_SILENCE_FROM_ACK: + base_url = ALERTMANAGER_API_URL or alert.attributes.get('externalUrl', DEFAULT_ALERTMANAGER_API_URL) + if action == 'close': + LOG.warning("Got a close action so trying to close this in alertmanager too") + url = base_url + '/api/v1/alerts' + raw_data_string = alert.raw_data + raw_data = json.loads(raw_data_string) + # set the endsAt to now so alertmanager will consider it expired or whatever + raw_data["endsAt"] = (datetime.datetime.utcnow() - datetime.timedelta(minutes=5)).replace(microsecond=0).isoformat() + ".000Z" + LOG.debug("Raw data type: {}, Raw data contents: {}".format(type(raw_data),raw_data)) + data = [ raw_data ] + try: + r = requests.post(url, json=data, auth=self.auth, timeout=2) + except Exception as e: + raise RuntimeError("Alertmanager: ERROR - %s" % e) + LOG.debug('Alertmanager response was: %s - %s', r.status_code, r.text) + + elif action == 'ack' and ALERTMANAGER_SILENCE_FROM_ACK: + + if not ALERTMANAGER_SILENCE_DAYS: silence_seconds = kwargs.get('timeout', alert.timeout) else: try: @@ -82,11 +125,15 @@ def take_action(self, alert: Alert, action: str, text: str, **kwargs) -> Any: "comment": text if text != '' else "silenced by alerta" } - base_url = ALERTMANAGER_API_URL or alert.attributes.get('externalUrl', DEFAULT_ALERTMANAGER_API_URL) + # if alertmanager is clustered behind a load balancer that mirrors requests we should prefer to create one silence + # rather than many + if USE_AM_EXTERNALURL_FOR_SILENCES: + base_url = alert.attributes.get('externalUrl', DEFAULT_ALERTMANAGER_API_URL) or ALERTMANAGER_API_URL + else: + base_url = ALERTMANAGER_API_URL or alert.attributes.get('externalUrl', DEFAULT_ALERTMANAGER_API_URL) + url = base_url + '/api/v1/silences' - LOG.debug('Alertmanager: URL=%s', url) - LOG.debug('Alertmanager: data=%s', data) try: r = requests.post(url, json=data, auth=self.auth, timeout=2) @@ -98,10 +145,10 @@ def take_action(self, alert: Alert, action: str, text: str, **kwargs) -> Any: try: data = r.json().get('data', []) if data: - silenceId = data['silenceId'] - alert.attributes['silenceId'] = silenceId + silenceId = data['silenceId'] + alert.attributes['silenceId'] = silenceId else: - silenceId = alert.attributes.get('silenceId', "unknown") + silenceId = alert.attributes.get('silenceId', "unknown") text = text + ' (silenced in Alertmanager)' except Exception as e: raise RuntimeError("Alertmanager: ERROR - %s" % e)