From 67f1c1472d9a341f93cf2fb3fc85f98bf7f402f5 Mon Sep 17 00:00:00 2001 From: sed-i <82407168+sed-i@users.noreply.github.com> Date: Fri, 19 Jul 2024 18:05:07 -0400 Subject: [PATCH 1/5] Compress rules, jobs for cos-proxy --- .../prometheus_k8s/v0/prometheus_scrape.py | 62 ++++++++++++++----- 1 file changed, 48 insertions(+), 14 deletions(-) diff --git a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py index e3d35c6f..74ee5517 100644 --- a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py +++ b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py @@ -321,12 +321,14 @@ def _on_scrape_targets_changed(self, event): of Metrics provider charms hold eponymous information. """ # noqa: W505 - +import base64 +import binascii import copy import hashlib import ipaddress import json import logging +import lzma import os import platform import re @@ -400,6 +402,22 @@ def _on_scrape_targets_changed(self, event): DEFAULT_ALERT_RULES_RELATIVE_PATH = "./src/prometheus_alert_rules" +def _encode_content(content: Union[str, bytes]) -> str: + if isinstance(content, str): + content = bytes(content, "utf-8") + + return base64.b64encode(lzma.compress(content)).decode("utf-8") + + +def _decode_content(encoded_content: str) -> str: + try: + # Assuming content is encoded, try decoding it. + return lzma.decompress(base64.b64decode(encoded_content.encode("utf-8"))).decode() + except (binascii.Error, lzma.LZMAError): + # Failed to base64-decode or decompress, so probably not encoded. Return as is. + return encoded_content + + class PrometheusConfig: """A namespace for utility functions for manipulating the prometheus config dict.""" @@ -1004,7 +1022,9 @@ def alerts(self) -> dict: if not relation.units or not relation.app: continue - alert_rules = json.loads(relation.data[relation.app].get("alert_rules", "{}")) + alert_rules = json.loads( + _decode_content(relation.data[relation.app].get("alert_rules", "{}")) + ) if not alert_rules: continue @@ -1158,7 +1178,9 @@ def _static_scrape_config(self, relation) -> list: if not relation.units: return [] - scrape_configs = json.loads(relation.data[relation.app].get("scrape_jobs", "[]")) + scrape_configs = json.loads( + _decode_content(relation.data[relation.app].get("scrape_jobs", "[]")) + ) if not scrape_configs: return [] @@ -1857,8 +1879,10 @@ def _set_prometheus_data(self, event): group = {"name": self.group_name(appname), "rules": rules} groups.append(group) - event.relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) - event.relation.data[self._charm.app]["alert_rules"] = json.dumps({"groups": groups}) + event.relation.data[self._charm.app]["scrape_jobs"] = _encode_content(json.dumps(jobs)) + event.relation.data[self._charm.app]["alert_rules"] = _encode_content( + json.dumps({"groups": groups}) + ) def _on_prometheus_targets_changed(self, event): """Update scrape jobs in response to scrape target changes. @@ -1894,11 +1918,13 @@ def set_target_job_data(self, targets: dict, app_name: str, **kwargs) -> None: updated_job = self._static_scrape_job(targets, app_name, **kwargs) for relation in self.model.relations[self._prometheus_relation]: - jobs = json.loads(relation.data[self._charm.app].get("scrape_jobs", "[]")) + jobs = json.loads( + _decode_content(relation.data[self._charm.app].get("scrape_jobs", "[]")) + ) # list of scrape jobs that have not changed jobs = [job for job in jobs if updated_job["job_name"] != job["job_name"]] jobs.append(updated_job) - relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) + relation.data[self._charm.app]["scrape_jobs"] = _encode_content(json.dumps(jobs)) if not _type_convert_stored(self._stored.jobs) == jobs: # pyright: ignore self._stored.jobs = jobs @@ -1927,7 +1953,9 @@ def remove_prometheus_jobs(self, job_name: str, unit_name: Optional[str] = ""): return for relation in self.model.relations[self._prometheus_relation]: - jobs = json.loads(relation.data[self._charm.app].get("scrape_jobs", "[]")) + jobs = json.loads( + _decode_content(relation.data[self._charm.app].get("scrape_jobs", "[]")) + ) if not jobs: continue @@ -1950,7 +1978,7 @@ def remove_prometheus_jobs(self, job_name: str, unit_name: Optional[str] = ""): changed_job["static_configs"] = configs_kept # type: ignore jobs.append(changed_job) - relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) + relation.data[self._charm.app]["scrape_jobs"] = _encode_content(json.dumps(jobs)) if not _type_convert_stored(self._stored.jobs) == jobs: # pyright: ignore self._stored.jobs = jobs @@ -2119,7 +2147,9 @@ def set_alert_rule_data(self, name: str, unit_rules: dict, label_rules: bool = T updated_group = {"name": self.group_name(name), "rules": rules} for relation in self.model.relations[self._prometheus_relation]: - alert_rules = json.loads(relation.data[self._charm.app].get("alert_rules", "{}")) + alert_rules = json.loads( + _decode_content(relation.data[self._charm.app].get("alert_rules", "{}")) + ) groups = alert_rules.get("groups", []) # list of alert rule groups that have not changed for group in groups: @@ -2129,7 +2159,9 @@ def set_alert_rule_data(self, name: str, unit_rules: dict, label_rules: bool = T if updated_group["name"] not in [g["name"] for g in groups]: groups.append(updated_group) - relation.data[self._charm.app]["alert_rules"] = json.dumps({"groups": groups}) + relation.data[self._charm.app]["alert_rules"] = _encode_content( + json.dumps({"groups": groups}) + ) if not _type_convert_stored(self._stored.alert_rules) == groups: # pyright: ignore self._stored.alert_rules = groups @@ -2150,7 +2182,9 @@ def remove_alert_rules(self, group_name: str, unit_name: str) -> None: return for relation in self.model.relations[self._prometheus_relation]: - alert_rules = json.loads(relation.data[self._charm.app].get("alert_rules", "{}")) + alert_rules = json.loads( + _decode_content(relation.data[self._charm.app].get("alert_rules", "{}")) + ) if not alert_rules: continue @@ -2177,7 +2211,7 @@ def remove_alert_rules(self, group_name: str, unit_name: str) -> None: changed_group["rules"] = rules_kept # type: ignore groups.append(changed_group) - relation.data[self._charm.app]["alert_rules"] = ( + relation.data[self._charm.app]["alert_rules"] = _encode_content( json.dumps({"groups": groups}) if groups else "{}" ) @@ -2204,7 +2238,7 @@ def _get_alert_rules(self, relation) -> dict: """ rules = {} for unit in relation.units: - unit_rules = yaml.safe_load(relation.data[unit].get("groups", "")) + unit_rules = yaml.safe_load(_decode_content(relation.data[unit].get("groups", ""))) if unit_rules: rules.update({unit.name: unit_rules}) From e89e7d2d8c08a1bf436cbe18b1bfaf0ee6a6ca1d Mon Sep 17 00:00:00 2001 From: sed-i <82407168+sed-i@users.noreply.github.com> Date: Fri, 19 Jul 2024 18:11:45 -0400 Subject: [PATCH 2/5] Bump LIBAPI --- .../prometheus_k8s/v0/prometheus_scrape.py | 62 +- .../prometheus_k8s/v1/prometheus_scrape.py | 2412 +++++++++++++++++ src/charm.py | 2 +- ...ator.py => test_endpoint_aggregator_v0.py} | 0 tests/unit/test_endpoint_aggregator_v1.py | 752 +++++ ...nsumer.py => test_endpoint_consumer_v0.py} | 0 tests/unit/test_endpoint_consumer_v1.py | 743 +++++ 7 files changed, 3922 insertions(+), 49 deletions(-) create mode 100644 lib/charms/prometheus_k8s/v1/prometheus_scrape.py rename tests/unit/{test_endpoint_aggregator.py => test_endpoint_aggregator_v0.py} (100%) create mode 100644 tests/unit/test_endpoint_aggregator_v1.py rename tests/unit/{test_endpoint_consumer.py => test_endpoint_consumer_v0.py} (100%) create mode 100644 tests/unit/test_endpoint_consumer_v1.py diff --git a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py index 74ee5517..e3d35c6f 100644 --- a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py +++ b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py @@ -321,14 +321,12 @@ def _on_scrape_targets_changed(self, event): of Metrics provider charms hold eponymous information. """ # noqa: W505 -import base64 -import binascii + import copy import hashlib import ipaddress import json import logging -import lzma import os import platform import re @@ -402,22 +400,6 @@ def _on_scrape_targets_changed(self, event): DEFAULT_ALERT_RULES_RELATIVE_PATH = "./src/prometheus_alert_rules" -def _encode_content(content: Union[str, bytes]) -> str: - if isinstance(content, str): - content = bytes(content, "utf-8") - - return base64.b64encode(lzma.compress(content)).decode("utf-8") - - -def _decode_content(encoded_content: str) -> str: - try: - # Assuming content is encoded, try decoding it. - return lzma.decompress(base64.b64decode(encoded_content.encode("utf-8"))).decode() - except (binascii.Error, lzma.LZMAError): - # Failed to base64-decode or decompress, so probably not encoded. Return as is. - return encoded_content - - class PrometheusConfig: """A namespace for utility functions for manipulating the prometheus config dict.""" @@ -1022,9 +1004,7 @@ def alerts(self) -> dict: if not relation.units or not relation.app: continue - alert_rules = json.loads( - _decode_content(relation.data[relation.app].get("alert_rules", "{}")) - ) + alert_rules = json.loads(relation.data[relation.app].get("alert_rules", "{}")) if not alert_rules: continue @@ -1178,9 +1158,7 @@ def _static_scrape_config(self, relation) -> list: if not relation.units: return [] - scrape_configs = json.loads( - _decode_content(relation.data[relation.app].get("scrape_jobs", "[]")) - ) + scrape_configs = json.loads(relation.data[relation.app].get("scrape_jobs", "[]")) if not scrape_configs: return [] @@ -1879,10 +1857,8 @@ def _set_prometheus_data(self, event): group = {"name": self.group_name(appname), "rules": rules} groups.append(group) - event.relation.data[self._charm.app]["scrape_jobs"] = _encode_content(json.dumps(jobs)) - event.relation.data[self._charm.app]["alert_rules"] = _encode_content( - json.dumps({"groups": groups}) - ) + event.relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) + event.relation.data[self._charm.app]["alert_rules"] = json.dumps({"groups": groups}) def _on_prometheus_targets_changed(self, event): """Update scrape jobs in response to scrape target changes. @@ -1918,13 +1894,11 @@ def set_target_job_data(self, targets: dict, app_name: str, **kwargs) -> None: updated_job = self._static_scrape_job(targets, app_name, **kwargs) for relation in self.model.relations[self._prometheus_relation]: - jobs = json.loads( - _decode_content(relation.data[self._charm.app].get("scrape_jobs", "[]")) - ) + jobs = json.loads(relation.data[self._charm.app].get("scrape_jobs", "[]")) # list of scrape jobs that have not changed jobs = [job for job in jobs if updated_job["job_name"] != job["job_name"]] jobs.append(updated_job) - relation.data[self._charm.app]["scrape_jobs"] = _encode_content(json.dumps(jobs)) + relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) if not _type_convert_stored(self._stored.jobs) == jobs: # pyright: ignore self._stored.jobs = jobs @@ -1953,9 +1927,7 @@ def remove_prometheus_jobs(self, job_name: str, unit_name: Optional[str] = ""): return for relation in self.model.relations[self._prometheus_relation]: - jobs = json.loads( - _decode_content(relation.data[self._charm.app].get("scrape_jobs", "[]")) - ) + jobs = json.loads(relation.data[self._charm.app].get("scrape_jobs", "[]")) if not jobs: continue @@ -1978,7 +1950,7 @@ def remove_prometheus_jobs(self, job_name: str, unit_name: Optional[str] = ""): changed_job["static_configs"] = configs_kept # type: ignore jobs.append(changed_job) - relation.data[self._charm.app]["scrape_jobs"] = _encode_content(json.dumps(jobs)) + relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) if not _type_convert_stored(self._stored.jobs) == jobs: # pyright: ignore self._stored.jobs = jobs @@ -2147,9 +2119,7 @@ def set_alert_rule_data(self, name: str, unit_rules: dict, label_rules: bool = T updated_group = {"name": self.group_name(name), "rules": rules} for relation in self.model.relations[self._prometheus_relation]: - alert_rules = json.loads( - _decode_content(relation.data[self._charm.app].get("alert_rules", "{}")) - ) + alert_rules = json.loads(relation.data[self._charm.app].get("alert_rules", "{}")) groups = alert_rules.get("groups", []) # list of alert rule groups that have not changed for group in groups: @@ -2159,9 +2129,7 @@ def set_alert_rule_data(self, name: str, unit_rules: dict, label_rules: bool = T if updated_group["name"] not in [g["name"] for g in groups]: groups.append(updated_group) - relation.data[self._charm.app]["alert_rules"] = _encode_content( - json.dumps({"groups": groups}) - ) + relation.data[self._charm.app]["alert_rules"] = json.dumps({"groups": groups}) if not _type_convert_stored(self._stored.alert_rules) == groups: # pyright: ignore self._stored.alert_rules = groups @@ -2182,9 +2150,7 @@ def remove_alert_rules(self, group_name: str, unit_name: str) -> None: return for relation in self.model.relations[self._prometheus_relation]: - alert_rules = json.loads( - _decode_content(relation.data[self._charm.app].get("alert_rules", "{}")) - ) + alert_rules = json.loads(relation.data[self._charm.app].get("alert_rules", "{}")) if not alert_rules: continue @@ -2211,7 +2177,7 @@ def remove_alert_rules(self, group_name: str, unit_name: str) -> None: changed_group["rules"] = rules_kept # type: ignore groups.append(changed_group) - relation.data[self._charm.app]["alert_rules"] = _encode_content( + relation.data[self._charm.app]["alert_rules"] = ( json.dumps({"groups": groups}) if groups else "{}" ) @@ -2238,7 +2204,7 @@ def _get_alert_rules(self, relation) -> dict: """ rules = {} for unit in relation.units: - unit_rules = yaml.safe_load(_decode_content(relation.data[unit].get("groups", ""))) + unit_rules = yaml.safe_load(relation.data[unit].get("groups", "")) if unit_rules: rules.update({unit.name: unit_rules}) diff --git a/lib/charms/prometheus_k8s/v1/prometheus_scrape.py b/lib/charms/prometheus_k8s/v1/prometheus_scrape.py new file mode 100644 index 00000000..f72e6284 --- /dev/null +++ b/lib/charms/prometheus_k8s/v1/prometheus_scrape.py @@ -0,0 +1,2412 @@ +# Copyright 2021 Canonical Ltd. +# See LICENSE file for licensing details. +"""Prometheus Scrape Library. + +## Overview + +This document explains how to integrate with the Prometheus charm +for the purpose of providing a metrics endpoint to Prometheus. It +also explains how alternative implementations of the Prometheus charms +may maintain the same interface and be backward compatible with all +currently integrated charms. Finally this document is the +authoritative reference on the structure of relation data that is +shared between Prometheus charms and any other charm that intends to +provide a scrape target for Prometheus. + +## Source code + +Source code can be found on GitHub at: + https://github.com/canonical/prometheus-k8s-operator/tree/main/lib/charms/prometheus_k8s + +## Provider Library Usage + +This Prometheus charm interacts with its scrape targets using its +charm library. Charms seeking to expose metric endpoints for the +Prometheus charm, must do so using the `MetricsEndpointProvider` +object from this charm library. For the simplest use cases, using the +`MetricsEndpointProvider` object only requires instantiating it, +typically in the constructor of your charm (the one which exposes a +metrics endpoint). The `MetricsEndpointProvider` constructor requires +the name of the relation over which a scrape target (metrics endpoint) +is exposed to the Prometheus charm. This relation must use the +`prometheus_scrape` interface. By default address of the metrics +endpoint is set to the unit IP address, by each unit of the +`MetricsEndpointProvider` charm. These units set their address in +response to the `PebbleReady` event of each container in the unit, +since container restarts of Kubernetes charms can result in change of +IP addresses. The default name for the metrics endpoint relation is +`metrics-endpoint`. It is strongly recommended to use the same +relation name for consistency across charms and doing so obviates the +need for an additional constructor argument. The +`MetricsEndpointProvider` object may be instantiated as follows + + from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointProvider + + def __init__(self, *args): + super().__init__(*args) + ... + self.metrics_endpoint = MetricsEndpointProvider(self) + ... + +Note that the first argument (`self`) to `MetricsEndpointProvider` is +always a reference to the parent (scrape target) charm. + +An instantiated `MetricsEndpointProvider` object will ensure that each +unit of its parent charm, is a scrape target for the +`MetricsEndpointConsumer` (Prometheus) charm. By default +`MetricsEndpointProvider` assumes each unit of the consumer charm +exports its metrics at a path given by `/metrics` on port 80. These +defaults may be changed by providing the `MetricsEndpointProvider` +constructor an optional argument (`jobs`) that represents a +Prometheus scrape job specification using Python standard data +structures. This job specification is a subset of Prometheus' own +[scrape +configuration](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config) +format but represented using Python data structures. More than one job +may be provided using the `jobs` argument. Hence `jobs` accepts a list +of dictionaries where each dictionary represents one `` +object as described in the Prometheus documentation. The currently +supported configuration subset is: `job_name`, `metrics_path`, +`static_configs` + +Suppose it is required to change the port on which scraped metrics are +exposed to 8000. This may be done by providing the following data +structure as the value of `jobs`. + +``` +[ + { + "static_configs": [ + { + "targets": ["*:8000"] + } + ] + } +] +``` + +The wildcard ("*") host specification implies that the scrape targets +will automatically be set to the host addresses advertised by each +unit of the consumer charm. + +It is also possible to change the metrics path and scrape multiple +ports, for example + +``` +[ + { + "metrics_path": "/my-metrics-path", + "static_configs": [ + { + "targets": ["*:8000", "*:8081"], + } + ] + } +] +``` + +More complex scrape configurations are possible. For example + +``` +[ + { + "static_configs": [ + { + "targets": ["10.1.32.215:7000", "*:8000"], + "labels": { + "some_key": "some-value" + } + } + ] + } +] +``` + +This example scrapes the target "10.1.32.215" at port 7000 in addition +to scraping each unit at port 8000. There is however one difference +between wildcard targets (specified using "*") and fully qualified +targets (such as "10.1.32.215"). The Prometheus charm automatically +associates labels with metrics generated by each target. These labels +localise the source of metrics within the Juju topology by specifying +its "model name", "model UUID", "application name" and "unit +name". However unit name is associated only with wildcard targets but +not with fully qualified targets. + +Multiple jobs with different metrics paths and labels are allowed, but +each job must be given a unique name: + +``` +[ + { + "job_name": "my-first-job", + "metrics_path": "one-path", + "static_configs": [ + { + "targets": ["*:7000"], + "labels": { + "some_key": "some-value" + } + } + ] + }, + { + "job_name": "my-second-job", + "metrics_path": "another-path", + "static_configs": [ + { + "targets": ["*:8000"], + "labels": { + "some_other_key": "some-other-value" + } + } + ] + } +] +``` + +**Important:** `job_name` should be a fixed string (e.g. hardcoded literal). +For instance, if you include variable elements, like your `unit.name`, it may break +the continuity of the metrics time series gathered by Prometheus when the leader unit +changes (e.g. on upgrade or rescale). + +Additionally, it is also technically possible, but **strongly discouraged**, to +configure the following scrape-related settings, which behave as described by the +[Prometheus documentation](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config): + +- `static_configs` +- `scrape_interval` +- `scrape_timeout` +- `proxy_url` +- `relabel_configs` +- `metric_relabel_configs` +- `sample_limit` +- `label_limit` +- `label_name_length_limit` +- `label_value_length_limit` + +The settings above are supported by the `prometheus_scrape` library only for the sake of +specialized facilities like the [Prometheus Scrape Config](https://charmhub.io/prometheus-scrape-config-k8s) +charm. Virtually no charms should use these settings, and charmers definitely **should not** +expose them to the Juju administrator via configuration options. + +## Consumer Library Usage + +The `MetricsEndpointConsumer` object may be used by Prometheus +charms to manage relations with their scrape targets. For this +purposes a Prometheus charm needs to do two things + +1. Instantiate the `MetricsEndpointConsumer` object by providing it a +reference to the parent (Prometheus) charm and optionally the name of +the relation that the Prometheus charm uses to interact with scrape +targets. This relation must confirm to the `prometheus_scrape` +interface and it is strongly recommended that this relation be named +`metrics-endpoint` which is its default value. + +For example a Prometheus charm may instantiate the +`MetricsEndpointConsumer` in its constructor as follows + + from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointConsumer + + def __init__(self, *args): + super().__init__(*args) + ... + self.metrics_consumer = MetricsEndpointConsumer(self) + ... + +2. A Prometheus charm also needs to respond to the +`TargetsChangedEvent` event of the `MetricsEndpointConsumer` by adding itself as +an observer for these events, as in + + self.framework.observe( + self.metrics_consumer.on.targets_changed, + self._on_scrape_targets_changed, + ) + +In responding to the `TargetsChangedEvent` event the Prometheus +charm must update the Prometheus configuration so that any new scrape +targets are added and/or old ones removed from the list of scraped +endpoints. For this purpose the `MetricsEndpointConsumer` object +exposes a `jobs()` method that returns a list of scrape jobs. Each +element of this list is the Prometheus scrape configuration for that +job. In order to update the Prometheus configuration, the Prometheus +charm needs to replace the current list of jobs with the list provided +by `jobs()` as follows + + def _on_scrape_targets_changed(self, event): + ... + scrape_jobs = self.metrics_consumer.jobs() + for job in scrape_jobs: + prometheus_scrape_config.append(job) + ... + +## Alerting Rules + +This charm library also supports gathering alerting rules from all +related `MetricsEndpointProvider` charms and enabling corresponding alerts within the +Prometheus charm. Alert rules are automatically gathered by `MetricsEndpointProvider` +charms when using this library, from a directory conventionally named +`prometheus_alert_rules`. This directory must reside at the top level +in the `src` folder of the consumer charm. Each file in this directory +is assumed to be in one of two formats: +- the official prometheus alert rule format, conforming to the +[Prometheus docs](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) +- a single rule format, which is a simplified subset of the official format, +comprising a single alert rule per file, using the same YAML fields. + +The file name must have one of the following extensions: +- `.rule` +- `.rules` +- `.yml` +- `.yaml` + +An example of the contents of such a file in the custom single rule +format is shown below. + +``` +alert: HighRequestLatency +expr: job:request_latency_seconds:mean5m{my_key=my_value} > 0.5 +for: 10m +labels: + severity: Medium + type: HighLatency +annotations: + summary: High request latency for {{ $labels.instance }}. +``` + +The `MetricsEndpointProvider` will read all available alert rules and +also inject "filtering labels" into the alert expressions. The +filtering labels ensure that alert rules are localised to the metrics +provider charm's Juju topology (application, model and its UUID). Such +a topology filter is essential to ensure that alert rules submitted by +one provider charm generates alerts only for that same charm. When +alert rules are embedded in a charm, and the charm is deployed as a +Juju application, the alert rules from that application have their +expressions automatically updated to filter for metrics coming from +the units of that application alone. This remove risk of spurious +evaluation, e.g., when you have multiple deployments of the same charm +monitored by the same Prometheus. + +Not all alerts one may want to specify can be embedded in a +charm. Some alert rules will be specific to a user's use case. This is +the case, for example, of alert rules that are based on business +constraints, like expecting a certain amount of requests to a specific +API every five minutes. Such alert rules can be specified via the +[COS Config Charm](https://charmhub.io/cos-configuration-k8s), +which allows importing alert rules and other settings like dashboards +from a Git repository. + +Gathering alert rules and generating rule files within the Prometheus +charm is easily done using the `alerts()` method of +`MetricsEndpointConsumer`. Alerts generated by Prometheus will +automatically include Juju topology labels in the alerts. These labels +indicate the source of the alert. The following labels are +automatically included with each alert + +- `juju_model` +- `juju_model_uuid` +- `juju_application` + +## Relation Data + +The Prometheus charm uses both application and unit relation data to +obtain information regarding its scrape jobs, alert rules and scrape +targets. This relation data is in JSON format and it closely resembles +the YAML structure of Prometheus [scrape configuration] +(https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config). + +Units of Metrics provider charms advertise their names and addresses +over unit relation data using the `prometheus_scrape_unit_name` and +`prometheus_scrape_unit_address` keys. While the `scrape_metadata`, +`scrape_jobs` and `alert_rules` keys in application relation data +of Metrics provider charms hold eponymous information. + +""" # noqa: W505 +import base64 +import binascii +import copy +import hashlib +import ipaddress +import json +import logging +import lzma +import os +import platform +import re +import socket +import subprocess +import tempfile +from collections import defaultdict +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from urllib.parse import urlparse + +import yaml +from cosl import JujuTopology +from cosl.rules import AlertRules +from ops.charm import CharmBase, RelationRole +from ops.framework import ( + BoundEvent, + EventBase, + EventSource, + Object, + ObjectEvents, + StoredDict, + StoredList, + StoredState, +) +from ops.model import Relation + +# The unique Charmhub library identifier, never change it +LIBID = "bc84295fef5f4049878f07b131968ee2" + +# Increment this major API version when introducing breaking changes +LIBAPI = 1 + +# Increment this PATCH version before using `charmcraft publish-lib` or reset +# to 0 if you are raising the major API version +LIBPATCH = 0 + +PYDEPS = ["cosl"] + +logger = logging.getLogger(__name__) + + +ALLOWED_KEYS = { + "job_name", + "metrics_path", + "static_configs", + "scrape_interval", + "scrape_timeout", + "proxy_url", + "relabel_configs", + "metric_relabel_configs", + "sample_limit", + "label_limit", + "label_name_length_limit", + "label_value_length_limit", + "scheme", + "basic_auth", + "tls_config", + "authorization", + "params", +} +DEFAULT_JOB = { + "metrics_path": "/metrics", + "static_configs": [{"targets": ["*:80"]}], +} + + +DEFAULT_RELATION_NAME = "metrics-endpoint" +RELATION_INTERFACE_NAME = "prometheus_scrape" + +DEFAULT_ALERT_RULES_RELATIVE_PATH = "./src/prometheus_alert_rules" + + +def _encode_content(content: Union[str, bytes]) -> str: + if isinstance(content, str): + content = bytes(content, "utf-8") + + return base64.b64encode(lzma.compress(content)).decode("utf-8") + + +def _decode_content(encoded_content: str) -> str: + try: + # Assuming content is encoded, try decoding it. + return lzma.decompress(base64.b64decode(encoded_content.encode("utf-8"))).decode() + except (binascii.Error, lzma.LZMAError): + # Failed to base64-decode or decompress, so probably not encoded. Return as is. + return encoded_content + + +class PrometheusConfig: + """A namespace for utility functions for manipulating the prometheus config dict.""" + + # relabel instance labels so that instance identifiers are globally unique + # stable over unit recreation + topology_relabel_config = { + "source_labels": ["juju_model", "juju_model_uuid", "juju_application"], + "separator": "_", + "target_label": "instance", + "regex": "(.*)", + } + + topology_relabel_config_wildcard = { + "source_labels": ["juju_model", "juju_model_uuid", "juju_application", "juju_unit"], + "separator": "_", + "target_label": "instance", + "regex": "(.*)", + } + + @staticmethod + def sanitize_scrape_config(job: dict) -> dict: + """Restrict permissible scrape configuration options. + + If job is empty then a default job is returned. The + default job is + + ``` + { + "metrics_path": "/metrics", + "static_configs": [{"targets": ["*:80"]}], + } + ``` + + Args: + job: a dict containing a single Prometheus job + specification. + + Returns: + a dictionary containing a sanitized job specification. + """ + sanitized_job = DEFAULT_JOB.copy() + sanitized_job.update({key: value for key, value in job.items() if key in ALLOWED_KEYS}) + return sanitized_job + + @staticmethod + def sanitize_scrape_configs(scrape_configs: List[dict]) -> List[dict]: + """A vectorized version of `sanitize_scrape_config`.""" + return [PrometheusConfig.sanitize_scrape_config(job) for job in scrape_configs] + + @staticmethod + def prefix_job_names(scrape_configs: List[dict], prefix: str) -> List[dict]: + """Adds the given prefix to all the job names in the given scrape_configs list.""" + modified_scrape_configs = [] + for scrape_config in scrape_configs: + job_name = scrape_config.get("job_name") + modified = scrape_config.copy() + modified["job_name"] = prefix + "_" + job_name if job_name else prefix + modified_scrape_configs.append(modified) + + return modified_scrape_configs + + @staticmethod + def expand_wildcard_targets_into_individual_jobs( + scrape_jobs: List[dict], + hosts: Dict[str, Tuple[str, str]], + topology: Optional[JujuTopology] = None, + ) -> List[dict]: + """Extract wildcard hosts from the given scrape_configs list into separate jobs. + + Args: + scrape_jobs: list of scrape jobs. + hosts: a dictionary mapping host names to host address for + all units of the relation for which this job configuration + must be constructed. + topology: optional arg for adding topology labels to scrape targets. + """ + # hosts = self._relation_hosts(relation) + + modified_scrape_jobs = [] + for job in scrape_jobs: + static_configs = job.get("static_configs") + if not static_configs: + continue + + # When a single unit specified more than one wildcard target, then they are expanded + # into a static_config per target + non_wildcard_static_configs = [] + + for static_config in static_configs: + targets = static_config.get("targets") + if not targets: + continue + + # All non-wildcard targets remain in the same static_config + non_wildcard_targets = [] + + # All wildcard targets are extracted to a job per unit. If multiple wildcard + # targets are specified, they remain in the same static_config (per unit). + wildcard_targets = [] + + for target in targets: + match = re.compile(r"\*(?:(:\d+))?").match(target) + if match: + # This is a wildcard target. + # Need to expand into separate jobs and remove it from this job here + wildcard_targets.append(target) + else: + # This is not a wildcard target. Copy it over into its own static_config. + non_wildcard_targets.append(target) + + # All non-wildcard targets remain in the same static_config + if non_wildcard_targets: + non_wildcard_static_config = static_config.copy() + non_wildcard_static_config["targets"] = non_wildcard_targets + + if topology: + # When non-wildcard targets (aka fully qualified hostnames) are specified, + # there is no reliable way to determine the name (Juju topology unit name) + # for such a target. Therefore labeling with Juju topology, excluding the + # unit name. + non_wildcard_static_config["labels"] = { + **topology.label_matcher_dict, + **non_wildcard_static_config.get("labels", {}), + } + + non_wildcard_static_configs.append(non_wildcard_static_config) + + # Extract wildcard targets into individual jobs + if wildcard_targets: + for unit_name, (unit_hostname, unit_path) in hosts.items(): + modified_job = job.copy() + modified_job["static_configs"] = [static_config.copy()] + modified_static_config = modified_job["static_configs"][0] + modified_static_config["targets"] = [ + target.replace("*", unit_hostname) for target in wildcard_targets + ] + + unit_num = unit_name.split("/")[-1] + job_name = modified_job.get("job_name", "unnamed-job") + "-" + unit_num + modified_job["job_name"] = job_name + modified_job["metrics_path"] = unit_path + ( + job.get("metrics_path") or "/metrics" + ) + + if topology: + # Add topology labels + modified_static_config["labels"] = { + **topology.label_matcher_dict, + **{"juju_unit": unit_name}, + **modified_static_config.get("labels", {}), + } + + # Instance relabeling for topology should be last in order. + modified_job["relabel_configs"] = modified_job.get( + "relabel_configs", [] + ) + [PrometheusConfig.topology_relabel_config_wildcard] + + modified_scrape_jobs.append(modified_job) + + if non_wildcard_static_configs: + modified_job = job.copy() + modified_job["static_configs"] = non_wildcard_static_configs + modified_job["metrics_path"] = modified_job.get("metrics_path") or "/metrics" + + if topology: + # Instance relabeling for topology should be last in order. + modified_job["relabel_configs"] = modified_job.get("relabel_configs", []) + [ + PrometheusConfig.topology_relabel_config + ] + + modified_scrape_jobs.append(modified_job) + + return modified_scrape_jobs + + @staticmethod + def render_alertmanager_static_configs(alertmanagers: List[str]): + """Render the alertmanager static_configs section from a list of URLs. + + Each target must be in the hostname:port format, and prefixes are specified in a separate + key. Therefore, with ingress in place, would need to extract the path into the + `path_prefix` key, which is higher up in the config hierarchy. + + https://prometheus.io/docs/prometheus/latest/configuration/configuration/#alertmanager_config + + Args: + alertmanagers: List of alertmanager URLs. + + Returns: + A dict representation for the static_configs section. + """ + # Make sure it's a valid url so urlparse could parse it. + scheme = re.compile(r"^https?://") + sanitized = [am if scheme.search(am) else "http://" + am for am in alertmanagers] + + # Create a mapping from paths to netlocs + # Group alertmanager targets into a dictionary of lists: + # {path: [netloc1, netloc2]} + paths = defaultdict(list) # type: Dict[Tuple[str, str], List[str]] + for parsed in map(urlparse, sanitized): + path = parsed.path or "/" + paths[(parsed.scheme, path)].append(parsed.netloc) + + return { + "alertmanagers": [ + { + # For https we still do not render a `tls_config` section because + # certs are expected to be made available by the charm via the + # `update-ca-certificates` mechanism. + "scheme": scheme, + "path_prefix": path_prefix, + "static_configs": [{"targets": netlocs}], + } + for (scheme, path_prefix), netlocs in paths.items() + ] + } + + +class RelationNotFoundError(Exception): + """Raised if there is no relation with the given name is found.""" + + def __init__(self, relation_name: str): + self.relation_name = relation_name + self.message = "No relation named '{}' found".format(relation_name) + + super().__init__(self.message) + + +class RelationInterfaceMismatchError(Exception): + """Raised if the relation with the given name has a different interface.""" + + def __init__( + self, + relation_name: str, + expected_relation_interface: str, + actual_relation_interface: str, + ): + self.relation_name = relation_name + self.expected_relation_interface = expected_relation_interface + self.actual_relation_interface = actual_relation_interface + self.message = ( + "The '{}' relation has '{}' as interface rather than the expected '{}'".format( + relation_name, actual_relation_interface, expected_relation_interface + ) + ) + + super().__init__(self.message) + + +class RelationRoleMismatchError(Exception): + """Raised if the relation with the given name has a different role.""" + + def __init__( + self, + relation_name: str, + expected_relation_role: RelationRole, + actual_relation_role: RelationRole, + ): + self.relation_name = relation_name + self.expected_relation_interface = expected_relation_role + self.actual_relation_role = actual_relation_role + self.message = "The '{}' relation has role '{}' rather than the expected '{}'".format( + relation_name, repr(actual_relation_role), repr(expected_relation_role) + ) + + super().__init__(self.message) + + +class InvalidAlertRuleEvent(EventBase): + """Event emitted when alert rule files are not parsable. + + Enables us to set a clear status on the provider. + """ + + def __init__(self, handle, errors: str = "", valid: bool = False): + super().__init__(handle) + self.errors = errors + self.valid = valid + + def snapshot(self) -> Dict: + """Save alert rule information.""" + return { + "valid": self.valid, + "errors": self.errors, + } + + def restore(self, snapshot): + """Restore alert rule information.""" + self.valid = snapshot["valid"] + self.errors = snapshot["errors"] + + +class InvalidScrapeJobEvent(EventBase): + """Event emitted when alert rule files are not valid.""" + + def __init__(self, handle, errors: str = ""): + super().__init__(handle) + self.errors = errors + + def snapshot(self) -> Dict: + """Save error information.""" + return {"errors": self.errors} + + def restore(self, snapshot): + """Restore error information.""" + self.errors = snapshot["errors"] + + +class MetricsEndpointProviderEvents(ObjectEvents): + """Events raised by :class:`InvalidAlertRuleEvent`s.""" + + alert_rule_status_changed = EventSource(InvalidAlertRuleEvent) + invalid_scrape_job = EventSource(InvalidScrapeJobEvent) + + +def _type_convert_stored(obj): + """Convert Stored* to their appropriate types, recursively.""" + if isinstance(obj, StoredList): + return list(map(_type_convert_stored, obj)) + if isinstance(obj, StoredDict): + rdict = {} # type: Dict[Any, Any] + for k in obj.keys(): + rdict[k] = _type_convert_stored(obj[k]) + return rdict + return obj + + +def _validate_relation_by_interface_and_direction( + charm: CharmBase, + relation_name: str, + expected_relation_interface: str, + expected_relation_role: RelationRole, +): + """Verifies that a relation has the necessary characteristics. + + Verifies that the `relation_name` provided: (1) exists in metadata.yaml, + (2) declares as interface the interface name passed as `relation_interface` + and (3) has the right "direction", i.e., it is a relation that `charm` + provides or requires. + + Args: + charm: a `CharmBase` object to scan for the matching relation. + relation_name: the name of the relation to be verified. + expected_relation_interface: the interface name to be matched by the + relation named `relation_name`. + expected_relation_role: whether the `relation_name` must be either + provided or required by `charm`. + + Raises: + RelationNotFoundError: If there is no relation in the charm's metadata.yaml + with the same name as provided via `relation_name` argument. + RelationInterfaceMismatchError: The relation with the same name as provided + via `relation_name` argument does not have the same relation interface + as specified via the `expected_relation_interface` argument. + RelationRoleMismatchError: If the relation with the same name as provided + via `relation_name` argument does not have the same role as specified + via the `expected_relation_role` argument. + """ + if relation_name not in charm.meta.relations: + raise RelationNotFoundError(relation_name) + + relation = charm.meta.relations[relation_name] + + actual_relation_interface = relation.interface_name + if actual_relation_interface != expected_relation_interface: + raise RelationInterfaceMismatchError( + relation_name, expected_relation_interface, actual_relation_interface or "None" + ) + + if expected_relation_role == RelationRole.provides: + if relation_name not in charm.meta.provides: + raise RelationRoleMismatchError( + relation_name, RelationRole.provides, RelationRole.requires + ) + elif expected_relation_role == RelationRole.requires: + if relation_name not in charm.meta.requires: + raise RelationRoleMismatchError( + relation_name, RelationRole.requires, RelationRole.provides + ) + else: + raise Exception("Unexpected RelationDirection: {}".format(expected_relation_role)) + + +class InvalidAlertRulePathError(Exception): + """Raised if the alert rules folder cannot be found or is otherwise invalid.""" + + def __init__( + self, + alert_rules_absolute_path: Path, + message: str, + ): + self.alert_rules_absolute_path = alert_rules_absolute_path + self.message = message + + super().__init__(self.message) + + +def _is_official_alert_rule_format(rules_dict: dict) -> bool: + """Are alert rules in the upstream format as supported by Prometheus. + + Alert rules in dictionary format are in "official" form if they + contain a "groups" key, since this implies they contain a list of + alert rule groups. + + Args: + rules_dict: a set of alert rules in Python dictionary format + + Returns: + True if alert rules are in official Prometheus file format. + """ + return "groups" in rules_dict + + +def _is_single_alert_rule_format(rules_dict: dict) -> bool: + """Are alert rules in single rule format. + + The Prometheus charm library supports reading of alert rules in a + custom format that consists of a single alert rule per file. This + does not conform to the official Prometheus alert rule file format + which requires that each alert rules file consists of a list of + alert rule groups and each group consists of a list of alert + rules. + + Alert rules in dictionary form are considered to be in single rule + format if in the least it contains two keys corresponding to the + alert rule name and alert expression. + + Returns: + True if alert rule is in single rule file format. + """ + # one alert rule per file + return set(rules_dict) >= {"alert", "expr"} + + +class TargetsChangedEvent(EventBase): + """Event emitted when Prometheus scrape targets change.""" + + def __init__(self, handle, relation_id): + super().__init__(handle) + self.relation_id = relation_id + + def snapshot(self): + """Save scrape target relation information.""" + return {"relation_id": self.relation_id} + + def restore(self, snapshot): + """Restore scrape target relation information.""" + self.relation_id = snapshot["relation_id"] + + +class MonitoringEvents(ObjectEvents): + """Event descriptor for events raised by `MetricsEndpointConsumer`.""" + + targets_changed = EventSource(TargetsChangedEvent) + + +class MetricsEndpointConsumer(Object): + """A Prometheus based Monitoring service.""" + + on = MonitoringEvents() # pyright: ignore + + def __init__(self, charm: CharmBase, relation_name: str = DEFAULT_RELATION_NAME): + """A Prometheus based Monitoring service. + + Args: + charm: a `CharmBase` instance that manages this + instance of the Prometheus service. + relation_name: an optional string name of the relation between `charm` + and the Prometheus charmed service. The default is "metrics-endpoint". + It is strongly advised not to change the default, so that people + deploying your charm will have a consistent experience with all + other charms that consume metrics endpoints. + + Raises: + RelationNotFoundError: If there is no relation in the charm's metadata.yaml + with the same name as provided via `relation_name` argument. + RelationInterfaceMismatchError: The relation with the same name as provided + via `relation_name` argument does not have the `prometheus_scrape` relation + interface. + RelationRoleMismatchError: If the relation with the same name as provided + via `relation_name` argument does not have the `RelationRole.requires` + role. + """ + _validate_relation_by_interface_and_direction( + charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.requires + ) + + super().__init__(charm, relation_name) + self._charm = charm + self._relation_name = relation_name + self._tool = CosTool(self._charm) + events = self._charm.on[relation_name] + self.framework.observe(events.relation_changed, self._on_metrics_provider_relation_changed) + self.framework.observe( + events.relation_departed, self._on_metrics_provider_relation_departed + ) + + def _on_metrics_provider_relation_changed(self, event): + """Handle changes with related metrics providers. + + Anytime there are changes in relations between Prometheus + and metrics provider charms the Prometheus charm is informed, + through a `TargetsChangedEvent` event. The Prometheus charm can + then choose to update its scrape configuration. + + Args: + event: a `CharmEvent` in response to which the Prometheus + charm must update its scrape configuration. + """ + rel_id = event.relation.id + + self.on.targets_changed.emit(relation_id=rel_id) + + def _on_metrics_provider_relation_departed(self, event): + """Update job config when a metrics provider departs. + + When a metrics provider departs the Prometheus charm is informed + through a `TargetsChangedEvent` event so that it can update its + scrape configuration to ensure that the departed metrics provider + is removed from the list of scrape jobs and + + Args: + event: a `CharmEvent` that indicates a metrics provider + unit has departed. + """ + rel_id = event.relation.id + self.on.targets_changed.emit(relation_id=rel_id) + + def jobs(self) -> list: + """Fetch the list of scrape jobs. + + Returns: + A list consisting of all the static scrape configurations + for each related `MetricsEndpointProvider` that has specified + its scrape targets. + """ + scrape_jobs = [] + + for relation in self._charm.model.relations[self._relation_name]: + static_scrape_jobs = self._static_scrape_config(relation) + if static_scrape_jobs: + # Duplicate job names will cause validate_scrape_jobs to fail. + # Therefore we need to dedupe here and after all jobs are collected. + static_scrape_jobs = _dedupe_job_names(static_scrape_jobs) + try: + self._tool.validate_scrape_jobs(static_scrape_jobs) + except subprocess.CalledProcessError as e: + if self._charm.unit.is_leader(): + data = json.loads(relation.data[self._charm.app].get("event", "{}")) + data["scrape_job_errors"] = str(e) + relation.data[self._charm.app]["event"] = json.dumps(data) + else: + scrape_jobs.extend(static_scrape_jobs) + + scrape_jobs = _dedupe_job_names(scrape_jobs) + + return scrape_jobs + + @property + def alerts(self) -> dict: + """Fetch alerts for all relations. + + A Prometheus alert rules file consists of a list of "groups". Each + group consists of a list of alerts (`rules`) that are sequentially + executed. This method returns all the alert rules provided by each + related metrics provider charm. These rules may be used to generate a + separate alert rules file for each relation since the returned list + of alert groups are indexed by that relations Juju topology identifier. + The Juju topology identifier string includes substrings that identify + alert rule related metadata such as the Juju model, model UUID and the + application name from where the alert rule originates. Since this + topology identifier is globally unique, it may be used for instance as + the name for the file into which the list of alert rule groups are + written. For each relation, the structure of data returned is a dictionary + representation of a standard prometheus rules file: + + {"groups": [{"name": ...}, ...]} + + per official prometheus documentation + https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ + + The value of the `groups` key is such that it may be used to generate + a Prometheus alert rules file directly using `yaml.dump` but the + `groups` key itself must be included as this is required by Prometheus. + + For example the list of alert rule groups returned by this method may + be written into files consumed by Prometheus as follows + + ``` + for topology_identifier, alert_rule_groups in self.metrics_consumer.alerts().items(): + filename = "juju_" + topology_identifier + ".rules" + path = os.path.join(PROMETHEUS_RULES_DIR, filename) + rules = yaml.safe_dump(alert_rule_groups) + container.push(path, rules, make_dirs=True) + ``` + + Returns: + A dictionary mapping the Juju topology identifier of the source charm to + its list of alert rule groups. + """ + alerts = {} # type: Dict[str, dict] # mapping b/w juju identifiers and alert rule files + for relation in self._charm.model.relations[self._relation_name]: + if not relation.units or not relation.app: + continue + + alert_rules = json.loads( + _decode_content(relation.data[relation.app].get("alert_rules", "{}")) + ) + if not alert_rules: + continue + + alert_rules = self._inject_alert_expr_labels(alert_rules) + + identifier, topology = self._get_identifier_by_alert_rules(alert_rules) + if not topology: + try: + scrape_metadata = json.loads(relation.data[relation.app]["scrape_metadata"]) + identifier = JujuTopology.from_dict(scrape_metadata).identifier + + except KeyError as e: + logger.debug( + "Relation %s has no 'scrape_metadata': %s", + relation.id, + e, + ) + + if not identifier: + logger.error( + "Alert rules were found but no usable group or identifier was present." + ) + continue + + # We need to append the relation info to the identifier. This is to allow for cases for there are two + # relations which eventually scrape the same application. Issue #551. + identifier = f"{identifier}_{relation.name}_{relation.id}" + + alerts[identifier] = alert_rules + + _, errmsg = self._tool.validate_alert_rules(alert_rules) + if errmsg: + if alerts[identifier]: + del alerts[identifier] + if self._charm.unit.is_leader(): + data = json.loads(relation.data[self._charm.app].get("event", "{}")) + data["errors"] = errmsg + relation.data[self._charm.app]["event"] = json.dumps(data) + continue + + return alerts + + def _get_identifier_by_alert_rules( + self, rules: dict + ) -> Tuple[Union[str, None], Union[JujuTopology, None]]: + """Determine an appropriate dict key for alert rules. + + The key is used as the filename when writing alerts to disk, so the structure + and uniqueness is important. + + Args: + rules: a dict of alert rules + Returns: + A tuple containing an identifier, if found, and a JujuTopology, if it could + be constructed. + """ + if "groups" not in rules: + logger.debug("No alert groups were found in relation data") + return None, None + + # Construct an ID based on what's in the alert rules if they have labels + for group in rules["groups"]: + try: + labels = group["rules"][0]["labels"] + topology = JujuTopology( + # Don't try to safely get required constructor fields. There's already + # a handler for KeyErrors + model_uuid=labels["juju_model_uuid"], + model=labels["juju_model"], + application=labels["juju_application"], + unit=labels.get("juju_unit", ""), + charm_name=labels.get("juju_charm", ""), + ) + return topology.identifier, topology + except KeyError: + logger.debug("Alert rules were found but no usable labels were present") + continue + + logger.warning( + "No labeled alert rules were found, and no 'scrape_metadata' " + "was available. Using the alert group name as filename." + ) + try: + for group in rules["groups"]: + return group["name"], None + except KeyError: + logger.debug("No group name was found to use as identifier") + + return None, None + + def _inject_alert_expr_labels(self, rules: Dict[str, Any]) -> Dict[str, Any]: + """Iterate through alert rules and inject topology into expressions. + + Args: + rules: a dict of alert rules + """ + if "groups" not in rules: + return rules + + modified_groups = [] + for group in rules["groups"]: + # Copy off rules, so we don't modify an object we're iterating over + rules_copy = group["rules"] + for idx, rule in enumerate(rules_copy): + labels = rule.get("labels") + + if labels: + try: + topology = JujuTopology( + # Don't try to safely get required constructor fields. There's already + # a handler for KeyErrors + model_uuid=labels["juju_model_uuid"], + model=labels["juju_model"], + application=labels["juju_application"], + unit=labels.get("juju_unit", ""), + charm_name=labels.get("juju_charm", ""), + ) + + # Inject topology and put it back in the list + rule["expr"] = self._tool.inject_label_matchers( + re.sub(r"%%juju_topology%%,?", "", rule["expr"]), + topology.alert_expression_dict, + ) + except KeyError: + # Some required JujuTopology key is missing. Just move on. + pass + + group["rules"][idx] = rule + + modified_groups.append(group) + + rules["groups"] = modified_groups + return rules + + def _static_scrape_config(self, relation) -> list: + """Generate the static scrape configuration for a single relation. + + If the relation data includes `scrape_metadata` then the value + of this key is used to annotate the scrape jobs with Juju + Topology labels before returning them. + + Args: + relation: an `ops.model.Relation` object whose static + scrape configuration is required. + + Returns: + A list (possibly empty) of scrape jobs. Each job is a + valid Prometheus scrape configuration for that job, + represented as a Python dictionary. + """ + if not relation.units: + return [] + + scrape_configs = json.loads( + _decode_content(relation.data[relation.app].get("scrape_jobs", "[]")) + ) + + if not scrape_configs: + return [] + + scrape_metadata = json.loads(relation.data[relation.app].get("scrape_metadata", "{}")) + + if not scrape_metadata: + return scrape_configs + + topology = JujuTopology.from_dict(scrape_metadata) + + job_name_prefix = "juju_{}_prometheus_scrape".format(topology.identifier) + scrape_configs = PrometheusConfig.prefix_job_names(scrape_configs, job_name_prefix) + scrape_configs = PrometheusConfig.sanitize_scrape_configs(scrape_configs) + + hosts = self._relation_hosts(relation) + + scrape_configs = PrometheusConfig.expand_wildcard_targets_into_individual_jobs( + scrape_configs, hosts, topology + ) + + # For https scrape targets we still do not render a `tls_config` section because certs + # are expected to be made available by the charm via the `update-ca-certificates` mechanism. + return scrape_configs + + def _relation_hosts(self, relation: Relation) -> Dict[str, Tuple[str, str]]: + """Returns a mapping from unit names to (address, path) tuples, for the given relation.""" + hosts = {} + for unit in relation.units: + # TODO deprecate and remove unit.name + unit_name = relation.data[unit].get("prometheus_scrape_unit_name") or unit.name + # TODO deprecate and remove "prometheus_scrape_host" + unit_address = relation.data[unit].get( + "prometheus_scrape_unit_address" + ) or relation.data[unit].get("prometheus_scrape_host") + unit_path = relation.data[unit].get("prometheus_scrape_unit_path", "") + if unit_name and unit_address: + hosts.update({unit_name: (unit_address, unit_path)}) + + return hosts + + def _target_parts(self, target) -> list: + """Extract host and port from a wildcard target. + + Args: + target: a string specifying a scrape target. A + scrape target is expected to have the format + "host:port". The host part may be a wildcard + "*" and the port part can be missing (along + with ":") in which case port is set to 80. + + Returns: + a list with target host and port as in [host, port] + """ + if ":" in target: + parts = target.split(":") + else: + parts = [target, "80"] + + return parts + + +def _dedupe_job_names(jobs: List[dict]): + """Deduplicate a list of dicts by appending a hash to the value of the 'job_name' key. + + Additionally, fully de-duplicate any identical jobs. + + Args: + jobs: A list of prometheus scrape jobs + """ + jobs_copy = copy.deepcopy(jobs) + + # Convert to a dict with job names as keys + # I think this line is O(n^2) but it should be okay given the list sizes + jobs_dict = { + job["job_name"]: list(filter(lambda x: x["job_name"] == job["job_name"], jobs_copy)) + for job in jobs_copy + } + + # If multiple jobs have the same name, convert the name to "name_" + for key in jobs_dict: + if len(jobs_dict[key]) > 1: + for job in jobs_dict[key]: + job_json = json.dumps(job) + hashed = hashlib.sha256(job_json.encode()).hexdigest() + job["job_name"] = "{}_{}".format(job["job_name"], hashed) + new_jobs = [] + for key in jobs_dict: + new_jobs.extend(list(jobs_dict[key])) + + # Deduplicate jobs which are equal + # Again this in O(n^2) but it should be okay + deduped_jobs = [] + seen = [] + for job in new_jobs: + job_json = json.dumps(job) + hashed = hashlib.sha256(job_json.encode()).hexdigest() + if hashed in seen: + continue + seen.append(hashed) + deduped_jobs.append(job) + + return deduped_jobs + + +def _resolve_dir_against_charm_path(charm: CharmBase, *path_elements: str) -> str: + """Resolve the provided path items against the directory of the main file. + + Look up the directory of the `main.py` file being executed. This is normally + going to be the charm.py file of the charm including this library. Then, resolve + the provided path elements and, if the result path exists and is a directory, + return its absolute path; otherwise, raise en exception. + + Raises: + InvalidAlertRulePathError, if the path does not exist or is not a directory. + """ + charm_dir = Path(str(charm.charm_dir)) + if not charm_dir.exists() or not charm_dir.is_dir(): + # Operator Framework does not currently expose a robust + # way to determine the top level charm source directory + # that is consistent across deployed charms and unit tests + # Hence for unit tests the current working directory is used + # TODO: updated this logic when the following ticket is resolved + # https://github.com/canonical/operator/issues/643 + charm_dir = Path(os.getcwd()) + + alerts_dir_path = charm_dir.absolute().joinpath(*path_elements) + + if not alerts_dir_path.exists(): + raise InvalidAlertRulePathError(alerts_dir_path, "directory does not exist") + if not alerts_dir_path.is_dir(): + raise InvalidAlertRulePathError(alerts_dir_path, "is not a directory") + + return str(alerts_dir_path) + + +class MetricsEndpointProvider(Object): + """A metrics endpoint for Prometheus.""" + + on = MetricsEndpointProviderEvents() # pyright: ignore + + def __init__( + self, + charm, + relation_name: str = DEFAULT_RELATION_NAME, + jobs=None, + alert_rules_path: str = DEFAULT_ALERT_RULES_RELATIVE_PATH, + refresh_event: Optional[Union[BoundEvent, List[BoundEvent]]] = None, + external_url: str = "", + lookaside_jobs_callable: Optional[Callable] = None, + ): + """Construct a metrics provider for a Prometheus charm. + + If your charm exposes a Prometheus metrics endpoint, the + `MetricsEndpointProvider` object enables your charm to easily + communicate how to reach that metrics endpoint. + + By default, a charm instantiating this object has the metrics + endpoints of each of its units scraped by the related Prometheus + charms. The scraped metrics are automatically tagged by the + Prometheus charms with Juju topology data via the + `juju_model_name`, `juju_model_uuid`, `juju_application_name` + and `juju_unit` labels. To support such tagging `MetricsEndpointProvider` + automatically forwards scrape metadata to a `MetricsEndpointConsumer` + (Prometheus charm). + + Scrape targets provided by `MetricsEndpointProvider` can be + customized when instantiating this object. For example in the + case of a charm exposing the metrics endpoint for each of its + units on port 8080 and the `/metrics` path, the + `MetricsEndpointProvider` can be instantiated as follows: + + self.metrics_endpoint_provider = MetricsEndpointProvider( + self, + jobs=[{ + "static_configs": [{"targets": ["*:8080"]}], + }]) + + The notation `*:` means "scrape each unit of this charm on port + ``. + + In case the metrics endpoints are not on the standard `/metrics` path, + a custom path can be specified as follows: + + self.metrics_endpoint_provider = MetricsEndpointProvider( + self, + jobs=[{ + "metrics_path": "/my/strange/metrics/path", + "static_configs": [{"targets": ["*:8080"]}], + }]) + + Note how the `jobs` argument is a list: this allows you to expose multiple + combinations of paths "metrics_path" and "static_configs" in case your charm + exposes multiple endpoints, which could happen, for example, when you have + multiple workload containers, with applications in each needing to be scraped. + The structure of the objects in the `jobs` list is one-to-one with the + `scrape_config` configuration item of Prometheus' own configuration (see + https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config + ), but with only a subset of the fields allowed. The permitted fields are + listed in `ALLOWED_KEYS` object in this charm library module. + + It is also possible to specify alert rules. By default, this library will look + into the `/prometheus_alert_rules`, which in a standard charm + layouts resolves to `src/prometheus_alert_rules`. Each alert rule goes into a + separate `*.rule` file. If the syntax of a rule is invalid, + the `MetricsEndpointProvider` logs an error and does not load the particular + rule. + + To avoid false positives and negatives in the evaluation of alert rules, + all ingested alert rule expressions are automatically qualified using Juju + Topology filters. This ensures that alert rules provided by your charm, trigger + alerts based only on data scrapped from your charm. For example an alert rule + such as the following + + alert: UnitUnavailable + expr: up < 1 + for: 0m + + will be automatically transformed into something along the lines of the following + + alert: UnitUnavailable + expr: up{juju_model=, juju_model_uuid=, juju_application=} < 1 + for: 0m + + An attempt will be made to validate alert rules prior to loading them into Prometheus. + If they are invalid, an event will be emitted from this object which charms can respond + to in order to set a meaningful status for administrators. + + This can be observed via `consumer.on.alert_rule_status_changed` which contains: + - The error(s) encountered when validating as `errors` + - A `valid` attribute, which can be used to reset the state of charms if alert rules + are updated via another mechanism (e.g. `cos-config`) and refreshed. + + Args: + charm: a `CharmBase` object that manages this + `MetricsEndpointProvider` object. Typically, this is + `self` in the instantiating class. + relation_name: an optional string name of the relation between `charm` + and the Prometheus charmed service. The default is "metrics-endpoint". + It is strongly advised not to change the default, so that people + deploying your charm will have a consistent experience with all + other charms that provide metrics endpoints. + jobs: an optional list of dictionaries where each + dictionary represents the Prometheus scrape + configuration for a single job. When not provided, a + default scrape configuration is provided for the + `/metrics` endpoint polling all units of the charm on port `80` + using the `MetricsEndpointProvider` object. + alert_rules_path: an optional path for the location of alert rules + files. Defaults to "./prometheus_alert_rules", + resolved relative to the directory hosting the charm entry file. + The alert rules are automatically updated on charm upgrade. + refresh_event: an optional bound event or list of bound events which + will be observed to re-set scrape job data (IP address and others) + external_url: an optional argument that represents an external url that + can be generated by an Ingress or a Proxy. + lookaside_jobs_callable: an optional `Callable` which should be invoked + when the job configuration is built as a secondary mapping. The callable + should return a `List[Dict]` which is syntactically identical to the + `jobs` parameter, but can be updated out of step initialization of + this library without disrupting the 'global' job spec. + + Raises: + RelationNotFoundError: If there is no relation in the charm's metadata.yaml + with the same name as provided via `relation_name` argument. + RelationInterfaceMismatchError: The relation with the same name as provided + via `relation_name` argument does not have the `prometheus_scrape` relation + interface. + RelationRoleMismatchError: If the relation with the same name as provided + via `relation_name` argument does not have the `RelationRole.provides` + role. + """ + _validate_relation_by_interface_and_direction( + charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.provides + ) + + try: + alert_rules_path = _resolve_dir_against_charm_path(charm, alert_rules_path) + except InvalidAlertRulePathError as e: + logger.debug( + "Invalid Prometheus alert rules folder at %s: %s", + e.alert_rules_absolute_path, + e.message, + ) + + super().__init__(charm, relation_name) + self.topology = JujuTopology.from_charm(charm) + + self._charm = charm + self._alert_rules_path = alert_rules_path + self._relation_name = relation_name + # sanitize job configurations to the supported subset of parameters + jobs = [] if jobs is None else jobs + self._jobs = PrometheusConfig.sanitize_scrape_configs(jobs) + + if external_url: + external_url = ( + external_url if urlparse(external_url).scheme else ("http://" + external_url) + ) + self.external_url = external_url + self._lookaside_jobs = lookaside_jobs_callable + + events = self._charm.on[self._relation_name] + self.framework.observe(events.relation_changed, self._on_relation_changed) + + if not refresh_event: + # FIXME remove once podspec charms are verified. + # `self.set_scrape_job_spec()` is called every re-init so this should not be needed. + if len(self._charm.meta.containers) == 1: + if "kubernetes" in self._charm.meta.series: + # This is a podspec charm + refresh_event = [self._charm.on.update_status] + else: + # This is a sidecar/pebble charm + container = list(self._charm.meta.containers.values())[0] + refresh_event = [self._charm.on[container.name.replace("-", "_")].pebble_ready] + else: + logger.warning( + "%d containers are present in metadata.yaml and " + "refresh_event was not specified. Defaulting to update_status. " + "Metrics IP may not be set in a timely fashion.", + len(self._charm.meta.containers), + ) + refresh_event = [self._charm.on.update_status] + + else: + if not isinstance(refresh_event, list): + refresh_event = [refresh_event] + + self.framework.observe(events.relation_joined, self.set_scrape_job_spec) + for ev in refresh_event: + self.framework.observe(ev, self.set_scrape_job_spec) + + def _on_relation_changed(self, event): + """Check for alert rule messages in the relation data before moving on.""" + if self._charm.unit.is_leader(): + ev = json.loads(event.relation.data[event.app].get("event", "{}")) + + if ev: + valid = bool(ev.get("valid", True)) + errors = ev.get("errors", "") + + if valid and not errors: + self.on.alert_rule_status_changed.emit(valid=valid) + else: + self.on.alert_rule_status_changed.emit(valid=valid, errors=errors) + + scrape_errors = ev.get("scrape_job_errors", None) + if scrape_errors: + self.on.invalid_scrape_job.emit(errors=scrape_errors) + + def update_scrape_job_spec(self, jobs): + """Update scrape job specification.""" + self._jobs = PrometheusConfig.sanitize_scrape_configs(jobs) + self.set_scrape_job_spec() + + def set_scrape_job_spec(self, _=None): + """Ensure scrape target information is made available to prometheus. + + When a metrics provider charm is related to a prometheus charm, the + metrics provider sets specification and metadata related to its own + scrape configuration. This information is set using Juju application + data. In addition, each of the consumer units also sets its own + host address in Juju unit relation data. + """ + self._set_unit_ip() + + if not self._charm.unit.is_leader(): + return + + alert_rules = AlertRules(query_type="promql", topology=self.topology) + alert_rules.add_path(self._alert_rules_path, recursive=True) + alert_rules_as_dict = alert_rules.as_dict() + + for relation in self._charm.model.relations[self._relation_name]: + relation.data[self._charm.app]["scrape_metadata"] = json.dumps(self._scrape_metadata) + relation.data[self._charm.app]["scrape_jobs"] = json.dumps(self._scrape_jobs) + + # Update relation data with the string representation of the rule file. + # Juju topology is already included in the "scrape_metadata" field above. + # The consumer side of the relation uses this information to name the rules file + # that is written to the filesystem. + relation.data[self._charm.app]["alert_rules"] = json.dumps(alert_rules_as_dict) + + def _set_unit_ip(self, _=None): + """Set unit host address. + + Each time a metrics provider charm container is restarted it updates its own + host address in the unit relation data for the prometheus charm. + + The only argument specified is an event, and it ignored. This is for expediency + to be able to use this method as an event handler, although no access to the + event is actually needed. + """ + for relation in self._charm.model.relations[self._relation_name]: + unit_ip = str(self._charm.model.get_binding(relation).network.bind_address) + + # TODO store entire url in relation data, instead of only select url parts. + + if self.external_url: + parsed = urlparse(self.external_url) + unit_address = parsed.hostname + path = parsed.path + elif self._is_valid_unit_address(unit_ip): + unit_address = unit_ip + path = "" + else: + unit_address = socket.getfqdn() + path = "" + + relation.data[self._charm.unit]["prometheus_scrape_unit_address"] = unit_address + relation.data[self._charm.unit]["prometheus_scrape_unit_path"] = path + relation.data[self._charm.unit]["prometheus_scrape_unit_name"] = str( + self._charm.model.unit.name + ) + + def _is_valid_unit_address(self, address: str) -> bool: + """Validate a unit address. + + At present only IP address validation is supported, but + this may be extended to DNS addresses also, as needed. + + Args: + address: a string representing a unit address + """ + try: + _ = ipaddress.ip_address(address) + except ValueError: + return False + + return True + + @property + def _scrape_jobs(self) -> list: + """Fetch list of scrape jobs. + + Returns: + A list of dictionaries, where each dictionary specifies a + single scrape job for Prometheus. + """ + jobs = self._jobs or [] + if callable(self._lookaside_jobs): + jobs.extend(PrometheusConfig.sanitize_scrape_configs(self._lookaside_jobs())) + return jobs or [DEFAULT_JOB] + + @property + def _scrape_metadata(self) -> dict: + """Generate scrape metadata. + + Returns: + Scrape configuration metadata for this metrics provider charm. + """ + return self.topology.as_dict() + + +class PrometheusRulesProvider(Object): + """Forward rules to Prometheus. + + This object may be used to forward rules to Prometheus. At present it only supports + forwarding alert rules. This is unlike :class:`MetricsEndpointProvider`, which + is used for forwarding both scrape targets and associated alert rules. This object + is typically used when there is a desire to forward rules that apply globally (across + all deployed charms and units) rather than to a single charm. All rule files are + forwarded using the same 'prometheus_scrape' interface that is also used by + `MetricsEndpointProvider`. + + Args: + charm: A charm instance that `provides` a relation with the `prometheus_scrape` interface. + relation_name: Name of the relation in `metadata.yaml` that + has the `prometheus_scrape` interface. + dir_path: Root directory for the collection of rule files. + recursive: Whether to scan for rule files recursively. + """ + + def __init__( + self, + charm: CharmBase, + relation_name: str = DEFAULT_RELATION_NAME, + dir_path: str = DEFAULT_ALERT_RULES_RELATIVE_PATH, + recursive=True, + ): + super().__init__(charm, relation_name) + self._charm = charm + self._relation_name = relation_name + self._recursive = recursive + + try: + dir_path = _resolve_dir_against_charm_path(charm, dir_path) + except InvalidAlertRulePathError as e: + logger.debug( + "Invalid Prometheus alert rules folder at %s: %s", + e.alert_rules_absolute_path, + e.message, + ) + self.dir_path = dir_path + + events = self._charm.on[self._relation_name] + event_sources = [ + events.relation_joined, + events.relation_changed, + self._charm.on.leader_elected, + self._charm.on.upgrade_charm, + ] + + for event_source in event_sources: + self.framework.observe(event_source, self._update_relation_data) + + def _reinitialize_alert_rules(self): + """Reloads alert rules and updates all relations.""" + self._update_relation_data(None) + + def _update_relation_data(self, _): + """Update application relation data with alert rules for all relations.""" + if not self._charm.unit.is_leader(): + return + + alert_rules = AlertRules(query_type="promql") + alert_rules.add_path(self.dir_path, recursive=self._recursive) + alert_rules_as_dict = alert_rules.as_dict() + + logger.info("Updating relation data with rule files from disk") + for relation in self._charm.model.relations[self._relation_name]: + relation.data[self._charm.app]["alert_rules"] = json.dumps( + alert_rules_as_dict, + sort_keys=True, # sort, to prevent unnecessary relation_changed events + ) + + +class MetricsEndpointAggregator(Object): + """Aggregate metrics from multiple scrape targets. + + `MetricsEndpointAggregator` collects scrape target information from one + or more related charms and forwards this to a `MetricsEndpointConsumer` + charm, which may be in a different Juju model. However, it is + essential that `MetricsEndpointAggregator` itself resides in the same + model as its scrape targets, as this is currently the only way to + ensure in Juju that the `MetricsEndpointAggregator` will be able to + determine the model name and uuid of the scrape targets. + + `MetricsEndpointAggregator` should be used in place of + `MetricsEndpointProvider` in the following two use cases: + + 1. Integrating one or more scrape targets that do not support the + `prometheus_scrape` interface. + + 2. Integrating one or more scrape targets through cross model + relations. Although the [Scrape Config Operator](https://charmhub.io/cos-configuration-k8s) + may also be used for the purpose of supporting cross model + relations. + + Using `MetricsEndpointAggregator` to build a Prometheus charm client + only requires instantiating it. Instantiating + `MetricsEndpointAggregator` is similar to `MetricsEndpointProvider` except + that it requires specifying the names of three relations: the + relation with scrape targets, the relation for alert rules, and + that with the Prometheus charms. For example + + ```python + self._aggregator = MetricsEndpointAggregator( + self, + { + "prometheus": "monitoring", + "scrape_target": "prometheus-target", + "alert_rules": "prometheus-rules" + } + ) + ``` + + `MetricsEndpointAggregator` assumes that each unit of a scrape target + sets in its unit-level relation data two entries with keys + "hostname" and "port". If it is required to integrate with charms + that do not honor these assumptions, it is always possible to + derive from `MetricsEndpointAggregator` overriding the `_get_targets()` + method, which is responsible for aggregating the unit name, host + address ("hostname") and port of the scrape target. + `MetricsEndpointAggregator` also assumes that each unit of a + scrape target sets in its unit-level relation data a key named + "groups". The value of this key is expected to be the string + representation of list of Prometheus Alert rules in YAML format. + An example of a single such alert rule is + + ```yaml + - alert: HighRequestLatency + expr: job:request_latency_seconds:mean5m{job="myjob"} > 0.5 + for: 10m + labels: + severity: page + annotations: + summary: High request latency + ``` + + Once again if it is required to integrate with charms that do not + honour these assumptions about alert rules then an object derived + from `MetricsEndpointAggregator` may be used by overriding the + `_get_alert_rules()` method. + + `MetricsEndpointAggregator` ensures that Prometheus scrape job + specifications and alert rules are annotated with Juju topology + information, just like `MetricsEndpointProvider` and + `MetricsEndpointConsumer` do. + + By default, `MetricsEndpointAggregator` ensures that Prometheus + "instance" labels refer to Juju topology. This ensures that + instance labels are stable over unit recreation. While it is not + advisable to change this option, if required it can be done by + setting the "relabel_instance" keyword argument to `False` when + constructing an aggregator object. + """ + + _stored = StoredState() + + def __init__( + self, + charm, + relation_names: Optional[dict] = None, + relabel_instance=True, + resolve_addresses=False, + ): + """Construct a `MetricsEndpointAggregator`. + + Args: + charm: a `CharmBase` object that manages this + `MetricsEndpointAggregator` object. Typically, this is + `self` in the instantiating class. + relation_names: a dictionary with three keys. The value + of the "scrape_target" and "alert_rules" keys are + the relation names over which scrape job and alert rule + information is gathered by this `MetricsEndpointAggregator`. + And the value of the "prometheus" key is the name of + the relation with a `MetricsEndpointConsumer` such as + the Prometheus charm. + relabel_instance: A boolean flag indicating if Prometheus + scrape job "instance" labels must refer to Juju Topology. + resolve_addresses: A boolean flag indiccating if the aggregator + should attempt to perform DNS lookups of targets and append + a `dns_name` label + """ + self._charm = charm + + relation_names = relation_names or {} + + self._prometheus_relation = relation_names.get( + "prometheus", "downstream-prometheus-scrape" + ) + self._target_relation = relation_names.get("scrape_target", "prometheus-target") + self._alert_rules_relation = relation_names.get("alert_rules", "prometheus-rules") + + super().__init__(charm, self._prometheus_relation) + self._stored.set_default(jobs=[], alert_rules=[]) + + self._relabel_instance = relabel_instance + self._resolve_addresses = resolve_addresses + + # manage Prometheus charm relation events + prometheus_events = self._charm.on[self._prometheus_relation] + self.framework.observe(prometheus_events.relation_joined, self._set_prometheus_data) + + # manage list of Prometheus scrape jobs from related scrape targets + target_events = self._charm.on[self._target_relation] + self.framework.observe(target_events.relation_changed, self._on_prometheus_targets_changed) + self.framework.observe( + target_events.relation_departed, self._on_prometheus_targets_departed + ) + + # manage alert rules for Prometheus from related scrape targets + alert_rule_events = self._charm.on[self._alert_rules_relation] + self.framework.observe(alert_rule_events.relation_changed, self._on_alert_rules_changed) + self.framework.observe(alert_rule_events.relation_departed, self._on_alert_rules_departed) + + def _set_prometheus_data(self, event): + """Ensure every new Prometheus instances is updated. + + Any time a new Prometheus unit joins the relation with + `MetricsEndpointAggregator`, that Prometheus unit is provided + with the complete set of existing scrape jobs and alert rules. + """ + if not self._charm.unit.is_leader(): + return + + jobs = [] + _type_convert_stored( + self._stored.jobs # pyright: ignore + ) # list of scrape jobs, one per relation + for relation in self.model.relations[self._target_relation]: + targets = self._get_targets(relation) + if targets and relation.app: + jobs.append(self._static_scrape_job(targets, relation.app.name)) + + groups = [] + _type_convert_stored( + self._stored.alert_rules # pyright: ignore + ) # list of alert rule groups + for relation in self.model.relations[self._alert_rules_relation]: + unit_rules = self._get_alert_rules(relation) + if unit_rules and relation.app: + appname = relation.app.name + rules = self._label_alert_rules(unit_rules, appname) + group = {"name": self.group_name(appname), "rules": rules} + groups.append(group) + + event.relation.data[self._charm.app]["scrape_jobs"] = _encode_content(json.dumps(jobs)) + event.relation.data[self._charm.app]["alert_rules"] = _encode_content( + json.dumps({"groups": groups}) + ) + + def _on_prometheus_targets_changed(self, event): + """Update scrape jobs in response to scrape target changes. + + When there is any change in relation data with any scrape + target, the Prometheus scrape job, for that specific target is + updated. + """ + targets = self._get_targets(event.relation) + if not targets: + return + + # new scrape job for the relation that has changed + self.set_target_job_data(targets, event.relation.app.name) + + def set_target_job_data(self, targets: dict, app_name: str, **kwargs) -> None: + """Update scrape jobs in response to scrape target changes. + + When there is any change in relation data with any scrape + target, the Prometheus scrape job, for that specific target is + updated. Additionally, if this method is called manually, do the + same. + + Args: + targets: a `dict` containing target information + app_name: a `str` identifying the application + kwargs: a `dict` of the extra arguments passed to the function + """ + if not self._charm.unit.is_leader(): + return + + # new scrape job for the relation that has changed + updated_job = self._static_scrape_job(targets, app_name, **kwargs) + + for relation in self.model.relations[self._prometheus_relation]: + jobs = json.loads( + _decode_content(relation.data[self._charm.app].get("scrape_jobs", "[]")) + ) + # list of scrape jobs that have not changed + jobs = [job for job in jobs if updated_job["job_name"] != job["job_name"]] + jobs.append(updated_job) + relation.data[self._charm.app]["scrape_jobs"] = _encode_content(json.dumps(jobs)) + + if not _type_convert_stored(self._stored.jobs) == jobs: # pyright: ignore + self._stored.jobs = jobs + + def _on_prometheus_targets_departed(self, event): + """Remove scrape jobs when a target departs. + + Any time a scrape target departs, any Prometheus scrape job + associated with that specific scrape target is removed. + """ + job_name = self._job_name(event.relation.app.name) + unit_name = event.unit.name + self.remove_prometheus_jobs(job_name, unit_name) + + def remove_prometheus_jobs(self, job_name: str, unit_name: Optional[str] = ""): + """Given a job name and unit name, remove scrape jobs associated. + + The `unit_name` parameter is used for automatic, relation data bag-based + generation, where the unit name in labels can be used to ensure that jobs with + similar names (which are generated via the app name when scanning relation data + bags) are not accidentally removed, as their unit name labels will differ. + For NRPE, the job name is calculated from an ID sent via the NRPE relation, and is + sufficient to uniquely identify the target. + """ + if not self._charm.unit.is_leader(): + return + + for relation in self.model.relations[self._prometheus_relation]: + jobs = json.loads( + _decode_content(relation.data[self._charm.app].get("scrape_jobs", "[]")) + ) + if not jobs: + continue + + changed_job = [j for j in jobs if j.get("job_name") == job_name] + if not changed_job: + continue + changed_job = changed_job[0] + + # list of scrape jobs that have not changed + jobs = [job for job in jobs if job.get("job_name") != job_name] + + # list of scrape jobs for units of the same application that still exist + configs_kept = [ + config + for config in changed_job["static_configs"] # type: ignore + if config.get("labels", {}).get("juju_unit") != unit_name + ] + + if configs_kept: + changed_job["static_configs"] = configs_kept # type: ignore + jobs.append(changed_job) + + relation.data[self._charm.app]["scrape_jobs"] = _encode_content(json.dumps(jobs)) + + if not _type_convert_stored(self._stored.jobs) == jobs: # pyright: ignore + self._stored.jobs = jobs + + def _job_name(self, appname) -> str: + """Construct a scrape job name. + + Each relation has its own unique scrape job name. All units in + the relation are scraped as part of the same scrape job. + + Args: + appname: string name of a related application. + + Returns: + a string Prometheus scrape job name for the application. + """ + return "juju_{}_{}_{}_prometheus_scrape".format( + self.model.name, self.model.uuid[:7], appname + ) + + def _get_targets(self, relation) -> dict: + """Fetch scrape targets for a relation. + + Scrape target information is returned for each unit in the + relation. This information contains the unit name, network + hostname (or address) for that unit, and port on which a + metrics endpoint is exposed in that unit. + + Args: + relation: an `ops.model.Relation` object for which scrape + targets are required. + + Returns: + a dictionary whose keys are names of the units in the + relation. There values associated with each key is itself + a dictionary of the form + ``` + {"hostname": hostname, "port": port} + ``` + """ + targets = {} + for unit in relation.units: + port = relation.data[unit].get("port", 80) + hostname = relation.data[unit].get("hostname") + if hostname: + targets.update({unit.name: {"hostname": hostname, "port": port}}) + + return targets + + def _static_scrape_job(self, targets, application_name, **kwargs) -> dict: + """Construct a static scrape job for an application. + + Args: + targets: a dictionary providing hostname and port for all + scrape target. The keys of this dictionary are unit + names. Values corresponding to these keys are + themselves a dictionary with keys "hostname" and + "port". + application_name: a string name of the application for + which this static scrape job is being constructed. + kwargs: a `dict` of the extra arguments passed to the function + + Returns: + A dictionary corresponding to a Prometheus static scrape + job configuration for one application. The returned + dictionary may be transformed into YAML and appended to + the list of any existing list of Prometheus static configs. + """ + juju_model = self.model.name + juju_model_uuid = self.model.uuid + + job = { + "job_name": self._job_name(application_name), + "static_configs": [ + { + "targets": ["{}:{}".format(target["hostname"], target["port"])], + "labels": { + "juju_model": juju_model, + "juju_model_uuid": juju_model_uuid, + "juju_application": application_name, + "juju_unit": unit_name, + "host": target["hostname"], + # Expanding this will merge the dicts and replace the + # topology labels if any were present/found + **self._static_config_extra_labels(target), + }, + } + for unit_name, target in targets.items() + ], + "relabel_configs": self._relabel_configs + kwargs.get("relabel_configs", []), + } + job.update(kwargs.get("updates", {})) + + return job + + def _static_config_extra_labels(self, target: Dict[str, str]) -> Dict[str, str]: + """Build a list of extra static config parameters, if specified.""" + extra_info = {} + + if self._resolve_addresses: + try: + dns_name = socket.gethostbyaddr(target["hostname"])[0] + except OSError: + logger.debug("Could not perform DNS lookup for %s", target["hostname"]) + dns_name = target["hostname"] + extra_info["dns_name"] = dns_name + + return extra_info + + @property + def _relabel_configs(self) -> list: + """Create Juju topology relabeling configuration. + + Using Juju topology for instance labels ensures that these + labels are stable across unit recreation. + + Returns: + a list of Prometheus relabeling configurations. Each item in + this list is one relabel configuration. + """ + return ( + [ + { + "source_labels": [ + "juju_model", + "juju_model_uuid", + "juju_application", + "juju_unit", + ], + "separator": "_", + "target_label": "instance", + "regex": "(.*)", + } + ] + if self._relabel_instance + else [] + ) + + def _on_alert_rules_changed(self, event): + """Update alert rules in response to scrape target changes. + + When there is any change in alert rule relation data for any + scrape target, the list of alert rules for that specific + target is updated. + """ + unit_rules = self._get_alert_rules(event.relation) + if not unit_rules: + return + + app_name = event.relation.app.name + self.set_alert_rule_data(app_name, unit_rules) + + def set_alert_rule_data(self, name: str, unit_rules: dict, label_rules: bool = True) -> None: + """Update alert rule data. + + The unit rules should be a dict, which is has additional Juju topology labels added. For + rules generated by the NRPE exporter, they are pre-labeled so lookups can be performed. + """ + if not self._charm.unit.is_leader(): + return + + if label_rules: + rules = self._label_alert_rules(unit_rules, name) + else: + rules = [unit_rules] + updated_group = {"name": self.group_name(name), "rules": rules} + + for relation in self.model.relations[self._prometheus_relation]: + alert_rules = json.loads( + _decode_content(relation.data[self._charm.app].get("alert_rules", "{}")) + ) + groups = alert_rules.get("groups", []) + # list of alert rule groups that have not changed + for group in groups: + if group["name"] == updated_group["name"]: + group["rules"] = [r for r in group["rules"] if r not in updated_group["rules"]] + group["rules"].extend(updated_group["rules"]) + + if updated_group["name"] not in [g["name"] for g in groups]: + groups.append(updated_group) + relation.data[self._charm.app]["alert_rules"] = _encode_content( + json.dumps({"groups": groups}) + ) + + if not _type_convert_stored(self._stored.alert_rules) == groups: # pyright: ignore + self._stored.alert_rules = groups + + def _on_alert_rules_departed(self, event): + """Remove alert rules for departed targets. + + Any time a scrape target departs any alert rules associated + with that specific scrape target is removed. + """ + group_name = self.group_name(event.relation.app.name) + unit_name = event.unit.name + self.remove_alert_rules(group_name, unit_name) + + def remove_alert_rules(self, group_name: str, unit_name: str) -> None: + """Remove an alert rule group from relation data.""" + if not self._charm.unit.is_leader(): + return + + for relation in self.model.relations[self._prometheus_relation]: + alert_rules = json.loads( + _decode_content(relation.data[self._charm.app].get("alert_rules", "{}")) + ) + if not alert_rules: + continue + + groups = alert_rules.get("groups", []) + if not groups: + continue + + changed_group = [group for group in groups if group["name"] == group_name] + if not changed_group: + continue + changed_group = changed_group[0] + + # list of alert rule groups that have not changed + groups = [group for group in groups if group["name"] != group_name] + + # list of alert rules not associated with departing unit + rules_kept = [ + rule + for rule in changed_group.get("rules") # type: ignore + if rule.get("labels").get("juju_unit") != unit_name + ] + + if rules_kept: + changed_group["rules"] = rules_kept # type: ignore + groups.append(changed_group) + + relation.data[self._charm.app]["alert_rules"] = _encode_content( + json.dumps({"groups": groups}) if groups else "{}" + ) + + if not _type_convert_stored(self._stored.alert_rules) == groups: # pyright: ignore + self._stored.alert_rules = groups + + def _get_alert_rules(self, relation) -> dict: + """Fetch alert rules for a relation. + + Each unit of the related scrape target may have its own + associated alert rules. Alert rules for all units are returned + indexed by unit name. + + Args: + relation: an `ops.model.Relation` object for which alert + rules are required. + + Returns: + a dictionary whose keys are names of the units in the + relation. There values associated with each key is a list + of alert rules. Each rule is in dictionary format. The + structure "rule dictionary" corresponds to single + Prometheus alert rule. + """ + rules = {} + for unit in relation.units: + unit_rules = yaml.safe_load(_decode_content(relation.data[unit].get("groups", ""))) + if unit_rules: + rules.update({unit.name: unit_rules}) + + return rules + + def group_name(self, unit_name: str) -> str: + """Construct name for an alert rule group. + + Each unit in a relation may define its own alert rules. All + rules, for all units in a relation are grouped together and + given a single alert rule group name. + + Args: + unit_name: string name of a related application. + + Returns: + a string Prometheus alert rules group name for the unit. + """ + unit_name = re.sub(r"/", "_", unit_name) + return "juju_{}_{}_{}_alert_rules".format(self.model.name, self.model.uuid[:7], unit_name) + + def _label_alert_rules(self, unit_rules, app_name: str) -> list: + """Apply juju topology labels to alert rules. + + Args: + unit_rules: a list of alert rules, where each rule is in + dictionary format. + app_name: a string name of the application to which the + alert rules belong. + + Returns: + a list of alert rules with Juju topology labels. + """ + labeled_rules = [] + for unit_name, rules in unit_rules.items(): + for rule in rules: + # the new JujuTopology removed this, so build it up by hand + matchers = { + "juju_{}".format(k): v + for k, v in JujuTopology(self.model.name, self.model.uuid, app_name, unit_name) + .as_dict(excluded_keys=["charm_name"]) + .items() + } + rule["labels"].update(matchers.items()) + labeled_rules.append(rule) + + return labeled_rules + + +class CosTool: + """Uses cos-tool to inject label matchers into alert rule expressions and validate rules.""" + + _path = None + _disabled = False + + def __init__(self, charm): + self._charm = charm + + @property + def path(self): + """Lazy lookup of the path of cos-tool.""" + if self._disabled: + return None + if not self._path: + self._path = self._get_tool_path() + if not self._path: + logger.debug("Skipping injection of juju topology as label matchers") + self._disabled = True + return self._path + + def apply_label_matchers(self, rules) -> dict: + """Will apply label matchers to the expression of all alerts in all supplied groups.""" + if not self.path: + return rules + for group in rules["groups"]: + rules_in_group = group.get("rules", []) + for rule in rules_in_group: + topology = {} + # if the user for some reason has provided juju_unit, we'll need to honor it + # in most cases, however, this will be empty + for label in [ + "juju_model", + "juju_model_uuid", + "juju_application", + "juju_charm", + "juju_unit", + ]: + if label in rule["labels"]: + topology[label] = rule["labels"][label] + + rule["expr"] = self.inject_label_matchers(rule["expr"], topology) + return rules + + def validate_alert_rules(self, rules: dict) -> Tuple[bool, str]: + """Will validate correctness of alert rules, returning a boolean and any errors.""" + if not self.path: + logger.debug("`cos-tool` unavailable. Not validating alert correctness.") + return True, "" + + with tempfile.TemporaryDirectory() as tmpdir: + rule_path = Path(tmpdir + "/validate_rule.yaml") + rule_path.write_text(yaml.dump(rules)) + + args = [str(self.path), "validate", str(rule_path)] + # noinspection PyBroadException + try: + self._exec(args) + return True, "" + except subprocess.CalledProcessError as e: + logger.debug("Validating the rules failed: %s", e.output) + return False, ", ".join( + [ + line + for line in e.output.decode("utf8").splitlines() + if "error validating" in line + ] + ) + + def validate_scrape_jobs(self, jobs: list) -> bool: + """Validate scrape jobs using cos-tool.""" + if not self.path: + logger.debug("`cos-tool` unavailable. Not validating scrape jobs.") + return True + conf = {"scrape_configs": jobs} + with tempfile.NamedTemporaryFile() as tmpfile: + with open(tmpfile.name, "w") as f: + f.write(yaml.safe_dump(conf)) + try: + self._exec([str(self.path), "validate-config", tmpfile.name]) + except subprocess.CalledProcessError as e: + logger.error("Validating scrape jobs failed: {}".format(e.output)) + raise + return True + + def inject_label_matchers(self, expression, topology) -> str: + """Add label matchers to an expression.""" + if not topology: + return expression + if not self.path: + logger.debug("`cos-tool` unavailable. Leaving expression unchanged: %s", expression) + return expression + args = [str(self.path), "transform"] + args.extend( + ["--label-matcher={}={}".format(key, value) for key, value in topology.items()] + ) + + args.extend(["{}".format(expression)]) + # noinspection PyBroadException + try: + return self._exec(args) + except subprocess.CalledProcessError as e: + logger.debug('Applying the expression failed: "%s", falling back to the original', e) + return expression + + def _get_tool_path(self) -> Optional[Path]: + arch = platform.machine() + arch = "amd64" if arch == "x86_64" else arch + res = "cos-tool-{}".format(arch) + try: + path = Path(res).resolve() + path.chmod(0o777) + return path + except NotImplementedError: + logger.debug("System lacks support for chmod") + except FileNotFoundError: + logger.debug('Could not locate cos-tool at: "{}"'.format(res)) + return None + + def _exec(self, cmd) -> str: + result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + return result.stdout.decode("utf-8").strip() diff --git a/src/charm.py b/src/charm.py index ccee97eb..60032e6d 100755 --- a/src/charm.py +++ b/src/charm.py @@ -24,7 +24,7 @@ adjust_resource_requirements, ) from charms.observability_libs.v1.cert_handler import CertHandler -from charms.prometheus_k8s.v0.prometheus_scrape import ( +from charms.prometheus_k8s.v1.prometheus_scrape import ( MetricsEndpointConsumer, MetricsEndpointProvider, PrometheusConfig, diff --git a/tests/unit/test_endpoint_aggregator.py b/tests/unit/test_endpoint_aggregator_v0.py similarity index 100% rename from tests/unit/test_endpoint_aggregator.py rename to tests/unit/test_endpoint_aggregator_v0.py diff --git a/tests/unit/test_endpoint_aggregator_v1.py b/tests/unit/test_endpoint_aggregator_v1.py new file mode 100644 index 00000000..4fb221ff --- /dev/null +++ b/tests/unit/test_endpoint_aggregator_v1.py @@ -0,0 +1,752 @@ +# Copyright 2021 Canonical Ltd. +# See LICENSE file for licensing details. + +import json +import unittest +from unittest.mock import patch + +from charms.prometheus_k8s.v1.prometheus_scrape import MetricsEndpointAggregator +from ops.charm import CharmBase +from ops.testing import Harness + +PROMETHEUS_RELATION = "metrics-endpoint" +SCRAPE_TARGET_RELATION = "prometheus-target" +ALERT_RULES_RELATION = "prometheus-rules" +AGGREGATOR_META = f""" +name: aggregator-tester +containers: + aggregator-tester: +provides: + {PROMETHEUS_RELATION}: + interface: prometheus_scrape +requires: + {SCRAPE_TARGET_RELATION}: + interface: http + {ALERT_RULES_RELATION}: + interface: prometheus-rules +""" + +RELABEL_INSTANCE_CONFIG = { + "source_labels": [ + "juju_model", + "juju_model_uuid", + "juju_application", + "juju_unit", + ], + "separator": "_", + "target_label": "instance", + "regex": "(.*)", +} + +ALERT_RULE_1 = """- alert: CPU_Usage + expr: cpu_usage_idle{is_container!=\"True\", group=\"promoagents-juju\"} < 10 + for: 5m + labels: + override_group_by: host + severity: page + cloud: juju + annotations: + description: | + Host {{ $labels.host }} has had < 10% idle cpu for the last 5m + summary: Host {{ $labels.host }} CPU free is less than 10% +""" +ALERT_RULE_2 = """- alert: DiskFull + expr: disk_free{is_container!=\"True\", fstype!~\".*tmpfs|squashfs|overlay\"} <1024 + for: 5m + labels: + override_group_by: host + severity: page + annotations: + description: | + Host {{ $labels.host}} {{ $labels.path }} is full + summary: Host {{ $labels.host }} {{ $labels.path}} is full +""" + + +class EndpointAggregatorCharm(CharmBase): + def __init__(self, *args, **kwargs): + super().__init__(*args) + + relation_names = { + "prometheus": PROMETHEUS_RELATION, + "scrape_target": SCRAPE_TARGET_RELATION, + "alert_rules": ALERT_RULES_RELATION, + } + self._aggregator = MetricsEndpointAggregator( + self, + relation_names, + ) + + +class EndpointResolvingAggregatorCharm(CharmBase): + def __init__(self, *args, **kwargs): + super().__init__(*args) + + relation_names = { + "prometheus": PROMETHEUS_RELATION, + "scrape_target": SCRAPE_TARGET_RELATION, + "alert_rules": ALERT_RULES_RELATION, + } + self._aggregator = MetricsEndpointAggregator(self, relation_names, resolve_addresses=True) + + +class TestEndpointAggregator(unittest.TestCase): + def setUp(self): + self.harness = Harness(EndpointAggregatorCharm, meta=AGGREGATOR_META) + self.harness.set_model_info(name="testmodel", uuid="12de4fae-06cc-4ceb-9089-567be09fec78") + self.addCleanup(self.harness.cleanup) + self.harness.set_leader(True) + self.harness.begin_with_initial_hooks() + + def test_adding_prometheus_then_target_forwards_a_labeled_scrape_job(self): + prometheus_rel_id = self.harness.add_relation(PROMETHEUS_RELATION, "prometheus") + self.harness.add_relation_unit(prometheus_rel_id, "prometheus/0") + + target_rel_id = self.harness.add_relation(SCRAPE_TARGET_RELATION, "target-app") + self.harness.add_relation_unit(target_rel_id, "target-app/0") + + hostname = "scrape_target_0" + port = "1234" + self.harness.update_relation_data( + target_rel_id, + "target-app/0", + { + "hostname": f"{hostname}", + "port": f"{port}", + }, + ) + + prometheus_rel_data = self.harness.get_relation_data( + prometheus_rel_id, self.harness.model.app.name + ) + scrape_jobs = json.loads(prometheus_rel_data.get("scrape_jobs", "[]")) + expected_jobs = [ + { + "job_name": "juju_testmodel_12de4fa_target-app_prometheus_scrape", + "static_configs": [ + { + "targets": ["scrape_target_0:1234"], + "labels": { + "juju_model": "testmodel", + "juju_model_uuid": "12de4fae-06cc-4ceb-9089-567be09fec78", + "juju_application": "target-app", + "juju_unit": "target-app/0", + "host": "scrape_target_0", + }, + } + ], + "relabel_configs": [RELABEL_INSTANCE_CONFIG], + } + ] + self.assertListEqual(scrape_jobs, expected_jobs) + + def test_adding_prometheus_then_target_forwards_a_labeled_alert_rule(self): + prometheus_rel_id = self.harness.add_relation(PROMETHEUS_RELATION, "prometheus") + self.harness.add_relation_unit(prometheus_rel_id, "prometheus/0") + + alert_rules_rel_id = self.harness.add_relation(ALERT_RULES_RELATION, "rules-app") + self.harness.add_relation_unit(alert_rules_rel_id, "rules-app/0") + self.harness.update_relation_data( + alert_rules_rel_id, "rules-app/0", {"groups": ALERT_RULE_1} + ) + + prometheus_rel_data = self.harness.get_relation_data( + prometheus_rel_id, self.harness.model.app.name + ) + + alert_rules = json.loads(prometheus_rel_data.get("alert_rules", "{}")) + groups = alert_rules.get("groups", []) + self.assertEqual(len(groups), 1) + group = groups[0] + + expected_group = { + "name": "juju_testmodel_12de4fa_rules-app_alert_rules", + "rules": [ + { + "alert": "CPU_Usage", + "expr": 'cpu_usage_idle{is_container!="True", group="promoagents-juju"} < 10', + "for": "5m", + "labels": { + "override_group_by": "host", + "severity": "page", + "cloud": "juju", + "juju_model": "testmodel", + "juju_model_uuid": "12de4fae-06cc-4ceb-9089-567be09fec78", + "juju_application": "rules-app", + "juju_unit": "rules-app/0", + }, + "annotations": { + "description": "Host {{ $labels.host }} has had < 10% idle cpu for the last 5m\n", + "summary": "Host {{ $labels.host }} CPU free is less than 10%", + }, + } + ], + } + self.maxDiff = None + self.assertDictEqual(group, expected_group) + + def test_adding_target_then_prometheus_forwards_a_labeled_scrape_job(self): + target_rel_id = self.harness.add_relation(SCRAPE_TARGET_RELATION, "target-app") + self.harness.add_relation_unit(target_rel_id, "target-app/0") + + hostname = "scrape_target_0" + port = "1234" + self.harness.update_relation_data( + target_rel_id, + "target-app/0", + { + "hostname": f"{hostname}", + "port": f"{port}", + }, + ) + + prometheus_rel_id = self.harness.add_relation(PROMETHEUS_RELATION, "prometheus") + self.harness.add_relation_unit(prometheus_rel_id, "prometheus/0") + + prometheus_rel_data = self.harness.get_relation_data( + prometheus_rel_id, self.harness.model.app.name + ) + scrape_jobs = json.loads(prometheus_rel_data.get("scrape_jobs", "[]")) + expected_jobs = [ + { + "job_name": "juju_testmodel_12de4fa_target-app_prometheus_scrape", + "static_configs": [ + { + "targets": ["scrape_target_0:1234"], + "labels": { + "juju_model": "testmodel", + "juju_model_uuid": "12de4fae-06cc-4ceb-9089-567be09fec78", + "juju_application": "target-app", + "juju_unit": "target-app/0", + "host": "scrape_target_0", + }, + } + ], + "relabel_configs": [RELABEL_INSTANCE_CONFIG], + } + ] + self.assertListEqual(scrape_jobs, expected_jobs) + + def test_adding_target_then_prometheus_forwards_a_labeled_alert_rule(self): + prometheus_rel_id = self.harness.add_relation(PROMETHEUS_RELATION, "prometheus") + self.harness.add_relation_unit(prometheus_rel_id, "prometheus/0") + + alert_rules_rel_id = self.harness.add_relation(ALERT_RULES_RELATION, "rules-app") + self.harness.add_relation_unit(alert_rules_rel_id, "rules-app/0") + self.harness.update_relation_data( + alert_rules_rel_id, "rules-app/0", {"groups": ALERT_RULE_1} + ) + + prometheus_rel_data = self.harness.get_relation_data( + prometheus_rel_id, self.harness.model.app.name + ) + + alert_rules = json.loads(prometheus_rel_data.get("alert_rules", "{}")) + groups = alert_rules.get("groups", []) + self.assertEqual(len(groups), 1) + group = groups[0] + + expected_group = { + "name": "juju_testmodel_12de4fa_rules-app_alert_rules", + "rules": [ + { + "alert": "CPU_Usage", + "expr": 'cpu_usage_idle{is_container!="True", group="promoagents-juju"} < 10', + "for": "5m", + "labels": { + "override_group_by": "host", + "severity": "page", + "cloud": "juju", + "juju_model": "testmodel", + "juju_model_uuid": "12de4fae-06cc-4ceb-9089-567be09fec78", + "juju_application": "rules-app", + "juju_unit": "rules-app/0", + }, + "annotations": { + "description": "Host {{ $labels.host }} has had < 10% idle cpu for the last 5m\n", + "summary": "Host {{ $labels.host }} CPU free is less than 10%", + }, + } + ], + } + self.assertDictEqual(group, expected_group) + + def test_scrape_jobs_from_multiple_target_applications_are_forwarded(self): + prometheus_rel_id = self.harness.add_relation(PROMETHEUS_RELATION, "prometheus") + self.harness.add_relation_unit(prometheus_rel_id, "prometheus/0") + + target_rel_id_1 = self.harness.add_relation(SCRAPE_TARGET_RELATION, "target-app-1") + self.harness.add_relation_unit(target_rel_id_1, "target-app-1/0") + self.harness.update_relation_data( + target_rel_id_1, + "target-app-1/0", + { + "hostname": "scrape_target_0", + "port": "1234", + }, + ) + + target_rel_id_2 = self.harness.add_relation(SCRAPE_TARGET_RELATION, "target-app-2") + self.harness.add_relation_unit(target_rel_id_2, "target-app-2/0") + self.harness.update_relation_data( + target_rel_id_2, + "target-app-2/0", + { + "hostname": "scrape_target_1", + "port": "5678", + }, + ) + + prometheus_rel_data = self.harness.get_relation_data( + prometheus_rel_id, self.harness.model.app.name + ) + scrape_jobs = json.loads(prometheus_rel_data.get("scrape_jobs", "[]")) + self.assertEqual(len(scrape_jobs), 2) + + expected_jobs = [ + { + "job_name": "juju_testmodel_12de4fa_target-app-1_prometheus_scrape", + "static_configs": [ + { + "targets": ["scrape_target_0:1234"], + "labels": { + "juju_model": "testmodel", + "juju_model_uuid": "12de4fae-06cc-4ceb-9089-567be09fec78", + "juju_application": "target-app-1", + "juju_unit": "target-app-1/0", + "host": "scrape_target_0", + }, + } + ], + "relabel_configs": [RELABEL_INSTANCE_CONFIG], + }, + { + "job_name": "juju_testmodel_12de4fa_target-app-2_prometheus_scrape", + "static_configs": [ + { + "targets": ["scrape_target_1:5678"], + "labels": { + "juju_model": "testmodel", + "juju_model_uuid": "12de4fae-06cc-4ceb-9089-567be09fec78", + "juju_application": "target-app-2", + "juju_unit": "target-app-2/0", + "host": "scrape_target_1", + }, + } + ], + "relabel_configs": [RELABEL_INSTANCE_CONFIG], + }, + ] + + self.assertListEqual(scrape_jobs, expected_jobs) + + def test_alert_rules_from_multiple_target_applications_are_forwarded(self): + prometheus_rel_id = self.harness.add_relation(PROMETHEUS_RELATION, "prometheus") + self.harness.add_relation_unit(prometheus_rel_id, "prometheus/0") + + alert_rules_rel_id_1 = self.harness.add_relation(ALERT_RULES_RELATION, "rules-app-1") + self.harness.add_relation_unit(alert_rules_rel_id_1, "rules-app-1/0") + self.harness.update_relation_data( + alert_rules_rel_id_1, + "rules-app-1/0", + {"groups": ALERT_RULE_1}, + ) + + alert_rules_rel_id_2 = self.harness.add_relation(ALERT_RULES_RELATION, "rules-app-2") + self.harness.add_relation_unit(alert_rules_rel_id_2, "rules-app-2/0") + self.harness.update_relation_data( + alert_rules_rel_id_2, + "rules-app-2/0", + {"groups": ALERT_RULE_2}, + ) + + prometheus_rel_data = self.harness.get_relation_data( + prometheus_rel_id, self.harness.model.app.name + ) + + alert_rules = json.loads(prometheus_rel_data.get("alert_rules", "{}")) + groups = alert_rules.get("groups", []) + self.assertEqual(len(groups), 2) + expected_groups = [ + { + "name": "juju_testmodel_12de4fa_rules-app-1_alert_rules", + "rules": [ + { + "alert": "CPU_Usage", + "expr": 'cpu_usage_idle{is_container!="True", group="promoagents-juju"} < 10', + "for": "5m", + "labels": { + "override_group_by": "host", + "severity": "page", + "cloud": "juju", + "juju_model": "testmodel", + "juju_model_uuid": "12de4fae-06cc-4ceb-9089-567be09fec78", + "juju_application": "rules-app-1", + "juju_unit": "rules-app-1/0", + }, + "annotations": { + "description": "Host {{ $labels.host }} has had < 10% idle cpu for the last 5m\n", + "summary": "Host {{ $labels.host }} CPU free is less than 10%", + }, + } + ], + }, + { + "name": "juju_testmodel_12de4fa_rules-app-2_alert_rules", + "rules": [ + { + "alert": "DiskFull", + "expr": 'disk_free{is_container!="True", fstype!~".*tmpfs|squashfs|overlay"} <1024', + "for": "5m", + "labels": { + "override_group_by": "host", + "severity": "page", + "juju_model": "testmodel", + "juju_model_uuid": "12de4fae-06cc-4ceb-9089-567be09fec78", + "juju_application": "rules-app-2", + "juju_unit": "rules-app-2/0", + }, + "annotations": { + "description": "Host {{ $labels.host}} {{ $labels.path }} is full\nsummary: Host {{ $labels.host }} {{ $labels.path}} is full\n" + }, + } + ], + }, + ] + self.assertListEqual(groups, expected_groups) + + def test_scrape_job_removal_differentiates_between_applications(self): + prometheus_rel_id = self.harness.add_relation(PROMETHEUS_RELATION, "prometheus") + self.harness.add_relation_unit(prometheus_rel_id, "prometheus/0") + + target_rel_id_1 = self.harness.add_relation("prometheus-target", "target-app-1") + self.harness.add_relation_unit(target_rel_id_1, "target-app-1/0") + self.harness.update_relation_data( + target_rel_id_1, + "target-app-1/0", + { + "hostname": "scrape_target_0", + "port": "1234", + }, + ) + + target_rel_id_2 = self.harness.add_relation("prometheus-target", "target-app-2") + self.harness.add_relation_unit(target_rel_id_2, "target-app-2/0") + self.harness.update_relation_data( + target_rel_id_2, + "target-app-2/0", + { + "hostname": "scrape_target_1", + "port": "5678", + }, + ) + + prometheus_rel_data = self.harness.get_relation_data( + prometheus_rel_id, self.harness.model.app.name + ) + scrape_jobs = json.loads(prometheus_rel_data.get("scrape_jobs", "[]")) + self.assertEqual(len(scrape_jobs), 2) + + self.harness.remove_relation_unit(target_rel_id_2, "target-app-2/0") + scrape_jobs = json.loads(prometheus_rel_data.get("scrape_jobs", "[]")) + self.assertEqual(len(scrape_jobs), 1) + + expected_jobs = [ + { + "job_name": "juju_testmodel_12de4fa_target-app-1_prometheus_scrape", + "static_configs": [ + { + "targets": ["scrape_target_0:1234"], + "labels": { + "juju_model": "testmodel", + "juju_model_uuid": "12de4fae-06cc-4ceb-9089-567be09fec78", + "juju_application": "target-app-1", + "juju_unit": "target-app-1/0", + "host": "scrape_target_0", + }, + } + ], + "relabel_configs": [RELABEL_INSTANCE_CONFIG], + } + ] + self.assertListEqual(scrape_jobs, expected_jobs) + + def test_alert_rules_removal_differentiates_between_applications(self): + prometheus_rel_id = self.harness.add_relation(PROMETHEUS_RELATION, "prometheus") + self.harness.add_relation_unit(prometheus_rel_id, "prometheus/0") + + alert_rules_rel_id_1 = self.harness.add_relation("prometheus-rules", "rules-app-1") + self.harness.add_relation_unit(alert_rules_rel_id_1, "rules-app-1/0") + self.harness.update_relation_data( + alert_rules_rel_id_1, + "rules-app-1/0", + {"groups": ALERT_RULE_1}, + ) + + alert_rules_rel_id_2 = self.harness.add_relation("prometheus-rules", "rules-app-2") + self.harness.add_relation_unit(alert_rules_rel_id_2, "rules-app-2/0") + self.harness.update_relation_data( + alert_rules_rel_id_2, + "rules-app-2/0", + {"groups": ALERT_RULE_2}, + ) + + prometheus_rel_data = self.harness.get_relation_data( + prometheus_rel_id, self.harness.model.app.name + ) + + alert_rules = json.loads(prometheus_rel_data.get("alert_rules", "{}")) + groups = alert_rules.get("groups", []) + self.assertEqual(len(groups), 2) + + self.harness.remove_relation_unit(alert_rules_rel_id_2, "rules-app-2/0") + alert_rules = json.loads(prometheus_rel_data.get("alert_rules", "{}")) + groups = alert_rules.get("groups", []) + self.assertEqual(len(groups), 1) + + expected_groups = [ + { + "name": "juju_testmodel_12de4fa_rules-app-1_alert_rules", + "rules": [ + { + "alert": "CPU_Usage", + "expr": 'cpu_usage_idle{is_container!="True", group="promoagents-juju"} < 10', + "for": "5m", + "labels": { + "override_group_by": "host", + "severity": "page", + "cloud": "juju", + "juju_model": "testmodel", + "juju_model_uuid": "12de4fae-06cc-4ceb-9089-567be09fec78", + "juju_application": "rules-app-1", + "juju_unit": "rules-app-1/0", + }, + "annotations": { + "description": "Host {{ $labels.host }} has had < 10% idle cpu for the last 5m\n", + "summary": "Host {{ $labels.host }} CPU free is less than 10%", + }, + } + ], + }, + ] + + self.assertListEqual(groups, expected_groups) + + def test_removing_scrape_jobs_differentiates_between_units(self): + prometheus_rel_id = self.harness.add_relation(PROMETHEUS_RELATION, "prometheus") + self.harness.add_relation_unit(prometheus_rel_id, "prometheus/0") + + target_rel_id = self.harness.add_relation("prometheus-target", "target-app") + self.harness.add_relation_unit(target_rel_id, "target-app/0") + self.harness.update_relation_data( + target_rel_id, + "target-app/0", + { + "hostname": "scrape_target_0", + "port": "1234", + }, + ) + + self.harness.add_relation_unit(target_rel_id, "target-app/1") + self.harness.update_relation_data( + target_rel_id, + "target-app/1", + { + "hostname": "scrape_target_1", + "port": "5678", + }, + ) + + prometheus_rel_data = self.harness.get_relation_data( + prometheus_rel_id, self.harness.model.app.name + ) + scrape_jobs = json.loads(prometheus_rel_data.get("scrape_jobs", "[]")) + + self.assertEqual(len(scrape_jobs), 1) + self.assertEqual(len(scrape_jobs[0].get("static_configs")), 2) + + self.harness.remove_relation_unit(target_rel_id, "target-app/1") + scrape_jobs = json.loads(prometheus_rel_data.get("scrape_jobs", "[]")) + + self.assertEqual(len(scrape_jobs), 1) + self.assertEqual(len(scrape_jobs[0].get("static_configs")), 1) + + expected_jobs = [ + { + "job_name": "juju_testmodel_12de4fa_target-app_prometheus_scrape", + "static_configs": [ + { + "targets": ["scrape_target_0:1234"], + "labels": { + "juju_model": "testmodel", + "juju_model_uuid": "12de4fae-06cc-4ceb-9089-567be09fec78", + "juju_application": "target-app", + "juju_unit": "target-app/0", + "host": "scrape_target_0", + }, + } + ], + "relabel_configs": [RELABEL_INSTANCE_CONFIG], + } + ] + self.assertListEqual(scrape_jobs, expected_jobs) + + def test_removing_alert_rules_differentiates_between_units(self): + prometheus_rel_id = self.harness.add_relation(PROMETHEUS_RELATION, "prometheus") + self.harness.add_relation_unit(prometheus_rel_id, "prometheus/0") + + alert_rules_rel_id = self.harness.add_relation("prometheus-rules", "rules-app") + self.harness.add_relation_unit(alert_rules_rel_id, "rules-app/0") + self.harness.update_relation_data( + alert_rules_rel_id, + "rules-app/0", + {"groups": ALERT_RULE_1}, + ) + + self.harness.add_relation_unit(alert_rules_rel_id, "rules-app/1") + self.harness.update_relation_data( + alert_rules_rel_id, + "rules-app/1", + {"groups": ALERT_RULE_2}, + ) + + prometheus_rel_data = self.harness.get_relation_data( + prometheus_rel_id, self.harness.model.app.name + ) + + alert_rules = json.loads(prometheus_rel_data.get("alert_rules", "{}")) + groups = alert_rules.get("groups", []) + self.assertEqual(len(groups), 1) + + self.harness.remove_relation_unit(alert_rules_rel_id, "rules-app/1") + + alert_rules = json.loads(prometheus_rel_data.get("alert_rules", "{}")) + groups = alert_rules.get("groups", []) + self.assertEqual(len(groups), 1) + + expected_groups = [ + { + "name": "juju_testmodel_12de4fa_rules-app_alert_rules", + "rules": [ + { + "alert": "CPU_Usage", + "expr": 'cpu_usage_idle{is_container!="True", group="promoagents-juju"} < 10', + "for": "5m", + "labels": { + "override_group_by": "host", + "severity": "page", + "cloud": "juju", + "juju_model": "testmodel", + "juju_model_uuid": "12de4fae-06cc-4ceb-9089-567be09fec78", + "juju_application": "rules-app", + "juju_unit": "rules-app/0", + }, + "annotations": { + "description": "Host {{ $labels.host }} has had < 10% idle cpu for the last 5m\n", + "summary": "Host {{ $labels.host }} CPU free is less than 10%", + }, + } + ], + }, + ] + self.assertListEqual(groups, expected_groups) + + +class TestEndpointAggregatorWithRelabeling(unittest.TestCase): + def setUp(self): + self.harness = Harness(EndpointResolvingAggregatorCharm, meta=AGGREGATOR_META) + self.harness.set_model_info(name="testmodel", uuid="12de4fae-06cc-4ceb-9089-567be09fec78") + self.addCleanup(self.harness.cleanup) + self.harness.set_leader(True) + self.harness.begin_with_initial_hooks() + + @patch("socket.gethostbyaddr", new=lambda *args: ("target.harness.example.org",)) + def test_adding_prometheus_then_target_with_good_dns_adds_label(self): + prometheus_rel_id = self.harness.add_relation(PROMETHEUS_RELATION, "prometheus") + self.harness.add_relation_unit(prometheus_rel_id, "prometheus/0") + + target_rel_id = self.harness.add_relation(SCRAPE_TARGET_RELATION, "target-app") + self.harness.add_relation_unit(target_rel_id, "target-app/0") + + hostname = "scrape_target_0" + port = "1234" + self.harness.update_relation_data( + target_rel_id, + "target-app/0", + { + "hostname": f"{hostname}", + "port": f"{port}", + }, + ) + + prometheus_rel_data = self.harness.get_relation_data( + prometheus_rel_id, self.harness.model.app.name + ) + scrape_jobs = json.loads(prometheus_rel_data.get("scrape_jobs", "[]")) + expected_jobs = [ + { + "job_name": "juju_testmodel_12de4fa_target-app_prometheus_scrape", + "static_configs": [ + { + "targets": ["scrape_target_0:1234"], + "labels": { + "juju_model": "testmodel", + "juju_model_uuid": "12de4fae-06cc-4ceb-9089-567be09fec78", + "juju_application": "target-app", + "juju_unit": "target-app/0", + "host": "scrape_target_0", + "dns_name": "target.harness.example.org", + }, + } + ], + "relabel_configs": [RELABEL_INSTANCE_CONFIG], + } + ] + self.maxDiff = None + self.assertListEqual(scrape_jobs, expected_jobs) + + @patch("socket.gethostbyaddr") + def test_adding_prometheus_then_target_with_bad_dns_keeps_input(self, mock_lookup): + mock_lookup.side_effect = OSError() + prometheus_rel_id = self.harness.add_relation(PROMETHEUS_RELATION, "prometheus") + self.harness.add_relation_unit(prometheus_rel_id, "prometheus/0") + + target_rel_id = self.harness.add_relation(SCRAPE_TARGET_RELATION, "target-app") + self.harness.add_relation_unit(target_rel_id, "target-app/0") + + hostname = "scrape_target_0" + port = "1234" + self.harness.update_relation_data( + target_rel_id, + "target-app/0", + { + "hostname": f"{hostname}", + "port": f"{port}", + }, + ) + + prometheus_rel_data = self.harness.get_relation_data( + prometheus_rel_id, self.harness.model.app.name + ) + scrape_jobs = json.loads(prometheus_rel_data.get("scrape_jobs", "[]")) + expected_jobs = [ + { + "job_name": "juju_testmodel_12de4fa_target-app_prometheus_scrape", + "static_configs": [ + { + "targets": ["scrape_target_0:1234"], + "labels": { + "juju_model": "testmodel", + "juju_model_uuid": "12de4fae-06cc-4ceb-9089-567be09fec78", + "juju_application": "target-app", + "juju_unit": "target-app/0", + "host": "scrape_target_0", + "dns_name": "scrape_target_0", + }, + } + ], + "relabel_configs": [RELABEL_INSTANCE_CONFIG], + } + ] + self.maxDiff = None + self.assertListEqual(scrape_jobs, expected_jobs) diff --git a/tests/unit/test_endpoint_consumer.py b/tests/unit/test_endpoint_consumer_v0.py similarity index 100% rename from tests/unit/test_endpoint_consumer.py rename to tests/unit/test_endpoint_consumer_v0.py diff --git a/tests/unit/test_endpoint_consumer_v1.py b/tests/unit/test_endpoint_consumer_v1.py new file mode 100644 index 00000000..5b0b23cd --- /dev/null +++ b/tests/unit/test_endpoint_consumer_v1.py @@ -0,0 +1,743 @@ +# Copyright 2020 Canonical Ltd. +# See LICENSE file for licensing details. + +import json +import logging +import unittest +import uuid +from string import Template + +from charms.prometheus_k8s.v1.prometheus_scrape import ( + ALLOWED_KEYS, + MetricsEndpointConsumer, +) +from ops.charm import CharmBase +from ops.framework import StoredState +from ops.testing import Harness + +from tests.unit.helpers import PROJECT_DIR + +logger = logging.getLogger(__name__) + +RELATION_NAME = "metrics-endpoint" +DEFAULT_JOBS = [{"metrics_path": "/metrics"}] +BAD_JOBS = [ + { + "metrics_path": "/metrics", + "static_configs": [ + { + "targets": ["*:80"], + "labels": { + "juju_model": "bad_model", + "juju_application": "bad_application", + "juju_model_uuid": "bad_uuid", + "juju_unit": "bad_unit", + "juju_charm": "bad_charm", + }, + } + ], + } +] + +SCRAPE_METADATA = { + "model": "consumer-model", + "model_uuid": "12de4fae-06cc-4ceb-9089-567be09fec78", + "application": "consumer", + "charm_name": "test-charm", +} +FULL_TARGET = "10.1.238.1:6000" +SCRAPE_JOBS = [ + { + "global": {"scrape_interval": "1h"}, + "rule_files": ["/some/file"], + "file_sd_configs": [{"files": "*some-files*"}], + "job_name": "my-first-job", + "metrics_path": "/one-path", + "static_configs": [ + {"targets": [FULL_TARGET, "*:7000"], "labels": {"some_key": "some-value"}} + ], + }, + { + "job_name": "my-second-job", + "static_configs": [ + {"targets": ["*:8000"], "labels": {"some_other_key": "some-other-value"}} + ], + }, +] + + +ALERT_RULES = { + "groups": [ + { + "name": "None_a5edc336-b02e-4fad-b847-c530500c1c86_consumer-tester_alerts", + "rules": [ + { + "alert": "CPUOverUse", + "expr": "process_cpu_seconds_total > 0.12", + "for": "0m", + "labels": { + "severity": "Low", + "juju_model": "None", + "juju_model_uuid": "a5edc336-b02e-4fad-b847-c530500c1c86", + "juju_application": "consumer-tester", + }, + "annotations": { + "summary": "Instance {{ $labels.instance }} CPU over use", + "description": "{{ $labels.instance }} of job " + "{{ $labels.job }} has used too much CPU.", + }, + }, + { + "alert": "PrometheusTargetMissing", + "expr": "up == 0", + "for": "0m", + "labels": { + "severity": "critical", + "juju_model": "None", + "juju_model_uuid": "a5edc336-b02e-4fad-b847-c530500c1c86", + "juju_application": "consumer-tester", + }, + "annotations": { + "summary": "Prometheus target missing (instance {{ $labels.instance }})", + "description": "A Prometheus target has disappeared." + "An exporter might be crashed.\n" + "VALUE = {{ $value }}\n LABELS = {{ $labels }}", + }, + }, + ], + } + ] +} +UNLABELED_ALERT_RULES = { + "groups": [ + { + "name": "unlabeled_external_cpu_alerts", + "rules": [ + { + "alert": "CPUOverUse", + "expr": 'process_cpu_seconds_total{juju_model="None"}', + "for": "0m", + "labels": { + "severity": "Low", + }, + "annotations": { + "summary": "Instance {{ $labels.instance }} CPU over use", + "description": "{{ $labels.instance }} of job " + "{{ $labels.job }} has used too much CPU.", + }, + }, + ], + }, + ] +} +OTHER_SCRAPE_JOBS = [ + { + "metrics_path": "/other-path", + "static_configs": [{"targets": ["*:9000"], "labels": {"other_key": "other-value"}}], + } +] +OTHER_SCRAPE_METADATA = { + "model": "consumer-model", + "model_uuid": "12de4fae-06cc-4ceb-9089-567be09fec78", + "application": "other-consumer", + "charm_name": "other-charm", +} + + +class EndpointConsumerCharm(CharmBase): + _stored = StoredState() + + def __init__(self, *args, **kwargs): + super().__init__(*args) + self._stored.set_default(num_events=0) + self.prometheus_consumer = MetricsEndpointConsumer(self, RELATION_NAME) + self.framework.observe(self.prometheus_consumer.on.targets_changed, self.record_events) + + def record_events(self, event): + self._stored.num_events += 1 + + @property + def version(self): + return "1.0.0" + + +class TestEndpointConsumer(unittest.TestCase): + def setUp(self): + metadata_file = open(PROJECT_DIR / "metadata.yaml") + self.harness = Harness(EndpointConsumerCharm, meta=metadata_file) + + self.addCleanup(self.harness.cleanup) + self.harness.begin() + + def setup_charm_relations(self, multi=False): + """Create relations used by test cases. + + Args: + multi: a boolean indicating if multiple relations must be + created. + """ + rel_ids = [] + self.assertEqual(self.harness.charm._stored.num_events, 0) + rel_id = self.harness.add_relation(RELATION_NAME, "consumer") + rel_ids.append(rel_id) + self.harness.update_relation_data( + rel_id, + "consumer", + { + "scrape_metadata": json.dumps(SCRAPE_METADATA), + "scrape_jobs": json.dumps(SCRAPE_JOBS), + }, + ) + self.harness.add_relation_unit(rel_id, "consumer/0") + self.harness.update_relation_data( + rel_id, + "consumer/0", + { + "prometheus_scrape_unit_address": "1.1.1.1", + "prometheus_scrape_unit_name": "consumer/0", + }, + ) + self.assertEqual(self.harness.charm._stored.num_events, 2) + + if multi: + rel_id = self.harness.add_relation(RELATION_NAME, "other-consumer") + rel_ids.append(rel_id) + self.harness.update_relation_data( + rel_id, + "other-consumer", + { + "scrape_metadata": json.dumps(OTHER_SCRAPE_METADATA), + "scrape_jobs": json.dumps(OTHER_SCRAPE_JOBS), + }, + ) + self.harness.add_relation_unit(rel_id, "other-consumer/0") + self.harness.update_relation_data( + rel_id, + "other-consumer/0", + { + "prometheus_scrape_unit_address": "2.2.2.2", + "prometheus_scrape_unit_name": "other-consumer/0", + }, + ) + self.assertEqual(self.harness.charm._stored.num_events, 4) + + return rel_ids + + def validate_jobs(self, jobs): + """Valdiate that a list of jobs has the expected fields. + + Existence for unit labels is not checked since these do not + exist for all jobs. + + Args: + jobs: list of jobs where each job is a dictionary. + + Raises: + assertion failures if any job is not as expected. + """ + for job in jobs: + self.assertIn("job_name", job) + self.assertIn("static_configs", job) + static_configs = job["static_configs"] + for static_config in static_configs: + self.assertIn("targets", static_config) + self.assertIn("labels", static_config) + labels = static_config["labels"] + self.assertIn("juju_model", labels) + self.assertIn("juju_model_uuid", labels) + self.assertIn("juju_application", labels) + self.assertIn("juju_charm", labels) + + relabel_configs = job["relabel_configs"] + self.assertEqual(len(relabel_configs), 1) + + relabel_config = relabel_configs[0] + self.assertGreaterEqual( + set(relabel_config.get("source_labels")), + {"juju_model", "juju_model_uuid", "juju_application"}, + ) + + def test_consumer_notifies_on_new_scrape_relation(self): + self.assertEqual(self.harness.charm._stored.num_events, 0) + + rel_id = self.harness.add_relation(RELATION_NAME, "consumer") + self.harness.update_relation_data( + rel_id, "consumer", {"scrape_metadata": json.dumps(SCRAPE_METADATA)} + ) + self.assertEqual(self.harness.charm._stored.num_events, 1) + + def test_consumer_notifies_on_new_scrape_target(self): + self.assertEqual(self.harness.charm._stored.num_events, 0) + rel_id = self.harness.add_relation(RELATION_NAME, "consumer") + self.harness.add_relation_unit(rel_id, "consumer/0") + self.harness.update_relation_data( + rel_id, "consumer/0", {"prometheus_scrape_host": "1.1.1.1"} + ) + self.assertEqual(self.harness.charm._stored.num_events, 1) + + def test_consumer_returns_all_static_scrape_labeled_jobs(self): + self.setup_charm_relations() + + jobs = self.harness.charm.prometheus_consumer.jobs() + self.assertEqual(len(jobs), 3) # two wildcards and one fully-qualified + self.validate_jobs(jobs) + + def test_consumer_does_not_unit_label_fully_qualified_targets(self): + self.setup_charm_relations() + + jobs = self.harness.charm.prometheus_consumer.jobs() + self.assertEqual(len(jobs), 3) # two wildcards and one fully-qualified + for job in jobs: + for static_config in job["static_configs"]: + if FULL_TARGET in static_config.get("targets"): + self.assertNotIn("juju_unit", static_config.get("labels")) + + def test_consumer_does_attach_unit_labels_to_wildcard_hosts(self): + self.setup_charm_relations() + + jobs = self.harness.charm.prometheus_consumer.jobs() + self.assertEqual(len(jobs), 3) # two wildcards and one fully-qualified + for job in jobs: + for static_config in job["static_configs"]: + if FULL_TARGET not in static_config.get("targets"): + self.assertIn("juju_unit", static_config.get("labels")) + + def test_consumer_allows_custom_metrics_paths(self): + rel_ids = self.setup_charm_relations() + self.assertEqual(len(rel_ids), 1) + rel_id = rel_ids[0] + + jobs = self.harness.charm.prometheus_consumer.jobs() + for job in jobs: + if job.get("metrics_path"): + name_suffix = job_name_suffix(job["job_name"], juju_job_labels(job), rel_id) + path = named_job_attribute(name_suffix, "metrics_path", "/metrics") + self.assertEqual(job["metrics_path"], path) + + def test_consumer_sanitizes_jobs(self): + self.setup_charm_relations() + + jobs = self.harness.charm.prometheus_consumer.jobs() + for job in jobs: + job_keys = set(job.keys()) + self.assertTrue(job_keys.issubset(ALLOWED_KEYS)) + + def test_consumer_returns_jobs_for_all_relations(self): + self.setup_charm_relations(multi=True) + + jobs = self.harness.charm.prometheus_consumer.jobs() + self.assertEqual(len(jobs), 4) # three wildcards and one fully-qualified + + def test_consumer_scrapes_each_port_for_wildcard_hosts(self): + rel_ids = self.setup_charm_relations() + self.assertEqual(len(rel_ids), 1) + rel_id = rel_ids[0] + + jobs = self.harness.charm.prometheus_consumer.jobs() + self.assertEqual(len(jobs), 3) # two wildcards and one fully-qualified + ports = wildcard_target_ports(SCRAPE_JOBS) + targets = wildcard_targets(jobs, ports) + consumers = self.harness.charm.model.get_relation(RELATION_NAME, rel_id) + self.assertEqual(len(targets), len(ports) * len(consumers.units)) + + def test_consumer_handles_default_scrape_job(self): + self.assertEqual(self.harness.charm._stored.num_events, 0) + + rel_id = self.harness.add_relation(RELATION_NAME, "consumer") + self.harness.update_relation_data( + rel_id, + "consumer", + { + "scrape_metadata": json.dumps(SCRAPE_METADATA), + "scrape_jobs": json.dumps(DEFAULT_JOBS), + }, + ) + self.assertEqual(self.harness.charm._stored.num_events, 1) + self.harness.add_relation_unit(rel_id, "consumer/0") + self.harness.update_relation_data( + rel_id, + "consumer/0", + { + "prometheus_scrape_unit_address": "1.1.1.1", + "prometheus_scrape_unit_name": "provider/0", + }, + ) + self.assertEqual(self.harness.charm._stored.num_events, 2) + + jobs = self.harness.charm.prometheus_consumer.jobs() + self.validate_jobs(jobs) + + def test_consumer_accepts_targets_without_a_port_set(self): + self.assertEqual(self.harness.charm._stored.num_events, 0) + + rel_id = self.harness.add_relation(RELATION_NAME, "consumer") + jobs = DEFAULT_JOBS.copy() + jobs[0]["static_configs"] = [ + { + "targets": ["*"], + } + ] + self.harness.update_relation_data( + rel_id, + "consumer", + { + "scrape_metadata": json.dumps(SCRAPE_METADATA), + "scrape_jobs": json.dumps(jobs), + }, + ) + self.assertEqual(self.harness.charm._stored.num_events, 1) + self.harness.add_relation_unit(rel_id, "consumer/0") + self.harness.update_relation_data( + rel_id, + "consumer/0", + { + "prometheus_scrape_unit_address": "1.1.1.1", + "prometheus_scrape_unit_name": "provider/0", + }, + ) + self.assertEqual(self.harness.charm._stored.num_events, 2) + + jobs = self.harness.charm.prometheus_consumer.jobs() + self.validate_jobs(jobs) + + def test_consumer_returns_alerts_rules_file(self): + self.assertEqual(self.harness.charm._stored.num_events, 0) + + rel_id = self.harness.add_relation(RELATION_NAME, "consumer") + self.harness.update_relation_data( + rel_id, + "consumer", + { + "scrape_metadata": json.dumps(SCRAPE_METADATA), + "alert_rules": json.dumps(ALERT_RULES), + }, + ) + self.harness.add_relation_unit(rel_id, "consumer/0") + self.assertEqual(self.harness.charm._stored.num_events, 1) + + rules_file = self.harness.charm.prometheus_consumer.alerts + alerts = list(rules_file.values())[0] + + alert_names = [x["alert"] for x in alerts["groups"][0]["rules"]] + + self.assertEqual(alert_names, ["CPUOverUse", "PrometheusTargetMissing"]) + + def test_consumer_logs_an_error_on_missing_alerting_data(self): + self.assertEqual(self.harness.charm._stored.num_events, 0) + + bad_metadata = {"bad": "metadata"} + bad_rules = {"bad": "rule"} + + rel_id = self.harness.add_relation(RELATION_NAME, "consumer") + self.harness.update_relation_data( + rel_id, + "consumer", + { + "scrape_metadata": json.dumps(bad_metadata), + "alert_rules": json.dumps(bad_rules), + }, + ) + self.harness.add_relation_unit(rel_id, "consumer/0") + self.assertEqual(self.harness.charm._stored.num_events, 1) + with self.assertLogs(level="WARNING") as logger: + _ = self.harness.charm.prometheus_consumer.alerts + messages = logger.output + self.assertEqual(len(messages), 1) + self.assertIn( + "Alert rules were found but no usable group or identifier was present", messages[0] + ) + + def test_consumer_accepts_rules_with_no_identifier(self): + self.assertEqual(self.harness.charm._stored.num_events, 0) + + rel_id = self.harness.add_relation(RELATION_NAME, "consumer") + self.harness.update_relation_data( + rel_id, + "consumer", + { + "alert_rules": json.dumps(UNLABELED_ALERT_RULES), + }, + ) + self.harness.add_relation_unit(rel_id, "consumer/0") + self.assertEqual(self.harness.charm._stored.num_events, 1) + with self.assertLogs(level="DEBUG") as logger: + _ = self.harness.charm.prometheus_consumer.alerts + messages = logger.output + + searched_message = ( + "No labeled alert rules were found, and no 'scrape_metadata' " + "was available. Using the alert group name as filename." + ) + any_matches = any(searched_message in log_message for log_message in messages) + self.assertTrue(any_matches) + alerts = self.harness.charm.prometheus_consumer.alerts + identifier = f"unlabeled_external_cpu_alerts_{RELATION_NAME}_{rel_id}" + self.assertIn(identifier, alerts.keys()) + self.assertEqual(UNLABELED_ALERT_RULES, alerts[identifier]) + + def test_bad_scrape_job(self): + self.harness.set_leader(True) + bad_scrape_jobs = json.dumps( + [ + { + "metrics_path": "/metrics", + "static_configs": [{"targets": ["*:3100"]}], + "sample_limit": {"not_a_key": "not_a_value"}, + } + ] + ) + app_data = {"scrape_jobs": bad_scrape_jobs, "scrape_metadata": json.dumps(SCRAPE_METADATA)} + + rel_id = self.harness.add_relation(RELATION_NAME, "consumer") + self.harness.add_relation_unit(rel_id, "consumer/0") + self.harness.update_relation_data(rel_id, "consumer", app_data) + self.harness.update_relation_data( + rel_id, + "consumer/0", + { + "prometheus_scrape_unit_address": "1.1.1.1", + "prometheus_scrape_unit_name": "provider/0", + }, + ) + jobs = self.harness.charm.prometheus_consumer.jobs() + self.assertTrue(len(jobs) == 0) + app_data = json.loads( + self.harness.get_relation_data(rel_id, self.harness.charm.app.name).get("event", "{}") + ) + self.assertIn("scrape_job_errors", app_data) + + +def juju_job_labels(job, num=0): + """Fetch job labels. + + Args: + job: a list of static scrape jobs + num: index of static config for which labels must be extracted. + + Returns: + a dictionary of job labels for the first static job. + """ + static_config = job["static_configs"][num] + return static_config["labels"] + + +def job_name_suffix(job_name, labels, rel_id): + """Construct provider set job name. + + Args: + job_name: Consumer generated job name string. + labels: dictionary of juju static job labels + rel_id: id of relation for this job. + + Returns: + string name of job as set by provider (if any) + """ + name_prefix = "juju_{}_{}_{}_prometheus_scrape_".format( + labels["juju_model"], + labels["juju_model_uuid"][:7], + labels["juju_application"], + ) + return job_name[len(name_prefix) :] + + +def named_job_attribute(job_name, attribute, default=None, jobs=SCRAPE_JOBS): + """Fetch and attribute of a named job_name. + + Args: + job_name: string name suffix of job_name. + attribute: string name of attribute. + default: optional default value to be returned if attribute is not found. + jobs: optional list of jobs to search for job_name. + + Returns: + value of requested attribute if found or default. + """ + for job in jobs: + if job["job_name"] in job_name: + return job.get(attribute, default) + return None + + +def wildcard_target_ports(jobs): + """Fetch list of wildcard target ports from a job list. + + Args: + jobs: list of jobs to search for wildcard target ports. + + Returns: + possibly empty list of wildcard target ports. + """ + ports = [] + for job in jobs: + for static_config in job["static_configs"]: + for target in static_config["targets"]: + if target.startswith("*"): + ports.append(target.split(":")[-1]) + return ports + + +def wildcard_targets(jobs, wildcard_ports): + """Fetch list of wildcard targets. + + Args: + jobs: list of jobs to be searched. + wildcard_ports: ports of wildcard targets. + + Returns: + possibly empty list of wildcard targets. + """ + targets = [] + for job in jobs: + for static_config in job["static_configs"]: + for target in static_config["targets"]: + for port in wildcard_ports: + if target.endswith(port): + targets.append(target) + return targets + + +class TestWildcardTargetsWithMutliunitProvider(unittest.TestCase): + """Test how scrape jobs of multiunit providers with wildcard-hosts get rendered to disk.""" + + def set_relation_data(self, metrics_path: str = None, external_url_path: Template = None): + job = { + "job_name": "job", + "static_configs": [{"targets": ["*:1234"]}], + } + if metrics_path: + job.update({"metrics_path": metrics_path}) + + self.harness.update_relation_data( + self.rel_id, + "remote-app", + { + "scrape_metadata": json.dumps( + { + "model": self.__class__.__name__, + "model_uuid": str(uuid.uuid4()), + "application": "remote-app", + "charm_name": "some-provider", + } + ), + "scrape_jobs": json.dumps([job]), + }, + ) + + external_url_path = external_url_path or Template("") + self.harness.update_relation_data( + self.rel_id, + "remote-app/0", + { + "prometheus_scrape_unit_address": "10.10.10.10", + "prometheus_scrape_unit_path": external_url_path.substitute(unit=0), + "prometheus_scrape_unit_name": "remote-app/0", + }, + ) + + self.harness.update_relation_data( + self.rel_id, + "remote-app/1", + { + "prometheus_scrape_unit_address": "11.11.11.11", + "prometheus_scrape_unit_path": external_url_path.substitute(unit=1), + "prometheus_scrape_unit_name": "remote-app/1", + }, + ) + + def setUp(self): + metadata_file = open("metadata.yaml") + self.harness = Harness(EndpointConsumerCharm, meta=metadata_file) + + self.rel_id = self.harness.add_relation(RELATION_NAME, "remote-app") + self.harness.add_relation_unit(self.rel_id, "remote-app/0") + self.harness.add_relation_unit(self.rel_id, "remote-app/1") + + self.addCleanup(self.harness.cleanup) + self.harness.begin() + + def test_bare_job(self): + # WHEN the the provider forwards a nice and simple scrape job + self.set_relation_data(metrics_path=None, external_url_path=None) + + # THEN the consumer side sees two jobs + # AND one static_config per job + # AND one target per static_config + jobs = self.harness.charm.prometheus_consumer.jobs() + self.assertEqual(len(jobs), 2) # two jobs, one job per unit + self.assertEqual(len(jobs[0]["static_configs"]), 1) # one static_config per unit + self.assertEqual(len(jobs[1]["static_configs"]), 1) + + # one target per static_config + self.assertEqual(len(targets_unit_0 := jobs[0]["static_configs"][0]["targets"]), 1) + self.assertEqual(len(targets_unit_1 := jobs[1]["static_configs"][0]["targets"]), 1) + + # AND the consumer side expands it to unit addresses + unit_netlocs = sorted([targets_unit_0[0], targets_unit_1[0]]) + self.assertEqual(unit_netlocs, ["10.10.10.10:1234", "11.11.11.11:1234"]) + + # AND adds the default metrics_path key + self.assertEqual(jobs[0]["metrics_path"], "/metrics") + + def test_job_with_metrics_path(self): + # WHEN the provider specifies a metrics_path + self.set_relation_data(metrics_path="/custom_path", external_url_path=None) + + # THEN the consumer uses it + jobs = self.harness.charm.prometheus_consumer.jobs() + self.assertEqual(jobs[0]["metrics_path"], "/custom_path") + + def test_job_with_same_path_prefix(self): + # WHEN the provider sets unit addresses with the same path in both units + self.set_relation_data(metrics_path=None, external_url_path=Template("/foo")) + + # THEN the consumer side sees two jobs + # AND one static_configs per job + # AN one target per static_config + jobs = self.harness.charm.prometheus_consumer.jobs() + self.assertEqual(len(jobs), 2) # two jobs + self.assertEqual(len(jobs[0]["static_configs"]), 1) # one static_config per unit + self.assertEqual(len(jobs[1]["static_configs"]), 1) + + # one target per static_config + self.assertEqual(len(targets_unit_0 := jobs[0]["static_configs"][0]["targets"]), 1) + self.assertEqual(len(targets_unit_1 := jobs[1]["static_configs"][0]["targets"]), 1) + + # AND the consumer side expands it to unit addresses + unit_netlocs = sorted([targets_unit_0[0], targets_unit_1[0]]) + self.assertEqual(unit_netlocs, ["10.10.10.10:1234", "11.11.11.11:1234"]) + + # AND prefixes the default metrics_path with the unit's path + self.assertEqual(jobs[0]["metrics_path"], "/foo/metrics") + self.assertEqual(jobs[1]["metrics_path"], "/foo/metrics") + + def test_job_with_different_path_prefix(self): + # WHEN the provider sets unit addresses with a path + self.set_relation_data(metrics_path=None, external_url_path=Template("/model-app-$unit")) + + # THEN the consumer side sees one job per unit + jobs = self.harness.charm.prometheus_consumer.jobs() + self.assertEqual(len(jobs), 2) # one job per unit, so 2 in total + self.assertEqual(len(jobs[0]["static_configs"]), 1) + + # one target per static_config + self.assertEqual(len(targets_unit_0 := jobs[0]["static_configs"][0]["targets"]), 1) + self.assertEqual(len(targets_unit_1 := jobs[1]["static_configs"][0]["targets"]), 1) + + # AND the consumer side expands it to unit addresses + unit_netlocs = sorted([targets_unit_0[0], targets_unit_1[0]]) + self.assertEqual(unit_netlocs, ["10.10.10.10:1234", "11.11.11.11:1234"]) + + # AND prefixes the default metrics_path with the unit's path + paths = sorted([jobs[0]["metrics_path"], jobs[1]["metrics_path"]]) + self.assertEqual(paths, ["/model-app-0/metrics", "/model-app-1/metrics"]) + + def test_job_with_port_and_path_prefix(self): + # WHEN the provider sets unit addresses with a path and also specifies metrics_path + self.set_relation_data( + metrics_path="/custom-path", external_url_path=Template("/model-app-$unit") + ) + + # THEN the provider's metrics_path appended to the unit's path instead of the default + jobs = self.harness.charm.prometheus_consumer.jobs() + paths = sorted([jobs[0]["metrics_path"], jobs[1]["metrics_path"]]) + self.assertEqual(paths, ["/model-app-0/custom-path", "/model-app-1/custom-path"]) From 32ca00d2fff31f15cbee61c778d7bf5c082cea61 Mon Sep 17 00:00:00 2001 From: sed-i <82407168+sed-i@users.noreply.github.com> Date: Fri, 19 Jul 2024 18:28:51 -0400 Subject: [PATCH 3/5] Lint --- src/charm.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/charm.py b/src/charm.py index 60032e6d..a2267f0d 100755 --- a/src/charm.py +++ b/src/charm.py @@ -24,17 +24,17 @@ adjust_resource_requirements, ) from charms.observability_libs.v1.cert_handler import CertHandler -from charms.prometheus_k8s.v1.prometheus_scrape import ( - MetricsEndpointConsumer, - MetricsEndpointProvider, - PrometheusConfig, -) from charms.prometheus_k8s.v1.prometheus_remote_write import ( DEFAULT_RELATION_NAME as DEFAULT_REMOTE_WRITE_RELATION_NAME, ) from charms.prometheus_k8s.v1.prometheus_remote_write import ( PrometheusRemoteWriteProvider, ) +from charms.prometheus_k8s.v1.prometheus_scrape import ( + MetricsEndpointConsumer, + MetricsEndpointProvider, + PrometheusConfig, +) from charms.tempo_k8s.v1.charm_tracing import trace_charm from charms.tempo_k8s.v2.tracing import TracingEndpointRequirer from charms.traefik_k8s.v1.ingress_per_unit import ( From 5ef36b1a1d1c2aa7372b4c5db01505d557bfb818 Mon Sep 17 00:00:00 2001 From: sed-i <82407168+sed-i@users.noreply.github.com> Date: Wed, 24 Jul 2024 19:29:30 -0400 Subject: [PATCH 4/5] Fix utest --- tests/unit/test_endpoint_aggregator_v1.py | 34 +++++++++++------------ 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/tests/unit/test_endpoint_aggregator_v1.py b/tests/unit/test_endpoint_aggregator_v1.py index 4fb221ff..3529a8e6 100644 --- a/tests/unit/test_endpoint_aggregator_v1.py +++ b/tests/unit/test_endpoint_aggregator_v1.py @@ -5,7 +5,7 @@ import unittest from unittest.mock import patch -from charms.prometheus_k8s.v1.prometheus_scrape import MetricsEndpointAggregator +from charms.prometheus_k8s.v1.prometheus_scrape import MetricsEndpointAggregator, _decode_content from ops.charm import CharmBase from ops.testing import Harness @@ -119,7 +119,7 @@ def test_adding_prometheus_then_target_forwards_a_labeled_scrape_job(self): prometheus_rel_data = self.harness.get_relation_data( prometheus_rel_id, self.harness.model.app.name ) - scrape_jobs = json.loads(prometheus_rel_data.get("scrape_jobs", "[]")) + scrape_jobs = json.loads(_decode_content(prometheus_rel_data.get("scrape_jobs", "[]"))) expected_jobs = [ { "job_name": "juju_testmodel_12de4fa_target-app_prometheus_scrape", @@ -154,7 +154,7 @@ def test_adding_prometheus_then_target_forwards_a_labeled_alert_rule(self): prometheus_rel_id, self.harness.model.app.name ) - alert_rules = json.loads(prometheus_rel_data.get("alert_rules", "{}")) + alert_rules = json.loads(_decode_content(prometheus_rel_data.get("alert_rules", "{}"))) groups = alert_rules.get("groups", []) self.assertEqual(len(groups), 1) group = groups[0] @@ -206,7 +206,7 @@ def test_adding_target_then_prometheus_forwards_a_labeled_scrape_job(self): prometheus_rel_data = self.harness.get_relation_data( prometheus_rel_id, self.harness.model.app.name ) - scrape_jobs = json.loads(prometheus_rel_data.get("scrape_jobs", "[]")) + scrape_jobs = json.loads(_decode_content(prometheus_rel_data.get("scrape_jobs", "[]"))) expected_jobs = [ { "job_name": "juju_testmodel_12de4fa_target-app_prometheus_scrape", @@ -241,7 +241,7 @@ def test_adding_target_then_prometheus_forwards_a_labeled_alert_rule(self): prometheus_rel_id, self.harness.model.app.name ) - alert_rules = json.loads(prometheus_rel_data.get("alert_rules", "{}")) + alert_rules = json.loads(_decode_content(prometheus_rel_data.get("alert_rules", "{}"))) groups = alert_rules.get("groups", []) self.assertEqual(len(groups), 1) group = groups[0] @@ -300,7 +300,7 @@ def test_scrape_jobs_from_multiple_target_applications_are_forwarded(self): prometheus_rel_data = self.harness.get_relation_data( prometheus_rel_id, self.harness.model.app.name ) - scrape_jobs = json.loads(prometheus_rel_data.get("scrape_jobs", "[]")) + scrape_jobs = json.loads(_decode_content(prometheus_rel_data.get("scrape_jobs", "[]"))) self.assertEqual(len(scrape_jobs), 2) expected_jobs = [ @@ -364,7 +364,7 @@ def test_alert_rules_from_multiple_target_applications_are_forwarded(self): prometheus_rel_id, self.harness.model.app.name ) - alert_rules = json.loads(prometheus_rel_data.get("alert_rules", "{}")) + alert_rules = json.loads(_decode_content(prometheus_rel_data.get("alert_rules", "{}"))) groups = alert_rules.get("groups", []) self.assertEqual(len(groups), 2) expected_groups = [ @@ -444,11 +444,11 @@ def test_scrape_job_removal_differentiates_between_applications(self): prometheus_rel_data = self.harness.get_relation_data( prometheus_rel_id, self.harness.model.app.name ) - scrape_jobs = json.loads(prometheus_rel_data.get("scrape_jobs", "[]")) + scrape_jobs = json.loads(_decode_content(prometheus_rel_data.get("scrape_jobs", "[]"))) self.assertEqual(len(scrape_jobs), 2) self.harness.remove_relation_unit(target_rel_id_2, "target-app-2/0") - scrape_jobs = json.loads(prometheus_rel_data.get("scrape_jobs", "[]")) + scrape_jobs = json.loads(_decode_content(prometheus_rel_data.get("scrape_jobs", "[]"))) self.assertEqual(len(scrape_jobs), 1) expected_jobs = [ @@ -495,12 +495,12 @@ def test_alert_rules_removal_differentiates_between_applications(self): prometheus_rel_id, self.harness.model.app.name ) - alert_rules = json.loads(prometheus_rel_data.get("alert_rules", "{}")) + alert_rules = json.loads(_decode_content(prometheus_rel_data.get("alert_rules", "{}"))) groups = alert_rules.get("groups", []) self.assertEqual(len(groups), 2) self.harness.remove_relation_unit(alert_rules_rel_id_2, "rules-app-2/0") - alert_rules = json.loads(prometheus_rel_data.get("alert_rules", "{}")) + alert_rules = json.loads(_decode_content(prometheus_rel_data.get("alert_rules", "{}"))) groups = alert_rules.get("groups", []) self.assertEqual(len(groups), 1) @@ -560,13 +560,13 @@ def test_removing_scrape_jobs_differentiates_between_units(self): prometheus_rel_data = self.harness.get_relation_data( prometheus_rel_id, self.harness.model.app.name ) - scrape_jobs = json.loads(prometheus_rel_data.get("scrape_jobs", "[]")) + scrape_jobs = json.loads(_decode_content(prometheus_rel_data.get("scrape_jobs", "[]"))) self.assertEqual(len(scrape_jobs), 1) self.assertEqual(len(scrape_jobs[0].get("static_configs")), 2) self.harness.remove_relation_unit(target_rel_id, "target-app/1") - scrape_jobs = json.loads(prometheus_rel_data.get("scrape_jobs", "[]")) + scrape_jobs = json.loads(_decode_content(prometheus_rel_data.get("scrape_jobs", "[]"))) self.assertEqual(len(scrape_jobs), 1) self.assertEqual(len(scrape_jobs[0].get("static_configs")), 1) @@ -614,13 +614,13 @@ def test_removing_alert_rules_differentiates_between_units(self): prometheus_rel_id, self.harness.model.app.name ) - alert_rules = json.loads(prometheus_rel_data.get("alert_rules", "{}")) + alert_rules = json.loads(_decode_content(prometheus_rel_data.get("alert_rules", "{}"))) groups = alert_rules.get("groups", []) self.assertEqual(len(groups), 1) self.harness.remove_relation_unit(alert_rules_rel_id, "rules-app/1") - alert_rules = json.loads(prometheus_rel_data.get("alert_rules", "{}")) + alert_rules = json.loads(_decode_content(prometheus_rel_data.get("alert_rules", "{}"))) groups = alert_rules.get("groups", []) self.assertEqual(len(groups), 1) @@ -682,7 +682,7 @@ def test_adding_prometheus_then_target_with_good_dns_adds_label(self): prometheus_rel_data = self.harness.get_relation_data( prometheus_rel_id, self.harness.model.app.name ) - scrape_jobs = json.loads(prometheus_rel_data.get("scrape_jobs", "[]")) + scrape_jobs = json.loads(_decode_content(prometheus_rel_data.get("scrape_jobs", "[]"))) expected_jobs = [ { "job_name": "juju_testmodel_12de4fa_target-app_prometheus_scrape", @@ -728,7 +728,7 @@ def test_adding_prometheus_then_target_with_bad_dns_keeps_input(self, mock_looku prometheus_rel_data = self.harness.get_relation_data( prometheus_rel_id, self.harness.model.app.name ) - scrape_jobs = json.loads(prometheus_rel_data.get("scrape_jobs", "[]")) + scrape_jobs = json.loads(_decode_content(prometheus_rel_data.get("scrape_jobs", "[]"))) expected_jobs = [ { "job_name": "juju_testmodel_12de4fa_target-app_prometheus_scrape", From a32b24b341754ab4c767c7f72b5d058983da0fce Mon Sep 17 00:00:00 2001 From: sed-i <82407168+sed-i@users.noreply.github.com> Date: Wed, 24 Jul 2024 20:57:17 -0400 Subject: [PATCH 5/5] Add charm-binary-python-packages to the tester charm This was prompted by charmcraft pack failing with "error: can't find Rust compiler". --- tests/integration/prometheus-tester/charmcraft.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/integration/prometheus-tester/charmcraft.yaml b/tests/integration/prometheus-tester/charmcraft.yaml index f35c5da5..5d42256d 100644 --- a/tests/integration/prometheus-tester/charmcraft.yaml +++ b/tests/integration/prometheus-tester/charmcraft.yaml @@ -10,3 +10,11 @@ parts: charm: build-packages: - git + charm-binary-python-packages: + - jsonschema + - cryptography + - pyyaml + - requests + - ops + - cosl + - pydantic>=2