From 1a2e0332232ee03d6b28c8a88fa174b9ab2aba6e Mon Sep 17 00:00:00 2001 From: Michael Thamm Date: Thu, 19 Dec 2024 08:40:38 -0500 Subject: [PATCH 1/5] Update prometheus_scrape and prometheus_remote_write libs for generic host health rules --- .../prometheus_k8s/v0/prometheus_scrape.py | 43 +++++++++++++++++-- .../v1/prometheus_remote_write.py | 25 ++++++++++- 2 files changed, 63 insertions(+), 5 deletions(-) diff --git a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py index e3d35c6f..81c737e5 100644 --- a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py +++ b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py @@ -333,6 +333,7 @@ def _on_scrape_targets_changed(self, event): import socket import subprocess import tempfile +import textwrap from collections import defaultdict from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Tuple, Union @@ -362,13 +363,13 @@ def _on_scrape_targets_changed(self, event): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 47 +LIBPATCH = 48 -PYDEPS = ["cosl"] +# TODO This is pinned to a branch so we cannot test (i.e. Jhack sync) fast without pushing each time +PYDEPS = ["git+https://github.com/canonical/cos-lib.git@feature/generic-alerts#egg=cosl"] logger = logging.getLogger(__name__) - ALLOWED_KEYS = { "job_name", "metrics_path", @@ -399,6 +400,39 @@ def _on_scrape_targets_changed(self, event): DEFAULT_ALERT_RULES_RELATIVE_PATH = "./src/prometheus_alert_rules" +GENERIC_ALERT_RULES_GROUP = yaml.safe_load( + textwrap.dedent( + """ + groups: + - name: HostHealth + rules: + - alert: HostDown + expr: up < 1 + for: 5m + labels: + severity: critical + annotations: + summary: Host '{{ $labels.instance }}' is down. + description: >- + Host '{{ $labels.instance }}' is down. + VALUE = {{ $value }} + LABELS = {{ $labels }} + - alert: HostUnavailable + # This alert is applicable only when the provider is linked via an aggregator (such as grafana agent) + expr: absent(up) + for: 5m + labels: + severity: critical + annotations: + summary: Metrics not received from host '{{ $labels.instance }}'. + description: >- + Metrics not received from host '{{ $labels.instance }}'. + VALUE = {{ $value }} + LABELS = {{ $labels }} + """ + ) +) + class PrometheusConfig: """A namespace for utility functions for manipulating the prometheus config dict.""" @@ -541,7 +575,7 @@ def expand_wildcard_targets_into_individual_jobs( job_name = modified_job.get("job_name", "unnamed-job") + "-" + unit_num modified_job["job_name"] = job_name modified_job["metrics_path"] = unit_path + ( - job.get("metrics_path") or "/metrics" + job.get("metrics_path") or "/metrics" ) if topology: @@ -1531,6 +1565,7 @@ def set_scrape_job_spec(self, _=None): alert_rules = AlertRules(query_type="promql", topology=self.topology) alert_rules.add_path(self._alert_rules_path, recursive=True) + alert_rules.add(GENERIC_ALERT_RULES_GROUP, group_name_prefix=self.topology.identifier) alert_rules_as_dict = alert_rules.as_dict() for relation in self._charm.model.relations[self._relation_name]: diff --git a/lib/charms/prometheus_k8s/v1/prometheus_remote_write.py b/lib/charms/prometheus_k8s/v1/prometheus_remote_write.py index cf24b9f7..d21fbf69 100644 --- a/lib/charms/prometheus_k8s/v1/prometheus_remote_write.py +++ b/lib/charms/prometheus_k8s/v1/prometheus_remote_write.py @@ -21,6 +21,7 @@ import socket import subprocess import tempfile +import textwrap from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Tuple, Union @@ -48,7 +49,7 @@ # to 0 if you are raising the major API version LIBPATCH = 4 -PYDEPS = ["cosl"] +PYDEPS = ["git+https://github.com/canonical/cos-lib.git@feature/generic-alerts#egg=cosl"] logger = logging.getLogger(__name__) @@ -60,6 +61,27 @@ DEFAULT_ALERT_RULES_RELATIVE_PATH = "./src/prometheus_alert_rules" +# TODO Come up with a name that will not collide when Gagent is in model AggregatorHostHealth +GENERIC_ALERT_RULES_GROUP = yaml.safe_load( + textwrap.dedent( + """ + groups: + - name: AggregatorHostHealth + rules: + - alert: HostUnavailable + expr: absent(up) + for: 5m + labels: + severity: critical + annotations: + summary: Metrics not received from host '{{ $labels.instance }}'. + description: >- + Metrics not received from host '{{ $labels.instance }}'. + VALUE = {{ $value }} + LABELS = {{ $labels }} + """ + ) +) class RelationNotFoundError(Exception): """Raised if there is no relation with the given name.""" @@ -485,6 +507,7 @@ def _push_alerts_to_relation_databag(self, relation: Relation) -> None: alert_rules = AlertRules(query_type="promql", topology=self.topology) alert_rules.add_path(self._alert_rules_path) + alert_rules.add(GENERIC_ALERT_RULES_GROUP, group_name_prefix=self.topology.identifier) alert_rules_as_dict = alert_rules.as_dict() From 1b35ab815dbcee42155727b542fd342840db82ae Mon Sep 17 00:00:00 2001 From: Michael Thamm Date: Thu, 19 Dec 2024 18:15:30 -0500 Subject: [PATCH 2/5] chore: remove TODOs --- lib/charms/prometheus_k8s/v0/prometheus_scrape.py | 1 - lib/charms/prometheus_k8s/v1/prometheus_remote_write.py | 1 - 2 files changed, 2 deletions(-) diff --git a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py index 81c737e5..8467c73a 100644 --- a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py +++ b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py @@ -365,7 +365,6 @@ def _on_scrape_targets_changed(self, event): # to 0 if you are raising the major API version LIBPATCH = 48 -# TODO This is pinned to a branch so we cannot test (i.e. Jhack sync) fast without pushing each time PYDEPS = ["git+https://github.com/canonical/cos-lib.git@feature/generic-alerts#egg=cosl"] logger = logging.getLogger(__name__) diff --git a/lib/charms/prometheus_k8s/v1/prometheus_remote_write.py b/lib/charms/prometheus_k8s/v1/prometheus_remote_write.py index d21fbf69..71d8e11b 100644 --- a/lib/charms/prometheus_k8s/v1/prometheus_remote_write.py +++ b/lib/charms/prometheus_k8s/v1/prometheus_remote_write.py @@ -61,7 +61,6 @@ DEFAULT_ALERT_RULES_RELATIVE_PATH = "./src/prometheus_alert_rules" -# TODO Come up with a name that will not collide when Gagent is in model AggregatorHostHealth GENERIC_ALERT_RULES_GROUP = yaml.safe_load( textwrap.dedent( """ From 48e9ce8b4b59628e361bd6654112214e7d6a3a45 Mon Sep 17 00:00:00 2001 From: Michael Thamm Date: Fri, 20 Dec 2024 08:51:15 -0500 Subject: [PATCH 3/5] Switch naming to `HostMetricsMissing` for absent metrics --- lib/charms/prometheus_k8s/v0/prometheus_scrape.py | 8 ++++---- lib/charms/prometheus_k8s/v1/prometheus_remote_write.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py index 8467c73a..80c0236c 100644 --- a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py +++ b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py @@ -413,19 +413,19 @@ def _on_scrape_targets_changed(self, event): annotations: summary: Host '{{ $labels.instance }}' is down. description: >- - Host '{{ $labels.instance }}' is down. + Host '{{ $labels.instance }}' is down, failed to scrape. VALUE = {{ $value }} LABELS = {{ $labels }} - - alert: HostUnavailable + - alert: HostMetricsMissing # This alert is applicable only when the provider is linked via an aggregator (such as grafana agent) expr: absent(up) for: 5m labels: severity: critical annotations: - summary: Metrics not received from host '{{ $labels.instance }}'. + summary: Metrics not received from host '{{ $labels.instance }}', failed to remote write. description: >- - Metrics not received from host '{{ $labels.instance }}'. + Metrics not received from host '{{ $labels.instance }}', failed to remote write. VALUE = {{ $value }} LABELS = {{ $labels }} """ diff --git a/lib/charms/prometheus_k8s/v1/prometheus_remote_write.py b/lib/charms/prometheus_k8s/v1/prometheus_remote_write.py index 71d8e11b..a216389e 100644 --- a/lib/charms/prometheus_k8s/v1/prometheus_remote_write.py +++ b/lib/charms/prometheus_k8s/v1/prometheus_remote_write.py @@ -67,15 +67,15 @@ groups: - name: AggregatorHostHealth rules: - - alert: HostUnavailable + - alert: HostMetricsMissing expr: absent(up) for: 5m labels: severity: critical annotations: - summary: Metrics not received from host '{{ $labels.instance }}'. + summary: Metrics not received from host '{{ $labels.instance }}', failed to remote write. description: >- - Metrics not received from host '{{ $labels.instance }}'. + Metrics not received from host '{{ $labels.instance }}', failed to remote write. VALUE = {{ $value }} LABELS = {{ $labels }} """ From 082a34cfe383f456e99e100338853611cf77e007 Mon Sep 17 00:00:00 2001 From: Michael Thamm Date: Fri, 20 Dec 2024 12:14:08 -0500 Subject: [PATCH 4/5] test: update cosl dep to the branch for testing CI --- lib/charms/prometheus_k8s/v0/prometheus_scrape.py | 2 +- .../prometheus_k8s/v1/prometheus_remote_write.py | 1 + requirements.txt | 2 +- tests/unit/test_endpoint_provider.py | 11 +++++++---- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py index 80c0236c..e5ebb619 100644 --- a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py +++ b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py @@ -574,7 +574,7 @@ def expand_wildcard_targets_into_individual_jobs( job_name = modified_job.get("job_name", "unnamed-job") + "-" + unit_num modified_job["job_name"] = job_name modified_job["metrics_path"] = unit_path + ( - job.get("metrics_path") or "/metrics" + job.get("metrics_path") or "/metrics" ) if topology: diff --git a/lib/charms/prometheus_k8s/v1/prometheus_remote_write.py b/lib/charms/prometheus_k8s/v1/prometheus_remote_write.py index a216389e..94dd78af 100644 --- a/lib/charms/prometheus_k8s/v1/prometheus_remote_write.py +++ b/lib/charms/prometheus_k8s/v1/prometheus_remote_write.py @@ -82,6 +82,7 @@ ) ) + class RelationNotFoundError(Exception): """Raised if there is no relation with the given name.""" diff --git a/requirements.txt b/requirements.txt index 08a78f21..7fc56b1e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -cosl>=0.0.46 +git+https://github.com/canonical/cos-lib.git@feature/generic-alerts#egg=cosl cryptography jsonschema ops diff --git a/tests/unit/test_endpoint_provider.py b/tests/unit/test_endpoint_provider.py index 12651b69..f4d69d4f 100644 --- a/tests/unit/test_endpoint_provider.py +++ b/tests/unit/test_endpoint_provider.py @@ -324,7 +324,7 @@ def test_each_alert_rule_is_topology_labeled(self): self.assertIn("alert_rules", data) alerts = json.loads(data["alert_rules"]) self.assertIn("groups", alerts) - self.assertEqual(len(alerts["groups"]), 6) + self.assertEqual(len(alerts["groups"]), 7) for group in alerts["groups"]: for rule in group["rules"]: if "and_unit" not in group["name"]: @@ -360,7 +360,7 @@ def test_each_alert_expression_is_topology_labeled(self): self.assertIn("alert_rules", data) alerts = json.loads(data["alert_rules"]) self.assertIn("groups", alerts) - self.assertEqual(len(alerts["groups"]), 6) + self.assertEqual(len(alerts["groups"]), 7) group = alerts["groups"][0] for rule in group["rules"]: self.assertIn("expr", rule) @@ -753,10 +753,13 @@ def test_unit_label_is_retained_if_hard_coded(self): # check unit topology is present in labels and in alert rule expression relation = self.harness.charm.model.get_relation("metrics-endpoint") alert_rules = json.loads(relation.data[self.harness.charm.app].get("alert_rules")) + from pprint import pprint + pprint(alert_rules) for group in alert_rules["groups"]: for rule in group["rules"]: - self.assertIn("juju_unit", rule["labels"]) - self.assertIn("juju_unit=", rule["expr"]) + if "_HostHealth_alerts" not in group["name"]: # _HostHealth_alerts are injected alerts without juju_unit labels + self.assertIn("juju_unit", rule["labels"]) + self.assertIn("juju_unit=", rule["expr"]) class TestNoLeader(unittest.TestCase): From dbf93410c73552deeee3e2533967d2af20936ab4 Mon Sep 17 00:00:00 2001 From: Michael Thamm Date: Fri, 20 Dec 2024 13:14:53 -0500 Subject: [PATCH 5/5] chore: fmt and static fixes --- .../prometheus_k8s/v0/prometheus_scrape.py | 66 +++++++++---------- .../v1/prometheus_remote_write.py | 44 ++++++------- tests/unit/test_endpoint_provider.py | 6 +- tox.ini | 2 +- 4 files changed, 59 insertions(+), 59 deletions(-) diff --git a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py index e5ebb619..2f475dc6 100644 --- a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py +++ b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py @@ -333,7 +333,6 @@ def _on_scrape_targets_changed(self, event): import socket import subprocess import tempfile -import textwrap from collections import defaultdict from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Tuple, Union @@ -399,38 +398,39 @@ def _on_scrape_targets_changed(self, event): DEFAULT_ALERT_RULES_RELATIVE_PATH = "./src/prometheus_alert_rules" -GENERIC_ALERT_RULES_GROUP = yaml.safe_load( - textwrap.dedent( - """ - groups: - - name: HostHealth - rules: - - alert: HostDown - expr: up < 1 - for: 5m - labels: - severity: critical - annotations: - summary: Host '{{ $labels.instance }}' is down. - description: >- - Host '{{ $labels.instance }}' is down, failed to scrape. - VALUE = {{ $value }} - LABELS = {{ $labels }} - - alert: HostMetricsMissing - # This alert is applicable only when the provider is linked via an aggregator (such as grafana agent) - expr: absent(up) - for: 5m - labels: - severity: critical - annotations: - summary: Metrics not received from host '{{ $labels.instance }}', failed to remote write. - description: >- - Metrics not received from host '{{ $labels.instance }}', failed to remote write. - VALUE = {{ $value }} - LABELS = {{ $labels }} - """ - ) -) +GENERIC_ALERT_RULES_GROUP = { + "groups": [ + { + "name": "HostHealth", + "rules": [ + { + "alert": "HostDown", + "expr": "up < 1", + "for": "5m", + "labels": {"severity": "critical"}, + "annotations": { + "summary": "Host '{{ $labels.instance }}' is down.", + "description": """Host '{{ $labels.instance }}' is down, failed to scrape. + VALUE = {{ $value }} + LABELS = {{ $labels }}""", + }, + }, + { + "alert": "HostMetricsMissing", + "expr": "absent(up)", + "for": "5m", + "labels": {"severity": "critical"}, + "annotations": { + "summary": "Metrics not received from host '{{ $labels.instance }}', failed to remote write.", + "description": """Metrics not received from host '{{ $labels.instance }}', failed to remote write. + VALUE = {{ $value }} + LABELS = {{ $labels }}""", + }, + }, + ], + } + ] +} class PrometheusConfig: diff --git a/lib/charms/prometheus_k8s/v1/prometheus_remote_write.py b/lib/charms/prometheus_k8s/v1/prometheus_remote_write.py index 94dd78af..d7b0b9fa 100644 --- a/lib/charms/prometheus_k8s/v1/prometheus_remote_write.py +++ b/lib/charms/prometheus_k8s/v1/prometheus_remote_write.py @@ -21,7 +21,6 @@ import socket import subprocess import tempfile -import textwrap from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Tuple, Union @@ -47,7 +46,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 4 +LIBPATCH = 5 PYDEPS = ["git+https://github.com/canonical/cos-lib.git@feature/generic-alerts#egg=cosl"] @@ -61,26 +60,27 @@ DEFAULT_ALERT_RULES_RELATIVE_PATH = "./src/prometheus_alert_rules" -GENERIC_ALERT_RULES_GROUP = yaml.safe_load( - textwrap.dedent( - """ - groups: - - name: AggregatorHostHealth - rules: - - alert: HostMetricsMissing - expr: absent(up) - for: 5m - labels: - severity: critical - annotations: - summary: Metrics not received from host '{{ $labels.instance }}', failed to remote write. - description: >- - Metrics not received from host '{{ $labels.instance }}', failed to remote write. - VALUE = {{ $value }} - LABELS = {{ $labels }} - """ - ) -) +GENERIC_ALERT_RULES_GROUP = { + "groups": [ + { + "name": "AggregatorHostHealth", + "rules": [ + { + "alert": "HostMetricsMissing", + "expr": "absent(up)", + "for": "5m", + "labels": {"severity": "critical"}, + "annotations": { + "summary": "Metrics not received from host '{{ $labels.instance }}', failed to remote write.", + "description": """Metrics not received from host '{{ $labels.instance }}', failed to remote write. + VALUE = {{ $value }} + LABELS = {{ $labels }}""", + }, + } + ], + } + ] +} class RelationNotFoundError(Exception): diff --git a/tests/unit/test_endpoint_provider.py b/tests/unit/test_endpoint_provider.py index f4d69d4f..ee2ad12e 100644 --- a/tests/unit/test_endpoint_provider.py +++ b/tests/unit/test_endpoint_provider.py @@ -753,11 +753,11 @@ def test_unit_label_is_retained_if_hard_coded(self): # check unit topology is present in labels and in alert rule expression relation = self.harness.charm.model.get_relation("metrics-endpoint") alert_rules = json.loads(relation.data[self.harness.charm.app].get("alert_rules")) - from pprint import pprint - pprint(alert_rules) for group in alert_rules["groups"]: for rule in group["rules"]: - if "_HostHealth_alerts" not in group["name"]: # _HostHealth_alerts are injected alerts without juju_unit labels + if ( + "_HostHealth_alerts" not in group["name"] + ): # _HostHealth_alerts are injected alerts without juju_unit labels self.assertIn("juju_unit", rule["labels"]) self.assertIn("juju_unit=", rule["expr"]) diff --git a/tox.ini b/tox.ini index b1c393ee..57b76083 100644 --- a/tox.ini +++ b/tox.ini @@ -45,7 +45,7 @@ commands = [testenv:static-{charm,lib,unit,integration}] description = Run static analysis checks deps = - cosl + git+https://github.com/canonical/cos-lib.git@feature/generic-alerts#egg=cosl pyright charm: -r{toxinidir}/requirements.txt lib: ops