From 88b0550d51c04fd4965a8538d35b1c95e1095b65 Mon Sep 17 00:00:00 2001 From: sed-i <82407168+sed-i@users.noreply.github.com> Date: Wed, 17 Jan 2024 11:37:25 -0800 Subject: [PATCH 01/10] Fix status for retention config option --- src/charm.py | 29 +++++++++++++++----------- tests/unit/test_charm.py | 45 +++++++++++++++++++++++++++++++++++----- 2 files changed, 57 insertions(+), 17 deletions(-) diff --git a/src/charm.py b/src/charm.py index df8183ff..865a3cd0 100755 --- a/src/charm.py +++ b/src/charm.py @@ -7,6 +7,8 @@ import hashlib import logging import re +from typing import TypedDict + import socket import subprocess from pathlib import Path @@ -55,6 +57,7 @@ ModelError, OpenedPort, WaitingStatus, + StatusBase, ) from ops.pebble import Error as PebbleError from ops.pebble import ExecError, Layer @@ -91,6 +94,12 @@ class ConfigError(Exception): pass +class CompositeStatus(TypedDict): + retention_size: StatusBase + timespec: StatusBase + k8s_patch: StatusBase + + @trace_charm( tracing_endpoint="tempo", extra_types=[ @@ -106,6 +115,7 @@ class PrometheusCharm(CharmBase): def __init__(self, *args): super().__init__(*args) + self.status = CompositeStatus() self._name = "prometheus" self._port = 9090 @@ -379,7 +389,7 @@ def _on_ingress_revoked(self, event: IngressPerUnitRevokedForUnitEvent): self._configure(event) def _on_k8s_patch_failed(self, event: K8sResourcePatchFailedEvent): - self.unit.status = BlockedStatus(cast(str, event.message)) + self.status["k8s_patch"] = BlockedStatus(cast(str, event.message)) def _on_server_cert_changed(self, _): self._update_cert() @@ -564,13 +574,7 @@ def _configure(self, _): logger.info("Prometheus configuration reloaded") - if ( - isinstance(self.unit.status, BlockedStatus) - and self.unit.status not in early_return_statuses.values() - ): - return - - self.unit.status = ActiveStatus() + self.unit.status = StatusBase._get_highest_priority(list(self.status.values())) def _on_pebble_ready(self, event) -> None: """Pebble ready hook. @@ -681,7 +685,7 @@ def _generate_command(self) -> str: except ValueError as e: logger.warning(e) - self.unit.status = BlockedStatus(f"Invalid retention size: {e}") + self.status["retention_size"] = BlockedStatus(f"Invalid retention size: {e}") else: # `storage.tsdb.retention.size` uses the legacy binary format, so "GB" and not "GiB" @@ -692,15 +696,16 @@ def _generate_command(self) -> str: self._get_pvc_capacity(), ratio ) except ValueError as e: - self.unit.status = BlockedStatus(f"Error calculating retention size: {e}") + self.status["retention_size"] = BlockedStatus(f"Error calculating retention size: {e}") except LightkubeApiError as e: - self.unit.status = BlockedStatus( + self.status["retention_size"] = BlockedStatus( "Error calculating retention size " f"(try running `juju trust` on this application): {e}" ) else: logger.debug("Retention size limit set to %s (%s%%)", capacity, ratio * 100) args.append(f"--storage.tsdb.retention.size={capacity}") + self.status["retention_size"] = ActiveStatus() command = ["/bin/prometheus"] + args @@ -808,7 +813,7 @@ def _is_valid_timespec(self, timeval: str) -> bool: r"^((([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?|0)$" ) if not (matched := timespec_re.search(timeval)): - self.unit.status = BlockedStatus(f"Invalid time spec : {timeval}") + self.status["timespec"] = BlockedStatus(f"Invalid time spec : {timeval}") return bool(matched) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 9bc8a3b0..1d15d1ee 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -287,6 +287,14 @@ def test_default_maximum_retention_size_is_80_percent(self, *unused): plan = self.harness.get_container_pebble_plan("prometheus") self.assertEqual(cli_arg(plan, "--storage.tsdb.retention.size"), "0.8GB") + # AND WHEN the config option is set and then unset + self.harness.update_config({"maximum_retention_size": "50%"}) + self.harness.update_config(unset={"maximum_retention_size"}) + + # THEN the pebble plan is back to 80% + plan = self.harness.get_container_pebble_plan("prometheus") + self.assertEqual(cli_arg(plan, "--storage.tsdb.retention.size"), "0.8GB") + @k8s_resource_multipatch @patch("lightkube.core.client.GenericSyncClient") def test_multiplication_factor_applied_to_pvc_capacity(self, *unused): @@ -294,16 +302,43 @@ def test_multiplication_factor_applied_to_pvc_capacity(self, *unused): # GIVEN a capacity limit in binary notation (k8s notation) self.mock_capacity.return_value = "1Gi" - # AND a multiplication factor as a config option - self.harness.update_config({"maximum_retention_size": "50%"}) - # WHEN the charm starts self.harness.begin_with_initial_hooks() self.harness.container_pebble_ready("prometheus") - # THEN the pebble plan the adjusted capacity + for (set_point, read_back) in [("0%", "0GB"), ("50%", "0.5GB"), ("100%", "1GB")]: + with self.subTest(limit=set_point): + # WHEN a limit is set + self.harness.update_config({"maximum_retention_size": set_point}) + + # THEN the pebble plan the adjusted capacity + plan = self.harness.get_container_pebble_plan("prometheus") + self.assertEqual(cli_arg(plan, "--storage.tsdb.retention.size"), read_back) + + @k8s_resource_multipatch + @patch("lightkube.core.client.GenericSyncClient") + def test_invalid_retention_size_config_option_string(self, *unused): + # GIVEN a running charm with default values + self.mock_capacity.return_value = "1Gi" + self.harness.begin_with_initial_hooks() + self.harness.container_pebble_ready("prometheus") + self.assertIsInstance(self.harness.model.unit.status, ActiveStatus) + + # WHEN the config option is set to an invalid string + self.harness.update_config({"maximum_retention_size": "42"}) + + # THEN cli arg is unspecified and the unit is blocked plan = self.harness.get_container_pebble_plan("prometheus") - self.assertEqual(cli_arg(plan, "--storage.tsdb.retention.size"), "0.5GB") + self.assertIsNone(cli_arg(plan, "--storage.tsdb.retention.size")) + self.assertIsInstance(self.harness.model.unit.status, BlockedStatus) + + # AND WHEN the config option is corrected + self.harness.update_config({"maximum_retention_size": "42%"}) + + # THEN cli arg is updated and the unit is goes back to active + plan = self.harness.get_container_pebble_plan("prometheus") + self.assertEqual(cli_arg(plan, "--storage.tsdb.retention.size"), "0.42GB") + self.assertIsInstance(self.harness.model.unit.status, ActiveStatus) @prom_multipatch From f4bc6d40711df7c035a8e4475c4b66693dafbc15 Mon Sep 17 00:00:00 2001 From: sed-i <82407168+sed-i@users.noreply.github.com> Date: Wed, 17 Jan 2024 14:42:01 -0500 Subject: [PATCH 02/10] Lint --- src/charm.py | 12 +++++++----- tests/unit/test_charm.py | 8 ++++---- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/charm.py b/src/charm.py index 865a3cd0..05d29a0e 100755 --- a/src/charm.py +++ b/src/charm.py @@ -7,12 +7,10 @@ import hashlib import logging import re -from typing import TypedDict - import socket import subprocess from pathlib import Path -from typing import Dict, Optional, cast +from typing import Dict, Optional, TypedDict, cast from urllib.parse import urlparse import yaml @@ -56,8 +54,8 @@ MaintenanceStatus, ModelError, OpenedPort, - WaitingStatus, StatusBase, + WaitingStatus, ) from ops.pebble import Error as PebbleError from ops.pebble import ExecError, Layer @@ -95,6 +93,8 @@ class ConfigError(Exception): class CompositeStatus(TypedDict): + """Per-component status holder.""" + retention_size: StatusBase timespec: StatusBase k8s_patch: StatusBase @@ -696,7 +696,9 @@ def _generate_command(self) -> str: self._get_pvc_capacity(), ratio ) except ValueError as e: - self.status["retention_size"] = BlockedStatus(f"Error calculating retention size: {e}") + self.status["retention_size"] = BlockedStatus( + f"Error calculating retention size: {e}" + ) except LightkubeApiError as e: self.status["retention_size"] = BlockedStatus( "Error calculating retention size " diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 1d15d1ee..fcc41c57 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -287,10 +287,10 @@ def test_default_maximum_retention_size_is_80_percent(self, *unused): plan = self.harness.get_container_pebble_plan("prometheus") self.assertEqual(cli_arg(plan, "--storage.tsdb.retention.size"), "0.8GB") - # AND WHEN the config option is set and then unset + # AND WHEN the config option is set and then unset self.harness.update_config({"maximum_retention_size": "50%"}) self.harness.update_config(unset={"maximum_retention_size"}) - + # THEN the pebble plan is back to 80% plan = self.harness.get_container_pebble_plan("prometheus") self.assertEqual(cli_arg(plan, "--storage.tsdb.retention.size"), "0.8GB") @@ -306,7 +306,7 @@ def test_multiplication_factor_applied_to_pvc_capacity(self, *unused): self.harness.begin_with_initial_hooks() self.harness.container_pebble_ready("prometheus") - for (set_point, read_back) in [("0%", "0GB"), ("50%", "0.5GB"), ("100%", "1GB")]: + for set_point, read_back in [("0%", "0GB"), ("50%", "0.5GB"), ("100%", "1GB")]: with self.subTest(limit=set_point): # WHEN a limit is set self.harness.update_config({"maximum_retention_size": set_point}) @@ -314,7 +314,7 @@ def test_multiplication_factor_applied_to_pvc_capacity(self, *unused): # THEN the pebble plan the adjusted capacity plan = self.harness.get_container_pebble_plan("prometheus") self.assertEqual(cli_arg(plan, "--storage.tsdb.retention.size"), read_back) - + @k8s_resource_multipatch @patch("lightkube.core.client.GenericSyncClient") def test_invalid_retention_size_config_option_string(self, *unused): From 1fce272658a66296e6265054731a76359ab13ebd Mon Sep 17 00:00:00 2001 From: sed-i <82407168+sed-i@users.noreply.github.com> Date: Wed, 17 Jan 2024 11:57:00 -0800 Subject: [PATCH 03/10] Static --- src/charm.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/charm.py b/src/charm.py index 05d29a0e..bfb786ba 100755 --- a/src/charm.py +++ b/src/charm.py @@ -7,6 +7,8 @@ import hashlib import logging import re +from typing import List, TypedDict, cast + import socket import subprocess from pathlib import Path @@ -574,7 +576,7 @@ def _configure(self, _): logger.info("Prometheus configuration reloaded") - self.unit.status = StatusBase._get_highest_priority(list(self.status.values())) + self.unit.status = StatusBase._get_highest_priority(cast(List[StatusBase], list(self.status.values()))) def _on_pebble_ready(self, event) -> None: """Pebble ready hook. From e5baf0c80db6d107c9fc4632205fa1de4517bd83 Mon Sep 17 00:00:00 2001 From: sed-i <82407168+sed-i@users.noreply.github.com> Date: Wed, 17 Jan 2024 15:18:29 -0500 Subject: [PATCH 04/10] Lint --- src/charm.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/charm.py b/src/charm.py index bfb786ba..104a9d95 100755 --- a/src/charm.py +++ b/src/charm.py @@ -7,12 +7,10 @@ import hashlib import logging import re -from typing import List, TypedDict, cast - import socket import subprocess from pathlib import Path -from typing import Dict, Optional, TypedDict, cast +from typing import Dict, List, Optional, TypedDict, cast from urllib.parse import urlparse import yaml @@ -576,7 +574,9 @@ def _configure(self, _): logger.info("Prometheus configuration reloaded") - self.unit.status = StatusBase._get_highest_priority(cast(List[StatusBase], list(self.status.values()))) + self.unit.status = StatusBase._get_highest_priority( + cast(List[StatusBase], list(self.status.values())) + ) def _on_pebble_ready(self, event) -> None: """Pebble ready hook. From 629a68d098eeb37e1cdb130ec7927b5944d6eacf Mon Sep 17 00:00:00 2001 From: sed-i <82407168+sed-i@users.noreply.github.com> Date: Wed, 17 Jan 2024 12:42:28 -0800 Subject: [PATCH 05/10] Lint --- src/charm.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/charm.py b/src/charm.py index 104a9d95..c1d915d3 100755 --- a/src/charm.py +++ b/src/charm.py @@ -115,7 +115,9 @@ class PrometheusCharm(CharmBase): def __init__(self, *args): super().__init__(*args) - self.status = CompositeStatus() + self.status = CompositeStatus( + retention_size=ActiveStatus(), timespec=ActiveStatus(), k8s_patch=ActiveStatus() + ) self._name = "prometheus" self._port = 9090 From e0a003dcd337a81f539c3a795a7cefa407f25989 Mon Sep 17 00:00:00 2001 From: sed-i <82407168+sed-i@users.noreply.github.com> Date: Thu, 25 Jan 2024 17:05:43 -0500 Subject: [PATCH 06/10] fetch-lib --- .../v2/tls_certificates.py | 228 ++++++++++++------ 1 file changed, 156 insertions(+), 72 deletions(-) diff --git a/lib/charms/tls_certificates_interface/v2/tls_certificates.py b/lib/charms/tls_certificates_interface/v2/tls_certificates.py index b8855bea..08c5cb50 100644 --- a/lib/charms/tls_certificates_interface/v2/tls_certificates.py +++ b/lib/charms/tls_certificates_interface/v2/tls_certificates.py @@ -308,13 +308,13 @@ def _on_all_certificates_invalidated(self, event: AllCertificatesInvalidatedEven # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 21 +LIBPATCH = 22 PYDEPS = ["cryptography", "jsonschema"] REQUIRER_JSON_SCHEMA = { "$schema": "http://json-schema.org/draft-04/schema#", - "$id": "https://canonical.github.io/charm-relation-interfaces/tls_certificates/v2/schemas/requirer.json", # noqa: E501 + "$id": "https://canonical.github.io/charm-relation-interfaces/interfaces/tls_certificates/v1/schemas/requirer.json", "type": "object", "title": "`tls_certificates` requirer root schema", "description": "The `tls_certificates` root schema comprises the entire requirer databag for this interface.", # noqa: E501 @@ -349,7 +349,7 @@ def _on_all_certificates_invalidated(self, event: AllCertificatesInvalidatedEven PROVIDER_JSON_SCHEMA = { "$schema": "http://json-schema.org/draft-04/schema#", - "$id": "https://canonical.github.io/charm-relation-interfaces/tls_certificates/v2/schemas/provider.json", # noqa: E501 + "$id": "https://canonical.github.io/charm-relation-interfaces/interfaces/tls_certificates/v1/schemas/provider.json", "type": "object", "title": "`tls_certificates` provider root schema", "description": "The `tls_certificates` root schema comprises the entire provider databag for this interface.", # noqa: E501 @@ -623,6 +623,40 @@ def _load_relation_data(relation_data_content: RelationDataContent) -> dict: return certificate_data +def _get_closest_future_time( + expiry_notification_time: datetime, expiry_time: datetime +) -> datetime: + """Return expiry_notification_time if not in the past, otherwise return expiry_time. + + Args: + expiry_notification_time (datetime): Notification time of impending expiration + expiry_time (datetime): Expiration time + + Returns: + datetime: expiry_notification_time if not in the past, expiry_time otherwise + """ + return ( + expiry_notification_time if datetime.utcnow() < expiry_notification_time else expiry_time + ) + + +def _get_certificate_expiry_time(certificate: str) -> Optional[datetime]: + """Extract expiry time from a certificate string. + + Args: + certificate (str): x509 certificate as a string + + Returns: + Optional[datetime]: Expiry datetime or None + """ + try: + certificate_object = x509.load_pem_x509_certificate(data=certificate.encode()) + return certificate_object.not_valid_after + except ValueError: + logger.warning("Could not load certificate.") + return None + + def generate_ca( private_key: bytes, subject: str, @@ -984,6 +1018,38 @@ def generate_csr( return signed_certificate.public_bytes(serialization.Encoding.PEM) +def csr_matches_certificate(csr: str, cert: str) -> bool: + """Check if a CSR matches a certificate. + + Args: + csr (str): Certificate Signing Request as a string + cert (str): Certificate as a string + Returns: + bool: True/False depending on whether the CSR matches the certificate. + """ + try: + csr_object = x509.load_pem_x509_csr(csr.encode("utf-8")) + cert_object = x509.load_pem_x509_certificate(cert.encode("utf-8")) + + if csr_object.public_key().public_bytes( + encoding=serialization.Encoding.PEM, + format=serialization.PublicFormat.SubjectPublicKeyInfo, + ) != cert_object.public_key().public_bytes( + encoding=serialization.Encoding.PEM, + format=serialization.PublicFormat.SubjectPublicKeyInfo, + ): + return False + if ( + csr_object.public_key().public_numbers().n # type: ignore[union-attr] + != cert_object.public_key().public_numbers().n # type: ignore[union-attr] + ): + return False + except ValueError: + logger.warning("Could not load certificate or CSR.") + return False + return True + + class CertificatesProviderCharmEvents(CharmEvents): """List of events that the TLS Certificates provider charm can leverage.""" @@ -1447,7 +1513,7 @@ def __init__( @property def _requirer_csrs(self) -> List[Dict[str, Union[bool, str]]]: - """Returns list of requirer's CSRs from relation data. + """Returns list of requirer's CSRs from relation unit data. Example: [ @@ -1592,6 +1658,92 @@ def request_certificate_renewal( ) logger.info("Certificate renewal request completed.") + def get_assigned_certificates(self) -> List[Dict[str, str]]: + """Get a list of certificates that were assigned to this unit. + + Returns: + List of certificates. For example: + [ + { + "ca": "-----BEGIN CERTIFICATE-----...", + "chain": [ + "-----BEGIN CERTIFICATE-----..." + ], + "certificate": "-----BEGIN CERTIFICATE-----...", + "certificate_signing_request": "-----BEGIN CERTIFICATE REQUEST-----...", + } + ] + """ + final_list = [] + for csr in self.get_certificate_signing_requests(fulfilled_only=True): + assert type(csr["certificate_signing_request"]) == str + if cert := self._find_certificate_in_relation_data(csr["certificate_signing_request"]): + final_list.append(cert) + return final_list + + def get_expiring_certificates(self) -> List[Dict[str, str]]: + """Get a list of certificates that were assigned to this unit that are expiring or expired. + + Returns: + List of certificates. For example: + [ + { + "ca": "-----BEGIN CERTIFICATE-----...", + "chain": [ + "-----BEGIN CERTIFICATE-----..." + ], + "certificate": "-----BEGIN CERTIFICATE-----...", + "certificate_signing_request": "-----BEGIN CERTIFICATE REQUEST-----...", + } + ] + """ + final_list = [] + for csr in self.get_certificate_signing_requests(fulfilled_only=True): + assert type(csr["certificate_signing_request"]) == str + if cert := self._find_certificate_in_relation_data(csr["certificate_signing_request"]): + expiry_time = _get_certificate_expiry_time(cert["certificate"]) + if not expiry_time: + continue + expiry_notification_time = expiry_time - timedelta( + hours=self.expiry_notification_time + ) + if datetime.utcnow() > expiry_notification_time: + final_list.append(cert) + return final_list + + def get_certificate_signing_requests( + self, + fulfilled_only: bool = False, + unfulfilled_only: bool = False, + ) -> List[Dict[str, Union[bool, str]]]: + """Gets the list of CSR's that were sent to the provider. + + You can choose to get only the CSR's that have a certificate assigned or only the CSR's + that don't. + + Args: + fulfilled_only (bool): This option will discard CSRs that don't have certificates yet. + unfulfilled_only (bool): This option will discard CSRs that have certificates signed. + Returns: + List of CSR dictionaries. For example: + [ + { + "certificate_signing_request": "-----BEGIN CERTIFICATE REQUEST-----...", + "ca": false + } + ] + """ + + final_list = [] + for csr in self._requirer_csrs: + assert type(csr["certificate_signing_request"]) == str + cert = self._find_certificate_in_relation_data(csr["certificate_signing_request"]) + if (unfulfilled_only and cert) or (fulfilled_only and not cert): + continue + final_list.append(csr) + + return final_list + @staticmethod def _relation_data_is_valid(certificates_data: dict) -> bool: """Checks whether relation data is valid based on json schema. @@ -1802,71 +1954,3 @@ def _on_update_status(self, event: UpdateStatusEvent) -> None: certificate=certificate_dict["certificate"], expiry=expiry_time.isoformat(), ) - - -def csr_matches_certificate(csr: str, cert: str) -> bool: - """Check if a CSR matches a certificate. - - expects to get the original string representations. - - Args: - csr (str): Certificate Signing Request - cert (str): Certificate - Returns: - bool: True/False depending on whether the CSR matches the certificate. - """ - try: - csr_object = x509.load_pem_x509_csr(csr.encode("utf-8")) - cert_object = x509.load_pem_x509_certificate(cert.encode("utf-8")) - - if csr_object.public_key().public_bytes( - encoding=serialization.Encoding.PEM, - format=serialization.PublicFormat.SubjectPublicKeyInfo, - ) != cert_object.public_key().public_bytes( - encoding=serialization.Encoding.PEM, - format=serialization.PublicFormat.SubjectPublicKeyInfo, - ): - return False - if ( - csr_object.public_key().public_numbers().n # type: ignore[union-attr] - != cert_object.public_key().public_numbers().n # type: ignore[union-attr] - ): - return False - except ValueError: - logger.warning("Could not load certificate or CSR.") - return False - return True - - -def _get_closest_future_time( - expiry_notification_time: datetime, expiry_time: datetime -) -> datetime: - """Return expiry_notification_time if not in the past, otherwise return expiry_time. - - Args: - expiry_notification_time (datetime): Notification time of impending expiration - expiry_time (datetime): Expiration time - - Returns: - datetime: expiry_notification_time if not in the past, expiry_time otherwise - """ - return ( - expiry_notification_time if datetime.utcnow() < expiry_notification_time else expiry_time - ) - - -def _get_certificate_expiry_time(certificate: str) -> Optional[datetime]: - """Extract expiry time from a certificate string. - - Args: - certificate (str): x509 certificate as a string - - Returns: - Optional[datetime]: Expiry datetime or None - """ - try: - certificate_object = x509.load_pem_x509_certificate(data=certificate.encode()) - return certificate_object.not_valid_after - except ValueError: - logger.warning("Could not load certificate.") - return None From 0b90f1b9c314e1431f19cf6a696523a335f24f02 Mon Sep 17 00:00:00 2001 From: sed-i <82407168+sed-i@users.noreply.github.com> Date: Thu, 25 Jan 2024 18:58:14 -0500 Subject: [PATCH 07/10] Refactor status setting --- src/charm.py | 58 ++++++++++++++++++++++----------- tests/unit/test_charm.py | 6 ++++ tests/unit/test_charm_status.py | 1 + tests/unit/test_remote_write.py | 1 + 4 files changed, 47 insertions(+), 19 deletions(-) diff --git a/src/charm.py b/src/charm.py index c1d915d3..9cda2e04 100755 --- a/src/charm.py +++ b/src/charm.py @@ -10,7 +10,7 @@ import socket import subprocess from pathlib import Path -from typing import Dict, List, Optional, TypedDict, cast +from typing import Dict, Optional, TypedDict, cast from urllib.parse import urlparse import yaml @@ -46,6 +46,7 @@ from lightkube.core.client import Client from lightkube.core.exceptions import ApiError as LightkubeApiError from lightkube.resources.core_v1 import PersistentVolumeClaim, Pod +from ops import CollectStatusEvent, StoredState from ops.charm import ActionEvent, CharmBase from ops.main import main from ops.model import ( @@ -96,7 +97,6 @@ class CompositeStatus(TypedDict): """Per-component status holder.""" retention_size: StatusBase - timespec: StatusBase k8s_patch: StatusBase @@ -113,10 +113,15 @@ class CompositeStatus(TypedDict): class PrometheusCharm(CharmBase): """A Juju Charm for Prometheus.""" + _stored = StoredState() + def __init__(self, *args): super().__init__(*args) - self.status = CompositeStatus( - retention_size=ActiveStatus(), timespec=ActiveStatus(), k8s_patch=ActiveStatus() + + # Prometheus has a mix of pull and push statuses. We need stored state for push statuses. + # https://discourse.charmhub.io/t/its-probably-ok-for-a-unit-to-go-into-error-state/13022 + self._stored.set_default( + status=CompositeStatus(retention_size=ActiveStatus(), k8s_patch=ActiveStatus()) ) self._name = "prometheus" @@ -203,6 +208,30 @@ def __init__(self, *args): self.framework.observe(self.alertmanager_consumer.on.cluster_changed, self._configure) self.framework.observe(self.resources_patch.on.patch_failed, self._on_k8s_patch_failed) self.framework.observe(self.on.validate_configuration_action, self._on_validate_config) + self.framework.observe(self.on.collect_unit_status, self._on_collect_unit_status) + + def _on_collect_unit_status(self, event: CollectStatusEvent): + # "Pull" statuses + if not self.container.can_connect(): + event.add_status(MaintenanceStatus("Configuring Prometheus")) + + if not self._is_valid_timespec( + retention_time := self.model.config.get("metrics_retention_time", "") + ): + BlockedStatus(f"Invalid time spec : {retention_time}") + + # "Push" statuses + if self.resources_patch.is_ready(): + self._stored.status["k8s_patch"] = ActiveStatus() + else: + if isinstance(self._stored.status["k8s_patch"], ActiveStatus): + self._stored.status["k8s_patch"] = WaitingStatus( + "Waiting for resource limit patch to apply" + ) + return + + for status in self._stored.status.values(): + event.add_status(status) def set_ports(self): """Open necessary (and close no longer needed) workload ports.""" @@ -391,7 +420,7 @@ def _on_ingress_revoked(self, event: IngressPerUnitRevokedForUnitEvent): self._configure(event) def _on_k8s_patch_failed(self, event: K8sResourcePatchFailedEvent): - self.status["k8s_patch"] = BlockedStatus(cast(str, event.message)) + self._stored.status["k8s_patch"] = BlockedStatus(cast(str, event.message)) def _on_server_cert_changed(self, _): self._update_cert() @@ -489,12 +518,9 @@ def _configure(self, _): } if not self.resources_patch.is_ready(): - if isinstance(self.unit.status, ActiveStatus) or self.unit.status.message == "": - self.unit.status = WaitingStatus("Waiting for resource limit patch to apply") return if not self.container.can_connect(): - self.unit.status = MaintenanceStatus("Configuring Prometheus") return if self._is_cert_available() and not self._is_tls_ready(): @@ -576,10 +602,6 @@ def _configure(self, _): logger.info("Prometheus configuration reloaded") - self.unit.status = StatusBase._get_highest_priority( - cast(List[StatusBase], list(self.status.values())) - ) - def _on_pebble_ready(self, event) -> None: """Pebble ready hook. @@ -689,7 +711,7 @@ def _generate_command(self) -> str: except ValueError as e: logger.warning(e) - self.status["retention_size"] = BlockedStatus(f"Invalid retention size: {e}") + self._stored.status["retention_size"] = BlockedStatus(f"Invalid retention size: {e}") else: # `storage.tsdb.retention.size` uses the legacy binary format, so "GB" and not "GiB" @@ -700,18 +722,18 @@ def _generate_command(self) -> str: self._get_pvc_capacity(), ratio ) except ValueError as e: - self.status["retention_size"] = BlockedStatus( + self._stored.status["retention_size"] = BlockedStatus( f"Error calculating retention size: {e}" ) except LightkubeApiError as e: - self.status["retention_size"] = BlockedStatus( + self._stored.status["retention_size"] = BlockedStatus( "Error calculating retention size " f"(try running `juju trust` on this application): {e}" ) else: logger.debug("Retention size limit set to %s (%s%%)", capacity, ratio * 100) args.append(f"--storage.tsdb.retention.size={capacity}") - self.status["retention_size"] = ActiveStatus() + self._stored.status["retention_size"] = ActiveStatus() command = ["/bin/prometheus"] + args @@ -818,9 +840,7 @@ def _is_valid_timespec(self, timeval: str) -> bool: timespec_re = re.compile( r"^((([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?|0)$" ) - if not (matched := timespec_re.search(timeval)): - self.status["timespec"] = BlockedStatus(f"Invalid time spec : {timeval}") - + matched = timespec_re.search(timeval) return bool(matched) def _percent_string_to_ratio(self, percentage: str) -> float: diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index fcc41c57..bce2f9e5 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -216,6 +216,7 @@ def test_configuration_reload(self, trigger_configuration_reload, *unused): def test_configuration_reload_success(self, trigger_configuration_reload, *unused): trigger_configuration_reload.return_value = True self.harness.update_config({"evaluation_interval": "1234m"}) + self.harness.evaluate_status() self.assertIsInstance(self.harness.model.unit.status, ActiveStatus) @k8s_resource_multipatch @@ -322,6 +323,7 @@ def test_invalid_retention_size_config_option_string(self, *unused): self.mock_capacity.return_value = "1Gi" self.harness.begin_with_initial_hooks() self.harness.container_pebble_ready("prometheus") + self.harness.evaluate_status() self.assertIsInstance(self.harness.model.unit.status, ActiveStatus) # WHEN the config option is set to an invalid string @@ -330,6 +332,7 @@ def test_invalid_retention_size_config_option_string(self, *unused): # THEN cli arg is unspecified and the unit is blocked plan = self.harness.get_container_pebble_plan("prometheus") self.assertIsNone(cli_arg(plan, "--storage.tsdb.retention.size")) + self.harness.evaluate_status() self.assertIsInstance(self.harness.model.unit.status, BlockedStatus) # AND WHEN the config option is corrected @@ -338,6 +341,7 @@ def test_invalid_retention_size_config_option_string(self, *unused): # THEN cli arg is updated and the unit is goes back to active plan = self.harness.get_container_pebble_plan("prometheus") self.assertEqual(cli_arg(plan, "--storage.tsdb.retention.size"), "0.42GB") + self.harness.evaluate_status() self.assertIsInstance(self.harness.model.unit.status, ActiveStatus) @@ -689,6 +693,7 @@ def test_ca_file(self, *_): }, ) + self.harness.evaluate_status() self.assertIsInstance(self.harness.model.unit.status, ActiveStatus) container = self.harness.charm.unit.get_container("prometheus") self.assertEqual(container.pull("/etc/prometheus/job1-ca.crt").read(), "CA 1") @@ -725,6 +730,7 @@ def test_no_tls_config(self, *_): }, ) + self.harness.evaluate_status() self.assertIsInstance(self.harness.model.unit.status, ActiveStatus) @k8s_resource_multipatch diff --git a/tests/unit/test_charm_status.py b/tests/unit/test_charm_status.py index b7aae061..13bce76d 100644 --- a/tests/unit/test_charm_status.py +++ b/tests/unit/test_charm_status.py @@ -58,6 +58,7 @@ def test_unit_is_active_if_deployed_without_relations_or_config(self, *unused): # WHEN no config is provided or relations created # THEN the unit goes into active state + self.harness.evaluate_status() self.assertIsInstance(self.harness.charm.unit.status, ActiveStatus) # AND pebble plan is not empty diff --git a/tests/unit/test_remote_write.py b/tests/unit/test_remote_write.py index b97e1b55..3a1fc31d 100644 --- a/tests/unit/test_remote_write.py +++ b/tests/unit/test_remote_write.py @@ -241,6 +241,7 @@ def test_port_is_set(self, *unused): self.harness.get_relation_data(rel_id, self.harness.charm.unit.name), {"remote_write": json.dumps({"url": "http://fqdn:9090/api/v1/write"})}, ) + self.harness.evaluate_status() self.assertIsInstance(self.harness.charm.unit.status, ActiveStatus) @k8s_resource_multipatch From 2ceeee6a4e838a0648140806974e6feba2ba42b5 Mon Sep 17 00:00:00 2001 From: sed-i <82407168+sed-i@users.noreply.github.com> Date: Thu, 25 Jan 2024 19:27:40 -0500 Subject: [PATCH 08/10] Push config from _configure to collect-status via _stored --- src/charm.py | 30 +++++++++++++++++++++--------- tests/unit/test_charm.py | 4 ++++ tests/unit/test_charm_status.py | 1 + 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/src/charm.py b/src/charm.py index 9cda2e04..c964480e 100755 --- a/src/charm.py +++ b/src/charm.py @@ -98,6 +98,7 @@ class CompositeStatus(TypedDict): retention_size: StatusBase k8s_patch: StatusBase + config: StatusBase @trace_charm( @@ -121,7 +122,9 @@ def __init__(self, *args): # Prometheus has a mix of pull and push statuses. We need stored state for push statuses. # https://discourse.charmhub.io/t/its-probably-ok-for-a-unit-to-go-into-error-state/13022 self._stored.set_default( - status=CompositeStatus(retention_size=ActiveStatus(), k8s_patch=ActiveStatus()) + status=CompositeStatus( + retention_size=ActiveStatus(), k8s_patch=ActiveStatus(), config=ActiveStatus() + ) ) self._name = "prometheus" @@ -546,19 +549,23 @@ def _configure(self, _): ) except ConfigError as e: logger.error("Failed to generate configuration: %s", e) - self.unit.status = BlockedStatus(str(e)) + self._stored.status["config"] = BlockedStatus(str(e)) return except PebbleError as e: logger.error("Failed to push updated config/alert files: %s", e) - self.unit.status = early_return_statuses["push_fail"] + self._stored.status["config"] = early_return_statuses["push_fail"] return + else: + self._stored.status["config"] = ActiveStatus() try: layer_changed = self._update_layer() except (TypeError, PebbleError) as e: logger.error("Failed to update prometheus service: %s", e) - self.unit.status = early_return_statuses["layer_fail"] + self._stored.status["config"] = early_return_statuses["layer_fail"] return + else: + self._stored.status["config"] = ActiveStatus() try: output, err = self._promtool_check_config() @@ -566,12 +573,14 @@ def _configure(self, _): logger.error( "Invalid prometheus configuration. Stdout: %s Stderr: %s", output, err ) - self.unit.status = early_return_statuses["config_invalid"] + self._stored.status["config"] = early_return_statuses["config_invalid"] return except PebbleError as e: logger.error("Failed to validate prometheus config: %s", e) - self.unit.status = early_return_statuses["validation_fail"] + self._stored.status["config"] = early_return_statuses["validation_fail"] return + else: + self._stored.status["config"] = ActiveStatus() try: # If a config is invalid then prometheus would exit immediately. @@ -585,8 +594,10 @@ def _configure(self, _): self._prometheus_layer.to_dict(), e, ) - self.unit.status = early_return_statuses["restart_fail"] + self._stored.status["config"] = early_return_statuses["restart_fail"] return + else: + self._stored.status["config"] = ActiveStatus() # We only need to reload if pebble didn't replan (if pebble replanned, then new config # would be picked up on startup anyway). @@ -594,13 +605,14 @@ def _configure(self, _): reloaded = self._prometheus_client.reload_configuration() if not reloaded: logger.error("Prometheus failed to reload the configuration") - self.unit.status = early_return_statuses["cfg_load_fail"] + self._stored.status["config"] = early_return_statuses["cfg_load_fail"] return if reloaded == "read_timeout": - self.unit.status = early_return_statuses["cfg_load_timeout"] + self._stored.status["config"] = early_return_statuses["cfg_load_timeout"] return logger.info("Prometheus configuration reloaded") + self._stored.status["config"] = ActiveStatus() def _on_pebble_ready(self, event) -> None: """Pebble ready hook. diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index bce2f9e5..9df7ae06 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -225,6 +225,7 @@ def test_configuration_reload_success(self, trigger_configuration_reload, *unuse def test_configuration_reload_error(self, trigger_configuration_reload, *unused): trigger_configuration_reload.return_value = False self.harness.update_config({"evaluation_interval": "1234m"}) + self.harness.evaluate_status() self.assertIsInstance(self.harness.model.unit.status, BlockedStatus) @k8s_resource_multipatch @@ -233,6 +234,7 @@ def test_configuration_reload_error(self, trigger_configuration_reload, *unused) def test_configuration_reload_read_timeout(self, trigger_configuration_reload, *unused): trigger_configuration_reload.return_value = "read_timeout" self.harness.update_config({"evaluation_interval": "1234m"}) + self.harness.evaluate_status() self.assertIsInstance(self.harness.model.unit.status, MaintenanceStatus) @@ -766,6 +768,7 @@ def test_tls_config_missing_cert(self, *_): }, ) + self.harness.evaluate_status() self.assertIsInstance(self.harness.model.unit.status, BlockedStatus) @k8s_resource_multipatch @@ -801,4 +804,5 @@ def test_tls_config_missing_key(self, *_): }, ) + self.harness.evaluate_status() self.assertIsInstance(self.harness.model.unit.status, BlockedStatus) diff --git a/tests/unit/test_charm_status.py b/tests/unit/test_charm_status.py index 13bce76d..e0517e67 100644 --- a/tests/unit/test_charm_status.py +++ b/tests/unit/test_charm_status.py @@ -91,6 +91,7 @@ def test_unit_is_blocked_if_reload_configuration_fails(self, *unused): # WHEN no config is provided or relations created # THEN the unit goes into blocked state + self.harness.evaluate_status() self.assertIsInstance(self.harness.charm.unit.status, BlockedStatus) # AND pebble plan is not empty From fd2c84d9d7e6f8d6a9d5e5454221c495923a514f Mon Sep 17 00:00:00 2001 From: sed-i <82407168+sed-i@users.noreply.github.com> Date: Thu, 25 Jan 2024 23:23:43 -0500 Subject: [PATCH 09/10] Tuple[str, str] instead of StatusBase --- src/charm.py | 101 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 59 insertions(+), 42 deletions(-) diff --git a/src/charm.py b/src/charm.py index c964480e..bbe052f3 100755 --- a/src/charm.py +++ b/src/charm.py @@ -10,7 +10,7 @@ import socket import subprocess from pathlib import Path -from typing import Dict, Optional, TypedDict, cast +from typing import Dict, Optional, Tuple, TypedDict, cast from urllib.parse import urlparse import yaml @@ -96,9 +96,21 @@ class ConfigError(Exception): class CompositeStatus(TypedDict): """Per-component status holder.""" - retention_size: StatusBase - k8s_patch: StatusBase - config: StatusBase + # These are going to go into stored state, so we must use marshallable objects. + # They are passed to StatusBase.from_name(). + retention_size: Tuple[str, str] + k8s_patch: Tuple[str, str] + config: Tuple[str, str] + + +def to_tuple(status: StatusBase) -> Tuple[str, str]: + """Convert a StatusBase to tuple, so it is marshallable into StoredState.""" + return status.name, status.message + + +def to_status(tpl: Tuple[str, str]) -> StatusBase: + """Convert a tuple to a StatusBase, so it could be used natively with ops.""" + return StatusBase.from_name(*tpl) @trace_charm( @@ -123,7 +135,9 @@ def __init__(self, *args): # https://discourse.charmhub.io/t/its-probably-ok-for-a-unit-to-go-into-error-state/13022 self._stored.set_default( status=CompositeStatus( - retention_size=ActiveStatus(), k8s_patch=ActiveStatus(), config=ActiveStatus() + retention_size=to_tuple(ActiveStatus()), + k8s_patch=to_tuple(ActiveStatus()), + config=to_tuple(ActiveStatus()), ) ) @@ -215,26 +229,14 @@ def __init__(self, *args): def _on_collect_unit_status(self, event: CollectStatusEvent): # "Pull" statuses - if not self.container.can_connect(): - event.add_status(MaintenanceStatus("Configuring Prometheus")) - if not self._is_valid_timespec( retention_time := self.model.config.get("metrics_retention_time", "") ): - BlockedStatus(f"Invalid time spec : {retention_time}") + event.add_status(BlockedStatus(f"Invalid time spec : {retention_time}")) # "Push" statuses - if self.resources_patch.is_ready(): - self._stored.status["k8s_patch"] = ActiveStatus() - else: - if isinstance(self._stored.status["k8s_patch"], ActiveStatus): - self._stored.status["k8s_patch"] = WaitingStatus( - "Waiting for resource limit patch to apply" - ) - return - for status in self._stored.status.values(): - event.add_status(status) + event.add_status(to_status(status)) def set_ports(self): """Open necessary (and close no longer needed) workload ports.""" @@ -423,7 +425,7 @@ def _on_ingress_revoked(self, event: IngressPerUnitRevokedForUnitEvent): self._configure(event) def _on_k8s_patch_failed(self, event: K8sResourcePatchFailedEvent): - self._stored.status["k8s_patch"] = BlockedStatus(cast(str, event.message)) + self._stored.status["k8s_patch"] = to_tuple(BlockedStatus(cast(str, event.message))) def _on_server_cert_changed(self, _): self._update_cert() @@ -520,10 +522,21 @@ def _configure(self, _): ), } - if not self.resources_patch.is_ready(): + # "is_ready" is a racy check, so we do it once here (instead of in collect-status) + if self.resources_patch.is_ready(): + self._stored.status["k8s_patch"] = to_tuple(ActiveStatus()) + else: + if isinstance(to_status(self._stored.status["k8s_patch"]), ActiveStatus): + self._stored.status["k8s_patch"] = to_tuple( + WaitingStatus("Waiting for resource limit patch to apply") + ) return - if not self.container.can_connect(): + # "can_connect" is a racy check, so we do it once here (instead of in collect-status) + if self.container.can_connect(): + self._stored.status["config"] = to_tuple(ActiveStatus()) + else: + self._stored.status["config"] = to_tuple(MaintenanceStatus("Configuring Prometheus")) return if self._is_cert_available() and not self._is_tls_ready(): @@ -549,23 +562,23 @@ def _configure(self, _): ) except ConfigError as e: logger.error("Failed to generate configuration: %s", e) - self._stored.status["config"] = BlockedStatus(str(e)) + self._stored.status["config"] = to_tuple(BlockedStatus(str(e))) return except PebbleError as e: logger.error("Failed to push updated config/alert files: %s", e) - self._stored.status["config"] = early_return_statuses["push_fail"] + self._stored.status["config"] = to_tuple(early_return_statuses["push_fail"]) return else: - self._stored.status["config"] = ActiveStatus() + self._stored.status["config"] = to_tuple(ActiveStatus()) try: layer_changed = self._update_layer() except (TypeError, PebbleError) as e: logger.error("Failed to update prometheus service: %s", e) - self._stored.status["config"] = early_return_statuses["layer_fail"] + self._stored.status["config"] = to_tuple(early_return_statuses["layer_fail"]) return else: - self._stored.status["config"] = ActiveStatus() + self._stored.status["config"] = to_tuple(ActiveStatus()) try: output, err = self._promtool_check_config() @@ -573,14 +586,14 @@ def _configure(self, _): logger.error( "Invalid prometheus configuration. Stdout: %s Stderr: %s", output, err ) - self._stored.status["config"] = early_return_statuses["config_invalid"] + self._stored.status["config"] = to_tuple(early_return_statuses["config_invalid"]) return except PebbleError as e: logger.error("Failed to validate prometheus config: %s", e) - self._stored.status["config"] = early_return_statuses["validation_fail"] + self._stored.status["config"] = to_tuple(early_return_statuses["validation_fail"]) return else: - self._stored.status["config"] = ActiveStatus() + self._stored.status["config"] = to_tuple(ActiveStatus()) try: # If a config is invalid then prometheus would exit immediately. @@ -594,10 +607,10 @@ def _configure(self, _): self._prometheus_layer.to_dict(), e, ) - self._stored.status["config"] = early_return_statuses["restart_fail"] + self._stored.status["config"] = to_tuple(early_return_statuses["restart_fail"]) return else: - self._stored.status["config"] = ActiveStatus() + self._stored.status["config"] = to_tuple(ActiveStatus()) # We only need to reload if pebble didn't replan (if pebble replanned, then new config # would be picked up on startup anyway). @@ -605,14 +618,14 @@ def _configure(self, _): reloaded = self._prometheus_client.reload_configuration() if not reloaded: logger.error("Prometheus failed to reload the configuration") - self._stored.status["config"] = early_return_statuses["cfg_load_fail"] + self._stored.status["config"] = to_tuple(early_return_statuses["cfg_load_fail"]) return if reloaded == "read_timeout": - self._stored.status["config"] = early_return_statuses["cfg_load_timeout"] + self._stored.status["config"] = to_tuple(early_return_statuses["cfg_load_timeout"]) return logger.info("Prometheus configuration reloaded") - self._stored.status["config"] = ActiveStatus() + self._stored.status["config"] = to_tuple(ActiveStatus()) def _on_pebble_ready(self, event) -> None: """Pebble ready hook. @@ -723,7 +736,9 @@ def _generate_command(self) -> str: except ValueError as e: logger.warning(e) - self._stored.status["retention_size"] = BlockedStatus(f"Invalid retention size: {e}") + self._stored.status["retention_size"] = to_tuple( + BlockedStatus(f"Invalid retention size: {e}") + ) else: # `storage.tsdb.retention.size` uses the legacy binary format, so "GB" and not "GiB" @@ -734,18 +749,20 @@ def _generate_command(self) -> str: self._get_pvc_capacity(), ratio ) except ValueError as e: - self._stored.status["retention_size"] = BlockedStatus( - f"Error calculating retention size: {e}" + self._stored.status["retention_size"] = to_tuple( + BlockedStatus(f"Error calculating retention size: {e}") ) except LightkubeApiError as e: - self._stored.status["retention_size"] = BlockedStatus( - "Error calculating retention size " - f"(try running `juju trust` on this application): {e}" + self._stored.status["retention_size"] = to_tuple( + BlockedStatus( + "Error calculating retention size " + f"(try running `juju trust` on this application): {e}" + ) ) else: logger.debug("Retention size limit set to %s (%s%%)", capacity, ratio * 100) args.append(f"--storage.tsdb.retention.size={capacity}") - self._stored.status["retention_size"] = ActiveStatus() + self._stored.status["retention_size"] = to_tuple(ActiveStatus()) command = ["/bin/prometheus"] + args From 06e7cff09395078ac36517b9ca60d2576d3adb92 Mon Sep 17 00:00:00 2001 From: Leon <82407168+sed-i@users.noreply.github.com> Date: Fri, 26 Jan 2024 21:41:06 -0500 Subject: [PATCH 10/10] Apply suggestions from code review Co-authored-by: Ben Hoyt --- src/charm.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/charm.py b/src/charm.py index bbe052f3..b0a51de6 100755 --- a/src/charm.py +++ b/src/charm.py @@ -110,7 +110,8 @@ def to_tuple(status: StatusBase) -> Tuple[str, str]: def to_status(tpl: Tuple[str, str]) -> StatusBase: """Convert a tuple to a StatusBase, so it could be used natively with ops.""" - return StatusBase.from_name(*tpl) + name, message = tpl + return StatusBase.from_name(name, message) @trace_charm( @@ -229,9 +230,8 @@ def __init__(self, *args): def _on_collect_unit_status(self, event: CollectStatusEvent): # "Pull" statuses - if not self._is_valid_timespec( - retention_time := self.model.config.get("metrics_retention_time", "") - ): + retention_time = self.model.config.get("metrics_retention_time", "") + if not self._is_valid_timespec(retention_time): event.add_status(BlockedStatus(f"Invalid time spec : {retention_time}")) # "Push" statuses