From fb9de0b6bd3bd3f54e030a0ce9cf9430ab9286b6 Mon Sep 17 00:00:00 2001 From: Mateusz Kulewicz Date: Tue, 7 Jan 2025 19:12:19 +0100 Subject: [PATCH 1/7] Don't expose tracing endpoints over cos_agent when backend isn't available --- lib/charms/grafana_agent/v0/cos_agent.py | 101 ++++++++++++++--------- src/charm.py | 22 +++++ src/grafana_agent.py | 28 ++----- tests/scenario/test_cos_agent_e2e.py | 1 + tests/scenario/test_peer_relation.py | 1 + 5 files changed, 91 insertions(+), 62 deletions(-) diff --git a/lib/charms/grafana_agent/v0/cos_agent.py b/lib/charms/grafana_agent/v0/cos_agent.py index 1ea79a6..5a54078 100644 --- a/lib/charms/grafana_agent/v0/cos_agent.py +++ b/lib/charms/grafana_agent/v0/cos_agent.py @@ -252,7 +252,7 @@ class _MetricsEndpointDict(TypedDict): LIBID = "dc15fa84cef84ce58155fb84f6c6213a" LIBAPI = 0 -LIBPATCH = 12 +LIBPATCH = 13 PYDEPS = ["cosl", "pydantic"] @@ -316,7 +316,11 @@ class NotReadyError(TracingError): """Raised by the provider wrapper if a requirer hasn't published the required data (yet).""" -class ProtocolNotRequestedError(TracingError): +class ProtocolNotFoundError(TracingError): + """Raised if the user doesn't receive an endpoint for a protocol it requested.""" + + +class ProtocolNotRequestedError(ProtocolNotFoundError): """Raised if the user attempts to obtain an endpoint for a protocol it did not request.""" @@ -578,7 +582,7 @@ class Receiver(pydantic.BaseModel): """Specification of an active receiver.""" protocol: ProtocolType = pydantic.Field(..., description="Receiver protocol name and type.") - url: str = pydantic.Field( + url: Optional[str] = pydantic.Field( ..., description="""URL at which the receiver is reachable. If there's an ingress, it would be the external URL. Otherwise, it would be the service's fqdn or internal IP. @@ -802,18 +806,22 @@ def get_all_endpoints( def _get_tracing_endpoint( self, relation: Optional[Relation], protocol: ReceiverProtocol ) -> Optional[str]: + """Return a tracing endpoint URL if it is available or raise a ProtocolNotFoundError.""" unit_data = self.get_all_endpoints(relation) if not unit_data: - return None + # we didn't find the protocol because the remote end didn't publish any data yet + # it might also mean that grafana-agent doesn't have a relation to the tracing backend + raise ProtocolNotFoundError(protocol) receivers: List[Receiver] = [i for i in unit_data.receivers if i.protocol.name == protocol] if not receivers: - logger.error(f"no receiver found with protocol={protocol!r}") - return None + # we didn't find the protocol because grafana-agent didn't return us the protocol that we requested + # the caller might want to verify that we did indeed request this protocol + # TODO put this in the error message maybe? + raise ProtocolNotFoundError(protocol) if len(receivers) > 1: - logger.error( + logger.warning( f"too many receivers with protocol={protocol!r}; using first one. Found: {receivers}" ) - return None receiver = receivers[0] return receiver.url @@ -821,9 +829,28 @@ def _get_tracing_endpoint( def get_tracing_endpoint( self, protocol: ReceiverProtocol, relation: Optional[Relation] = None ) -> Optional[str]: - """Receiver endpoint for the given protocol.""" - endpoint = self._get_tracing_endpoint(relation or self._relation, protocol=protocol) - if not endpoint: + """Receiver endpoint for the given protocol. + + It could happen that this function gets called before the provider publishes the endpoints. + In such a scenario, if a non-leader unit calls this function, a permission denied exception will be raised due to + restricted access. To prevent this, this function needs to be guarded by the `is_ready` check. + + Raises: + ProtocolNotRequestedError: + If the charm unit is the leader unit and attempts to obtain an endpoint for a protocol it did not request. + ProtocolNotFoundError: + If the charm attempts to obtain an endpoint when grafana-agent isn't related to a tracing backend. + """ + try: + endpoint = self._get_tracing_endpoint(relation or self._relation, protocol=protocol) + # because of the backwards compatibility requirements, the requirer is returning None + # (written as string to the databag) when tracing backend isn't connected. + # See more in the comment in `COSAgentRequirer#update_tracing_receivers` in this file. + if endpoint == "None": + return None + return endpoint + except ProtocolNotFoundError: + # let's see if we didn't find it because we didn't request the endpoint requested_protocols = set() relations = [relation] if relation else self.relations for relation in relations: @@ -838,8 +865,7 @@ def get_tracing_endpoint( if protocol not in requested_protocols: raise ProtocolNotRequestedError(protocol, relation) - return None - return endpoint + raise class COSAgentDataChanged(EventBase): @@ -982,18 +1008,26 @@ def _on_relation_data_changed(self, event: RelationChangedEvent): def update_tracing_receivers(self): """Updates the list of exposed tracing receivers in all relations.""" + requested_tracing_protocols = self.requested_tracing_protocols() + try: for relation in self._charm.model.relations[self._relation_name]: CosAgentRequirerUnitData( receivers=[ Receiver( - url=f"{self._get_tracing_receiver_url(protocol)}", + # if tracing isn't ready, we don't want the wrong receiver URLs present in the databag. + # however, because of the backwards compatibility requirements, we need to still provide + # the protocols list so that the charm with older cos_agent version doesn't error its hooks. + # before this change was added, the charm with old cos_agent version threw exceptions with + # connections to grafana-agent timing out, after that it will still throw exceptions, but + # faster: `MissingSchema: Invalid URL 'None/v1/traces': No scheme supplied.` + url=f"{self._get_tracing_receiver_url(protocol) if self._charm.tracing.is_ready() else None}", # type: ignore protocol=ProtocolType( name=protocol, type=receiver_protocol_to_transport_protocol[protocol], ), ) - for protocol in self.requested_tracing_protocols() + for protocol in requested_tracing_protocols ], ).dump(relation.data[self._charm.unit]) @@ -1306,6 +1340,7 @@ def dashboards(self) -> List[Dict[str, str]]: def charm_tracing_config( endpoint_requirer: COSAgentProvider, cert_path: Optional[Union[Path, str]] ) -> Tuple[Optional[str], Optional[str]]: + # TODO this probably needs an update - surely about blocked state """Utility function to determine the charm_tracing config you will likely want. If no endpoint is provided: @@ -1318,39 +1353,27 @@ def charm_tracing_config( proceed with charm tracing (with or without tls, as appropriate) Usage: - If you are using charm_tracing >= v1.9: - >>> from lib.charms.tempo_k8s.v1.charm_tracing import trace_charm - >>> from lib.charms.tempo_k8s.v0.cos_agent import charm_tracing_config + >>> from lib.charms.tempo_coordinator_k8s.v0.charm_tracing import trace_charm + >>> from lib.charms.tempo_coordinator_k8s.v0.tracing import charm_tracing_config >>> @trace_charm(tracing_endpoint="my_endpoint", cert_path="cert_path") >>> class MyCharm(...): >>> _cert_path = "/path/to/cert/on/charm/container.crt" >>> def __init__(self, ...): - >>> self.cos_agent = COSAgentProvider(...) + >>> self.tracing = TracingEndpointRequirer(...) >>> self.my_endpoint, self.cert_path = charm_tracing_config( - ... self.cos_agent, self._cert_path) - - If you are using charm_tracing < v1.9: - >>> from lib.charms.tempo_k8s.v1.charm_tracing import trace_charm - >>> from lib.charms.tempo_k8s.v2.tracing import charm_tracing_config - >>> @trace_charm(tracing_endpoint="my_endpoint", cert_path="cert_path") - >>> class MyCharm(...): - >>> _cert_path = "/path/to/cert/on/charm/container.crt" - >>> def __init__(self, ...): - >>> self.cos_agent = COSAgentProvider(...) - >>> self.my_endpoint, self.cert_path = charm_tracing_config( - ... self.cos_agent, self._cert_path) - >>> @property - >>> def my_endpoint(self): - >>> return self._my_endpoint - >>> @property - >>> def cert_path(self): - >>> return self._cert_path - + ... self.tracing, self._cert_path) """ if not endpoint_requirer.is_ready(): return None, None - endpoint = endpoint_requirer.get_tracing_endpoint("otlp_http") + try: + endpoint = endpoint_requirer.get_tracing_endpoint("otlp_http") + except ProtocolNotFoundError: + logger.warn( + "Endpoint for tracing wasn't provided as tracing backend isn't ready yet. If grafana-agent isn't connected to a tracing backend, integrate it. Otherwise this issue should resolve itself in a few events." + ) + return None, None + if not endpoint: return None, None diff --git a/src/charm.py b/src/charm.py index e59f406..fbac0c2 100755 --- a/src/charm.py +++ b/src/charm.py @@ -196,6 +196,16 @@ def __init__(self, *args): self.framework.observe(self.on.start, self._on_start) self.framework.observe(self.on.stop, self._on_stop) self.framework.observe(self.on.remove, self._on_remove) + # we need to observe these events in the charm as change of tracing endpoint may result in removing a receiver + # in the databags for charms related with us over cos-agent + self.framework.observe( + self.tracing.on.endpoint_changed, # pyright: ignore + self._on_tracing_endpoint_changed, + ) + self.framework.observe( + self.tracing.on.endpoint_removed, # pyright: ignore + self._on_tracing_endpoint_removed, + ) @property def snap(self): @@ -229,6 +239,18 @@ def _on_cos_validation_error(self, event): self._update_status() + def _on_tracing_endpoint_changed(self, _event) -> None: + """Event handler for the tracing endpoint-changed event.""" + super()._update_config() + super()._update_status() + self._cos.update_tracing_receivers() + + def _on_tracing_endpoint_removed(self, _event) -> None: + """Event handler for the tracing endpoint-removed event.""" + super()._update_config() + super()._update_status() + self._cos.update_tracing_receivers() + def _verify_snap_track(self) -> None: try: # install_ga_snap calls snap.ensure so it should do the right thing whether the track diff --git a/src/grafana_agent.py b/src/grafana_agent.py index 0c0dfba..92338e8 100644 --- a/src/grafana_agent.py +++ b/src/grafana_agent.py @@ -175,7 +175,7 @@ def __init__(self, *args): ) self.framework.observe(self.cert.on.cert_changed, self._on_cert_changed) # pyright: ignore - self._tracing = TracingEndpointRequirer( + self.tracing = TracingEndpointRequirer( self, protocols=[ "otlp_http", # for charm traces @@ -183,19 +183,11 @@ def __init__(self, *args): ], ) self._charm_tracing_endpoint, self._server_cert = charm_tracing_config( - self._tracing, self._ca_path + self.tracing, self._ca_path ) self._cloud = GrafanaCloudConfigRequirer(self) - self.framework.observe( - self._tracing.on.endpoint_changed, # pyright: ignore - self.on_tracing_endpoint_changed, - ) - self.framework.observe( - self._tracing.on.endpoint_removed, # pyright: ignore - self.on_tracing_endpoint_removed, - ) self.framework.observe( self._cloud.on.cloud_config_available, # pyright: ignore self._on_cloud_config_available, @@ -489,16 +481,6 @@ def on_remote_write_changed(self, _event) -> None: self._update_status() self._update_metrics_alerts() - def on_tracing_endpoint_changed(self, _event) -> None: - """Event handler for the tracing endpoint-changed event.""" - self._update_config() - self._update_status() - - def on_tracing_endpoint_removed(self, _event) -> None: - """Event handler for the tracing endpoint-removed event.""" - self._update_config() - self._update_status() - def _update_status(self, *_): """Determine the charm status based on relation health and grafana-agent service readiness. @@ -685,13 +667,13 @@ def _tempo_endpoints_with_tls(self) -> List[Dict[str, Any]]: FIXME: these should be separate concerns. """ tempo_endpoints = [] - if self._tracing.is_ready(): + if self.tracing.is_ready(): tempo_endpoints.append( { # outgoing traces are all otlp/grpc # cit: While Tempo and the Agent both can ingest in multiple formats, # the Agent only exports in OTLP gRPC and HTTP. - "endpoint": self._tracing.get_endpoint("otlp_grpc"), + "endpoint": self.tracing.get_endpoint("otlp_grpc"), "insecure": False if self.cert.enabled else True, } ) @@ -831,7 +813,7 @@ def _tracing_receivers(self) -> Dict[str, Union[Any, List[Any]]]: Returns: a dict with the receivers config. """ - if not self._tracing.is_ready(): + if not self.tracing.is_ready(): return {} receivers_set = self.requested_tracing_protocols diff --git a/tests/scenario/test_cos_agent_e2e.py b/tests/scenario/test_cos_agent_e2e.py index 6639312..ac7eab5 100644 --- a/tests/scenario/test_cos_agent_e2e.py +++ b/tests/scenario/test_cos_agent_e2e.py @@ -104,6 +104,7 @@ def __init__(self, framework: Framework): self, refresh_events=[self.on.cos_agent_relation_changed], ) + self.tracing = MagicMock() return MySubordinate diff --git a/tests/scenario/test_peer_relation.py b/tests/scenario/test_peer_relation.py index e2993ea..d26f234 100644 --- a/tests/scenario/test_peer_relation.py +++ b/tests/scenario/test_peer_relation.py @@ -65,6 +65,7 @@ def __init__(self, framework: Framework): super().__init__(framework) self.cosagent = COSAgentRequirer(self) self.prom = PrometheusRemoteWriteConsumer(self) + self.tracing = MagicMock() framework.observe(self.cosagent.on.data_changed, self._on_cosagent_data_changed) def _on_cosagent_data_changed(self, _): From 1e2169eb540e25a543cf930c125ccd3fc0cf8317 Mon Sep 17 00:00:00 2001 From: Mateusz Kulewicz Date: Wed, 8 Jan 2025 12:14:57 +0100 Subject: [PATCH 2/7] Use real null values, not a string named None --- lib/charms/grafana_agent/v0/cos_agent.py | 25 ++++++++++++------------ 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/lib/charms/grafana_agent/v0/cos_agent.py b/lib/charms/grafana_agent/v0/cos_agent.py index 5a54078..2fe3cdd 100644 --- a/lib/charms/grafana_agent/v0/cos_agent.py +++ b/lib/charms/grafana_agent/v0/cos_agent.py @@ -805,7 +805,7 @@ def get_all_endpoints( def _get_tracing_endpoint( self, relation: Optional[Relation], protocol: ReceiverProtocol - ) -> Optional[str]: + ) -> str: """Return a tracing endpoint URL if it is available or raise a ProtocolNotFoundError.""" unit_data = self.get_all_endpoints(relation) if not unit_data: @@ -816,7 +816,6 @@ def _get_tracing_endpoint( if not receivers: # we didn't find the protocol because grafana-agent didn't return us the protocol that we requested # the caller might want to verify that we did indeed request this protocol - # TODO put this in the error message maybe? raise ProtocolNotFoundError(protocol) if len(receivers) > 1: logger.warning( @@ -824,11 +823,14 @@ def _get_tracing_endpoint( ) receiver = receivers[0] + if not receiver.url: + # grafana-agent isn't connected to the tracing backend yet + raise ProtocolNotFoundError(protocol) return receiver.url def get_tracing_endpoint( self, protocol: ReceiverProtocol, relation: Optional[Relation] = None - ) -> Optional[str]: + ) -> str: """Receiver endpoint for the given protocol. It could happen that this function gets called before the provider publishes the endpoints. @@ -842,13 +844,7 @@ def get_tracing_endpoint( If the charm attempts to obtain an endpoint when grafana-agent isn't related to a tracing backend. """ try: - endpoint = self._get_tracing_endpoint(relation or self._relation, protocol=protocol) - # because of the backwards compatibility requirements, the requirer is returning None - # (written as string to the databag) when tracing backend isn't connected. - # See more in the comment in `COSAgentRequirer#update_tracing_receivers` in this file. - if endpoint == "None": - return None - return endpoint + return self._get_tracing_endpoint(relation or self._relation, protocol=protocol) except ProtocolNotFoundError: # let's see if we didn't find it because we didn't request the endpoint requested_protocols = set() @@ -1019,9 +1015,12 @@ def update_tracing_receivers(self): # however, because of the backwards compatibility requirements, we need to still provide # the protocols list so that the charm with older cos_agent version doesn't error its hooks. # before this change was added, the charm with old cos_agent version threw exceptions with - # connections to grafana-agent timing out, after that it will still throw exceptions, but - # faster: `MissingSchema: Invalid URL 'None/v1/traces': No scheme supplied.` - url=f"{self._get_tracing_receiver_url(protocol) if self._charm.tracing.is_ready() else None}", # type: ignore + # connections to grafana-agent timing out. After the change, the charm will fail validating + # databag contents (as it expects a string in URL) but that won't cause any errors as + # tracing endpoints are the only content in the grafana-agent's side of the databag. + url=f"{self._get_tracing_receiver_url(protocol)}" + if self._charm.tracing.is_ready() # type: ignore + else None, protocol=ProtocolType( name=protocol, type=receiver_protocol_to_transport_protocol[protocol], From 0e0486ac91e9c365f8ea2e1bf21f4f699eace2ab Mon Sep 17 00:00:00 2001 From: Mateusz Kulewicz Date: Wed, 8 Jan 2025 12:50:40 +0100 Subject: [PATCH 3/7] comment leftovers --- lib/charms/grafana_agent/v0/cos_agent.py | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/charms/grafana_agent/v0/cos_agent.py b/lib/charms/grafana_agent/v0/cos_agent.py index 2fe3cdd..9a64f2e 100644 --- a/lib/charms/grafana_agent/v0/cos_agent.py +++ b/lib/charms/grafana_agent/v0/cos_agent.py @@ -1339,7 +1339,6 @@ def dashboards(self) -> List[Dict[str, str]]: def charm_tracing_config( endpoint_requirer: COSAgentProvider, cert_path: Optional[Union[Path, str]] ) -> Tuple[Optional[str], Optional[str]]: - # TODO this probably needs an update - surely about blocked state """Utility function to determine the charm_tracing config you will likely want. If no endpoint is provided: From 8b6e7aa7664955151c44e72d0318d927c62f0ca7 Mon Sep 17 00:00:00 2001 From: Mateusz Kulewicz Date: Wed, 8 Jan 2025 16:47:32 +0100 Subject: [PATCH 4/7] pin cosl to <0.0.49 --- requirements.txt | 2 +- tox.ini | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 9114d86..de9661d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ # FIXME: Packing the charm with 2.2.0+139.gd011d92 will not include dependencies in PYDEPS key: # https://chat.charmhub.io/charmhub/pl/wngp665ycjnb78ar9ojrfhxjkr # That's why we are including cosl here until the bug in charmcraft is solved -cosl +cosl < 0.0.49 ops > 2.5.0 pydantic < 2 requests diff --git a/tox.ini b/tox.ini index a30244b..b7f86a3 100644 --- a/tox.ini +++ b/tox.ini @@ -69,7 +69,7 @@ description = Run scenario tests on LXD deps = -r{toxinidir}/requirements.txt pytest - cosl + cosl<0.0.49 ops[testing] commands = pytest -vv --tb native --log-cli-level=INFO -s {posargs} {[vars]tst_path}/scenario From fcc9317add7eb6fe92de74bf281a94a567791f8a Mon Sep 17 00:00:00 2001 From: Mateusz Kulewicz Date: Wed, 8 Jan 2025 17:37:12 +0100 Subject: [PATCH 5/7] better naming --- lib/charms/grafana_agent/v0/cos_agent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/charms/grafana_agent/v0/cos_agent.py b/lib/charms/grafana_agent/v0/cos_agent.py index 9a64f2e..46e547a 100644 --- a/lib/charms/grafana_agent/v0/cos_agent.py +++ b/lib/charms/grafana_agent/v0/cos_agent.py @@ -1346,7 +1346,7 @@ def charm_tracing_config( If https endpoint is provided but cert_path is not found on disk: disable charm tracing. If https endpoint is provided and cert_path is None: - ERROR + raise TracingError Else: proceed with charm tracing (with or without tls, as appropriate) From 3a218b4e374517e2ab15838af0d8aa0e7a937320 Mon Sep 17 00:00:00 2001 From: Michael Thamm Date: Wed, 8 Jan 2025 11:45:41 -0500 Subject: [PATCH 6/7] fix scenario test group naming --- requirements.txt | 2 +- tests/scenario/test_alert_labels.py | 2 +- tox.ini | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index de9661d..9114d86 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ # FIXME: Packing the charm with 2.2.0+139.gd011d92 will not include dependencies in PYDEPS key: # https://chat.charmhub.io/charmhub/pl/wngp665ycjnb78ar9ojrfhxjkr # That's why we are including cosl here until the bug in charmcraft is solved -cosl < 0.0.49 +cosl ops > 2.5.0 pydantic < 2 requests diff --git a/tests/scenario/test_alert_labels.py b/tests/scenario/test_alert_labels.py index bfeda06..c7f436a 100644 --- a/tests/scenario/test_alert_labels.py +++ b/tests/scenario/test_alert_labels.py @@ -120,7 +120,7 @@ def test_metrics_alert_rule_labels(charm_config): ) for group in alert_rules["groups"]: for rule in group["rules"]: - if "grafana-agent_alertgroup_alerts" in group["name"]: + if "grafana_agent_alertgroup_alerts" in group["name"]: assert ( rule["labels"]["juju_application"] == "primary" or rule["labels"]["juju_application"] == "subordinate" diff --git a/tox.ini b/tox.ini index b7f86a3..a30244b 100644 --- a/tox.ini +++ b/tox.ini @@ -69,7 +69,7 @@ description = Run scenario tests on LXD deps = -r{toxinidir}/requirements.txt pytest - cosl<0.0.49 + cosl ops[testing] commands = pytest -vv --tb native --log-cli-level=INFO -s {posargs} {[vars]tst_path}/scenario From 4f0da8877d4e1aa83e674c6156b6bf2efe68b1a0 Mon Sep 17 00:00:00 2001 From: Mateusz Kulewicz Date: Wed, 15 Jan 2025 10:08:04 +0100 Subject: [PATCH 7/7] Don't use a variable, LIBPATCH is now 14 --- lib/charms/grafana_agent/v0/cos_agent.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lib/charms/grafana_agent/v0/cos_agent.py b/lib/charms/grafana_agent/v0/cos_agent.py index 0d3f075..3ff52f9 100644 --- a/lib/charms/grafana_agent/v0/cos_agent.py +++ b/lib/charms/grafana_agent/v0/cos_agent.py @@ -252,7 +252,7 @@ class _MetricsEndpointDict(TypedDict): LIBID = "dc15fa84cef84ce58155fb84f6c6213a" LIBAPI = 0 -LIBPATCH = 13 +LIBPATCH = 14 PYDEPS = ["cosl", "pydantic"] @@ -1004,8 +1004,6 @@ def _on_relation_data_changed(self, event: RelationChangedEvent): def update_tracing_receivers(self): """Updates the list of exposed tracing receivers in all relations.""" - requested_tracing_protocols = self.requested_tracing_protocols() - try: for relation in self._charm.model.relations[self._relation_name]: CosAgentRequirerUnitData( @@ -1026,7 +1024,7 @@ def update_tracing_receivers(self): type=receiver_protocol_to_transport_protocol[protocol], ), ) - for protocol in requested_tracing_protocols + for protocol in self.requested_tracing_protocols() ], ).dump(relation.data[self._charm.unit])