Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Don't expose tracing endpoints over cos_agent when backend isn't available #228

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 63 additions & 42 deletions lib/charms/grafana_agent/v0/cos_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ class _MetricsEndpointDict(TypedDict):

LIBID = "dc15fa84cef84ce58155fb84f6c6213a"
LIBAPI = 0
LIBPATCH = 12
LIBPATCH = 13

PYDEPS = ["cosl", "pydantic"]

Expand Down Expand Up @@ -316,7 +316,11 @@ class NotReadyError(TracingError):
"""Raised by the provider wrapper if a requirer hasn't published the required data (yet)."""


class ProtocolNotRequestedError(TracingError):
class ProtocolNotFoundError(TracingError):
"""Raised if the user doesn't receive an endpoint for a protocol it requested."""


class ProtocolNotRequestedError(ProtocolNotFoundError):
"""Raised if the user attempts to obtain an endpoint for a protocol it did not request."""


Expand Down Expand Up @@ -578,7 +582,7 @@ class Receiver(pydantic.BaseModel):
"""Specification of an active receiver."""

protocol: ProtocolType = pydantic.Field(..., description="Receiver protocol name and type.")
url: str = pydantic.Field(
url: Optional[str] = pydantic.Field(
...,
description="""URL at which the receiver is reachable. If there's an ingress, it would be the external URL.
Otherwise, it would be the service's fqdn or internal IP.
Expand Down Expand Up @@ -801,29 +805,48 @@ def get_all_endpoints(

def _get_tracing_endpoint(
self, relation: Optional[Relation], protocol: ReceiverProtocol
) -> Optional[str]:
) -> str:
"""Return a tracing endpoint URL if it is available or raise a ProtocolNotFoundError."""
unit_data = self.get_all_endpoints(relation)
if not unit_data:
return None
# we didn't find the protocol because the remote end didn't publish any data yet
# it might also mean that grafana-agent doesn't have a relation to the tracing backend
raise ProtocolNotFoundError(protocol)
receivers: List[Receiver] = [i for i in unit_data.receivers if i.protocol.name == protocol]
if not receivers:
logger.error(f"no receiver found with protocol={protocol!r}")
return None
# we didn't find the protocol because grafana-agent didn't return us the protocol that we requested
# the caller might want to verify that we did indeed request this protocol
raise ProtocolNotFoundError(protocol)
if len(receivers) > 1:
logger.error(
logger.warning(
f"too many receivers with protocol={protocol!r}; using first one. Found: {receivers}"
)
return None

receiver = receivers[0]
if not receiver.url:
# grafana-agent isn't connected to the tracing backend yet
raise ProtocolNotFoundError(protocol)
return receiver.url

def get_tracing_endpoint(
self, protocol: ReceiverProtocol, relation: Optional[Relation] = None
) -> Optional[str]:
"""Receiver endpoint for the given protocol."""
endpoint = self._get_tracing_endpoint(relation or self._relation, protocol=protocol)
if not endpoint:
) -> str:
"""Receiver endpoint for the given protocol.

It could happen that this function gets called before the provider publishes the endpoints.
In such a scenario, if a non-leader unit calls this function, a permission denied exception will be raised due to
restricted access. To prevent this, this function needs to be guarded by the `is_ready` check.

Raises:
ProtocolNotRequestedError:
If the charm unit is the leader unit and attempts to obtain an endpoint for a protocol it did not request.
ProtocolNotFoundError:
If the charm attempts to obtain an endpoint when grafana-agent isn't related to a tracing backend.
"""
try:
return self._get_tracing_endpoint(relation or self._relation, protocol=protocol)
except ProtocolNotFoundError:
# let's see if we didn't find it because we didn't request the endpoint
requested_protocols = set()
relations = [relation] if relation else self.relations
for relation in relations:
Expand All @@ -838,8 +861,7 @@ def get_tracing_endpoint(
if protocol not in requested_protocols:
raise ProtocolNotRequestedError(protocol, relation)

return None
return endpoint
raise


class COSAgentDataChanged(EventBase):
Expand Down Expand Up @@ -982,18 +1004,29 @@ def _on_relation_data_changed(self, event: RelationChangedEvent):

def update_tracing_receivers(self):
"""Updates the list of exposed tracing receivers in all relations."""
requested_tracing_protocols = self.requested_tracing_protocols()

try:
for relation in self._charm.model.relations[self._relation_name]:
CosAgentRequirerUnitData(
receivers=[
Receiver(
url=f"{self._get_tracing_receiver_url(protocol)}",
# if tracing isn't ready, we don't want the wrong receiver URLs present in the databag.
# however, because of the backwards compatibility requirements, we need to still provide
# the protocols list so that the charm with older cos_agent version doesn't error its hooks.
# before this change was added, the charm with old cos_agent version threw exceptions with
# connections to grafana-agent timing out. After the change, the charm will fail validating
# databag contents (as it expects a string in URL) but that won't cause any errors as
# tracing endpoints are the only content in the grafana-agent's side of the databag.
url=f"{self._get_tracing_receiver_url(protocol)}"
if self._charm.tracing.is_ready() # type: ignore
else None,
protocol=ProtocolType(
name=protocol,
type=receiver_protocol_to_transport_protocol[protocol],
),
)
for protocol in self.requested_tracing_protocols()
for protocol in requested_tracing_protocols
],
).dump(relation.data[self._charm.unit])

Expand Down Expand Up @@ -1313,44 +1346,32 @@ def charm_tracing_config(
If https endpoint is provided but cert_path is not found on disk:
disable charm tracing.
If https endpoint is provided and cert_path is None:
ERROR
raise TracingError
Else:
proceed with charm tracing (with or without tls, as appropriate)

Usage:
If you are using charm_tracing >= v1.9:
>>> from lib.charms.tempo_k8s.v1.charm_tracing import trace_charm
>>> from lib.charms.tempo_k8s.v0.cos_agent import charm_tracing_config
>>> from lib.charms.tempo_coordinator_k8s.v0.charm_tracing import trace_charm
>>> from lib.charms.tempo_coordinator_k8s.v0.tracing import charm_tracing_config
>>> @trace_charm(tracing_endpoint="my_endpoint", cert_path="cert_path")
>>> class MyCharm(...):
>>> _cert_path = "/path/to/cert/on/charm/container.crt"
>>> def __init__(self, ...):
>>> self.cos_agent = COSAgentProvider(...)
>>> self.tracing = TracingEndpointRequirer(...)
>>> self.my_endpoint, self.cert_path = charm_tracing_config(
... self.cos_agent, self._cert_path)

If you are using charm_tracing < v1.9:
>>> from lib.charms.tempo_k8s.v1.charm_tracing import trace_charm
>>> from lib.charms.tempo_k8s.v2.tracing import charm_tracing_config
>>> @trace_charm(tracing_endpoint="my_endpoint", cert_path="cert_path")
>>> class MyCharm(...):
>>> _cert_path = "/path/to/cert/on/charm/container.crt"
>>> def __init__(self, ...):
>>> self.cos_agent = COSAgentProvider(...)
>>> self.my_endpoint, self.cert_path = charm_tracing_config(
... self.cos_agent, self._cert_path)
>>> @property
>>> def my_endpoint(self):
>>> return self._my_endpoint
>>> @property
>>> def cert_path(self):
>>> return self._cert_path

... self.tracing, self._cert_path)
"""
if not endpoint_requirer.is_ready():
return None, None

endpoint = endpoint_requirer.get_tracing_endpoint("otlp_http")
try:
endpoint = endpoint_requirer.get_tracing_endpoint("otlp_http")
except ProtocolNotFoundError:
logger.warn(
"Endpoint for tracing wasn't provided as tracing backend isn't ready yet. If grafana-agent isn't connected to a tracing backend, integrate it. Otherwise this issue should resolve itself in a few events."
)
return None, None

if not endpoint:
return None, None

Expand Down
22 changes: 22 additions & 0 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,16 @@ def __init__(self, *args):
self.framework.observe(self.on.start, self._on_start)
self.framework.observe(self.on.stop, self._on_stop)
self.framework.observe(self.on.remove, self._on_remove)
# we need to observe these events in the charm as change of tracing endpoint may result in removing a receiver
# in the databags for charms related with us over cos-agent
self.framework.observe(
self.tracing.on.endpoint_changed, # pyright: ignore
self._on_tracing_endpoint_changed,
)
self.framework.observe(
self.tracing.on.endpoint_removed, # pyright: ignore
self._on_tracing_endpoint_removed,
)

@property
def snap(self):
Expand Down Expand Up @@ -229,6 +239,18 @@ def _on_cos_validation_error(self, event):

self._update_status()

def _on_tracing_endpoint_changed(self, _event) -> None:
"""Event handler for the tracing endpoint-changed event."""
super()._update_config()
super()._update_status()
self._cos.update_tracing_receivers()

def _on_tracing_endpoint_removed(self, _event) -> None:
"""Event handler for the tracing endpoint-removed event."""
super()._update_config()
super()._update_status()
self._cos.update_tracing_receivers()

def _verify_snap_track(self) -> None:
try:
# install_ga_snap calls snap.ensure so it should do the right thing whether the track
Expand Down
28 changes: 5 additions & 23 deletions src/grafana_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,27 +175,19 @@ def __init__(self, *args):
)
self.framework.observe(self.cert.on.cert_changed, self._on_cert_changed) # pyright: ignore

self._tracing = TracingEndpointRequirer(
self.tracing = TracingEndpointRequirer(
self,
protocols=[
"otlp_http", # for charm traces
"otlp_grpc", # for forwarding workload traces
],
)
self._charm_tracing_endpoint, self._server_cert = charm_tracing_config(
self._tracing, self._ca_path
self.tracing, self._ca_path
)

self._cloud = GrafanaCloudConfigRequirer(self)

self.framework.observe(
self._tracing.on.endpoint_changed, # pyright: ignore
self.on_tracing_endpoint_changed,
)
self.framework.observe(
self._tracing.on.endpoint_removed, # pyright: ignore
self.on_tracing_endpoint_removed,
)
self.framework.observe(
self._cloud.on.cloud_config_available, # pyright: ignore
self._on_cloud_config_available,
Expand Down Expand Up @@ -489,16 +481,6 @@ def on_remote_write_changed(self, _event) -> None:
self._update_status()
self._update_metrics_alerts()

def on_tracing_endpoint_changed(self, _event) -> None:
"""Event handler for the tracing endpoint-changed event."""
self._update_config()
self._update_status()

def on_tracing_endpoint_removed(self, _event) -> None:
"""Event handler for the tracing endpoint-removed event."""
self._update_config()
self._update_status()

def _update_status(self, *_):
"""Determine the charm status based on relation health and grafana-agent service readiness.

Expand Down Expand Up @@ -685,13 +667,13 @@ def _tempo_endpoints_with_tls(self) -> List[Dict[str, Any]]:
FIXME: these should be separate concerns.
"""
tempo_endpoints = []
if self._tracing.is_ready():
if self.tracing.is_ready():
tempo_endpoints.append(
{
# outgoing traces are all otlp/grpc
# cit: While Tempo and the Agent both can ingest in multiple formats,
# the Agent only exports in OTLP gRPC and HTTP.
"endpoint": self._tracing.get_endpoint("otlp_grpc"),
"endpoint": self.tracing.get_endpoint("otlp_grpc"),
"insecure": False if self.cert.enabled else True,
}
)
Expand Down Expand Up @@ -831,7 +813,7 @@ def _tracing_receivers(self) -> Dict[str, Union[Any, List[Any]]]:
Returns:
a dict with the receivers config.
"""
if not self._tracing.is_ready():
if not self.tracing.is_ready():
return {}

receivers_set = self.requested_tracing_protocols
Expand Down
2 changes: 1 addition & 1 deletion tests/scenario/test_alert_labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def test_metrics_alert_rule_labels(charm_config):
)
for group in alert_rules["groups"]:
for rule in group["rules"]:
if "grafana-agent_alertgroup_alerts" in group["name"]:
if "grafana_agent_alertgroup_alerts" in group["name"]:
assert (
rule["labels"]["juju_application"] == "primary"
or rule["labels"]["juju_application"] == "subordinate"
Expand Down
1 change: 1 addition & 0 deletions tests/scenario/test_cos_agent_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ def __init__(self, framework: Framework):
self,
refresh_events=[self.on.cos_agent_relation_changed],
)
self.tracing = MagicMock()

return MySubordinate

Expand Down
1 change: 1 addition & 0 deletions tests/scenario/test_peer_relation.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def __init__(self, framework: Framework):
super().__init__(framework)
self.cosagent = COSAgentRequirer(self)
self.prom = PrometheusRemoteWriteConsumer(self)
self.tracing = MagicMock()
framework.observe(self.cosagent.on.data_changed, self._on_cosagent_data_changed)

def _on_cosagent_data_changed(self, _):
Expand Down
Loading