diff --git a/charmcraft-22.04.yaml b/charmcraft-22.04.yaml index 29d9de3..633fa3f 100644 --- a/charmcraft-22.04.yaml +++ b/charmcraft-22.04.yaml @@ -175,3 +175,27 @@ config: even if there is no integration currently requesting it. type: boolean default: false + tracing_sample_rate_charm: + description: > + This property defines the percentage of charm traces that are sent to tracing backend. + Setting it to 100 would mean all charm traces are kept, setting to 0 means charm traces + aren't sent to tracing backend at all. Anything outside of 0-100 range will be normalised + to this range by Grafana Agent. + type: float + default: 100.0 + tracing_sample_rate_workload: + description: > + This property defines the percentage of workload traces that are sent to tracing backend. + Setting it to 100 would mean all workload traces are kept, setting to 0 means workload traces + aren't sent to tracing backend at all. Anything outside of 0-100 range will be normalised + to this range by Grafana Agent. + type: float + default: 1.0 + tracing_sample_rate_error: + description: > + This property defines the percentage of error traces (regardless of the type) that are sent to tracing backend. + Setting it to 100 would mean all error traces are kept, setting to 0 means error traces + aren't sent to tracing backend at all. Anything outside of 0-100 range will be normalised + to this range by Grafana Agent. + type: float + default: 100.0 \ No newline at end of file diff --git a/charmcraft-24.04.yaml b/charmcraft-24.04.yaml index e726f67..08dde62 100644 --- a/charmcraft-24.04.yaml +++ b/charmcraft-24.04.yaml @@ -140,3 +140,24 @@ config: even if there is no integration currently requesting it. type: boolean default: false + tracing_sample_rate_charm: + description: > + This property defines the percentage of charm traces that are sent to the tracing backend. + Setting it to 100 would mean all charm traces are kept, setting to 0 means charm traces + aren't sent to the tracing backend at all. + type: float + default: 100.0 + tracing_sample_rate_workload: + description: > + This property defines the percentage of workload traces that are sent to the tracing backend. + Setting it to 100 would mean all workload traces are kept, setting to 0 means workload traces + aren't sent to the tracing backend at all. + type: float + default: 1.0 + tracing_sample_rate_error: + description: > + This property defines the percentage of error traces (from all sources) that are sent to the tracing backend. + Setting it to 100 would mean all error traces are kept, setting to 0 means error traces + aren't sent to the tracing backend at all. + type: float + default: 100.0 diff --git a/src/grafana_agent.py b/src/grafana_agent.py index 5e46bba..1c85b8d 100644 --- a/src/grafana_agent.py +++ b/src/grafana_agent.py @@ -868,6 +868,94 @@ def _receiver_config(protocol: str): return config + @property + def _tracing_sampling(self) -> Dict[str, Any]: + # policies, as defined by tail sampling processor definition: + # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/tailsamplingprocessor + # each of them is evaluated separately and processor decides whether to pass the trace through or not + # see the description of tail sampling processor above for the full decision tree + return { + "policies": [ + { + "name": "error-traces-policy", + "type": "and", + "and": { + "and_sub_policy": [ + { + "name": "trace-status-policy", + "type": "status_code", + "status_code": {"status_codes": ["ERROR"]}, + # status_code processor is using span_status property of spans within a trace + # see https://opentelemetry.io/docs/concepts/signals/traces/#span-status for reference + }, + { + "name": "probabilistic-policy", + "type": "probabilistic", + "probabilistic": { + "sampling_percentage": self.config.get( + "tracing_sample_rate_error" + ) + }, + }, + ] + }, + }, + { + "name": "charm-traces-policy", + "type": "and", + "and": { + "and_sub_policy": [ + { + "name": "service-name-policy", + "type": "string_attribute", + "string_attribute": { + "key": "service.name", + "values": [".+-charm"], + "enabled_regex_matching": True, + }, + }, + { + "name": "probabilistic-policy", + "type": "probabilistic", + "probabilistic": { + "sampling_percentage": self.config.get( + "tracing_sample_rate_charm" + ) + }, + }, + ] + }, + }, + { + "name": "workload-traces-policy", + "type": "and", + "and": { + "and_sub_policy": [ + { + "name": "service-name-policy", + "type": "string_attribute", + "string_attribute": { + "key": "service.name", + "values": [".+-charm"], + "enabled_regex_matching": True, + "invert_match": True, + }, + }, + { + "name": "probabilistic-policy", + "type": "probabilistic", + "probabilistic": { + "sampling_percentage": self.config.get( + "tracing_sample_rate_workload" + ) + }, + }, + ] + }, + }, + ] + } + @property def _tempo_config(self) -> Dict[str, Union[Any, List[Any]]]: """The tracing section of the config. @@ -877,6 +965,7 @@ def _tempo_config(self) -> Dict[str, Union[Any, List[Any]]]: """ endpoints = self._tempo_endpoints_with_tls() receivers = self._tracing_receivers + sampling = self._tracing_sampling if not receivers: # pushing a config with an empty receivers section will cause gagent to error out @@ -888,6 +977,7 @@ def _tempo_config(self) -> Dict[str, Union[Any, List[Any]]]: "name": "tempo", "remote_write": endpoints, "receivers": receivers, + "tail_sampling": sampling, } ] } diff --git a/tests/scenario/test_machine_charm/test_tracing_configuration.py b/tests/scenario/test_machine_charm/test_tracing_configuration.py index 00e4e11..5ff5abc 100644 --- a/tests/scenario/test_machine_charm/test_tracing_configuration.py +++ b/tests/scenario/test_machine_charm/test_tracing_configuration.py @@ -1,11 +1,18 @@ from typing import get_args +from unittest.mock import patch import pytest +import yaml from charms.grafana_agent.v0.cos_agent import ReceiverProtocol from charms.tempo_k8s.v2.tracing import ReceiverProtocol as TracingReceiverProtocol -from scenario import Context, State +from scenario import Context, Relation, State, SubordinateRelation from charm import GrafanaAgentMachineCharm +from lib.charms.grafana_agent.v0.cos_agent import ( + CosAgentProviderUnitData, + Receiver, +) +from lib.charms.tempo_k8s.v2.tracing import TracingProviderAppData def test_cos_agent_receiver_protocols_match_with_tracing(): @@ -28,3 +35,56 @@ def test_always_enable_config_variables_are_generated_for_tracing_protocols( with context.manager("config-changed", state) as mgr: charm: GrafanaAgentMachineCharm = mgr.charm assert protocol in charm.requested_tracing_protocols + + +@pytest.mark.parametrize( + "sampling_config", + ( + { + "always_enable_otlp_http": True, + }, + { + "always_enable_otlp_http": True, + "tracing_sample_rate_charm": 23.0, + "tracing_sample_rate_workload": 13.13, + "tracing_sample_rate_error": 42.42, + }, + ), +) +def test_tracing_sampling_config_is_present( + vroot, placeholder_cfg_path, mock_config_path, sampling_config +): + # GIVEN a tracing relation over the tracing-provider endpoint and one over tracing + context = Context( + charm_type=GrafanaAgentMachineCharm, + charm_root=vroot, + ) + tracing_provider = SubordinateRelation( + "cos-agent", + remote_unit_data=CosAgentProviderUnitData( + metrics_alert_rules={}, + log_alert_rules={}, + metrics_scrape_jobs=[], + log_slots=[], + dashboards=[], + subordinate=True, + tracing_protocols=["otlp_http", "otlp_grpc"], + ).dump(), + ) + tracing = Relation( + "tracing", + remote_app_data=TracingProviderAppData( + receivers=[ + Receiver(protocol={"name": "otlp_grpc", "type": "grpc"}, url="http:foo.com:1111") + ] + ).dump(), + ) + + state = State(leader=True, relations=[tracing, tracing_provider], config=sampling_config) + # WHEN we process any setup event for the relation + with patch("charm.GrafanaAgentMachineCharm.is_ready", True): + context.run("config_changed", state) + + yml = yaml.safe_load(placeholder_cfg_path.read_text()) + + assert yml["traces"]["configs"][0]["tail_sampling"]