From 2595b5e6cba926421ee71846bf099e1d515c77f5 Mon Sep 17 00:00:00 2001 From: Max Falk Date: Fri, 13 Oct 2023 09:53:28 +0200 Subject: [PATCH] add and emit pool owner metadata for alerting (#327) We'd like to have `pool_owner` metadata on each pool for both informational and alerting purposes. The new attribute on the pool config, `pool_owner`, defaults to `compute_infra`. Signed-off-by: Max Falk --- .../local-dev/default.kubernetes | 1 + .../local-dev/default.mesos | 1 + clusterman/autoscaler/autoscaler.py | 18 +++++++++++------- clusterman/autoscaler/pool_manager.py | 1 + clusterman/simulator/simulated_pool_manager.py | 1 + examples/schemas/pool.json | 3 ++- itests/environment.py | 2 ++ tests/autoscaler/autoscaler_test.py | 18 ++++++++++++++---- tests/conftest.py | 2 ++ 9 files changed, 35 insertions(+), 12 deletions(-) diff --git a/acceptance/srv-configs/clusterman-clusters/local-dev/default.kubernetes b/acceptance/srv-configs/clusterman-clusters/local-dev/default.kubernetes index 11859c701..f0a8f9792 100644 --- a/acceptance/srv-configs/clusterman-clusters/local-dev/default.kubernetes +++ b/acceptance/srv-configs/clusterman-clusters/local-dev/default.kubernetes @@ -21,3 +21,4 @@ autoscaling: instance_loss_threshold: 3 alert_on_max_capacity: false +pool_owner: compute_infra diff --git a/acceptance/srv-configs/clusterman-clusters/local-dev/default.mesos b/acceptance/srv-configs/clusterman-clusters/local-dev/default.mesos index 4fd9aec12..6bb9f05a7 100644 --- a/acceptance/srv-configs/clusterman-clusters/local-dev/default.mesos +++ b/acceptance/srv-configs/clusterman-clusters/local-dev/default.mesos @@ -29,3 +29,4 @@ autoscale_signal: minute_range: 10 alert_on_max_capacity: false +pool_owner: compute_infra diff --git a/clusterman/autoscaler/autoscaler.py b/clusterman/autoscaler/autoscaler.py index 0424c7cba..6eccbe58f 100644 --- a/clusterman/autoscaler/autoscaler.py +++ b/clusterman/autoscaler/autoscaler.py @@ -178,12 +178,9 @@ def run(self, dry_run: bool = False, timestamp: Optional[arrow.Arrow] = None) -> else: capacity_offset = get_capacity_offset(self.cluster, self.pool, self.scheduler, timestamp) new_target_capacity = self._compute_target_capacity(resource_request) + capacity_offset - self.target_capacity_gauge.set(new_target_capacity, {"dry_run": dry_run}) - self.max_capacity_gauge.set( - self.pool_manager.max_capacity, - {"dry_run": dry_run, "alert_on_max_capacity": self.pool_manager.alert_on_max_capacity}, - ) - self.setpoint_gauge.set(self.autoscaling_config.setpoint, {"dry_run": dry_run}) + self.target_capacity_gauge.set(new_target_capacity, self.add_metric_labels(dry_run)) + self.max_capacity_gauge.set(self.pool_manager.max_capacity, self.add_metric_labels(dry_run)) + self.setpoint_gauge.set(self.autoscaling_config.setpoint, self.add_metric_labels(dry_run)) self._emit_requested_resource_metrics(resource_request, dry_run=dry_run) try: @@ -202,7 +199,14 @@ def run(self, dry_run: bool = False, timestamp: Optional[arrow.Arrow] = None) -> def _emit_requested_resource_metrics(self, resource_request: SignalResourceRequest, dry_run: bool) -> None: for resource_type, resource_gauge in self.resource_request_gauges.items(): if getattr(resource_request, resource_type) is not None: - resource_gauge.set(getattr(resource_request, resource_type), {"dry_run": dry_run}) + resource_gauge.set(getattr(resource_request, resource_type), self.add_metric_labels(dry_run)) + + def add_metric_labels(self, dry_run): + return { + "dry_run": dry_run, + "alert_on_max_capacity": self.pool_manager.alert_on_max_capacity, + "team": self.pool_manager.pool_owner, + } def _get_signal_for_app(self, app: str) -> Signal: """Load the signal object to use for autoscaling for a particular app diff --git a/clusterman/autoscaler/pool_manager.py b/clusterman/autoscaler/pool_manager.py index 6ca514dc3..c06226c2c 100644 --- a/clusterman/autoscaler/pool_manager.py +++ b/clusterman/autoscaler/pool_manager.py @@ -86,6 +86,7 @@ def __init__( "autoscaling.killable_nodes_prioritizing_v2", default=False ) self.alert_on_max_capacity = self.pool_config.read_bool("alert_on_max_capacity", default=True) + self.pool_owner = self.pool_config.read_string("pool_owner", default="compute_infra") monitoring_info = {"cluster": cluster, "pool": pool} self.killable_nodes_counter = get_monitoring_client().create_counter(SFX_KILLABLE_NODES_COUNT, monitoring_info) diff --git a/clusterman/simulator/simulated_pool_manager.py b/clusterman/simulator/simulated_pool_manager.py index d796f8387..05446c098 100644 --- a/clusterman/simulator/simulated_pool_manager.py +++ b/clusterman/simulator/simulated_pool_manager.py @@ -59,6 +59,7 @@ def __init__( MAX_MIN_NODE_SCALEIN_UPTIME_SECONDS, ) self.alert_on_max_capacity = self.pool_config.read_bool("alert_on_max_capacity", default=True) + self.pool_owner = self.pool_config.read_string("pool_owner", default="compute_infra") self.killable_nodes_prioritizing_v2 = self.pool_config.read_bool( "autoscaling.killable_nodes_prioritizing_v2", default=False ) diff --git a/examples/schemas/pool.json b/examples/schemas/pool.json index f4fd7fbf5..d8fb9a6c7 100644 --- a/examples/schemas/pool.json +++ b/examples/schemas/pool.json @@ -64,7 +64,8 @@ "additionalProperties": false }, "sensu_config": {"$ref": "definitions.json#sensu_config"}, - "alert_on_max_capacity": {"type": "boolean"} + "alert_on_max_capacity": {"type": "boolean"}, + "pool_owner": {"type": "string"} }, "additionalProperties": false } diff --git a/itests/environment.py b/itests/environment.py index eabe17174..a65546b34 100644 --- a/itests/environment.py +++ b/itests/environment.py @@ -121,6 +121,7 @@ def setup_configurations(context): ], }, "alert_on_max_capacity": True, + "pool_owner": "compute_infra", } kube_pool_config = { "resource_groups": [ @@ -144,6 +145,7 @@ def setup_configurations(context): "period_minutes": 7, }, "alert_on_max_capacity": True, + "pool_owner": "compute_infra", } with staticconf.testing.MockConfiguration( boto_config, namespace=CREDENTIALS_NAMESPACE diff --git a/tests/autoscaler/autoscaler_test.py b/tests/autoscaler/autoscaler_test.py index c4555a152..335c13e92 100644 --- a/tests/autoscaler/autoscaler_test.py +++ b/tests/autoscaler/autoscaler_test.py @@ -49,6 +49,7 @@ def pool_configs(): "max_weight_to_remove": 10, }, "alert_on_max_capacity": True, + "pool_owner": "compute_infra", }, namespace=POOL_NAMESPACE.format(pool="bar", scheduler="mesos"), ): @@ -91,6 +92,10 @@ def mock_autoscaler(): "alert_on_max_capacity", namespace=POOL_NAMESPACE.format(pool="bar", scheduler="mesos"), ) + mock_autoscaler.pool_manager.pool_owner = staticconf.read_string( + "pool_owner", + namespace=POOL_NAMESPACE.format(pool="bar", scheduler="mesos"), + ) mock_autoscaler.pool_manager.non_orphan_fulfilled_capacity = 0 mock_autoscaler.target_capacity_gauge = mock.Mock(spec=GaugeProtocol) @@ -158,17 +163,22 @@ def test_autoscaler_run(dry_run, mock_autoscaler, run_timestamp): ), pytest.raises(ValueError): mock_autoscaler.run(dry_run=dry_run, timestamp=run_timestamp) - assert mock_autoscaler.target_capacity_gauge.set.call_args == mock.call(100, {"dry_run": dry_run}) + assert mock_autoscaler.target_capacity_gauge.set.call_args == mock.call( + 100, {"dry_run": dry_run, "alert_on_max_capacity": True, "team": "compute_infra"} + ) assert mock_autoscaler.max_capacity_gauge.set.call_args == mock.call( - mock_autoscaler.pool_manager.max_capacity, {"dry_run": dry_run, "alert_on_max_capacity": True} + mock_autoscaler.pool_manager.max_capacity, + {"dry_run": dry_run, "alert_on_max_capacity": True, "team": "compute_infra"}, + ) + assert mock_autoscaler.setpoint_gauge.set.call_args == mock.call( + 0.7, {"dry_run": dry_run, "alert_on_max_capacity": True, "team": "compute_infra"} ) - assert mock_autoscaler.setpoint_gauge.set.call_args == mock.call(0.7, {"dry_run": dry_run}) assert mock_autoscaler._compute_target_capacity.call_args == mock.call(resource_request) assert mock_autoscaler.pool_manager.modify_target_capacity.call_count == 1 assert mock_autoscaler.resource_request_gauges["cpus"].set.call_args == mock.call( resource_request.cpus, - {"dry_run": dry_run}, + {"dry_run": dry_run, "alert_on_max_capacity": True, "team": "compute_infra"}, ) assert mock_autoscaler.resource_request_gauges["mem"].set.call_count == 0 assert mock_autoscaler.resource_request_gauges["disk"].set.call_count == 0 diff --git a/tests/conftest.py b/tests/conftest.py index 55f5606f1..9651b2516 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -145,6 +145,7 @@ def clusterman_pool_config(): ], }, "alert_on_max_capacity": True, + "pool_owner": "compute_infra", } with staticconf.testing.MockConfiguration(config, namespace="bar.mesos_config"): yield @@ -202,6 +203,7 @@ def clusterman_k8s_pool_config(): "disable_autoscaling": False, }, "alert_on_max_capacity": False, + "pool_owner": "foo", } with staticconf.testing.MockConfiguration(config, namespace="bar.kubernetes_config"): yield