From 6934faa0d52dd6491e95e4869011d938713d0d8d Mon Sep 17 00:00:00 2001 From: Max Falk Date: Tue, 27 Jun 2023 18:59:14 +0200 Subject: [PATCH 01/10] add and emit pool owner metadata for alerting Signed-off-by: Max Falk --- .../clusterman-clusters/local-dev/default.kubernetes | 1 + .../srv-configs/clusterman-clusters/local-dev/default.mesos | 1 + clusterman/autoscaler/autoscaler.py | 2 +- clusterman/autoscaler/pool_manager.py | 1 + clusterman/simulator/simulated_pool_manager.py | 1 + examples/schemas/pool.json | 3 ++- itests/environment.py | 2 ++ tests/autoscaler/autoscaler_test.py | 4 +++- tests/conftest.py | 2 ++ 9 files changed, 14 insertions(+), 3 deletions(-) diff --git a/acceptance/srv-configs/clusterman-clusters/local-dev/default.kubernetes b/acceptance/srv-configs/clusterman-clusters/local-dev/default.kubernetes index 11859c701..f0a8f9792 100644 --- a/acceptance/srv-configs/clusterman-clusters/local-dev/default.kubernetes +++ b/acceptance/srv-configs/clusterman-clusters/local-dev/default.kubernetes @@ -21,3 +21,4 @@ autoscaling: instance_loss_threshold: 3 alert_on_max_capacity: false +pool_owner: compute_infra diff --git a/acceptance/srv-configs/clusterman-clusters/local-dev/default.mesos b/acceptance/srv-configs/clusterman-clusters/local-dev/default.mesos index 4fd9aec12..48fa3e976 100644 --- a/acceptance/srv-configs/clusterman-clusters/local-dev/default.mesos +++ b/acceptance/srv-configs/clusterman-clusters/local-dev/default.mesos @@ -29,3 +29,4 @@ autoscale_signal: minute_range: 10 alert_on_max_capacity: false +pool_owner: compute_infra \ No newline at end of file diff --git a/clusterman/autoscaler/autoscaler.py b/clusterman/autoscaler/autoscaler.py index 0424c7cba..3cd619690 100644 --- a/clusterman/autoscaler/autoscaler.py +++ b/clusterman/autoscaler/autoscaler.py @@ -181,7 +181,7 @@ def run(self, dry_run: bool = False, timestamp: Optional[arrow.Arrow] = None) -> self.target_capacity_gauge.set(new_target_capacity, {"dry_run": dry_run}) self.max_capacity_gauge.set( self.pool_manager.max_capacity, - {"dry_run": dry_run, "alert_on_max_capacity": self.pool_manager.alert_on_max_capacity}, + {"dry_run": dry_run, "alert_on_max_capacity": self.pool_manager.alert_on_max_capacity, "team": self.pool_manager.pool_owner}, ) self.setpoint_gauge.set(self.autoscaling_config.setpoint, {"dry_run": dry_run}) self._emit_requested_resource_metrics(resource_request, dry_run=dry_run) diff --git a/clusterman/autoscaler/pool_manager.py b/clusterman/autoscaler/pool_manager.py index 6ca514dc3..c06226c2c 100644 --- a/clusterman/autoscaler/pool_manager.py +++ b/clusterman/autoscaler/pool_manager.py @@ -86,6 +86,7 @@ def __init__( "autoscaling.killable_nodes_prioritizing_v2", default=False ) self.alert_on_max_capacity = self.pool_config.read_bool("alert_on_max_capacity", default=True) + self.pool_owner = self.pool_config.read_string("pool_owner", default="compute_infra") monitoring_info = {"cluster": cluster, "pool": pool} self.killable_nodes_counter = get_monitoring_client().create_counter(SFX_KILLABLE_NODES_COUNT, monitoring_info) diff --git a/clusterman/simulator/simulated_pool_manager.py b/clusterman/simulator/simulated_pool_manager.py index d796f8387..05446c098 100644 --- a/clusterman/simulator/simulated_pool_manager.py +++ b/clusterman/simulator/simulated_pool_manager.py @@ -59,6 +59,7 @@ def __init__( MAX_MIN_NODE_SCALEIN_UPTIME_SECONDS, ) self.alert_on_max_capacity = self.pool_config.read_bool("alert_on_max_capacity", default=True) + self.pool_owner = self.pool_config.read_string("pool_owner", default="compute_infra") self.killable_nodes_prioritizing_v2 = self.pool_config.read_bool( "autoscaling.killable_nodes_prioritizing_v2", default=False ) diff --git a/examples/schemas/pool.json b/examples/schemas/pool.json index f4fd7fbf5..d8fb9a6c7 100644 --- a/examples/schemas/pool.json +++ b/examples/schemas/pool.json @@ -64,7 +64,8 @@ "additionalProperties": false }, "sensu_config": {"$ref": "definitions.json#sensu_config"}, - "alert_on_max_capacity": {"type": "boolean"} + "alert_on_max_capacity": {"type": "boolean"}, + "pool_owner": {"type": "string"} }, "additionalProperties": false } diff --git a/itests/environment.py b/itests/environment.py index eabe17174..a65546b34 100644 --- a/itests/environment.py +++ b/itests/environment.py @@ -121,6 +121,7 @@ def setup_configurations(context): ], }, "alert_on_max_capacity": True, + "pool_owner": "compute_infra", } kube_pool_config = { "resource_groups": [ @@ -144,6 +145,7 @@ def setup_configurations(context): "period_minutes": 7, }, "alert_on_max_capacity": True, + "pool_owner": "compute_infra", } with staticconf.testing.MockConfiguration( boto_config, namespace=CREDENTIALS_NAMESPACE diff --git a/tests/autoscaler/autoscaler_test.py b/tests/autoscaler/autoscaler_test.py index c4555a152..9f1b1f499 100644 --- a/tests/autoscaler/autoscaler_test.py +++ b/tests/autoscaler/autoscaler_test.py @@ -49,6 +49,7 @@ def pool_configs(): "max_weight_to_remove": 10, }, "alert_on_max_capacity": True, + "pool_owner": "compute_infra", }, namespace=POOL_NAMESPACE.format(pool="bar", scheduler="mesos"), ): @@ -91,6 +92,7 @@ def mock_autoscaler(): "alert_on_max_capacity", namespace=POOL_NAMESPACE.format(pool="bar", scheduler="mesos"), ) + mock_autoscaler.pool_manager.pool_owner = "compute_infra" mock_autoscaler.pool_manager.non_orphan_fulfilled_capacity = 0 mock_autoscaler.target_capacity_gauge = mock.Mock(spec=GaugeProtocol) @@ -160,7 +162,7 @@ def test_autoscaler_run(dry_run, mock_autoscaler, run_timestamp): assert mock_autoscaler.target_capacity_gauge.set.call_args == mock.call(100, {"dry_run": dry_run}) assert mock_autoscaler.max_capacity_gauge.set.call_args == mock.call( - mock_autoscaler.pool_manager.max_capacity, {"dry_run": dry_run, "alert_on_max_capacity": True} + mock_autoscaler.pool_manager.max_capacity, {"dry_run": dry_run, "alert_on_max_capacity": True, "pool_owner": "compute_infra"} ) assert mock_autoscaler.setpoint_gauge.set.call_args == mock.call(0.7, {"dry_run": dry_run}) assert mock_autoscaler._compute_target_capacity.call_args == mock.call(resource_request) diff --git a/tests/conftest.py b/tests/conftest.py index 55f5606f1..9651b2516 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -145,6 +145,7 @@ def clusterman_pool_config(): ], }, "alert_on_max_capacity": True, + "pool_owner": "compute_infra", } with staticconf.testing.MockConfiguration(config, namespace="bar.mesos_config"): yield @@ -202,6 +203,7 @@ def clusterman_k8s_pool_config(): "disable_autoscaling": False, }, "alert_on_max_capacity": False, + "pool_owner": "foo", } with staticconf.testing.MockConfiguration(config, namespace="bar.kubernetes_config"): yield From d5a6c23a71df3c7262ef1f494d50eec80e69fb5f Mon Sep 17 00:00:00 2001 From: Max Falk Date: Thu, 29 Jun 2023 10:29:05 +0200 Subject: [PATCH 02/10] newline --- .../srv-configs/clusterman-clusters/local-dev/default.mesos | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/acceptance/srv-configs/clusterman-clusters/local-dev/default.mesos b/acceptance/srv-configs/clusterman-clusters/local-dev/default.mesos index 48fa3e976..6bb9f05a7 100644 --- a/acceptance/srv-configs/clusterman-clusters/local-dev/default.mesos +++ b/acceptance/srv-configs/clusterman-clusters/local-dev/default.mesos @@ -29,4 +29,4 @@ autoscale_signal: minute_range: 10 alert_on_max_capacity: false -pool_owner: compute_infra \ No newline at end of file +pool_owner: compute_infra From f32e3879d74880a0a47d23846dfb865dd461ace7 Mon Sep 17 00:00:00 2001 From: Max Falk Date: Thu, 29 Jun 2023 04:51:11 -0700 Subject: [PATCH 03/10] tests failing locally Signed-off-by: Max Falk --- tests/autoscaler/autoscaler_test.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/autoscaler/autoscaler_test.py b/tests/autoscaler/autoscaler_test.py index 9f1b1f499..582323e42 100644 --- a/tests/autoscaler/autoscaler_test.py +++ b/tests/autoscaler/autoscaler_test.py @@ -92,7 +92,10 @@ def mock_autoscaler(): "alert_on_max_capacity", namespace=POOL_NAMESPACE.format(pool="bar", scheduler="mesos"), ) - mock_autoscaler.pool_manager.pool_owner = "compute_infra" + mock_autoscaler.pool_manager.pool_owner = staticconf.read_string( + "pool_owner", + namespace=POOL_NAMESPACE.format(pool="bar", scheduler="mesos"), + ) mock_autoscaler.pool_manager.non_orphan_fulfilled_capacity = 0 mock_autoscaler.target_capacity_gauge = mock.Mock(spec=GaugeProtocol) From 327c8faeb4e9fac57a1568c8ccdb60db122e569f Mon Sep 17 00:00:00 2001 From: Max Falk Date: Wed, 5 Jul 2023 00:37:01 -0700 Subject: [PATCH 04/10] fix formatting Signed-off-by: Max Falk --- clusterman/autoscaler/autoscaler.py | 6 +++++- tests/autoscaler/autoscaler_test.py | 3 ++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/clusterman/autoscaler/autoscaler.py b/clusterman/autoscaler/autoscaler.py index 3cd619690..35965ef06 100644 --- a/clusterman/autoscaler/autoscaler.py +++ b/clusterman/autoscaler/autoscaler.py @@ -181,7 +181,11 @@ def run(self, dry_run: bool = False, timestamp: Optional[arrow.Arrow] = None) -> self.target_capacity_gauge.set(new_target_capacity, {"dry_run": dry_run}) self.max_capacity_gauge.set( self.pool_manager.max_capacity, - {"dry_run": dry_run, "alert_on_max_capacity": self.pool_manager.alert_on_max_capacity, "team": self.pool_manager.pool_owner}, + { + "dry_run": dry_run, + "alert_on_max_capacity": self.pool_manager.alert_on_max_capacity, + "team": self.pool_manager.pool_owner, + }, ) self.setpoint_gauge.set(self.autoscaling_config.setpoint, {"dry_run": dry_run}) self._emit_requested_resource_metrics(resource_request, dry_run=dry_run) diff --git a/tests/autoscaler/autoscaler_test.py b/tests/autoscaler/autoscaler_test.py index 582323e42..4c01ed48c 100644 --- a/tests/autoscaler/autoscaler_test.py +++ b/tests/autoscaler/autoscaler_test.py @@ -165,7 +165,8 @@ def test_autoscaler_run(dry_run, mock_autoscaler, run_timestamp): assert mock_autoscaler.target_capacity_gauge.set.call_args == mock.call(100, {"dry_run": dry_run}) assert mock_autoscaler.max_capacity_gauge.set.call_args == mock.call( - mock_autoscaler.pool_manager.max_capacity, {"dry_run": dry_run, "alert_on_max_capacity": True, "pool_owner": "compute_infra"} + mock_autoscaler.pool_manager.max_capacity, + {"dry_run": dry_run, "alert_on_max_capacity": True, "pool_owner": "compute_infra"}, ) assert mock_autoscaler.setpoint_gauge.set.call_args == mock.call(0.7, {"dry_run": dry_run}) assert mock_autoscaler._compute_target_capacity.call_args == mock.call(resource_request) From 553ee0d23e36a51848c81d3f783455127dc32616 Mon Sep 17 00:00:00 2001 From: Max Falk Date: Mon, 21 Aug 2023 09:34:01 +0200 Subject: [PATCH 05/10] add team label to all autoscaler metrics Signed-off-by: Max Falk --- clusterman/autoscaler/autoscaler.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/clusterman/autoscaler/autoscaler.py b/clusterman/autoscaler/autoscaler.py index 35965ef06..6eccbe58f 100644 --- a/clusterman/autoscaler/autoscaler.py +++ b/clusterman/autoscaler/autoscaler.py @@ -178,16 +178,9 @@ def run(self, dry_run: bool = False, timestamp: Optional[arrow.Arrow] = None) -> else: capacity_offset = get_capacity_offset(self.cluster, self.pool, self.scheduler, timestamp) new_target_capacity = self._compute_target_capacity(resource_request) + capacity_offset - self.target_capacity_gauge.set(new_target_capacity, {"dry_run": dry_run}) - self.max_capacity_gauge.set( - self.pool_manager.max_capacity, - { - "dry_run": dry_run, - "alert_on_max_capacity": self.pool_manager.alert_on_max_capacity, - "team": self.pool_manager.pool_owner, - }, - ) - self.setpoint_gauge.set(self.autoscaling_config.setpoint, {"dry_run": dry_run}) + self.target_capacity_gauge.set(new_target_capacity, self.add_metric_labels(dry_run)) + self.max_capacity_gauge.set(self.pool_manager.max_capacity, self.add_metric_labels(dry_run)) + self.setpoint_gauge.set(self.autoscaling_config.setpoint, self.add_metric_labels(dry_run)) self._emit_requested_resource_metrics(resource_request, dry_run=dry_run) try: @@ -206,7 +199,14 @@ def run(self, dry_run: bool = False, timestamp: Optional[arrow.Arrow] = None) -> def _emit_requested_resource_metrics(self, resource_request: SignalResourceRequest, dry_run: bool) -> None: for resource_type, resource_gauge in self.resource_request_gauges.items(): if getattr(resource_request, resource_type) is not None: - resource_gauge.set(getattr(resource_request, resource_type), {"dry_run": dry_run}) + resource_gauge.set(getattr(resource_request, resource_type), self.add_metric_labels(dry_run)) + + def add_metric_labels(self, dry_run): + return { + "dry_run": dry_run, + "alert_on_max_capacity": self.pool_manager.alert_on_max_capacity, + "team": self.pool_manager.pool_owner, + } def _get_signal_for_app(self, app: str) -> Signal: """Load the signal object to use for autoscaling for a particular app From 53fcf7c36384f1d791a0a76235723c3ec52f6b26 Mon Sep 17 00:00:00 2001 From: Max Falk Date: Mon, 21 Aug 2023 10:03:33 +0200 Subject: [PATCH 06/10] update tests Signed-off-by: Max Falk --- tests/autoscaler/autoscaler_test.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/autoscaler/autoscaler_test.py b/tests/autoscaler/autoscaler_test.py index 4c01ed48c..37f477f4d 100644 --- a/tests/autoscaler/autoscaler_test.py +++ b/tests/autoscaler/autoscaler_test.py @@ -163,18 +163,20 @@ def test_autoscaler_run(dry_run, mock_autoscaler, run_timestamp): ), pytest.raises(ValueError): mock_autoscaler.run(dry_run=dry_run, timestamp=run_timestamp) - assert mock_autoscaler.target_capacity_gauge.set.call_args == mock.call(100, {"dry_run": dry_run}) + assert mock_autoscaler.target_capacity_gauge.set.call_args == mock.call(100, {"dry_run": dry_run, + "alert_on_max_capacity": True, "pool_owner": "compute_infra"}) assert mock_autoscaler.max_capacity_gauge.set.call_args == mock.call( mock_autoscaler.pool_manager.max_capacity, {"dry_run": dry_run, "alert_on_max_capacity": True, "pool_owner": "compute_infra"}, ) - assert mock_autoscaler.setpoint_gauge.set.call_args == mock.call(0.7, {"dry_run": dry_run}) + assert mock_autoscaler.setpoint_gauge.set.call_args == mock.call(0.7, {"dry_run": dry_run, + "alert_on_max_capacity": True, "pool_owner": "compute_infra"}) assert mock_autoscaler._compute_target_capacity.call_args == mock.call(resource_request) assert mock_autoscaler.pool_manager.modify_target_capacity.call_count == 1 assert mock_autoscaler.resource_request_gauges["cpus"].set.call_args == mock.call( resource_request.cpus, - {"dry_run": dry_run}, + {"dry_run": dry_run, "alert_on_max_capacity": True, "pool_owner": "compute_infra"}, ) assert mock_autoscaler.resource_request_gauges["mem"].set.call_count == 0 assert mock_autoscaler.resource_request_gauges["disk"].set.call_count == 0 From b912f4ba98c3b448343e619b5826efad0271f8c5 Mon Sep 17 00:00:00 2001 From: Max Falk Date: Mon, 21 Aug 2023 01:23:10 -0700 Subject: [PATCH 07/10] formatting Signed-off-by: Max Falk --- tests/autoscaler/autoscaler_test.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/autoscaler/autoscaler_test.py b/tests/autoscaler/autoscaler_test.py index 37f477f4d..a7c85fc9e 100644 --- a/tests/autoscaler/autoscaler_test.py +++ b/tests/autoscaler/autoscaler_test.py @@ -163,14 +163,16 @@ def test_autoscaler_run(dry_run, mock_autoscaler, run_timestamp): ), pytest.raises(ValueError): mock_autoscaler.run(dry_run=dry_run, timestamp=run_timestamp) - assert mock_autoscaler.target_capacity_gauge.set.call_args == mock.call(100, {"dry_run": dry_run, - "alert_on_max_capacity": True, "pool_owner": "compute_infra"}) + assert mock_autoscaler.target_capacity_gauge.set.call_args == mock.call( + 100, {"dry_run": dry_run, "alert_on_max_capacity": True, "pool_owner": "compute_infra"} + ) assert mock_autoscaler.max_capacity_gauge.set.call_args == mock.call( mock_autoscaler.pool_manager.max_capacity, {"dry_run": dry_run, "alert_on_max_capacity": True, "pool_owner": "compute_infra"}, ) - assert mock_autoscaler.setpoint_gauge.set.call_args == mock.call(0.7, {"dry_run": dry_run, - "alert_on_max_capacity": True, "pool_owner": "compute_infra"}) + assert mock_autoscaler.setpoint_gauge.set.call_args == mock.call( + 0.7, {"dry_run": dry_run, "alert_on_max_capacity": True, "pool_owner": "compute_infra"} + ) assert mock_autoscaler._compute_target_capacity.call_args == mock.call(resource_request) assert mock_autoscaler.pool_manager.modify_target_capacity.call_count == 1 From 69d93083314b91099fcba21fd1f3123f31faad01 Mon Sep 17 00:00:00 2001 From: Max Falk Date: Wed, 11 Oct 2023 08:20:34 -0700 Subject: [PATCH 08/10] dummy commit Signed-off-by: Max Falk --- .../srv-configs/clusterman-clusters/local-dev/default.mesos | 1 + 1 file changed, 1 insertion(+) diff --git a/acceptance/srv-configs/clusterman-clusters/local-dev/default.mesos b/acceptance/srv-configs/clusterman-clusters/local-dev/default.mesos index 6bb9f05a7..044a4e3f1 100644 --- a/acceptance/srv-configs/clusterman-clusters/local-dev/default.mesos +++ b/acceptance/srv-configs/clusterman-clusters/local-dev/default.mesos @@ -30,3 +30,4 @@ autoscale_signal: alert_on_max_capacity: false pool_owner: compute_infra + From ffc4439c0fa49f024c830a49d14adb1d162469ea Mon Sep 17 00:00:00 2001 From: Max Falk Date: Wed, 11 Oct 2023 08:30:38 -0700 Subject: [PATCH 09/10] precommit Signed-off-by: Max Falk --- .../srv-configs/clusterman-clusters/local-dev/default.mesos | 1 - 1 file changed, 1 deletion(-) diff --git a/acceptance/srv-configs/clusterman-clusters/local-dev/default.mesos b/acceptance/srv-configs/clusterman-clusters/local-dev/default.mesos index 044a4e3f1..6bb9f05a7 100644 --- a/acceptance/srv-configs/clusterman-clusters/local-dev/default.mesos +++ b/acceptance/srv-configs/clusterman-clusters/local-dev/default.mesos @@ -30,4 +30,3 @@ autoscale_signal: alert_on_max_capacity: false pool_owner: compute_infra - From 987f308b744560b741de643683f9f21f9b2b9453 Mon Sep 17 00:00:00 2001 From: Max Falk Date: Thu, 12 Oct 2023 01:10:39 -0700 Subject: [PATCH 10/10] use team instead of pool_owner for emitted metrics Signed-off-by: Max Falk --- tests/autoscaler/autoscaler_test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/autoscaler/autoscaler_test.py b/tests/autoscaler/autoscaler_test.py index a7c85fc9e..335c13e92 100644 --- a/tests/autoscaler/autoscaler_test.py +++ b/tests/autoscaler/autoscaler_test.py @@ -164,21 +164,21 @@ def test_autoscaler_run(dry_run, mock_autoscaler, run_timestamp): mock_autoscaler.run(dry_run=dry_run, timestamp=run_timestamp) assert mock_autoscaler.target_capacity_gauge.set.call_args == mock.call( - 100, {"dry_run": dry_run, "alert_on_max_capacity": True, "pool_owner": "compute_infra"} + 100, {"dry_run": dry_run, "alert_on_max_capacity": True, "team": "compute_infra"} ) assert mock_autoscaler.max_capacity_gauge.set.call_args == mock.call( mock_autoscaler.pool_manager.max_capacity, - {"dry_run": dry_run, "alert_on_max_capacity": True, "pool_owner": "compute_infra"}, + {"dry_run": dry_run, "alert_on_max_capacity": True, "team": "compute_infra"}, ) assert mock_autoscaler.setpoint_gauge.set.call_args == mock.call( - 0.7, {"dry_run": dry_run, "alert_on_max_capacity": True, "pool_owner": "compute_infra"} + 0.7, {"dry_run": dry_run, "alert_on_max_capacity": True, "team": "compute_infra"} ) assert mock_autoscaler._compute_target_capacity.call_args == mock.call(resource_request) assert mock_autoscaler.pool_manager.modify_target_capacity.call_count == 1 assert mock_autoscaler.resource_request_gauges["cpus"].set.call_args == mock.call( resource_request.cpus, - {"dry_run": dry_run, "alert_on_max_capacity": True, "pool_owner": "compute_infra"}, + {"dry_run": dry_run, "alert_on_max_capacity": True, "team": "compute_infra"}, ) assert mock_autoscaler.resource_request_gauges["mem"].set.call_count == 0 assert mock_autoscaler.resource_request_gauges["disk"].set.call_count == 0