add and emit pool owner metadata for alerting (#327)

We'd like to have `pool_owner` metadata on each pool for both informational and alerting purposes. The new attribute on the pool config, `pool_owner`, defaults to `compute_infra`. Signed-off-by: Max Falk <[email protected]>
Yelp · Oct 13, 2023 · 2595b5e · 2595b5e
1 parent 6c4b8bb
commit 2595b5e
Show file tree

Hide file tree

Showing 9 changed files with 35 additions and 12 deletions.
diff --git a/acceptance/srv-configs/clusterman-clusters/local-dev/default.kubernetes b/acceptance/srv-configs/clusterman-clusters/local-dev/default.kubernetes
@@ -21,3 +21,4 @@ autoscaling:
   instance_loss_threshold: 3
 
 alert_on_max_capacity: false
+pool_owner: compute_infra
diff --git a/acceptance/srv-configs/clusterman-clusters/local-dev/default.mesos b/acceptance/srv-configs/clusterman-clusters/local-dev/default.mesos
@@ -29,3 +29,4 @@ autoscale_signal:
       minute_range: 10
 
 alert_on_max_capacity: false
+pool_owner: compute_infra
diff --git a/clusterman/autoscaler/autoscaler.py b/clusterman/autoscaler/autoscaler.py
@@ -178,12 +178,9 @@ def run(self, dry_run: bool = False, timestamp: Optional[arrow.Arrow] = None) ->
         else:
             capacity_offset = get_capacity_offset(self.cluster, self.pool, self.scheduler, timestamp)
             new_target_capacity = self._compute_target_capacity(resource_request) + capacity_offset
-            self.target_capacity_gauge.set(new_target_capacity, {"dry_run": dry_run})
-            self.max_capacity_gauge.set(
-                self.pool_manager.max_capacity,
-                {"dry_run": dry_run, "alert_on_max_capacity": self.pool_manager.alert_on_max_capacity},
-            )
-            self.setpoint_gauge.set(self.autoscaling_config.setpoint, {"dry_run": dry_run})
+            self.target_capacity_gauge.set(new_target_capacity, self.add_metric_labels(dry_run))
+            self.max_capacity_gauge.set(self.pool_manager.max_capacity, self.add_metric_labels(dry_run))
+            self.setpoint_gauge.set(self.autoscaling_config.setpoint, self.add_metric_labels(dry_run))
             self._emit_requested_resource_metrics(resource_request, dry_run=dry_run)
 
         try:
@@ -202,7 +199,14 @@ def run(self, dry_run: bool = False, timestamp: Optional[arrow.Arrow] = None) ->
     def _emit_requested_resource_metrics(self, resource_request: SignalResourceRequest, dry_run: bool) -> None:
         for resource_type, resource_gauge in self.resource_request_gauges.items():
             if getattr(resource_request, resource_type) is not None:
-                resource_gauge.set(getattr(resource_request, resource_type), {"dry_run": dry_run})
+                resource_gauge.set(getattr(resource_request, resource_type), self.add_metric_labels(dry_run))
+
+    def add_metric_labels(self, dry_run):
+        return {
+            "dry_run": dry_run,
+            "alert_on_max_capacity": self.pool_manager.alert_on_max_capacity,
+            "team": self.pool_manager.pool_owner,
+        }
 
     def _get_signal_for_app(self, app: str) -> Signal:
         """Load the signal object to use for autoscaling for a particular app

diff --git a/clusterman/autoscaler/pool_manager.py b/clusterman/autoscaler/pool_manager.py
@@ -86,6 +86,7 @@ def __init__(
             "autoscaling.killable_nodes_prioritizing_v2", default=False
         )
         self.alert_on_max_capacity = self.pool_config.read_bool("alert_on_max_capacity", default=True)
+        self.pool_owner = self.pool_config.read_string("pool_owner", default="compute_infra")
         monitoring_info = {"cluster": cluster, "pool": pool}
         self.killable_nodes_counter = get_monitoring_client().create_counter(SFX_KILLABLE_NODES_COUNT, monitoring_info)
 

diff --git a/clusterman/simulator/simulated_pool_manager.py b/clusterman/simulator/simulated_pool_manager.py
@@ -59,6 +59,7 @@ def __init__(
             MAX_MIN_NODE_SCALEIN_UPTIME_SECONDS,
         )
         self.alert_on_max_capacity = self.pool_config.read_bool("alert_on_max_capacity", default=True)
+        self.pool_owner = self.pool_config.read_string("pool_owner", default="compute_infra")
         self.killable_nodes_prioritizing_v2 = self.pool_config.read_bool(
             "autoscaling.killable_nodes_prioritizing_v2", default=False
         )

diff --git a/examples/schemas/pool.json b/examples/schemas/pool.json
@@ -64,7 +64,8 @@
             "additionalProperties": false
         },
         "sensu_config": {"$ref": "definitions.json#sensu_config"},
-        "alert_on_max_capacity": {"type": "boolean"}
+        "alert_on_max_capacity": {"type": "boolean"},
+        "pool_owner": {"type": "string"}
     },
     "additionalProperties": false
 }
diff --git a/itests/environment.py b/itests/environment.py
@@ -121,6 +121,7 @@ def setup_configurations(context):
             ],
         },
         "alert_on_max_capacity": True,
+        "pool_owner": "compute_infra",
     }
     kube_pool_config = {
         "resource_groups": [
@@ -144,6 +145,7 @@ def setup_configurations(context):
             "period_minutes": 7,
         },
         "alert_on_max_capacity": True,
+        "pool_owner": "compute_infra",
     }
     with staticconf.testing.MockConfiguration(
         boto_config, namespace=CREDENTIALS_NAMESPACE

diff --git a/tests/autoscaler/autoscaler_test.py b/tests/autoscaler/autoscaler_test.py
@@ -49,6 +49,7 @@ def pool_configs():
                 "max_weight_to_remove": 10,
             },
             "alert_on_max_capacity": True,
+            "pool_owner": "compute_infra",
         },
         namespace=POOL_NAMESPACE.format(pool="bar", scheduler="mesos"),
     ):
@@ -91,6 +92,10 @@ def mock_autoscaler():
         "alert_on_max_capacity",
         namespace=POOL_NAMESPACE.format(pool="bar", scheduler="mesos"),
     )
+    mock_autoscaler.pool_manager.pool_owner = staticconf.read_string(
+        "pool_owner",
+        namespace=POOL_NAMESPACE.format(pool="bar", scheduler="mesos"),
+    )
     mock_autoscaler.pool_manager.non_orphan_fulfilled_capacity = 0
 
     mock_autoscaler.target_capacity_gauge = mock.Mock(spec=GaugeProtocol)
@@ -158,17 +163,22 @@ def test_autoscaler_run(dry_run, mock_autoscaler, run_timestamp):
     ), pytest.raises(ValueError):
         mock_autoscaler.run(dry_run=dry_run, timestamp=run_timestamp)
 
-    assert mock_autoscaler.target_capacity_gauge.set.call_args == mock.call(100, {"dry_run": dry_run})
+    assert mock_autoscaler.target_capacity_gauge.set.call_args == mock.call(
+        100, {"dry_run": dry_run, "alert_on_max_capacity": True, "team": "compute_infra"}
+    )
     assert mock_autoscaler.max_capacity_gauge.set.call_args == mock.call(
-        mock_autoscaler.pool_manager.max_capacity, {"dry_run": dry_run, "alert_on_max_capacity": True}
+        mock_autoscaler.pool_manager.max_capacity,
+        {"dry_run": dry_run, "alert_on_max_capacity": True, "team": "compute_infra"},
+    )
+    assert mock_autoscaler.setpoint_gauge.set.call_args == mock.call(
+        0.7, {"dry_run": dry_run, "alert_on_max_capacity": True, "team": "compute_infra"}
     )
-    assert mock_autoscaler.setpoint_gauge.set.call_args == mock.call(0.7, {"dry_run": dry_run})
     assert mock_autoscaler._compute_target_capacity.call_args == mock.call(resource_request)
     assert mock_autoscaler.pool_manager.modify_target_capacity.call_count == 1
 
     assert mock_autoscaler.resource_request_gauges["cpus"].set.call_args == mock.call(
         resource_request.cpus,
-        {"dry_run": dry_run},
+        {"dry_run": dry_run, "alert_on_max_capacity": True, "team": "compute_infra"},
     )
     assert mock_autoscaler.resource_request_gauges["mem"].set.call_count == 0
     assert mock_autoscaler.resource_request_gauges["disk"].set.call_count == 0

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -145,6 +145,7 @@ def clusterman_pool_config():
             ],
         },
         "alert_on_max_capacity": True,
+        "pool_owner": "compute_infra",
     }
     with staticconf.testing.MockConfiguration(config, namespace="bar.mesos_config"):
         yield
@@ -202,6 +203,7 @@ def clusterman_k8s_pool_config():
             "disable_autoscaling": False,
         },
         "alert_on_max_capacity": False,
+        "pool_owner": "foo",
     }
     with staticconf.testing.MockConfiguration(config, namespace="bar.kubernetes_config"):
         yield
Original file line number	Diff line number	Diff line change
Expand Up		@@ -21,3 +21,4 @@ autoscaling:
		instance_loss_threshold: 3

		alert_on_max_capacity: false
		pool_owner: compute_infra
Original file line number	Diff line number	Diff line change
Expand Up		@@ -29,3 +29,4 @@ autoscale_signal:
		minute_range: 10

		alert_on_max_capacity: false
		pool_owner: compute_infra