Skip to content
This repository has been archived by the owner on Oct 16, 2024. It is now read-only.

Commit

Permalink
add and emit pool owner metadata for alerting (#327)
Browse files Browse the repository at this point in the history
We'd like to have `pool_owner` metadata on each pool for both informational and alerting purposes.
The new attribute on the pool config, `pool_owner`, defaults to `compute_infra`.

Signed-off-by: Max Falk <[email protected]>
  • Loading branch information
gmdfalk authored Oct 13, 2023
1 parent 6c4b8bb commit 2595b5e
Show file tree
Hide file tree
Showing 9 changed files with 35 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,4 @@ autoscaling:
instance_loss_threshold: 3

alert_on_max_capacity: false
pool_owner: compute_infra
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,4 @@ autoscale_signal:
minute_range: 10

alert_on_max_capacity: false
pool_owner: compute_infra
18 changes: 11 additions & 7 deletions clusterman/autoscaler/autoscaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,12 +178,9 @@ def run(self, dry_run: bool = False, timestamp: Optional[arrow.Arrow] = None) ->
else:
capacity_offset = get_capacity_offset(self.cluster, self.pool, self.scheduler, timestamp)
new_target_capacity = self._compute_target_capacity(resource_request) + capacity_offset
self.target_capacity_gauge.set(new_target_capacity, {"dry_run": dry_run})
self.max_capacity_gauge.set(
self.pool_manager.max_capacity,
{"dry_run": dry_run, "alert_on_max_capacity": self.pool_manager.alert_on_max_capacity},
)
self.setpoint_gauge.set(self.autoscaling_config.setpoint, {"dry_run": dry_run})
self.target_capacity_gauge.set(new_target_capacity, self.add_metric_labels(dry_run))
self.max_capacity_gauge.set(self.pool_manager.max_capacity, self.add_metric_labels(dry_run))
self.setpoint_gauge.set(self.autoscaling_config.setpoint, self.add_metric_labels(dry_run))
self._emit_requested_resource_metrics(resource_request, dry_run=dry_run)

try:
Expand All @@ -202,7 +199,14 @@ def run(self, dry_run: bool = False, timestamp: Optional[arrow.Arrow] = None) ->
def _emit_requested_resource_metrics(self, resource_request: SignalResourceRequest, dry_run: bool) -> None:
for resource_type, resource_gauge in self.resource_request_gauges.items():
if getattr(resource_request, resource_type) is not None:
resource_gauge.set(getattr(resource_request, resource_type), {"dry_run": dry_run})
resource_gauge.set(getattr(resource_request, resource_type), self.add_metric_labels(dry_run))

def add_metric_labels(self, dry_run):
return {
"dry_run": dry_run,
"alert_on_max_capacity": self.pool_manager.alert_on_max_capacity,
"team": self.pool_manager.pool_owner,
}

def _get_signal_for_app(self, app: str) -> Signal:
"""Load the signal object to use for autoscaling for a particular app
Expand Down
1 change: 1 addition & 0 deletions clusterman/autoscaler/pool_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def __init__(
"autoscaling.killable_nodes_prioritizing_v2", default=False
)
self.alert_on_max_capacity = self.pool_config.read_bool("alert_on_max_capacity", default=True)
self.pool_owner = self.pool_config.read_string("pool_owner", default="compute_infra")
monitoring_info = {"cluster": cluster, "pool": pool}
self.killable_nodes_counter = get_monitoring_client().create_counter(SFX_KILLABLE_NODES_COUNT, monitoring_info)

Expand Down
1 change: 1 addition & 0 deletions clusterman/simulator/simulated_pool_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def __init__(
MAX_MIN_NODE_SCALEIN_UPTIME_SECONDS,
)
self.alert_on_max_capacity = self.pool_config.read_bool("alert_on_max_capacity", default=True)
self.pool_owner = self.pool_config.read_string("pool_owner", default="compute_infra")
self.killable_nodes_prioritizing_v2 = self.pool_config.read_bool(
"autoscaling.killable_nodes_prioritizing_v2", default=False
)
Expand Down
3 changes: 2 additions & 1 deletion examples/schemas/pool.json
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@
"additionalProperties": false
},
"sensu_config": {"$ref": "definitions.json#sensu_config"},
"alert_on_max_capacity": {"type": "boolean"}
"alert_on_max_capacity": {"type": "boolean"},
"pool_owner": {"type": "string"}
},
"additionalProperties": false
}
2 changes: 2 additions & 0 deletions itests/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ def setup_configurations(context):
],
},
"alert_on_max_capacity": True,
"pool_owner": "compute_infra",
}
kube_pool_config = {
"resource_groups": [
Expand All @@ -144,6 +145,7 @@ def setup_configurations(context):
"period_minutes": 7,
},
"alert_on_max_capacity": True,
"pool_owner": "compute_infra",
}
with staticconf.testing.MockConfiguration(
boto_config, namespace=CREDENTIALS_NAMESPACE
Expand Down
18 changes: 14 additions & 4 deletions tests/autoscaler/autoscaler_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def pool_configs():
"max_weight_to_remove": 10,
},
"alert_on_max_capacity": True,
"pool_owner": "compute_infra",
},
namespace=POOL_NAMESPACE.format(pool="bar", scheduler="mesos"),
):
Expand Down Expand Up @@ -91,6 +92,10 @@ def mock_autoscaler():
"alert_on_max_capacity",
namespace=POOL_NAMESPACE.format(pool="bar", scheduler="mesos"),
)
mock_autoscaler.pool_manager.pool_owner = staticconf.read_string(
"pool_owner",
namespace=POOL_NAMESPACE.format(pool="bar", scheduler="mesos"),
)
mock_autoscaler.pool_manager.non_orphan_fulfilled_capacity = 0

mock_autoscaler.target_capacity_gauge = mock.Mock(spec=GaugeProtocol)
Expand Down Expand Up @@ -158,17 +163,22 @@ def test_autoscaler_run(dry_run, mock_autoscaler, run_timestamp):
), pytest.raises(ValueError):
mock_autoscaler.run(dry_run=dry_run, timestamp=run_timestamp)

assert mock_autoscaler.target_capacity_gauge.set.call_args == mock.call(100, {"dry_run": dry_run})
assert mock_autoscaler.target_capacity_gauge.set.call_args == mock.call(
100, {"dry_run": dry_run, "alert_on_max_capacity": True, "team": "compute_infra"}
)
assert mock_autoscaler.max_capacity_gauge.set.call_args == mock.call(
mock_autoscaler.pool_manager.max_capacity, {"dry_run": dry_run, "alert_on_max_capacity": True}
mock_autoscaler.pool_manager.max_capacity,
{"dry_run": dry_run, "alert_on_max_capacity": True, "team": "compute_infra"},
)
assert mock_autoscaler.setpoint_gauge.set.call_args == mock.call(
0.7, {"dry_run": dry_run, "alert_on_max_capacity": True, "team": "compute_infra"}
)
assert mock_autoscaler.setpoint_gauge.set.call_args == mock.call(0.7, {"dry_run": dry_run})
assert mock_autoscaler._compute_target_capacity.call_args == mock.call(resource_request)
assert mock_autoscaler.pool_manager.modify_target_capacity.call_count == 1

assert mock_autoscaler.resource_request_gauges["cpus"].set.call_args == mock.call(
resource_request.cpus,
{"dry_run": dry_run},
{"dry_run": dry_run, "alert_on_max_capacity": True, "team": "compute_infra"},
)
assert mock_autoscaler.resource_request_gauges["mem"].set.call_count == 0
assert mock_autoscaler.resource_request_gauges["disk"].set.call_count == 0
Expand Down
2 changes: 2 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ def clusterman_pool_config():
],
},
"alert_on_max_capacity": True,
"pool_owner": "compute_infra",
}
with staticconf.testing.MockConfiguration(config, namespace="bar.mesos_config"):
yield
Expand Down Expand Up @@ -202,6 +203,7 @@ def clusterman_k8s_pool_config():
"disable_autoscaling": False,
},
"alert_on_max_capacity": False,
"pool_owner": "foo",
}
with staticconf.testing.MockConfiguration(config, namespace="bar.kubernetes_config"):
yield
Expand Down

0 comments on commit 2595b5e

Please sign in to comment.