From 499a26e03a3a0d2ffba6dbc7df92770532c1d4d2 Mon Sep 17 00:00:00 2001 From: Luis Perez Date: Wed, 18 Oct 2023 11:23:48 -0700 Subject: [PATCH 1/2] Revert "add and emit pool owner metadata for alerting (#327)" This reverts commit 2595b5e6cba926421ee71846bf099e1d515c77f5. --- .../local-dev/default.kubernetes | 1 - .../local-dev/default.mesos | 1 - clusterman/autoscaler/autoscaler.py | 18 +++++++----------- clusterman/autoscaler/pool_manager.py | 1 - clusterman/simulator/simulated_pool_manager.py | 1 - examples/schemas/pool.json | 3 +-- itests/environment.py | 2 -- tests/autoscaler/autoscaler_test.py | 18 ++++-------------- tests/conftest.py | 2 -- 9 files changed, 12 insertions(+), 35 deletions(-) diff --git a/acceptance/srv-configs/clusterman-clusters/local-dev/default.kubernetes b/acceptance/srv-configs/clusterman-clusters/local-dev/default.kubernetes index f0a8f9792..11859c701 100644 --- a/acceptance/srv-configs/clusterman-clusters/local-dev/default.kubernetes +++ b/acceptance/srv-configs/clusterman-clusters/local-dev/default.kubernetes @@ -21,4 +21,3 @@ autoscaling: instance_loss_threshold: 3 alert_on_max_capacity: false -pool_owner: compute_infra diff --git a/acceptance/srv-configs/clusterman-clusters/local-dev/default.mesos b/acceptance/srv-configs/clusterman-clusters/local-dev/default.mesos index 6bb9f05a7..4fd9aec12 100644 --- a/acceptance/srv-configs/clusterman-clusters/local-dev/default.mesos +++ b/acceptance/srv-configs/clusterman-clusters/local-dev/default.mesos @@ -29,4 +29,3 @@ autoscale_signal: minute_range: 10 alert_on_max_capacity: false -pool_owner: compute_infra diff --git a/clusterman/autoscaler/autoscaler.py b/clusterman/autoscaler/autoscaler.py index 6eccbe58f..0424c7cba 100644 --- a/clusterman/autoscaler/autoscaler.py +++ b/clusterman/autoscaler/autoscaler.py @@ -178,9 +178,12 @@ def run(self, dry_run: bool = False, timestamp: Optional[arrow.Arrow] = None) -> else: capacity_offset = get_capacity_offset(self.cluster, self.pool, self.scheduler, timestamp) new_target_capacity = self._compute_target_capacity(resource_request) + capacity_offset - self.target_capacity_gauge.set(new_target_capacity, self.add_metric_labels(dry_run)) - self.max_capacity_gauge.set(self.pool_manager.max_capacity, self.add_metric_labels(dry_run)) - self.setpoint_gauge.set(self.autoscaling_config.setpoint, self.add_metric_labels(dry_run)) + self.target_capacity_gauge.set(new_target_capacity, {"dry_run": dry_run}) + self.max_capacity_gauge.set( + self.pool_manager.max_capacity, + {"dry_run": dry_run, "alert_on_max_capacity": self.pool_manager.alert_on_max_capacity}, + ) + self.setpoint_gauge.set(self.autoscaling_config.setpoint, {"dry_run": dry_run}) self._emit_requested_resource_metrics(resource_request, dry_run=dry_run) try: @@ -199,14 +202,7 @@ def run(self, dry_run: bool = False, timestamp: Optional[arrow.Arrow] = None) -> def _emit_requested_resource_metrics(self, resource_request: SignalResourceRequest, dry_run: bool) -> None: for resource_type, resource_gauge in self.resource_request_gauges.items(): if getattr(resource_request, resource_type) is not None: - resource_gauge.set(getattr(resource_request, resource_type), self.add_metric_labels(dry_run)) - - def add_metric_labels(self, dry_run): - return { - "dry_run": dry_run, - "alert_on_max_capacity": self.pool_manager.alert_on_max_capacity, - "team": self.pool_manager.pool_owner, - } + resource_gauge.set(getattr(resource_request, resource_type), {"dry_run": dry_run}) def _get_signal_for_app(self, app: str) -> Signal: """Load the signal object to use for autoscaling for a particular app diff --git a/clusterman/autoscaler/pool_manager.py b/clusterman/autoscaler/pool_manager.py index c06226c2c..6ca514dc3 100644 --- a/clusterman/autoscaler/pool_manager.py +++ b/clusterman/autoscaler/pool_manager.py @@ -86,7 +86,6 @@ def __init__( "autoscaling.killable_nodes_prioritizing_v2", default=False ) self.alert_on_max_capacity = self.pool_config.read_bool("alert_on_max_capacity", default=True) - self.pool_owner = self.pool_config.read_string("pool_owner", default="compute_infra") monitoring_info = {"cluster": cluster, "pool": pool} self.killable_nodes_counter = get_monitoring_client().create_counter(SFX_KILLABLE_NODES_COUNT, monitoring_info) diff --git a/clusterman/simulator/simulated_pool_manager.py b/clusterman/simulator/simulated_pool_manager.py index 05446c098..d796f8387 100644 --- a/clusterman/simulator/simulated_pool_manager.py +++ b/clusterman/simulator/simulated_pool_manager.py @@ -59,7 +59,6 @@ def __init__( MAX_MIN_NODE_SCALEIN_UPTIME_SECONDS, ) self.alert_on_max_capacity = self.pool_config.read_bool("alert_on_max_capacity", default=True) - self.pool_owner = self.pool_config.read_string("pool_owner", default="compute_infra") self.killable_nodes_prioritizing_v2 = self.pool_config.read_bool( "autoscaling.killable_nodes_prioritizing_v2", default=False ) diff --git a/examples/schemas/pool.json b/examples/schemas/pool.json index d8fb9a6c7..f4fd7fbf5 100644 --- a/examples/schemas/pool.json +++ b/examples/schemas/pool.json @@ -64,8 +64,7 @@ "additionalProperties": false }, "sensu_config": {"$ref": "definitions.json#sensu_config"}, - "alert_on_max_capacity": {"type": "boolean"}, - "pool_owner": {"type": "string"} + "alert_on_max_capacity": {"type": "boolean"} }, "additionalProperties": false } diff --git a/itests/environment.py b/itests/environment.py index a65546b34..eabe17174 100644 --- a/itests/environment.py +++ b/itests/environment.py @@ -121,7 +121,6 @@ def setup_configurations(context): ], }, "alert_on_max_capacity": True, - "pool_owner": "compute_infra", } kube_pool_config = { "resource_groups": [ @@ -145,7 +144,6 @@ def setup_configurations(context): "period_minutes": 7, }, "alert_on_max_capacity": True, - "pool_owner": "compute_infra", } with staticconf.testing.MockConfiguration( boto_config, namespace=CREDENTIALS_NAMESPACE diff --git a/tests/autoscaler/autoscaler_test.py b/tests/autoscaler/autoscaler_test.py index 335c13e92..c4555a152 100644 --- a/tests/autoscaler/autoscaler_test.py +++ b/tests/autoscaler/autoscaler_test.py @@ -49,7 +49,6 @@ def pool_configs(): "max_weight_to_remove": 10, }, "alert_on_max_capacity": True, - "pool_owner": "compute_infra", }, namespace=POOL_NAMESPACE.format(pool="bar", scheduler="mesos"), ): @@ -92,10 +91,6 @@ def mock_autoscaler(): "alert_on_max_capacity", namespace=POOL_NAMESPACE.format(pool="bar", scheduler="mesos"), ) - mock_autoscaler.pool_manager.pool_owner = staticconf.read_string( - "pool_owner", - namespace=POOL_NAMESPACE.format(pool="bar", scheduler="mesos"), - ) mock_autoscaler.pool_manager.non_orphan_fulfilled_capacity = 0 mock_autoscaler.target_capacity_gauge = mock.Mock(spec=GaugeProtocol) @@ -163,22 +158,17 @@ def test_autoscaler_run(dry_run, mock_autoscaler, run_timestamp): ), pytest.raises(ValueError): mock_autoscaler.run(dry_run=dry_run, timestamp=run_timestamp) - assert mock_autoscaler.target_capacity_gauge.set.call_args == mock.call( - 100, {"dry_run": dry_run, "alert_on_max_capacity": True, "team": "compute_infra"} - ) + assert mock_autoscaler.target_capacity_gauge.set.call_args == mock.call(100, {"dry_run": dry_run}) assert mock_autoscaler.max_capacity_gauge.set.call_args == mock.call( - mock_autoscaler.pool_manager.max_capacity, - {"dry_run": dry_run, "alert_on_max_capacity": True, "team": "compute_infra"}, - ) - assert mock_autoscaler.setpoint_gauge.set.call_args == mock.call( - 0.7, {"dry_run": dry_run, "alert_on_max_capacity": True, "team": "compute_infra"} + mock_autoscaler.pool_manager.max_capacity, {"dry_run": dry_run, "alert_on_max_capacity": True} ) + assert mock_autoscaler.setpoint_gauge.set.call_args == mock.call(0.7, {"dry_run": dry_run}) assert mock_autoscaler._compute_target_capacity.call_args == mock.call(resource_request) assert mock_autoscaler.pool_manager.modify_target_capacity.call_count == 1 assert mock_autoscaler.resource_request_gauges["cpus"].set.call_args == mock.call( resource_request.cpus, - {"dry_run": dry_run, "alert_on_max_capacity": True, "team": "compute_infra"}, + {"dry_run": dry_run}, ) assert mock_autoscaler.resource_request_gauges["mem"].set.call_count == 0 assert mock_autoscaler.resource_request_gauges["disk"].set.call_count == 0 diff --git a/tests/conftest.py b/tests/conftest.py index 9651b2516..55f5606f1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -145,7 +145,6 @@ def clusterman_pool_config(): ], }, "alert_on_max_capacity": True, - "pool_owner": "compute_infra", } with staticconf.testing.MockConfiguration(config, namespace="bar.mesos_config"): yield @@ -203,7 +202,6 @@ def clusterman_k8s_pool_config(): "disable_autoscaling": False, }, "alert_on_max_capacity": False, - "pool_owner": "foo", } with staticconf.testing.MockConfiguration(config, namespace="bar.kubernetes_config"): yield From 8b8ad6e6fa894b4d7c1efc2eb9cd18cef4e3f019 Mon Sep 17 00:00:00 2001 From: Luis Perez Date: Wed, 18 Oct 2023 11:23:52 -0700 Subject: [PATCH 2/2] Revert "CLUSTERMAN-812: upgrade k8s client library (#334)" This reverts commit 6c4b8bb424fd84f1087552fb19d992180cf17834. --- clusterman/kubernetes/kubernetes_cluster_connector.py | 4 ++-- clusterman/kubernetes/util.py | 3 +-- requirements.txt | 2 +- tests/kubernetes/util_test.py | 3 +-- 4 files changed, 5 insertions(+), 7 deletions(-) diff --git a/clusterman/kubernetes/kubernetes_cluster_connector.py b/clusterman/kubernetes/kubernetes_cluster_connector.py index 4a4c3dfd1..b03fc948d 100644 --- a/clusterman/kubernetes/kubernetes_cluster_connector.py +++ b/clusterman/kubernetes/kubernetes_cluster_connector.py @@ -24,8 +24,8 @@ import colorlog import kubernetes import staticconf +from kubernetes.client import V1beta1Eviction from kubernetes.client import V1DeleteOptions -from kubernetes.client import V1Eviction from kubernetes.client import V1ObjectMeta from kubernetes.client.models.v1_node import V1Node as KubernetesNode from kubernetes.client.models.v1_pod import V1Pod as KubernetesPod @@ -356,7 +356,7 @@ def _evict_pod(self, pod: KubernetesPod): self._core_api.create_namespaced_pod_eviction( name=pod.metadata.name, namespace=pod.metadata.namespace, - body=V1Eviction( + body=V1beta1Eviction( metadata=V1ObjectMeta( name=pod.metadata.name, namespace=pod.metadata.namespace, diff --git a/clusterman/kubernetes/util.py b/clusterman/kubernetes/util.py index 4fed406b7..0e9253a5f 100644 --- a/clusterman/kubernetes/util.py +++ b/clusterman/kubernetes/util.py @@ -30,7 +30,6 @@ from kubernetes.client.models.v1_node_selector_requirement import V1NodeSelectorRequirement from kubernetes.client.models.v1_node_selector_term import V1NodeSelectorTerm from kubernetes.client.models.v1_pod import V1Pod as KubernetesPod -from kubernetes.config.config_exception import ConfigException from clusterman.util import ClustermanResources @@ -73,7 +72,7 @@ def __init__(self, kubeconfig_path: str, client_class: Type) -> None: kubernetes.config.load_incluster_config() else: kubernetes.config.load_kube_config(kubeconfig_path, context=os.getenv("KUBECONTEXT")) - except (TypeError, ConfigException): + except TypeError: error_msg = "Could not load KUBECONFIG; is this running on Kubernetes master?" if "yelpcorp" in socket.getfqdn(): error_msg += "\nHint: try using the clusterman-k8s- wrapper script!" diff --git a/requirements.txt b/requirements.txt index e0308153d..8e4359b62 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,7 +16,7 @@ idna==2.8 jmespath==0.9.4 jsonpickle==1.4.2 kiwisolver==1.1.0 -kubernetes==24.2.0 +kubernetes==10.0.1 matplotlib==3.4.2 mypy-extensions==0.4.3 numpy==1.21.6 diff --git a/tests/kubernetes/util_test.py b/tests/kubernetes/util_test.py index 92942f9ac..3316a4660 100644 --- a/tests/kubernetes/util_test.py +++ b/tests/kubernetes/util_test.py @@ -5,7 +5,6 @@ import pytest from kubernetes.client.models.v1_node_selector_requirement import V1NodeSelectorRequirement from kubernetes.client.models.v1_node_selector_term import V1NodeSelectorTerm -from kubernetes.config import ConfigException from clusterman.kubernetes.util import CachedCoreV1Api from clusterman.kubernetes.util import ConciseCRDApi @@ -22,7 +21,7 @@ def mock_cached_core_v1_api(): def test_cached_corev1_api_no_kubeconfig(caplog): - with pytest.raises(ConfigException): + with pytest.raises(TypeError): CachedCoreV1Api("/foo/bar/admin.conf") assert "Could not load KUBECONFIG" in caplog.text