From be86afdfb5ad3554a2ba4ad4298310458e4b8871 Mon Sep 17 00:00:00 2001 From: Mimi Liao Date: Mon, 25 Nov 2024 13:11:49 -0600 Subject: [PATCH] Add logging for missing node_id in ray_nodes_idle_duration_ms_by_id and set default value Signed-off-by: Mimi Liao --- python/ray/autoscaler/_private/load_metrics.py | 4 +++- python/ray/autoscaler/_private/monitor.py | 12 ++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/python/ray/autoscaler/_private/load_metrics.py b/python/ray/autoscaler/_private/load_metrics.py index 177fcfaa0d0e..930229aa634f 100644 --- a/python/ray/autoscaler/_private/load_metrics.py +++ b/python/ray/autoscaler/_private/load_metrics.py @@ -102,7 +102,6 @@ def update( self.static_resources_by_ip[ip] = static_resources self.raylet_id_by_ip[ip] = raylet_id self.cluster_full_of_actors_detected = cluster_full_of_actors_detected - self.ray_nodes_last_used_time_by_ip[ip] = node_last_used_time_s if not waiting_bundles: waiting_bundles = [] @@ -122,6 +121,9 @@ def update( self.dynamic_resources_by_ip[ip] = dynamic_resources_update now = time.time() + self.ray_nodes_last_used_time_by_ip[ip] = ( + node_last_used_time_s if node_last_used_time_s else now + ) self.last_heartbeat_time_by_ip[ip] = now self.waiting_bundles = waiting_bundles self.infeasible_bundles = infeasible_bundles diff --git a/python/ray/autoscaler/_private/monitor.py b/python/ray/autoscaler/_private/monitor.py index e5ce6cc01cda..f9872d9d0202 100644 --- a/python/ray/autoscaler/_private/monitor.py +++ b/python/ray/autoscaler/_private/monitor.py @@ -320,7 +320,13 @@ def update_load_metrics(self): else: ip = resource_message.node_manager_address - idle_duration_ms = ray_nodes_idle_duration_ms_by_id[node_id] + idle_duration_ms = 0.0 + if node_id in ray_nodes_idle_duration_ms_by_id: + idle_duration_ms = ray_nodes_idle_duration_ms_by_id[node_id] + else: + logger.warning( + f"node_id {node_id} not found in ray_nodes_idle_duration_ms_by_id" + ) self.load_metrics.update( ip, @@ -331,7 +337,9 @@ def update_load_metrics(self): infeasible_bundles, pending_placement_groups, cluster_full, - time.time() - idle_duration_ms / 1000, # last_used_time + time.time() + - idle_duration_ms + / 1000, # node_last_used_time_s = now - idle_duration ) if self.readonly_config: self.readonly_config["available_node_types"].update(mirror_node_types)