Skip to content

Commit

Permalink
fix ping-and-up test for den launched clusters
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexandra Belousov authored and Alexandra Belousov committed Jan 15, 2025
1 parent e722b62 commit 6524d80
Showing 1 changed file with 21 additions and 3 deletions.
24 changes: 21 additions & 3 deletions runhouse/resources/hardware/on_demand_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,16 +526,33 @@ def _populate_connection_from_status_dict(self, cluster_dict: Dict[str, Any]):
self._kube_namespace = self.compute_properties.get("kube_namespace")
self._kube_context = self.compute_properties.get("kube_context")

def _update_from_sky_status(self, dryrun: bool = False):
def _update_from_status(self, dryrun: bool = False):
# Try to get the cluster status from SkyDB
if self.is_shared:
# If the cluster is shared can ignore, since the sky data will only be saved on the machine where
# the cluster was initially upped
return

if self.launcher == "local":
if self.launcher == LauncherType.LOCAL:
cluster_dict = self._sky_status(refresh=not dryrun)
self._populate_connection_from_status_dict(cluster_dict)
if self.launcher == LauncherType.DEN:
cluster_uri = rns_client.format_rns_address(self.rns_address or self.name)
cluster_den_status = (
requests.get(
f"{rns_client.api_server_url}/resource/{cluster_uri}/cluster/status?limit=1",
headers=rns_client.request_headers(),
)
.json()
.get("data", None)
)
if not cluster_den_status or not cluster_den_status[0].get("resource_info"):
return

cluster_dict = (
cluster_den_status[0].get("resource_info").get("cluster_config")
)
DenLauncher._update_from_den_response(cluster=self, config=cluster_dict)

def get_instance_type(self):
"""Returns instance type of the cluster."""
Expand Down Expand Up @@ -792,6 +809,7 @@ def _ping(self, timeout=5, retry=False):
return True

if retry:
self._update_from_sky_status(dryrun=False)
dryrun = False if self.launcher == LauncherType.LOCAL else None
self._update_from_status(dryrun=dryrun)
return super()._ping(timeout=timeout, retry=False)
return False

0 comments on commit 6524d80

Please sign in to comment.