From 3c2b01b5684dff92770733bc4f70329969ca31fd Mon Sep 17 00:00:00 2001 From: mjibril Date: Tue, 17 Sep 2024 18:43:19 +0100 Subject: [PATCH] [FluidStack] Add NVLINK GPUs * Add NVLINK GPUs as distinct gpus --- .../service_catalog/data_fetchers/fetch_fluidstack.py | 6 ++++-- sky/provision/fluidstack/instance.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py b/sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py index cf943541e08..dde268e295e 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py @@ -148,8 +148,8 @@ GPU_MAP = { 'H100_PCIE_80GB': 'H100', - 'H100_NVLINK_80GB': 'H100', - 'A100_NVLINK_80GB': 'A100-80GB', + 'H100_NVLINK_80GB': 'H100-NVLINK', + 'A100_NVLINK_80GB': 'A100-80GB-NVLINK', 'A100_SXM4_80GB': 'A100-80GB', 'A100_PCIE_80GB': 'A100-80GB', 'A100_SXM4_40GB': 'A100', @@ -185,6 +185,8 @@ def create_catalog(output_dir: str) -> None: with open(DEFAULT_FLUIDSTACK_API_KEY_PATH, 'r', encoding='UTF-8') as f: api_key = f.read().strip() response = requests.get(ENDPOINT, headers={'api-key': api_key}) + if not response.ok: + raise Exception(response.text) plans = response.json() with open(os.path.join(output_dir, 'vms.csv'), mode='w', diff --git a/sky/provision/fluidstack/instance.py b/sky/provision/fluidstack/instance.py index 538aafc8887..266841d87ec 100644 --- a/sky/provision/fluidstack/instance.py +++ b/sky/provision/fluidstack/instance.py @@ -298,7 +298,7 @@ def query_instances( 'pending': status_lib.ClusterStatus.INIT, 'stopped': status_lib.ClusterStatus.STOPPED, 'running': status_lib.ClusterStatus.UP, - 'unhealthy': status_lib.ClusterStatus.INIT, + 'failed': status_lib.ClusterStatus.INIT, 'terminated': None, } statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}