From 8ca950c7075c7676ded721554a50ec281be00d52 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 27 Sep 2024 23:21:09 +0000 Subject: [PATCH 01/23] Show multiple kubernetes in the optimizer table --- sky/optimizer.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sky/optimizer.py b/sky/optimizer.py index 4326329579d..559039def37 100644 --- a/sky/optimizer.py +++ b/sky/optimizer.py @@ -821,12 +821,15 @@ def format_number(x): return row def _get_resource_group_hash(resources: 'resources_lib.Resources'): - return json.dumps( - { + resource_key_dict = { 'cloud': f'{resources.cloud}', 'accelerators': f'{resources.accelerators}', 'use_spot': resources.use_spot - }, + } + if isinstance(resources.cloud, clouds.Kubernetes): + resource_key_dict['region'] = resources.region + return json.dumps( + resource_key_dict, sort_keys=True) # Print the list of resouces that the optimizer considered. From de8a688478b2a55536cfce4c2e573e1aa01f5f61 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 27 Sep 2024 23:21:19 +0000 Subject: [PATCH 02/23] Add docs for multiple kubernetes --- .../reference/kubernetes/multi-kubernetes.rst | 114 ++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 docs/source/reference/kubernetes/multi-kubernetes.rst diff --git a/docs/source/reference/kubernetes/multi-kubernetes.rst b/docs/source/reference/kubernetes/multi-kubernetes.rst new file mode 100644 index 00000000000..e42a7d0d377 --- /dev/null +++ b/docs/source/reference/kubernetes/multi-kubernetes.rst @@ -0,0 +1,114 @@ +.. _multi-kubernetes: + +Across Multiple Kubernetes Clusters +=================================== + + +SkyPilot allows you to manage dev pods, jobs and services across multiple Kubernetes clusters in a single pane of glass. + +You may have multiple Kubernetes clusters for a variety of reasons: + +* Clusters for different purposes: e.g.,a production cluster and a development/testing cluster. +* Clusters in different regions or clouds: e.g., US and EU regions; or AWS and Lambda clouds. +* Clusters for different accelerators: e.g., NVIDIA H100 cluster and a Google TPU cluster. +* Clusters with different configurations: e.g., a small cluster for a single node and a large cluster for multiple nodes. + + +.. image:: /images/kubernetes/multi-kubernetes.png + + +Set Up Credentials for Multiple Kubernetes Clusters +--------------------------------------------------- + +To work with multiple Kubernetes clusters, you need to ensure you have the necessary credentials for each cluster. To get +it work with SkyPilot, you don't have to do any additional setup than having those credentials in your local ``~/.kube/config`` file. + +For example, a ``~/.kube/config`` file may look like this: + +.. code-block:: yaml + + apiVersion: v1 + clusters: + - cluster: + certificate-authority-data: + ... + server: https://xx.xx.xx.xx:45819 + name: my-h100-cluster + - cluster: + certificate-authority-data: + ... + server: https://yy.yy.yy.yy:45819 + name: my-tpu-cluster + contexts: + - context: + cluster: my-h100-cluster + user: my-h100-cluster + name: my-h100-cluster + - context: + cluster: my-tpu-cluster + namespace: my-namespace + user: my-tpu-cluster + name: my-tpu-cluster + current-context: my-h100-cluster + ... + + +In this example, we have two Kubernetes clusters: ``my-h100-cluster`` and ``my-tpu-cluster``, and each Kubernetes cluster has a context for it. + +Point to a Kubernetes Cluster and Launch +----------------------------------------- + +SkyPilot borrows the ``region`` concept from clouds to denote a Kubernetes cluster. You can point to a Kubernetes cluster +by specifying the ``--region`` with the context name for that cluster. + +.. code-block:: console + + # Check the GPUs available in a Kubernetes cluster + $ sky show-gpus --cloud kubernetes --region my-h100-cluster + + Kubernetes GPUs (Context: my-h100-cluster) + GPU QTY_PER_NODE TOTAL_GPUS TOTAL_FREE_GPUS + H100 1, 2, 3, 4, 5, 6, 7, 8 8 8 + + Kubernetes per node GPU availability + NODE_NAME GPU_NAME TOTAL_GPUS FREE_GPUS + gke-test-zhwu-default-pool-20159504-hbzn H100 8 8 + gke-test-zhwu-default-pool-20159504-w5x7 None 0 0 + +When launching a SkyPilot cluster or task, you can also specify the context name with ``--region`` to launch the cluster or task in. + +.. code-block:: console + + $ sky launch --cloud kubernetes --region my-tpu-cluster echo 'Hello World' + + +.. note:: + + When you don't specify a region, SkyPilot will use the current context. + + +Failover across Multiple Kubernetes Clusters +-------------------------------------------- + +SkyPilot enables you to failover across multiple Kubernetes clusters. It is useful when you have multiple Kubernetes clusters +across different clouds and regions, and you want to launch a task in any of the clusters with available GPUs. + +Different from cloud providers, SkyPilot does not failover through different regions (contexts) by default, because multiple +Kubernetes clusters can be for different purposes. To enable the failover, you can specify the ``kubernetes.allowed_contexts`` +in SkyPilot config, ``~/.sky/config.yaml`` (See config YAML spec: :ref:`config-yaml`). + +.. code-block:: yaml + + kubernetes: + allowed_contexts: + - my-h100-cluster-gke + - my-h100-cluster-eks + +With this global config, SkyPilot will failover through the Kubernetes clusters in the ``allowed_contexts`` with in the same +order as they are specified. + + +.. code-block:: console + + $ sky launch --cloud kubernetes echo 'Hello World' + From 55c26c3c8eb43c2d65228f9b9ea0a75c67164210 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 27 Sep 2024 23:43:01 +0000 Subject: [PATCH 03/23] Add dynamic update --- .../reference/kubernetes/multi-kubernetes.rst | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/docs/source/reference/kubernetes/multi-kubernetes.rst b/docs/source/reference/kubernetes/multi-kubernetes.rst index e42a7d0d377..6798a1c1969 100644 --- a/docs/source/reference/kubernetes/multi-kubernetes.rst +++ b/docs/source/reference/kubernetes/multi-kubernetes.rst @@ -12,6 +12,7 @@ You may have multiple Kubernetes clusters for a variety of reasons: * Clusters in different regions or clouds: e.g., US and EU regions; or AWS and Lambda clouds. * Clusters for different accelerators: e.g., NVIDIA H100 cluster and a Google TPU cluster. * Clusters with different configurations: e.g., a small cluster for a single node and a large cluster for multiple nodes. +* Clusters for different Kubernetes versions: e.g., to upgrade a cluster from Kubernetes 1.20 to 1.21, you may create a new Kubernetes cluster to avoid downtime or unexpected errors. .. image:: /images/kubernetes/multi-kubernetes.png @@ -112,3 +113,18 @@ order as they are specified. $ sky launch --cloud kubernetes echo 'Hello World' + Considered resources (1 node): + ------------------------------------------------------------------------------------------------------------ + CLOUD INSTANCE vCPUs Mem(GB) ACCELERATORS REGION/ZONE COST ($) CHOSEN + ------------------------------------------------------------------------------------------------------------ + Kubernetes 2CPU--8GB--1H100 2 8 H100:1 my-h100-cluster-gke 0.00 ✔ + Kubernetes 2CPU--8GB--1H100 2 8 H100:1 my-h100-cluster-eks 0.00 + ------------------------------------------------------------------------------------------------------------ + + + +Dynamically Update Kubernetes Clusters to Use +---------------------------------------------- + +To see how to dynamically update Kubernetes clusters to use, refer to :ref:`dynamic-kubernetes-contexts-update-policy`. + From 92b4ded1bdc2d4fb411b4488d9ade1868c72caa7 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 27 Sep 2024 23:44:10 +0000 Subject: [PATCH 04/23] format --- sky/optimizer.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/sky/optimizer.py b/sky/optimizer.py index 559039def37..e42dda8d8ed 100644 --- a/sky/optimizer.py +++ b/sky/optimizer.py @@ -822,15 +822,13 @@ def format_number(x): def _get_resource_group_hash(resources: 'resources_lib.Resources'): resource_key_dict = { - 'cloud': f'{resources.cloud}', - 'accelerators': f'{resources.accelerators}', - 'use_spot': resources.use_spot - } + 'cloud': f'{resources.cloud}', + 'accelerators': f'{resources.accelerators}', + 'use_spot': resources.use_spot + } if isinstance(resources.cloud, clouds.Kubernetes): resource_key_dict['region'] = resources.region - return json.dumps( - resource_key_dict, - sort_keys=True) + return json.dumps(resource_key_dict, sort_keys=True) # Print the list of resouces that the optimizer considered. resource_fields = [ From 6a487572bf23dc1ebdb10e02b4e9192dbce6f60c Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 27 Sep 2024 23:46:06 +0000 Subject: [PATCH 05/23] Add new button --- docs/source/_static/custom.js | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js index 1fa28105186..c06b974ec97 100644 --- a/docs/source/_static/custom.js +++ b/docs/source/_static/custom.js @@ -33,6 +33,7 @@ document.addEventListener('DOMContentLoaded', () => { { selector: '.toctree-l1 > a', text: 'Llama 3.2 (Meta)' }, { selector: '.toctree-l1 > a', text: 'Admin Policy Enforcement' }, { selector: '.toctree-l1 > a', text: 'Using Existing Machines' }, + { selector: '.toctree-l1 > a', text: 'Across Multiple Kubernetes Clusters' }, ]; newItems.forEach(({ selector, text }) => { document.querySelectorAll(selector).forEach((el) => { From 867a239ca14448e868be881bfb4242c511b82e75 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 27 Sep 2024 23:46:29 +0000 Subject: [PATCH 06/23] Add to index --- docs/source/docs/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/docs/index.rst b/docs/source/docs/index.rst index d83bf7821c3..e0a81a836d1 100644 --- a/docs/source/docs/index.rst +++ b/docs/source/docs/index.rst @@ -151,6 +151,7 @@ Read the research: ../reservations/reservations Using Existing Machines <../reservations/existing-machines> ../reference/kubernetes/index + ../reference/kubernetes/multi-kubernetes .. toctree:: :hidden: From fe4c8c402de6801a80112b988e46b03128274015 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 27 Sep 2024 23:59:19 +0000 Subject: [PATCH 07/23] fix --- .../reference/kubernetes/multi-kubernetes.rst | 33 ++++++++++--------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/docs/source/reference/kubernetes/multi-kubernetes.rst b/docs/source/reference/kubernetes/multi-kubernetes.rst index 6798a1c1969..eb2e6ea9463 100644 --- a/docs/source/reference/kubernetes/multi-kubernetes.rst +++ b/docs/source/reference/kubernetes/multi-kubernetes.rst @@ -1,18 +1,18 @@ .. _multi-kubernetes: -Across Multiple Kubernetes Clusters -=================================== +Multiple Kubernetes Clusters +============================= SkyPilot allows you to manage dev pods, jobs and services across multiple Kubernetes clusters in a single pane of glass. -You may have multiple Kubernetes clusters for a variety of reasons: +You may have multiple Kubernetes clusters for different: -* Clusters for different purposes: e.g.,a production cluster and a development/testing cluster. -* Clusters in different regions or clouds: e.g., US and EU regions; or AWS and Lambda clouds. -* Clusters for different accelerators: e.g., NVIDIA H100 cluster and a Google TPU cluster. -* Clusters with different configurations: e.g., a small cluster for a single node and a large cluster for multiple nodes. -* Clusters for different Kubernetes versions: e.g., to upgrade a cluster from Kubernetes 1.20 to 1.21, you may create a new Kubernetes cluster to avoid downtime or unexpected errors. +* **Use cases:** e.g., a production cluster and a development/testing cluster. +* **Regions or clouds:** e.g., US and EU regions; or AWS and Lambda clouds. +* **Accelerators:** e.g., NVIDIA H100 cluster and a Google TPU cluster. +* **Configurations:** e.g., a small cluster for a single node and a large cluster for multiple nodes. +* **Kubernetes versions:** e.g., to upgrade a cluster from Kubernetes 1.20 to 1.21, you may create a new Kubernetes cluster to avoid downtime or unexpected errors. .. image:: /images/kubernetes/multi-kubernetes.png @@ -21,8 +21,9 @@ You may have multiple Kubernetes clusters for a variety of reasons: Set Up Credentials for Multiple Kubernetes Clusters --------------------------------------------------- -To work with multiple Kubernetes clusters, you need to ensure you have the necessary credentials for each cluster. To get -it work with SkyPilot, you don't have to do any additional setup than having those credentials in your local ``~/.kube/config`` file. +To work with multiple Kubernetes clusters, you need to ensure you have the necessary credentials for each cluster. +Check that your local ``~/.kube/config`` file has the credentials for each cluster. For setting up clusters and their credentials, +see :ref:`kubernetes-setup-deploy`. For example, a ``~/.kube/config`` file may look like this: @@ -73,8 +74,8 @@ by specifying the ``--region`` with the context name for that cluster. Kubernetes per node GPU availability NODE_NAME GPU_NAME TOTAL_GPUS FREE_GPUS - gke-test-zhwu-default-pool-20159504-hbzn H100 8 8 - gke-test-zhwu-default-pool-20159504-w5x7 None 0 0 + my-h100-cluster-hbzn H100 8 8 + my-h100-cluster-w5x7 None 0 0 When launching a SkyPilot cluster or task, you can also specify the context name with ``--region`` to launch the cluster or task in. @@ -91,12 +92,12 @@ When launching a SkyPilot cluster or task, you can also specify the context name Failover across Multiple Kubernetes Clusters -------------------------------------------- -SkyPilot enables you to failover across multiple Kubernetes clusters. It is useful when you have multiple Kubernetes clusters -across different clouds and regions, and you want to launch a task in any of the clusters with available GPUs. +SkyPilot enables you to failover across multiple Kubernetes clusters. It is useful when you want to launch a task in any of the clusters with available GPUs. Different from cloud providers, SkyPilot does not failover through different regions (contexts) by default, because multiple -Kubernetes clusters can be for different purposes. To enable the failover, you can specify the ``kubernetes.allowed_contexts`` -in SkyPilot config, ``~/.sky/config.yaml`` (See config YAML spec: :ref:`config-yaml`). +Kubernetes clusters can be for different purposes. + +To enable the failover, you can specify the ``kubernetes.allowed_contexts`` in SkyPilot config, ``~/.sky/config.yaml`` (See config YAML spec: :ref:`config-yaml`). .. code-block:: yaml From 254b3937a5813ccb5f259b3e6e1ccc2ddd2d7fa4 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 22 Jan 2025 20:10:57 +0000 Subject: [PATCH 08/23] Add figure for multi-k8s docs --- docs/source/images/multi-kubernetes.svg | 1 + docs/source/reference/kubernetes/multi-kubernetes.rst | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 docs/source/images/multi-kubernetes.svg diff --git a/docs/source/images/multi-kubernetes.svg b/docs/source/images/multi-kubernetes.svg new file mode 100644 index 00000000000..af2b1d934eb --- /dev/null +++ b/docs/source/images/multi-kubernetes.svg @@ -0,0 +1 @@ + diff --git a/docs/source/reference/kubernetes/multi-kubernetes.rst b/docs/source/reference/kubernetes/multi-kubernetes.rst index eb2e6ea9463..294faa57a51 100644 --- a/docs/source/reference/kubernetes/multi-kubernetes.rst +++ b/docs/source/reference/kubernetes/multi-kubernetes.rst @@ -15,7 +15,7 @@ You may have multiple Kubernetes clusters for different: * **Kubernetes versions:** e.g., to upgrade a cluster from Kubernetes 1.20 to 1.21, you may create a new Kubernetes cluster to avoid downtime or unexpected errors. -.. image:: /images/kubernetes/multi-kubernetes.png +.. image:: /images/multi-kubernetes.svg Set Up Credentials for Multiple Kubernetes Clusters From 903a7eecaee6511febfd75fcaa5bcd239b3c3948 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 22 Jan 2025 20:15:44 +0000 Subject: [PATCH 09/23] Fix new badge --- docs/source/_static/custom.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js index 4336825e11f..73dd2e556ea 100644 --- a/docs/source/_static/custom.js +++ b/docs/source/_static/custom.js @@ -29,7 +29,7 @@ document.addEventListener('DOMContentLoaded', () => { { selector: '.toctree-l1 > a', text: 'Many Parallel Jobs' }, { selector: '.toctree-l1 > a', text: 'Admin Policy Enforcement' }, { selector: '.toctree-l1 > a', text: 'Using Existing Machines' }, - { selector: '.toctree-l1 > a', text: 'Across Multiple Kubernetes Clusters' }, + { selector: '.toctree-l1 > a', text: 'Multiple Kubernetes Clusters' }, ]; newItems.forEach(({ selector, text }) => { document.querySelectorAll(selector).forEach((el) => { From b6f6c0f85000d3b2c622df376fcb1767661e103d Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 31 Jan 2025 09:55:27 -0800 Subject: [PATCH 10/23] Update docs/source/reference/kubernetes/multi-kubernetes.rst Co-authored-by: Romil Bhardwaj --- docs/source/reference/kubernetes/multi-kubernetes.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/reference/kubernetes/multi-kubernetes.rst b/docs/source/reference/kubernetes/multi-kubernetes.rst index 294faa57a51..4a2eb792098 100644 --- a/docs/source/reference/kubernetes/multi-kubernetes.rst +++ b/docs/source/reference/kubernetes/multi-kubernetes.rst @@ -21,9 +21,9 @@ You may have multiple Kubernetes clusters for different: Set Up Credentials for Multiple Kubernetes Clusters --------------------------------------------------- -To work with multiple Kubernetes clusters, you need to ensure you have the necessary credentials for each cluster. -Check that your local ``~/.kube/config`` file has the credentials for each cluster. For setting up clusters and their credentials, -see :ref:`kubernetes-setup-deploy`. +To work with multiple Kubernetes clusters, their credentials must be set up as individual `contexts `_ in your local ``~/.kube/config`` file. + +For deploying new clusters and getting credentials, see :ref:`kubernetes-setup-deploy`. For example, a ``~/.kube/config`` file may look like this: From fd097f4512cb776963b89d9f1392fcd40e0dfa1f Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 31 Jan 2025 18:28:33 +0000 Subject: [PATCH 11/23] update --- docs/source/docs/index.rst | 1 - docs/source/reference/kubernetes/index.rst | 1 + .../reference/kubernetes/multi-kubernetes.rst | 86 +++++++++++-------- 3 files changed, 49 insertions(+), 39 deletions(-) diff --git a/docs/source/docs/index.rst b/docs/source/docs/index.rst index 9d50d404dda..17f8d545fa6 100644 --- a/docs/source/docs/index.rst +++ b/docs/source/docs/index.rst @@ -156,7 +156,6 @@ Read the research: ../reservations/reservations Using Existing Machines <../reservations/existing-machines> ../reference/kubernetes/index - ../reference/kubernetes/multi-kubernetes .. toctree:: :hidden: diff --git a/docs/source/reference/kubernetes/index.rst b/docs/source/reference/kubernetes/index.rst index 639b5b633ed..6ea14ed8858 100644 --- a/docs/source/reference/kubernetes/index.rst +++ b/docs/source/reference/kubernetes/index.rst @@ -103,3 +103,4 @@ Table of Contents Getting Started kubernetes-setup kubernetes-troubleshooting + multi-kubernetes diff --git a/docs/source/reference/kubernetes/multi-kubernetes.rst b/docs/source/reference/kubernetes/multi-kubernetes.rst index 294faa57a51..481cb29b0f1 100644 --- a/docs/source/reference/kubernetes/multi-kubernetes.rst +++ b/docs/source/reference/kubernetes/multi-kubernetes.rst @@ -16,10 +16,15 @@ You may have multiple Kubernetes clusters for different: .. image:: /images/multi-kubernetes.svg + :width: 80% +.. original image: https://docs.google.com/presentation/d/1_NzqS_ccihsQKfbOTewPaH8D496zaHMuh-fvPsPf9y0/edit#slide=id.p -Set Up Credentials for Multiple Kubernetes Clusters ---------------------------------------------------- +Setup your Multiple Kubernetes Clusters +----------------------------------------- + +Step 1: Set Up Credentials +~~~~~~~~~~~~~~~~~~~~~~~~~~~ To work with multiple Kubernetes clusters, you need to ensure you have the necessary credentials for each cluster. Check that your local ``~/.kube/config`` file has the credentials for each cluster. For setting up clusters and their credentials, @@ -57,56 +62,35 @@ For example, a ``~/.kube/config`` file may look like this: In this example, we have two Kubernetes clusters: ``my-h100-cluster`` and ``my-tpu-cluster``, and each Kubernetes cluster has a context for it. -Point to a Kubernetes Cluster and Launch ------------------------------------------ - -SkyPilot borrows the ``region`` concept from clouds to denote a Kubernetes cluster. You can point to a Kubernetes cluster -by specifying the ``--region`` with the context name for that cluster. - -.. code-block:: console +Step 2: Configure SkyPilot to Access Multiple Kubernetes Clusters +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # Check the GPUs available in a Kubernetes cluster - $ sky show-gpus --cloud kubernetes --region my-h100-cluster +By default, SkyPilot will only use the current context in the kubeconfig, e.g., ``current-context: my-h100-cluster`` or you can get the current context with ``kubectl config current-context``. +To allow SkyPilot to access multiple Kubernetes clusters, you can set the ``kubernetes.allowed_contexts`` in the SkyPilot config. - Kubernetes GPUs (Context: my-h100-cluster) - GPU QTY_PER_NODE TOTAL_GPUS TOTAL_FREE_GPUS - H100 1, 2, 3, 4, 5, 6, 7, 8 8 8 +.. code-block:: yaml - Kubernetes per node GPU availability - NODE_NAME GPU_NAME TOTAL_GPUS FREE_GPUS - my-h100-cluster-hbzn H100 8 8 - my-h100-cluster-w5x7 None 0 0 + kubernetes: + allowed_contexts: + - my-h100-cluster + - my-tpu-cluster -When launching a SkyPilot cluster or task, you can also specify the context name with ``--region`` to launch the cluster or task in. +To check the enabled Kubernetes clusters, you can run ``sky check kubernetes``. .. code-block:: console - $ sky launch --cloud kubernetes --region my-tpu-cluster echo 'Hello World' - + $ sky check kubernetes -.. note:: + Enabled Kubernetes clusters: + - my-h100-cluster + - my-tpu-cluster - When you don't specify a region, SkyPilot will use the current context. Failover across Multiple Kubernetes Clusters -------------------------------------------- -SkyPilot enables you to failover across multiple Kubernetes clusters. It is useful when you want to launch a task in any of the clusters with available GPUs. - -Different from cloud providers, SkyPilot does not failover through different regions (contexts) by default, because multiple -Kubernetes clusters can be for different purposes. - -To enable the failover, you can specify the ``kubernetes.allowed_contexts`` in SkyPilot config, ``~/.sky/config.yaml`` (See config YAML spec: :ref:`config-yaml`). - -.. code-block:: yaml - - kubernetes: - allowed_contexts: - - my-h100-cluster-gke - - my-h100-cluster-eks - -With this global config, SkyPilot will failover through the Kubernetes clusters in the ``allowed_contexts`` with in the same +With the ``kubernetes.allowed_contexts`` global config, SkyPilot failover through the Kubernetes clusters in the ``allowed_contexts`` in the same order as they are specified. @@ -123,6 +107,32 @@ order as they are specified. ------------------------------------------------------------------------------------------------------------ +Point to a Kubernetes Cluster and Launch +----------------------------------------- + +SkyPilot borrows the ``region`` concept from clouds to denote a Kubernetes context. You can point to a Kubernetes cluster +by specifying the ``--region`` with the context name for that cluster. + +.. code-block:: console + + # Check the GPUs available in a Kubernetes cluster + $ sky show-gpus --cloud kubernetes --region my-h100-cluster + + Kubernetes GPUs (Context: my-h100-cluster) + GPU QTY_PER_NODE TOTAL_GPUS TOTAL_FREE_GPUS + H100 1, 2, 3, 4, 5, 6, 7, 8 8 8 + + Kubernetes per node GPU availability + NODE_NAME GPU_NAME TOTAL_GPUS FREE_GPUS + my-h100-cluster-hbzn H100 8 8 + my-h100-cluster-w5x7 None 0 0 + +When launching a SkyPilot cluster or task, you can also specify the context name with ``--region`` to launch the cluster or task in. + +.. code-block:: console + + $ sky launch --cloud kubernetes --region my-tpu-cluster echo 'Hello World' + Dynamically Update Kubernetes Clusters to Use ---------------------------------------------- From 03d91229fee346a49a5b9008681b5a3cfae9cd4d Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 31 Jan 2025 11:09:20 -0800 Subject: [PATCH 12/23] Update docs/source/reference/kubernetes/multi-kubernetes.rst Co-authored-by: Romil Bhardwaj --- docs/source/reference/kubernetes/multi-kubernetes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/reference/kubernetes/multi-kubernetes.rst b/docs/source/reference/kubernetes/multi-kubernetes.rst index 4a2eb792098..e8047e6ac7a 100644 --- a/docs/source/reference/kubernetes/multi-kubernetes.rst +++ b/docs/source/reference/kubernetes/multi-kubernetes.rst @@ -127,5 +127,5 @@ order as they are specified. Dynamically Update Kubernetes Clusters to Use ---------------------------------------------- -To see how to dynamically update Kubernetes clusters to use, refer to :ref:`dynamic-kubernetes-contexts-update-policy`. +You can have configure SkyPilot to dynamically fetch Kubernetes cluster configs and enforce restrictions on which clusters are used. Refer to :ref:`dynamic-kubernetes-contexts-update-policy` for more. From c6cf6149eb36a475ccc6d08f1d8e1a61953a1eca Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 31 Jan 2025 19:12:46 +0000 Subject: [PATCH 13/23] update --- .../reference/kubernetes/multi-kubernetes.rst | 16 ++++++++++------ sky/check.py | 4 ++-- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/docs/source/reference/kubernetes/multi-kubernetes.rst b/docs/source/reference/kubernetes/multi-kubernetes.rst index 481cb29b0f1..06c9e1deb72 100644 --- a/docs/source/reference/kubernetes/multi-kubernetes.rst +++ b/docs/source/reference/kubernetes/multi-kubernetes.rst @@ -65,7 +65,10 @@ In this example, we have two Kubernetes clusters: ``my-h100-cluster`` and ``my-t Step 2: Configure SkyPilot to Access Multiple Kubernetes Clusters ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -By default, SkyPilot will only use the current context in the kubeconfig, e.g., ``current-context: my-h100-cluster`` or you can get the current context with ``kubectl config current-context``. +Unlike clouds, SkyPilot does not failover through different Kubernetes clusters (regions) by default because each Kubernetes clusters can have a different purpose. + +By default, SkyPilot only uses the current context in the kubeconfig, e.g., ``current-context: my-h100-cluster`` or you can get the current context with ``kubectl config current-context``. + To allow SkyPilot to access multiple Kubernetes clusters, you can set the ``kubernetes.allowed_contexts`` in the SkyPilot config. .. code-block:: yaml @@ -81,10 +84,11 @@ To check the enabled Kubernetes clusters, you can run ``sky check kubernetes``. $ sky check kubernetes - Enabled Kubernetes clusters: - - my-h100-cluster - - my-tpu-cluster - + 🎉 Enabled clouds 🎉 + ✔ Kubernetes + Allowed contexts: + ├── my-h100-cluster + └── my-tpu-cluster Failover across Multiple Kubernetes Clusters @@ -96,7 +100,7 @@ order as they are specified. .. code-block:: console - $ sky launch --cloud kubernetes echo 'Hello World' + $ sky launch --gpus H100 --cloud kubernetes echo 'Hello World' Considered resources (1 node): ------------------------------------------------------------------------------------------------------------ diff --git a/sky/check.py b/sky/check.py index f32e4985079..3118b28ff78 100644 --- a/sky/check.py +++ b/sky/check.py @@ -245,10 +245,10 @@ def _format_enabled_cloud(cloud_name: str) -> str: # here we are using rich. We should migrate this file to # use colorama as we do in the rest of the codebase. symbol = ('└── ' if i == len(existing_contexts) - 1 else '├── ') - contexts_formatted.append(f'\n {symbol}{context}') + contexts_formatted.append(f'\n {symbol}{context}') context_info = f'Allowed contexts:{"".join(contexts_formatted)}' else: context_info = f'Active context: {existing_contexts[0]}' - return f'{cloud_name}[/green][dim]\n └── {context_info}[/dim][green]' + return f'{cloud_name}[/green][dim]\n {context_info}[/dim][green]' return cloud_name From bd6eeb67d3fa876c406b958e003828df5c81ddd0 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 31 Jan 2025 19:17:25 +0000 Subject: [PATCH 14/23] fix --- docs/source/reference/kubernetes/multi-kubernetes.rst | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/source/reference/kubernetes/multi-kubernetes.rst b/docs/source/reference/kubernetes/multi-kubernetes.rst index a203204ecb8..40a8459c1a8 100644 --- a/docs/source/reference/kubernetes/multi-kubernetes.rst +++ b/docs/source/reference/kubernetes/multi-kubernetes.rst @@ -17,11 +17,12 @@ You may have multiple Kubernetes clusters for different: .. image:: /images/multi-kubernetes.svg :width: 80% + :align: center .. original image: https://docs.google.com/presentation/d/1_NzqS_ccihsQKfbOTewPaH8D496zaHMuh-fvPsPf9y0/edit#slide=id.p -Setup your Multiple Kubernetes Clusters ------------------------------------------ +Configuration +------------- Step 1: Set Up Credentials ~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -62,8 +63,8 @@ For example, a ``~/.kube/config`` file may look like this: In this example, we have two Kubernetes clusters: ``my-h100-cluster`` and ``my-tpu-cluster``, and each Kubernetes cluster has a context for it. -Step 2: Configure SkyPilot to Access Multiple Kubernetes Clusters -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Step 2: Setup SkyPilot to Access Multiple Kubernetes Clusters +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Unlike clouds, SkyPilot does not failover through different Kubernetes clusters (regions) by default because each Kubernetes clusters can have a different purpose. From e3584371e88c2f9063acd29741876ae9e4e00a02 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 31 Jan 2025 19:21:02 +0000 Subject: [PATCH 15/23] fix --- docs/source/_static/custom.js | 2 +- docs/source/reference/kubernetes/multi-kubernetes.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js index 73dd2e556ea..77e8bb4b6b8 100644 --- a/docs/source/_static/custom.js +++ b/docs/source/_static/custom.js @@ -29,7 +29,7 @@ document.addEventListener('DOMContentLoaded', () => { { selector: '.toctree-l1 > a', text: 'Many Parallel Jobs' }, { selector: '.toctree-l1 > a', text: 'Admin Policy Enforcement' }, { selector: '.toctree-l1 > a', text: 'Using Existing Machines' }, - { selector: '.toctree-l1 > a', text: 'Multiple Kubernetes Clusters' }, + { selector: '.toctree-l2 > a', text: 'Multi-Kubernetes Clusters' }, ]; newItems.forEach(({ selector, text }) => { document.querySelectorAll(selector).forEach((el) => { diff --git a/docs/source/reference/kubernetes/multi-kubernetes.rst b/docs/source/reference/kubernetes/multi-kubernetes.rst index 40a8459c1a8..bf11e54c98c 100644 --- a/docs/source/reference/kubernetes/multi-kubernetes.rst +++ b/docs/source/reference/kubernetes/multi-kubernetes.rst @@ -1,6 +1,6 @@ .. _multi-kubernetes: -Multiple Kubernetes Clusters +Multi-Kubernetes Clusters ============================= From 9dd445a7abd73d7483db876cc691e07ce1ee35a6 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 3 Feb 2025 00:23:05 -0800 Subject: [PATCH 16/23] Update docs/source/reference/kubernetes/multi-kubernetes.rst Co-authored-by: Romil Bhardwaj --- docs/source/reference/kubernetes/multi-kubernetes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/reference/kubernetes/multi-kubernetes.rst b/docs/source/reference/kubernetes/multi-kubernetes.rst index bf11e54c98c..a009ea428e8 100644 --- a/docs/source/reference/kubernetes/multi-kubernetes.rst +++ b/docs/source/reference/kubernetes/multi-kubernetes.rst @@ -29,7 +29,7 @@ Step 1: Set Up Credentials To work with multiple Kubernetes clusters, their credentials must be set up as individual `contexts `_ in your local ``~/.kube/config`` file. -For deploying new clusters and getting credentials, see :ref:`kubernetes-setup-deploy`. +For deploying new clusters and getting credentials, see :ref:`kubernetes-deployment`. For example, a ``~/.kube/config`` file may look like this: From 9a906a44d692628d0bc75bed2fbf0e201587e35c Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 3 Feb 2025 08:25:13 +0000 Subject: [PATCH 17/23] rename --- docs/source/reference/kubernetes/multi-kubernetes.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/reference/kubernetes/multi-kubernetes.rst b/docs/source/reference/kubernetes/multi-kubernetes.rst index bf11e54c98c..b3a518142d3 100644 --- a/docs/source/reference/kubernetes/multi-kubernetes.rst +++ b/docs/source/reference/kubernetes/multi-kubernetes.rst @@ -1,7 +1,7 @@ .. _multi-kubernetes: -Multi-Kubernetes Clusters -============================= +Multiple Kubernetes Clusters +================================ SkyPilot allows you to manage dev pods, jobs and services across multiple Kubernetes clusters in a single pane of glass. From 935c0869c768b90674a6ae06e011e1dde52b734b Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 3 Feb 2025 00:25:25 -0800 Subject: [PATCH 18/23] Update docs/source/reference/kubernetes/multi-kubernetes.rst Co-authored-by: Romil Bhardwaj --- docs/source/reference/kubernetes/multi-kubernetes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/reference/kubernetes/multi-kubernetes.rst b/docs/source/reference/kubernetes/multi-kubernetes.rst index a009ea428e8..0a13ae4649d 100644 --- a/docs/source/reference/kubernetes/multi-kubernetes.rst +++ b/docs/source/reference/kubernetes/multi-kubernetes.rst @@ -4,7 +4,7 @@ Multi-Kubernetes Clusters ============================= -SkyPilot allows you to manage dev pods, jobs and services across multiple Kubernetes clusters in a single pane of glass. +SkyPilot allows you to manage dev pods, jobs and services across multiple Kubernetes clusters through a single pane of glass. You may have multiple Kubernetes clusters for different: From bb0a981faa1415b2e70fb75c4367a26900a2f01c Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 3 Feb 2025 00:25:47 -0800 Subject: [PATCH 19/23] Update docs/source/reference/kubernetes/multi-kubernetes.rst Co-authored-by: Romil Bhardwaj --- docs/source/reference/kubernetes/multi-kubernetes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/reference/kubernetes/multi-kubernetes.rst b/docs/source/reference/kubernetes/multi-kubernetes.rst index 0a13ae4649d..a1cdfe0637a 100644 --- a/docs/source/reference/kubernetes/multi-kubernetes.rst +++ b/docs/source/reference/kubernetes/multi-kubernetes.rst @@ -68,7 +68,7 @@ Step 2: Setup SkyPilot to Access Multiple Kubernetes Clusters Unlike clouds, SkyPilot does not failover through different Kubernetes clusters (regions) by default because each Kubernetes clusters can have a different purpose. -By default, SkyPilot only uses the current context in the kubeconfig, e.g., ``current-context: my-h100-cluster`` or you can get the current context with ``kubectl config current-context``. +By default, SkyPilot only uses the context set as the ``current-context`` in the kubeconfig. You can get the current context with ``kubectl config current-context``. To allow SkyPilot to access multiple Kubernetes clusters, you can set the ``kubernetes.allowed_contexts`` in the SkyPilot config. From 34d6bbec71805499f81f267dc05fc4b7c459fad2 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 3 Feb 2025 00:26:03 -0800 Subject: [PATCH 20/23] Update docs/source/reference/kubernetes/multi-kubernetes.rst Co-authored-by: Romil Bhardwaj --- docs/source/reference/kubernetes/multi-kubernetes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/reference/kubernetes/multi-kubernetes.rst b/docs/source/reference/kubernetes/multi-kubernetes.rst index a1cdfe0637a..0495b231710 100644 --- a/docs/source/reference/kubernetes/multi-kubernetes.rst +++ b/docs/source/reference/kubernetes/multi-kubernetes.rst @@ -118,7 +118,7 @@ Point to a Kubernetes Cluster and Launch SkyPilot borrows the ``region`` concept from clouds to denote a Kubernetes context. You can point to a Kubernetes cluster by specifying the ``--region`` with the context name for that cluster. -.. code-block:: console +.. code-block:: bash # Check the GPUs available in a Kubernetes cluster $ sky show-gpus --cloud kubernetes --region my-h100-cluster From 4368e4f9ee692dad7493b1794c16e32b89446bff Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 3 Feb 2025 08:29:56 +0000 Subject: [PATCH 21/23] revert to console, fix comment color --- docs/source/reference/kubernetes/multi-kubernetes.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/reference/kubernetes/multi-kubernetes.rst b/docs/source/reference/kubernetes/multi-kubernetes.rst index 93875c45050..4a4c3939b9e 100644 --- a/docs/source/reference/kubernetes/multi-kubernetes.rst +++ b/docs/source/reference/kubernetes/multi-kubernetes.rst @@ -118,9 +118,9 @@ Point to a Kubernetes Cluster and Launch SkyPilot borrows the ``region`` concept from clouds to denote a Kubernetes context. You can point to a Kubernetes cluster by specifying the ``--region`` with the context name for that cluster. -.. code-block:: bash +.. code-block:: console - # Check the GPUs available in a Kubernetes cluster + $ # Check the GPUs available in a Kubernetes cluster $ sky show-gpus --cloud kubernetes --region my-h100-cluster Kubernetes GPUs (Context: my-h100-cluster) From 7aac76a6ca8546df5f19fc27056183720aa4154d Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 3 Feb 2025 21:43:23 +0000 Subject: [PATCH 22/23] new badge --- docs/source/_static/custom.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js index 77e8bb4b6b8..97831517ce7 100644 --- a/docs/source/_static/custom.js +++ b/docs/source/_static/custom.js @@ -29,7 +29,7 @@ document.addEventListener('DOMContentLoaded', () => { { selector: '.toctree-l1 > a', text: 'Many Parallel Jobs' }, { selector: '.toctree-l1 > a', text: 'Admin Policy Enforcement' }, { selector: '.toctree-l1 > a', text: 'Using Existing Machines' }, - { selector: '.toctree-l2 > a', text: 'Multi-Kubernetes Clusters' }, + { selector: '.toctree-l2 > a', text: 'Multiple Kubernetes Clusters' }, ]; newItems.forEach(({ selector, text }) => { document.querySelectorAll(selector).forEach((el) => { From eb5b16178ab292df0dbc59bf117001d830a2acea Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 3 Feb 2025 22:09:37 +0000 Subject: [PATCH 23/23] use comma instead --- docs/source/reference/kubernetes/multi-kubernetes.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/reference/kubernetes/multi-kubernetes.rst b/docs/source/reference/kubernetes/multi-kubernetes.rst index 4a4c3939b9e..135fa8747d3 100644 --- a/docs/source/reference/kubernetes/multi-kubernetes.rst +++ b/docs/source/reference/kubernetes/multi-kubernetes.rst @@ -8,11 +8,11 @@ SkyPilot allows you to manage dev pods, jobs and services across multiple Kubern You may have multiple Kubernetes clusters for different: -* **Use cases:** e.g., a production cluster and a development/testing cluster. -* **Regions or clouds:** e.g., US and EU regions; or AWS and Lambda clouds. -* **Accelerators:** e.g., NVIDIA H100 cluster and a Google TPU cluster. -* **Configurations:** e.g., a small cluster for a single node and a large cluster for multiple nodes. -* **Kubernetes versions:** e.g., to upgrade a cluster from Kubernetes 1.20 to 1.21, you may create a new Kubernetes cluster to avoid downtime or unexpected errors. +* **Use cases**, e.g., a production cluster and a development/testing cluster. +* **Regions or clouds**, e.g., US and EU regions; or AWS and Lambda clouds. +* **Accelerators**, e.g., NVIDIA H100 cluster and a Google TPU cluster. +* **Configurations**, e.g., a small cluster for a single node and a large cluster for multiple nodes. +* **Kubernetes versions**, e.g., to upgrade a cluster from Kubernetes 1.20 to 1.21, you may create a new Kubernetes cluster to avoid downtime or unexpected errors. .. image:: /images/multi-kubernetes.svg