From 1064e8a074909c5d13f862357ed0e3186079a8e8 Mon Sep 17 00:00:00 2001
From: Kaiyuan Eric Chen <kych@berkeley.edu>
Date: Tue, 11 Feb 2025 00:32:35 -0800
Subject: [PATCH 01/18] [Examples] Update Vector DB Doc (#4690)

* update the doc with images

* blog readme

* revert deepseek example

* Add blank line in DeepSeek-R1 README for formatting
---
 examples/vector_database/README.md | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)
diff --git a/examples/vector_database/README.md b/examples/vector_database/README.md
index 0bb44cd2fb5..f127d2c176e 100644
--- a/examples/vector_database/README.md
+++ b/examples/vector_database/README.md
@@ -1,6 +1,10 @@
-# Building A Million Scale Image Vector Database With SkyPilot 
+# Building Large-Scale Image Search using VectorDB & OpenAI CLIP
 
-### Semantic Search at Million (Billion) Scale 
+<p align="center">
+<img src="https://i.imgur.com/xNyKyxK.png" alt="VectorDB with SkyPilot" style="width: 70%;">
+</p>
+
+### Large-Scale Image Search
 As the volume of image data grows, the need for efficient and powerful search methods becomes critical. Traditional keyword-based or metadata-based search often fails to capture the full semantic meaning in images. A vector database enables semantic search: you can find images that conceptually match a query (e.g., "a photo of a cloud") rather than relying on textual tags.
 
 In particular:
@@ -11,6 +15,8 @@ In particular:
 
 SkyPilot streamlines the process of running such large-scale jobs in the cloud. It abstracts away much of the complexity of managing infrastructure and helps you run compute-intensive tasks efficiently and cost-effectively through managed jobs. 
 
+Please find the complete blog post [here](https://blog.skypilot.co/large-scale-vector-database/)
+
 ### Step 0: Set Up The Environment
 Install the following Prerequisites:  
 * SkyPilot: Make sure you have SkyPilot installed and `sky check` should succeed. Refer to [SkyPilot’s documentation](https://docs.skypilot.co/en/latest/getting-started/installation.html) for instructions.
@@ -39,7 +45,11 @@ This will automatically find available machines to compute the vectors. Expect:
 (clip-batch-compute-vectors, pid=2523) 2025-01-28 00:06:25,009 - root - INFO - Saved partition 6 to /output/embeddings_90000_100000.parquet_part_6/data.parquet
 ...
 ```
-You can also use `sky jobs queue` and `sky jobs dashboard` to see the status of jobs. 
+You can also use `sky jobs queue` and `sky jobs dashboard` to see the status of jobs. Figure below shows our jobs are launched across different regions: 
+
+<p align="center">
+<img src="https://i.imgur.com/2CyQADY.png" alt="SkyPilot Dashboard" style="width: 70%;">
+</p>
 
 ### Step 2: Construct the Vector Database from Computed Embeddings
 Once you have the image embeddings, you need a specialized engine to perform rapid similarity searches at scale. In this example, we use [ChromaDB](https://docs.trychroma.com/getting-started) to store and query the embeddings. This step ingests the embeddings from Step 1 into a vector database to enable real-time or near real-time search over millions of vectors. 
@@ -86,4 +96,8 @@ If you run through `sky serve`, you may run
 sky serve status vectordb --endpoint
 ```
 
-to get the endpoint address of the service. 
\ No newline at end of file
+to get the endpoint address of the service. 
+
+<p align="center">
+<img src="https://i.imgur.com/KONQ4wd.png" alt="Image Search Website" style="width: 70%;">
+</p>
\ No newline at end of file

From ecba7d3b50ebe381073a9acc5e6571bef52b9e88 Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil.bhardwaj@berkeley.edu>
Date: Tue, 11 Feb 2025 06:32:57 -0800
Subject: [PATCH 02/18] [k8s] Fix sky show-gpus not showing GPU name on GKE
 (#4688)

* Fix show-gpus node list

* comment

* lint
---
 sky/provision/kubernetes/utils.py | 70 ++++++++++++++++---------------
 1 file changed, 36 insertions(+), 34 deletions(-)

diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index 4e6da8de491..594e3752b41 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -2178,52 +2178,54 @@ def get_kubernetes_node_info(
 
     lf, _ = detect_gpu_label_formatter(context)
     if not lf:
-        label_key = None
+        label_keys = []
     else:
         label_keys = lf.get_label_keys()
 
     node_info_dict: Dict[str, KubernetesNodeInfo] = {}
 
-    for label_key in label_keys:
-        for node in nodes:
-            allocated_qty = 0
+    for node in nodes:
+        accelerator_name = None
+        # Determine the accelerator name from the node labels and pick the
+        # first one found. We assume that the node has only one accelerator type
+        # (e.g., either GPU or TPU).
+        for label_key in label_keys:
             if lf is not None and label_key in node.metadata.labels:
                 accelerator_name = lf.get_accelerator_from_label_value(
                     node.metadata.labels.get(label_key))
-            else:
-                accelerator_name = None
+                break
 
-            accelerator_count = get_node_accelerator_count(
-                node.status.allocatable)
+        allocated_qty = 0
+        accelerator_count = get_node_accelerator_count(node.status.allocatable)
 
-            if pods is None:
-                accelerators_available = -1
+        if pods is None:
+            accelerators_available = -1
 
-            else:
-                for pod in pods:
-                    # Get all the pods running on the node
-                    if (pod.spec.node_name == node.metadata.name and
-                            pod.status.phase in ['Running', 'Pending']):
-                        # Iterate over all the containers in the pod and sum the
-                        # GPU requests
-                        for container in pod.spec.containers:
-                            if container.resources.requests:
-                                allocated_qty += get_node_accelerator_count(
-                                    container.resources.requests)
-
-                accelerators_available = accelerator_count - allocated_qty
-
-            # Exclude multi-host TPUs from being processed.
-            # TODO(Doyoung): Remove the logic when adding support for
-            # multi-host TPUs.
-            if is_multi_host_tpu(node.metadata.labels):
-                continue
+        else:
+            for pod in pods:
+                # Get all the pods running on the node
+                if (pod.spec.node_name == node.metadata.name and
+                        pod.status.phase in ['Running', 'Pending']):
+                    # Iterate over all the containers in the pod and sum the
+                    # GPU requests
+                    for container in pod.spec.containers:
+                        if container.resources.requests:
+                            allocated_qty += get_node_accelerator_count(
+                                container.resources.requests)
+
+            accelerators_available = accelerator_count - allocated_qty
+
+        # Exclude multi-host TPUs from being processed.
+        # TODO(Doyoung): Remove the logic when adding support for
+        # multi-host TPUs.
+        if is_multi_host_tpu(node.metadata.labels):
+            continue
 
-            node_info_dict[node.metadata.name] = KubernetesNodeInfo(
-                name=node.metadata.name,
-                accelerator_type=accelerator_name,
-                total={'accelerator_count': int(accelerator_count)},
-                free={'accelerators_available': int(accelerators_available)})
+        node_info_dict[node.metadata.name] = KubernetesNodeInfo(
+            name=node.metadata.name,
+            accelerator_type=accelerator_name,
+            total={'accelerator_count': int(accelerator_count)},
+            free={'accelerators_available': int(accelerators_available)})
 
     return node_info_dict
 

From ebc77cfda85fbac8b5c86433b1ecc320ec92368f Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil.bhardwaj@berkeley.edu>
Date: Tue, 11 Feb 2025 06:33:55 -0800
Subject: [PATCH 03/18] [Tests] Bug fixes for v0.8.0 (#4689)

* Fix tests

* Fix dpkg installation

* lint
---
 examples/multi_echo.py              | 4 +++-
 sky/templates/kubernetes-ray.yml.j2 | 5 +++--
 tests/smoke_tests/test_basic.py     | 1 +
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/examples/multi_echo.py b/examples/multi_echo.py
index 7f310a4bb6b..0c911dd4540 100644
--- a/examples/multi_echo.py
+++ b/examples/multi_echo.py
@@ -23,7 +23,9 @@ def run(cluster: Optional[str] = None, cloud: Optional[str] = None):
 
     # Create the cluster.
     with sky.Dag() as dag:
-        cluster_resources = sky.Resources(cloud, accelerators={'T4': 1})
+        cluster_resources = sky.Resources(cloud,
+                                          cpus='4+',
+                                          accelerators={'T4': 1})
         task = sky.Task(num_nodes=2).set_resources(cluster_resources)
     # `detach_run` will only detach the `run` command. The provision and
     # `setup` are still blocking.
diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2
index 2087d9c6e9d..c246141c03f 100644
--- a/sky/templates/kubernetes-ray.yml.j2
+++ b/sky/templates/kubernetes-ray.yml.j2
@@ -373,15 +373,16 @@ available_node_types:
                 done;
                 if [ ! -z "$INSTALL_FIRST" ]; then
                   echo "Installing core packages: $INSTALL_FIRST";
-                  DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y $INSTALL_FIRST;
+                  DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" $INSTALL_FIRST;
                 fi;
                 # SSH and other packages are not necessary, so we disable set -e
                 set +e
                 
                 if [ ! -z "$MISSING_PACKAGES" ]; then
                   echo "Installing missing packages: $MISSING_PACKAGES";
-                  DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y $MISSING_PACKAGES;
+                  DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" $MISSING_PACKAGES;
                 fi;
+              
                 $(prefix_cmd) mkdir -p /var/run/sshd;
                 $(prefix_cmd) sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config;
                 $(prefix_cmd) sed "s@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g" -i /etc/pam.d/sshd;
diff --git a/tests/smoke_tests/test_basic.py b/tests/smoke_tests/test_basic.py
index 9d88cb99a22..0345a4b6d65 100644
--- a/tests/smoke_tests/test_basic.py
+++ b/tests/smoke_tests/test_basic.py
@@ -345,6 +345,7 @@ def test_core_api_sky_launch_exec():
 
 # The sky launch CLI has some additional checks to make sure the cluster is up/
 # restarted. However, the core API doesn't have these; make sure it still works
+@pytest.mark.no_kubernetes
 def test_core_api_sky_launch_fast(generic_cloud: str):
     name = smoke_tests_utils.get_cluster_name()
     cloud = sky.clouds.CLOUD_REGISTRY.from_str(generic_cloud)

From 713b13b2944cba456356e72f51268a6e5a6a20a7 Mon Sep 17 00:00:00 2001
From: Zongheng Yang <zongheng.y@gmail.com>
Date: Tue, 11 Feb 2025 07:37:40 -0800
Subject: [PATCH 04/18] [Docs] New pages organization; new docs; cleanup.
 (#4664)

* wip: tightening docs TOC

* updates

* updatees

* Updates

* Updates

* updates

* updates

* yaml

* Updates

* Updates

* updates

* Fix lints.

* WIP

* 'Sentence case' all subheadings.

* tip
---
 CONTRIBUTING.md                               |   2 +-
 docs/README.md                                |   5 +
 docs/build.sh                                 |  18 +++
 docs/source/_static/custom.js                 |   5 +-
 .../cloud-setup/cloud-permissions/aws.rst     |   6 +-
 .../cloud-setup/cloud-permissions/gcp.rst     |  16 +--
 .../cloud-permissions/kubernetes.rst          |   6 +-
 .../cloud-setup/cloud-permissions/vsphere.rst |   4 +-
 docs/source/cloud-setup/policy.rst            |  22 +--
 docs/source/compute/cloud-vm.rst              |  25 ++++
 docs/source/compute/gpus.rst                  |  98 +++++++++++++
 docs/source/compute/show-gpus-all.txt         |  60 ++++++++
 docs/source/compute/show-gpus-h100-8.txt      |  17 +++
 docs/source/docs/index.rst                    |  61 ++++----
 docs/source/examples/auto-failover.rst        |  20 +--
 docs/source/examples/docker-containers.rst    |   8 +-
 .../examples/interactive-development.rst      |  13 +-
 docs/source/examples/managed-jobs.rst         |  18 +--
 docs/source/examples/ports.rst                |   2 +-
 docs/source/examples/spot-jobs.rst            |  23 ---
 docs/source/getting-started/quickstart.rst    |   2 +
 docs/source/overview.rst                      |   4 +-
 docs/source/reference/benchmark/callback.rst  | 134 ------------------
 docs/source/reference/benchmark/cli.rst       |  78 ----------
 docs/source/reference/benchmark/config.rst    |  48 -------
 docs/source/reference/benchmark/index.rst     |  43 ------
 docs/source/reference/cli.rst                 |  29 ++--
 docs/source/reference/faq.rst                 |   2 +-
 docs/source/reference/job-queue.rst           |  57 ++++----
 docs/source/reference/kubernetes/index.rst    |   1 +
 .../kubernetes/kubernetes-getting-started.rst |  14 +-
 .../reference/kubernetes/kubernetes-setup.rst |   6 +-
 .../reference/kubernetes/multi-kubernetes.rst |  10 +-
 .../skypilot-and-vanilla-k8s.rst}             |  14 +-
 docs/source/reference/storage.rst             |   4 +-
 docs/source/reference/tpu.rst                 |   2 +-
 docs/source/reference/yaml-spec.rst           | 106 ++++++++++++--
 docs/source/reservations/reservations.rst     |   8 +-
 docs/source/running-jobs/distributed-jobs.rst |  50 ++++---
 docs/source/running-jobs/many-jobs.rst        |  22 ++-
 docs/source/serving/auth.rst                  |  10 +-
 docs/source/serving/autoscaling.rst           |   8 +-
 docs/source/serving/https.rst                 |   2 +-
 docs/source/serving/service-yaml-spec.rst     |  85 -----------
 docs/source/serving/sky-serve.rst             |   4 +-
 docs/source/serving/spot-policy.rst           |  62 ++++----
 docs/source/serving/update.rst                |   7 +-
 docs/source/serving/user-guides.rst           |   1 +
 sky/cli.py                                    |   2 +-
 49 files changed, 576 insertions(+), 668 deletions(-)
 create mode 100644 docs/source/compute/cloud-vm.rst
 create mode 100644 docs/source/compute/gpus.rst
 create mode 100644 docs/source/compute/show-gpus-all.txt
 create mode 100644 docs/source/compute/show-gpus-h100-8.txt
 delete mode 100644 docs/source/examples/spot-jobs.rst
 delete mode 100644 docs/source/reference/benchmark/callback.rst
 delete mode 100644 docs/source/reference/benchmark/cli.rst
 delete mode 100644 docs/source/reference/benchmark/config.rst
 delete mode 100644 docs/source/reference/benchmark/index.rst
 rename docs/source/reference/{comparison.rst => kubernetes/skypilot-and-vanilla-k8s.rst} (90%)
 delete mode 100644 docs/source/serving/service-yaml-spec.rst

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 25c6421c347..e227419c127 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -10,7 +10,7 @@ all contributions to the project, including but not limited to:
 * Documentation
 * Tutorials, blog posts and talks on SkyPilot
 
-## Contributing Code
+## Contributing code
 
 We use GitHub to track issues and features. For new contributors, we recommend looking at issues labeled ["good first issue"](https://github.com/sky-proj/sky/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22+).
 
diff --git a/docs/README.md b/docs/README.md
index 7b710f478af..d08a67f73a0 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,6 +1,11 @@
 # Documentation
 Sphinx docs based on ReadTheDocs.
 
+## Styleguide
+
+- Each page's title is in `Title Case <https://en.wikipedia.org/wiki/Title_case>`_.
+- Each subsection's title is in `Sentence case <https://en.wikipedia.org/wiki/Sentence_case>`_.
+
 ## Build
 ```bash
 pip install -r requirements-docs.txt
diff --git a/docs/build.sh b/docs/build.sh
index 87dd2dbf500..112084ef3b5 100755
--- a/docs/build.sh
+++ b/docs/build.sh
@@ -1,5 +1,23 @@
 #!/bin/bash
 
+# Function to check if file exists and is less than 24 hours old
+check_file_age() {
+    if [ -f "$1" ] && [ $(( $(date +%s) - $(stat -f %m "$1" 2>/dev/null || stat -c %Y "$1" 2>/dev/null) )) -lt 86400 ]; then
+        return 0  # File exists and is recent
+    fi
+    return 1  # File doesn't exist or is old
+}
+
+# Only run sky show-gpus commands if output files don't exist or are old
+if ! check_file_age "source/compute/show-gpus-all.txt"; then
+    sky show-gpus -a > source/compute/show-gpus-all.txt
+    sed -i '' '/^tpu-v2-128/,$d' source/compute/show-gpus-all.txt && echo "... [omitted long outputs] ..." >> source/compute/show-gpus-all.txt
+fi
+
+if ! check_file_age "source/compute/show-gpus-h100-8.txt"; then
+    sky show-gpus H100:8 > source/compute/show-gpus-h100-8.txt
+fi
+
 rm -rf build docs
 
 # MacOS and GNU `script` have different usages
diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js
index 93c233765a3..eb1f07ab9bb 100644
--- a/docs/source/_static/custom.js
+++ b/docs/source/_static/custom.js
@@ -27,10 +27,9 @@ document.addEventListener('DOMContentLoaded', () => {
     // New items:
     const newItems = [
         { selector: '.toctree-l1 > a', text: 'Many Parallel Jobs' },
-        { selector: '.toctree-l1 > a', text: 'Admin Policy Enforcement' },
-        { selector: '.toctree-l1 > a', text: 'Using Existing Machines' },
+        { selector: '.toctree-l1 > a', text: 'Admin Policies' },
         { selector: '.toctree-l2 > a', text: 'Multiple Kubernetes Clusters' },
-        { selector: '.toctree-l1 > a', text: 'HTTPS Encryption' },
+        { selector: '.toctree-l2 > a', text: 'HTTPS Encryption' },
     ];
     newItems.forEach(({ selector, text }) => {
         document.querySelectorAll(selector).forEach((el) => {
diff --git a/docs/source/cloud-setup/cloud-permissions/aws.rst b/docs/source/cloud-setup/cloud-permissions/aws.rst
index 57fc7ac9732..d340145b540 100644
--- a/docs/source/cloud-setup/cloud-permissions/aws.rst
+++ b/docs/source/cloud-setup/cloud-permissions/aws.rst
@@ -6,7 +6,7 @@ AWS
 
 .. _cloud-permissions-aws-user-creation:
 
-Minimal Permissions
+Minimal permissions
 -----------------------
 
 Minimizing AWS permissions should be set up in two places:
@@ -14,7 +14,7 @@ Minimizing AWS permissions should be set up in two places:
 1. **User Account**: the user account is the individual account of an user created by the administrator.
 2. **IAM role**: the IAM role is assigned to all EC2 instances created by SkyPilot, which is used by the instances to access AWS resources, e.g., read/write S3 buckets or create other EC2 nodes. The IAM role is shared by all users under the same organization/root account. (If a user account has the permission to create IAM roles, SkyPilot can automatically create the role.)
 
-User Account
+User account
 ~~~~~~~~~~~~~~~~~~
 
 AWS accounts can be attached with a policy that limits the permissions of the account. Follow these steps to create an AWS user with the minimum permissions required by SkyPilot:
@@ -195,7 +195,7 @@ With the steps above you are almost ready to have the users in your organization
 2. Alternatively, you can create the ``skypilot-v1`` IAM role manually. The following section describes how to create the IAM role manually.
 
 
-IAM Role Creation
+IAM role creation
 ~~~~~~~~~~~~~~~~~~
 
 .. note::
diff --git a/docs/source/cloud-setup/cloud-permissions/gcp.rst b/docs/source/cloud-setup/cloud-permissions/gcp.rst
index a1c05532892..7617d82de3a 100644
--- a/docs/source/cloud-setup/cloud-permissions/gcp.rst
+++ b/docs/source/cloud-setup/cloud-permissions/gcp.rst
@@ -12,7 +12,7 @@ Generally, the administrator can choose among three "levels" of permissions, fro
 
 .. _gcp-medium-permissions:
 
-Medium Permissions
+Medium permissions
 -----------------------
 
 The easiest way to grant permissions to a user access your GCP project without the ``Owner`` role is to add the following roles to the user principals:
@@ -41,7 +41,7 @@ You can grant those accesses via GCP's `IAM & Admin console <https://console.clo
 
 .. _gcp-minimal-permissions:
 
-Minimal Permissions
+Minimal permissions
 -----------------------
 
 The :ref:`Medium Permissions <gcp-medium-permissions>` assigns admin permissions for some GCP services to the user.  If you would like to grant finer-grained and more minimal permissions to your users in your organization / project, you can create a custom role by following the steps below:
@@ -178,7 +178,7 @@ User
 
 .. _gcp-service-account-creation:
 
-Service Account
+Service account
 ~~~~~~~~~~~~~~~~~~~
 .. note::
 
@@ -210,7 +210,7 @@ Medium Permissions roles as described in the previous sections.
 
 .. _gcp-minimum-firewall-rules:
 
-Firewall Rules
+Firewall rules
 ~~~~~~~~~~~~~~~~~~~
 
 By default, users do not need to set up any special firewall rules to start
@@ -286,7 +286,7 @@ The custom VPC should contain the :ref:`required firewall rules <gcp-minimum-fir
 .. _gcp-use-internal-ips:
 
 
-Using Internal IPs
+Using internal IPs
 -----------------------
 For security reason, users may only want to use internal IPs for SkyPilot instances.
 To do so, you can use SkyPilot's global config file ``~/.sky/config.yaml`` to specify the ``gcp.use_internal_ips`` and ``gcp.ssh_proxy_command`` fields (to see the detailed syntax, see :ref:`config-yaml`):
@@ -302,7 +302,7 @@ To do so, you can use SkyPilot's global config file ``~/.sky/config.yaml`` to sp
 The ``gcp.ssh_proxy_command`` field is optional. If SkyPilot is run on a machine that can directly access the internal IPs of the instances, it can be omitted. Otherwise, it should be set to a command that can be used to proxy SSH connections to the internal IPs of the instances.
 
 
-Cloud NAT Setup
+Cloud NAT setup
 ~~~~~~~~~~~~~~~~
 
 Instances created with internal IPs only on GCP cannot access public internet by default. To make sure SkyPilot can install the dependencies correctly on the instances,
@@ -340,8 +340,8 @@ If proxy is not needed, but the regions need to be limited, you can set the ``gc
         us-east1: null
 
 
-Force Enable Exteral IPs
-~~~~~~~~~~~~~~~~~~~~~~~~
+Force enable external IPs
+~~~~~~~~~~~~~~~~~~~~~~~~~
 
 An alternative to setting up cloud NAT for instances that need to access the public internet but are in a VPC and communicated with via their internal IP is to force them to be created with an external IP address.
 
diff --git a/docs/source/cloud-setup/cloud-permissions/kubernetes.rst b/docs/source/cloud-setup/cloud-permissions/kubernetes.rst
index d337c9a2c76..9838d840639 100644
--- a/docs/source/cloud-setup/cloud-permissions/kubernetes.rst
+++ b/docs/source/cloud-setup/cloud-permissions/kubernetes.rst
@@ -53,7 +53,7 @@ Below are the permissions required by SkyPilot and an example service account YA
 
 .. _k8s-permissions:
 
-Minimum Permissions Required for SkyPilot
+Minimum permissions required for SkyPilot
 -----------------------------------------
 
 SkyPilot requires permissions equivalent to the following roles to be able to manage the resources in the Kubernetes cluster:
@@ -120,7 +120,7 @@ Permissions for ``sky show-gpus``
     If this role is not granted to the service account, ``sky show-gpus`` will still work but it will only show the total GPUs on the nodes, not the number of free GPUs.
 
 
-Permissions for Object Store Mounting
+Permissions for object store mounting
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 If your tasks use object store mounting (e.g., S3, GCS, etc.), SkyPilot will need to run a DaemonSet to expose the FUSE device as a Kubernetes resource to SkyPilot pods.
@@ -177,7 +177,7 @@ If your tasks use :ref:`Ingress <kubernetes-ingress>` for exposing ports, you wi
 
 .. _k8s-sa-example:
 
-Example using Custom Service Account
+Example using custom service account
 ------------------------------------
 
 To create a service account that has all necessary permissions for SkyPilot (including for accessing object stores), you can use the following YAML.
diff --git a/docs/source/cloud-setup/cloud-permissions/vsphere.rst b/docs/source/cloud-setup/cloud-permissions/vsphere.rst
index f82ce060cdd..1b6cd2f1a97 100644
--- a/docs/source/cloud-setup/cloud-permissions/vsphere.rst
+++ b/docs/source/cloud-setup/cloud-permissions/vsphere.rst
@@ -7,7 +7,7 @@ This document is provided for users who use VMware vSphere provider and helps th
 
 .. _cloud-prepare-vsphere-tags:
 
-Prepare Category & Tag
+Prepare category & tag
 ~~~~~~~~~~~~~~~~~~~~~~~
 
 The Categories and Tags is needed when using the vSphere provider, please follow bellow steps to create them.
@@ -79,7 +79,7 @@ The Categories and Tags is needed when using the vSphere provider, please follow
 
 .. _cloud-prepare-vsphere-storage-policy:
 
-Create VM Storage Policies
+Create VM storage policies
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 The vSphere provider depends on the VM Storage Policies to place the VM. A Shared Datastore is recommended.
diff --git a/docs/source/cloud-setup/policy.rst b/docs/source/cloud-setup/policy.rst
index 663b2db7f28..302d986e77e 100644
--- a/docs/source/cloud-setup/policy.rst
+++ b/docs/source/cloud-setup/policy.rst
@@ -14,11 +14,11 @@ Example usage:
 - :ref:`use-spot-for-gpu-policy`
 - :ref:`enforce-autostop-policy`
 - :ref:`dynamic-kubernetes-contexts-update-policy`
- 
+
 
 To implement and use an admin policy:
 
-- Admins writes a simple Python package with a policy class that implements SkyPilot's ``sky.AdminPolicy`` interface; 
+- Admins writes a simple Python package with a policy class that implements SkyPilot's ``sky.AdminPolicy`` interface;
 - Admins distributes this package to users;
 - Users simply set the ``admin_policy`` field in the SkyPilot config file ``~/.sky/config.yaml`` for the policy to go into effect.
 
@@ -117,7 +117,7 @@ The ``sky.Config`` and ``sky.RequestOptions`` classes are defined as follows:
 
     The ``sky.AdminPolicy`` should be idempotent. In other words, it should be safe to apply the policy multiple times to the same user request.
 
-Example Policies    
+Example policies
 ----------------
 
 We have provided a few example policies in `examples/admin_policy/example_policy <https://github.com/skypilot-org/skypilot/tree/master/examples/admin_policy/example_policy>`_. You can test these policies by installing the example policy package in your Python environment.
@@ -128,8 +128,8 @@ We have provided a few example policies in `examples/admin_policy/example_policy
     cd skypilot
     pip install examples/admin_policy/example_policy
 
-Reject All
-~~~~~~~~~~
+Reject all tasks
+~~~~~~~~~~~~~~~~
 
 .. literalinclude:: ../../../examples/admin_policy/example_policy/example_policy/skypilot_policy.py
     :language: python
@@ -142,7 +142,7 @@ Reject All
 
 .. _kubernetes-labels-policy:
 
-Add Labels for all Tasks on Kubernetes
+Add labels for all tasks on Kubernetes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. literalinclude:: ../../../examples/admin_policy/example_policy/example_policy/skypilot_policy.py
@@ -156,8 +156,8 @@ Add Labels for all Tasks on Kubernetes
 
 
 .. _disable-public-ip-policy:
-    
-Always Disable Public IP for AWS Tasks
+
+Always disable public IP for AWS tasks
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. literalinclude:: ../../../examples/admin_policy/example_policy/example_policy/skypilot_policy.py
@@ -171,7 +171,7 @@ Always Disable Public IP for AWS Tasks
 
 .. _use-spot-for-gpu-policy:
 
-Use Spot for all GPU Tasks
+Use spot for all GPU tasks
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 ..
@@ -186,7 +186,7 @@ Use Spot for all GPU Tasks
 
 .. _enforce-autostop-policy:
 
-Enforce Autostop for all Tasks
+Enforce autostop for all tasks
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. literalinclude:: ../../../examples/admin_policy/example_policy/example_policy/skypilot_policy.py
@@ -201,7 +201,7 @@ Enforce Autostop for all Tasks
 
 .. _dynamic-kubernetes-contexts-update-policy:
 
-Dynamically Update Kubernetes Contexts to Use
+Dynamically update Kubernetes contexts to use
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. literalinclude:: ../../../examples/admin_policy/example_policy/example_policy/skypilot_policy.py
diff --git a/docs/source/compute/cloud-vm.rst b/docs/source/compute/cloud-vm.rst
new file mode 100644
index 00000000000..107430a9376
--- /dev/null
+++ b/docs/source/compute/cloud-vm.rst
@@ -0,0 +1,25 @@
+.. _cloud-vm:
+
+Using Cloud VMs
+=====================
+
+SkyPilot supports launching cloud instances (virtual machines, or VMs) on all major cloud providers.
+You can get started with :ref:`quickstart`.
+
+See :ref:`concept-cloud-vms` for an overview.
+
+
+.. Administrator Guides
+.. ~~~~~~~~~~~~~~~~~~~~~
+
+.. For administrators, the following optional guides may be helpful:
+
+.. The following guides are optional and may be helpful for administrators:
+
+.. - :ref:`cloud-permissions`
+.. - :ref:`cloud-auth`
+.. - :ref:`quota`
+
+.. - :ref:`cloud-permissions`: Set up specific IAM roles, permissions, or service accounts for SkyPilot to use.
+.. - :ref:`cloud-auth`: Guides for different authentication methods for the clouds.
+.. - :ref:`quota`: Guides for requesting quota increases.
diff --git a/docs/source/compute/gpus.rst b/docs/source/compute/gpus.rst
new file mode 100644
index 00000000000..ce57643235e
--- /dev/null
+++ b/docs/source/compute/gpus.rst
@@ -0,0 +1,98 @@
+.. _accelerators:
+
+GPUs and Accelerators
+============================
+
+SkyPilot supports a wide range of GPUs, TPUs, and other accelerators.
+
+Supported accelerators
+----------------------
+
+.. code-block:: console
+
+   $ sky show-gpus -a
+
+.. literalinclude:: show-gpus-all.txt
+   :language: text
+
+Behind the scenes, these details are encoded in the SkyPilot Catalog: https://github.com/skypilot-org/skypilot-catalog.
+
+Accelerators in Kubernetes
+--------------------------
+
+Your Kubernetes clusters may contain only certain accelerators.
+
+You can query the accelerators available in your Kubernetes clusters with:
+
+.. code-block:: console
+
+   $ sky show-gpus --cloud k8s
+
+
+.. code-block:: text
+
+    Kubernetes GPUs
+    GPU   REQUESTABLE_QTY_PER_NODE  TOTAL_GPUS  TOTAL_FREE_GPUS
+    L4    1, 2, 4                   12          12
+    H100  1, 2, 4, 8                16          16
+
+    Kubernetes per node GPU availability
+    NODE_NAME                  GPU_NAME  TOTAL_GPUS  FREE_GPUS
+    my-cluster-0               L4        4           4
+    my-cluster-1               L4        4           4
+    my-cluster-2               L4        2           2
+    my-cluster-3               L4        2           2
+    my-cluster-4               H100      8           8
+    my-cluster-5               H100      8           8
+
+Querying accelerator details
+----------------------------
+
+You can query the details of a supported accelerator config, ``accelerator:count``:
+
+.. code-block:: console
+
+   $ sky show-gpus H100:8
+
+.. literalinclude:: show-gpus-h100-8.txt
+   :language: text
+
+Requesting accelerators
+----------------------------
+
+You can use ``accelerator:count`` in various places that accept accelerator specifications.
+
+.. code-block:: console
+
+   $ sky launch --gpus H100:8
+   $ sky launch --gpus H100  # If count is omitted, default to 1.
+   $ sky exec my-h100-8-cluster --gpus H100:0.5 job.yaml
+
+.. code-block:: yaml
+
+   # In SkyPilot YAML:
+
+   resources:
+     accelerators: H100:8
+
+   # Set: ask SkyPilot to auto-choose the cheapest and available option.
+   resources:
+     accelerators: {H100:8, A100:8}
+
+   # List: ask SkyPilot to try each one in order.
+   resources:
+     accelerators: [L4:8, L40S:8, A10G:8, A10:8]
+
+See :ref:`auto-failover` for more examples.
+
+Google TPUs
+-----------------
+
+See :ref:`tpu`.
+
+.. toctree::
+   :maxdepth: 1
+   :hidden:
+
+   Using Google TPUs <../../reference/tpu>
+
diff --git a/docs/source/compute/show-gpus-all.txt b/docs/source/compute/show-gpus-all.txt
new file mode 100644
index 00000000000..8dd70f8d6e3
--- /dev/null
+++ b/docs/source/compute/show-gpus-all.txt
@@ -0,0 +1,60 @@
+COMMON_GPU  AVAILABLE_QUANTITIES  
+A10         1, 2, 4               
+A10G        1, 4, 8               
+A100        1, 2, 4, 8, 16        
+A100-80GB   1, 2, 4, 8            
+H100        1, 2, 4, 8, 12        
+L4          1, 2, 4, 8            
+L40S        1, 2, 4, 8            
+P100        1, 2, 4               
+T4          1, 2, 4, 8            
+V100        1, 2, 4, 8            
+V100-32GB   1, 2, 4, 8            
+
+GOOGLE_TPU       AVAILABLE_QUANTITIES  
+tpu-v2-8         1                     
+tpu-v3-8         1                     
+tpu-v4-8         1                     
+tpu-v4-16        1                     
+tpu-v4-32        1                     
+tpu-v5litepod-1  1                     
+tpu-v5litepod-4  1                     
+tpu-v5litepod-8  1                     
+tpu-v5p-8        1                     
+tpu-v5p-16       1                     
+tpu-v5p-32       1                     
+tpu-v6e-1        1                     
+tpu-v6e-4        1                     
+tpu-v6e-8        1                     
+
+OTHER_GPU          AVAILABLE_QUANTITIES  
+A100-80GB-SXM      1, 2, 4, 8            
+A40                1, 2, 4, 8            
+A4000              1, 2, 4               
+A6000              1, 2, 4               
+GH200              1                     
+Gaudi HL-205       8                     
+H100-MEGA          8                     
+H100-SXM           1, 2, 4, 8            
+H200               8                     
+K80                1, 2, 4, 8, 16        
+L40                1, 2, 4, 8            
+M4000              1                     
+M60                1, 2, 4               
+P4                 1, 2, 4               
+P4000              1, 2                  
+RTX3060            1, 2                  
+RTX3080            1                     
+RTX3090            1, 2, 4, 8            
+RTX4000-Ada        1, 2, 4, 8            
+RTX4090            1, 2, 3, 4, 6, 8, 12  
+RTX6000            1                     
+RTX6000-Ada        1, 2, 4, 8            
+RTXA4000           1, 2, 4, 8            
+RTXA4500           1, 2, 4, 8            
+RTXA5000           1, 2, 4, 8            
+RTXA6000           1, 2, 4, 8            
+Radeon MI25        1                     
+Radeon Pro V520    1, 2, 4               
+T4g                1, 2                  
+... [omitted long outputs] ...
diff --git a/docs/source/compute/show-gpus-h100-8.txt b/docs/source/compute/show-gpus-h100-8.txt
new file mode 100644
index 00000000000..7c0bb80fee3
--- /dev/null
+++ b/docs/source/compute/show-gpus-h100-8.txt
@@ -0,0 +1,17 @@
+GPU   QTY  CLOUD       INSTANCE_TYPE         DEVICE_MEM  vCPUs  HOST_MEM  HOURLY_PRICE  HOURLY_SPOT_PRICE  REGION             
+H100  8    Vast        8x-H100_NVL-32-65536  749GB       32     64GB      $ 16.000      $ 16.000           Australia, AU, OC  
+H100  8    Vast        8x-H100_SXM-32-65536  637GB       32     64GB      $ 21.000      $ 10.670           Iceland, IS, EU    
+H100  8    Lambda      gpu_8x_h100_sxm5      80GB        208    1800GB    $ 23.920      -                  europe-central-1   
+H100  8    Fluidstack  H100_NVLINK_80GB::8   80GB        252    1440GB    $ 23.920      -                  FINLAND            
+H100  8    RunPod      8x_H100_SECURE        -           128    640GB     $ 35.920      -                  CA                 
+H100  8    GCP         a3-highgpu-8g         80GB        208    1872GB    $ 46.021      $ 35.133           us-central1        
+H100  8    Paperspace  H100x8                -           128    640GB     $ 47.600      -                  East Coast (NY2)   
+H100  8    DO          gpu-h100x8-640gb      80GB        160    1920GB    $ 47.600      -                  tor1               
+H100  8    OCI         BM.GPU.H100.8         80GB        224    2048GB    $ 80.000      -                  eu-amsterdam-1     
+H100  8    AWS         p5.48xlarge           80GB        192    2048GB    $ 98.320      $ 9.832            us-east-1          
+
+GPU        QTY  CLOUD  INSTANCE_TYPE  DEVICE_MEM  vCPUs  HOST_MEM  HOURLY_PRICE  HOURLY_SPOT_PRICE  REGION       
+H100-MEGA  8    GCP    a3-megagpu-8g  80GB        208    1872GB    $ 92.214      $ 36.886           us-central1  
+
+GPU       QTY  CLOUD   INSTANCE_TYPE       DEVICE_MEM  vCPUs  HOST_MEM  HOURLY_PRICE  HOURLY_SPOT_PRICE  REGION  
+H100-SXM  8    RunPod  8x_H100-SXM_SECURE  -           208    640GB     $ 37.520      -                  CA      
diff --git a/docs/source/docs/index.rst b/docs/source/docs/index.rst
index 0a99a5f4b9d..2e9ca6859c6 100644
--- a/docs/source/docs/index.rst
+++ b/docs/source/docs/index.rst
@@ -136,90 +136,87 @@ Read the research:
    ../overview
    ../getting-started/installation
    ../getting-started/quickstart
-   ../examples/interactive-development
-   ../getting-started/tutorial
+   Example: AI Training <../getting-started/tutorial>
    ../sky-computing
 
-
 .. toctree::
    :hidden:
    :maxdepth: 1
-   :caption: Running Jobs
+   :caption: Clusters
 
-   ../examples/managed-jobs
-   ../reference/job-queue
+   Start a Cluster <../examples/interactive-development>
    ../examples/auto-failover
-   ../running-jobs/distributed-jobs
-   ../running-jobs/many-jobs
+   ../reference/auto-stop
 
 .. toctree::
    :hidden:
    :maxdepth: 1
-   :caption: Reserved & Existing Clusters
+   :caption: Jobs
+
+   Cluster Jobs <../reference/job-queue>
+   ../examples/managed-jobs
+   Multi-Node Jobs <../running-jobs/distributed-jobs>
+   Many Parallel Jobs <../running-jobs/many-jobs>
 
-   ../reservations/reservations
-   Using Existing Machines <../reservations/existing-machines>
-   ../reference/kubernetes/index
 
 .. toctree::
    :hidden:
    :maxdepth: 1
-   :caption: SkyServe: Model Serving
+   :caption: Model Serving
 
-   ../serving/sky-serve
+   Getting Started <../serving/sky-serve>
    ../serving/user-guides
-   ../serving/service-yaml-spec
-   ../serving/https
 
 .. toctree::
    :hidden:
    :maxdepth: 1
-   :caption: Cutting Cloud Costs
+   :caption: Infra Choices
+
+   ../compute/gpus
+   Using Cloud VMs <../compute/cloud-vm>
+   ../reference/kubernetes/index
+   Using Existing Machines <../reservations/existing-machines>
+   ../reservations/reservations
+
 
-   Managed Spot Jobs <../examples/spot-jobs>
-   ../reference/auto-stop
-   ../reference/benchmark/index
 
 .. toctree::
    :hidden:
    :maxdepth: 1
-   :caption: Using Data
+   :caption: Data
 
-   ../examples/syncing-code-artifacts
    ../reference/storage
+   ../examples/syncing-code-artifacts
 
 .. toctree::
    :hidden:
    :maxdepth: 1
    :caption: User Guides
 
-   ../running-jobs/environment-variables
-   ../examples/docker-containers
+   Secrets and Environment Variables <../running-jobs/environment-variables>
+   Docker Containers <../examples/docker-containers>
    ../examples/ports
-   ../reference/tpu
    ../reference/logging
    ../reference/faq
-   SkyPilot vs. Other Systems <../reference/comparison>
-
 
 .. toctree::
    :hidden:
    :maxdepth: 1
-   :caption: Cloud Admin and Usage
+   :caption: Administrator Guides
 
    ../cloud-setup/cloud-permissions/index
    ../cloud-setup/cloud-auth
    ../cloud-setup/quota
-   ../cloud-setup/policy
+   Admin Policies <../cloud-setup/policy>
 
 .. toctree::
    :hidden:
    :maxdepth: 1
    :caption: References
 
-   ../reference/yaml-spec
-   ../reference/cli
-   ../reference/api
+   Task YAML <../reference/yaml-spec>
+   CLI <../reference/cli>
+   Python API <../reference/api>
    ../reference/config
    ../developers/index
 
diff --git a/docs/source/examples/auto-failover.rst b/docs/source/examples/auto-failover.rst
index 8ac9d5c71bf..596e9d2c415 100644
--- a/docs/source/examples/auto-failover.rst
+++ b/docs/source/examples/auto-failover.rst
@@ -1,7 +1,7 @@
 .. _auto-failover:
 
-Auto-provisioning GPUs
-==========================
+Provisioning Compute
+====================
 
 SkyPilot comes with an auto-failover provisioner, which
 **automatically retries provisioning** a cluster in different regions (or
@@ -62,7 +62,7 @@ provisioner handles such a request:
   $ sky launch -c gpu --gpus A100
 
   ...
-  Launching a new cluster 'gpu'. Proceed? [Y/n]: 
+  Launching a new cluster 'gpu'. Proceed? [Y/n]:
   ⚙️ Launching on GCP us-central1 (us-central1-a).
   W 10-11 18:25:57 instance_utils.py:112] Got return codes 'VM_MIN_COUNT_NOT_REACHED', 'ZONE_RESOURCE_POOL_EXHAUSTED_WITH_DETAILS' in us-central1-a: 'Requested minimum count of 1 VMs could not be created'; "The zone 'projects/xxxxxx/zones/us-central1-a' does not have enough resources available to fulfill the request.  '(resource type:compute)'"
   ...
@@ -91,13 +91,13 @@ GCP, where it succeeded after one region:
 
   Considered resources (1 node):
   ----------------------------------------------------------------------------------------------------
-   CLOUD   INSTANCE              vCPUs   Mem(GB)   ACCELERATORS   REGION/ZONE     COST ($)   CHOSEN   
+   CLOUD   INSTANCE              vCPUs   Mem(GB)   ACCELERATORS   REGION/ZONE     COST ($)   CHOSEN
   ----------------------------------------------------------------------------------------------------
-   Azure   Standard_ND96asr_v4   96      900       A100:8         eastus          27.20         ✔     
-   GCP     a2-highgpu-8g         96      680       A100:8         us-central1-a   29.39               
-   AWS     p4d.24xlarge          96      1152      A100:8         us-east-1       32.77               
+   Azure   Standard_ND96asr_v4   96      900       A100:8         eastus          27.20         ✔
+   GCP     a2-highgpu-8g         96      680       A100:8         us-central1-a   29.39
+   AWS     p4d.24xlarge          96      1152      A100:8         us-east-1       32.77
   ----------------------------------------------------------------------------------------------------
-  Launching a new cluster 'a100-8'. Proceed? [Y/n]: 
+  Launching a new cluster 'a100-8'. Proceed? [Y/n]:
 
   ...
   ⚙️ Launching on Azure eastus.
@@ -114,7 +114,7 @@ GCP, where it succeeded after one region:
   ✓ Cluster launched: a100-8.  View logs at: ~/sky_logs/sky-2024-10-11-18-24-14-357884/provision.log
 
 
-Multiple Candidate GPUs
+Multiple candidate GPUs
 -------------------------
 
 If a task can be run on different GPUs, the user can specify multiple candidate GPUs,
@@ -155,7 +155,7 @@ In the above example, SkyPilot will first try to provision an A10 GPU, then an A
 
 .. _multiple-resources:
 
-Multiple Candidate Resources
+Multiple candidate resources
 --------------------------------------------
 
 If a task would like to specify multiple candidate resources (not only GPUs), the user can specify a list of candidate resources with a preference annotation:
diff --git a/docs/source/examples/docker-containers.rst b/docs/source/examples/docker-containers.rst
index 41a5e13a027..9e7fd8e9a11 100644
--- a/docs/source/examples/docker-containers.rst
+++ b/docs/source/examples/docker-containers.rst
@@ -15,7 +15,7 @@ SkyPilot can run a container either as a task, or as the runtime environment of
 
 .. _docker-containers-as-tasks:
 
-Running Containers as Tasks
+Running containers as tasks
 ---------------------------
 
 .. note::
@@ -42,7 +42,7 @@ For example, to run a HuggingFace TGI serving container:
     # `docker run` is blocking, so any commands after it
     # will NOT be run inside the container.
 
-Private Registries
+Private registries
 ^^^^^^^^^^^^^^^^^^
 
 When using this mode, to access Docker images hosted on private registries,
@@ -98,7 +98,7 @@ Our GitHub repository has more examples, including running `Detectron2 in a Dock
 
 .. _docker-containers-as-runtime-environments:
 
-Using Containers as Runtime Environments
+Using containers as runtime environments
 ----------------------------------------
 
 When a container is used as the runtime environment, everything happens inside the container:
@@ -174,7 +174,7 @@ Any GPUs assigned to the task will be automatically mapped to your Docker contai
   to limitations in the RunPod API, so ensure that you choose a container with a
   default entrypoint (i.e. :code:`/bin/bash`).
 
-Private Registries
+Private registries
 ^^^^^^^^^^^^^^^^^^
 
 .. note::
diff --git a/docs/source/examples/interactive-development.rst b/docs/source/examples/interactive-development.rst
index 40920934597..7271045abb5 100644
--- a/docs/source/examples/interactive-development.rst
+++ b/docs/source/examples/interactive-development.rst
@@ -6,7 +6,7 @@ Start a Development Cluster
 
 SkyPilot makes interactive development easy on Kubernetes or cloud VMs. It helps you:
 
-#. :ref:`Launch <dev-launch>`: Quickly get a cluster with GPU or other resource requirement with a single command.
+#. :ref:`Launch <dev-launch>`: Quickly get a cluster with GPUs or other resources with a single command.
 #. :ref:`Autostop <dev-autostop>`: Automatically stop the cluster after some idle time for cost savings.
 #. :ref:`Connect <dev-connect>`: Easily connect to the cluster using the cluster name:
 
@@ -25,7 +25,7 @@ To launch a cluster with a cheap GPU for development:
 
   # Launch a cluster with 1 NVIDIA GPU and sync the local working directory to the
   # cluster.
-  sky launch -c dev --gpus T4 --workdir .
+  sky launch -c dev --gpus L4 --workdir .
 
 This can be launched as a pod in your Kubernetes cluster or a VM on any cloud.
 
@@ -70,7 +70,7 @@ Or add an additional flag :code:`-i` during the launch:
 .. code-block:: bash
 
   # Launch a cluster with auto stop after 5 hours
-  sky launch -c dev --gpus T4 --workdir . -i 300
+  sky launch -c dev --gpus L4 --workdir . -i 300
 
 For more details of auto stopping, check out: :ref:`auto-stop`. This feature is designed
 to prevent idle clusters from incurring unnecessary costs, ensuring your cluster
@@ -170,7 +170,7 @@ The following :code:`jupyter.yaml` is an example of a task specification that ca
   name: jupyter
 
   resources:
-    accelerators: T4:1
+    accelerators: L4:1
 
   file_mounts:
     /covid:
@@ -209,3 +209,8 @@ You can verify that this notebook has access to the mounted storage bucket.
 
 
 
+
+Working with clusters
+---------------------
+
+To see a typical workflow of working with clusters, you can refer to :ref:`quickstart`.
diff --git a/docs/source/examples/managed-jobs.rst b/docs/source/examples/managed-jobs.rst
index 5d7037ff92f..97d4b503d9e 100644
--- a/docs/source/examples/managed-jobs.rst
+++ b/docs/source/examples/managed-jobs.rst
@@ -50,7 +50,7 @@ Managed jobs have several benefits:
 
 .. _spot-jobs:
 
-Managed Spot Jobs
+Managed spot jobs
 -----------------
 
 Managed jobs can run on spot instances, and preemptions are auto-recovered by SkyPilot.
@@ -275,7 +275,7 @@ Real-World Examples
 * PyTorch Lightning DDP, CIFAR-10: `YAML <https://github.com/skypilot-org/skypilot/blob/master/examples/spot/lightning_cifar10.yaml>`__
 
 
-Managed On-Demand/Reserved Jobs
+Managed on-demand/reserved jobs
 -------------------------------
 
 The same ``sky jobs launch`` and YAML interfaces can run jobs on auto-recovering
@@ -314,7 +314,7 @@ will be spot instances. If spot instances are not available, SkyPilot will fall
 
 .. _failure-recovery:
 
-Jobs Restarts on User Code Failure
+Jobs restarts on user code failure
 -----------------------------------
 
 By default, SkyPilot will try to recover a job when its underlying cluster is preempted or failed. Any user code failures (non-zero exit codes) are not auto-recovered.
@@ -335,7 +335,7 @@ More advanced policies for resource selection, such as the `Can't Be Late
 <https://www.usenix.org/conference/nsdi24/presentation/wu-zhanghao>`__ (NSDI'24)
 paper, may be supported in the future.
 
-Running Many Parallel Jobs
+Running many parallel jobs
 --------------------------
 
 For batch jobs such as **data processing** or **hyperparameter sweeps**, you can launch many jobs in parallel. See :ref:`many-jobs`.
@@ -353,7 +353,7 @@ See all managed jobs:
 
 .. code-block:: console
 
-  Fetching managed job statuses...
+  Fetching managed jobs...
   Managed jobs:
   ID NAME     RESOURCES           SUBMITTED   TOT. DURATION   JOB DURATION   #RECOVERIES  STATUS
   2  roberta  1x [A100:8][Spot]   2 hrs ago   2h 47m 18s      2h 36m 18s     0            RUNNING
@@ -379,7 +379,7 @@ Cancel a managed job:
 
 .. _pipeline:
 
-Managed Pipelines
+Managed pipelines
 -----------------
 
 A pipeline is a managed job that contains a sequence of tasks running one after another.
@@ -451,7 +451,7 @@ To submit the pipeline, the same command :code:`sky jobs launch` is used. The pi
 
   $ sky jobs launch -n pipeline pipeline.yaml
   $ sky jobs queue
-  Fetching managed job statuses...
+  Fetching managed jobs...
   Managed jobs
   In progress jobs: 1 RECOVERING
   ID  TASK  NAME      RESOURCES                    SUBMITTED    TOT. DURATION  JOB DURATION  #RECOVERIES  STATUS
@@ -467,7 +467,7 @@ To submit the pipeline, the same command :code:`sky jobs launch` is used. The pi
 
 
 
-Job Dashboard
+Job dashboard
 -------------
 
 Use ``sky jobs dashboard`` to open a dashboard to see all jobs:
@@ -525,7 +525,7 @@ When using a custom bucket (:code:`jobs.bucket`), the job-specific directories (
   Multiple users can share the same intermediate bucket. Each user's jobs will have their own unique job-specific directories, ensuring that files are kept separate and organized.
 
 
-How It Works: The Jobs Controller
+How it works: The jobs controller
 ---------------------------------
 
 The jobs controller is a small on-demand CPU VM or pod running in the cloud that manages all jobs of a user.
diff --git a/docs/source/examples/ports.rst b/docs/source/examples/ports.rst
index 8452efe6967..c2424822e85 100644
--- a/docs/source/examples/ports.rst
+++ b/docs/source/examples/ports.rst
@@ -63,7 +63,7 @@ SkyPilot also support opening ports through the CLI:
 
     $ sky launch -c jupyter --ports 8888 jupyter_lab.yaml
 
-Security and Lifecycle Considerations
+Security and lifecycle considerations
 -------------------------------------
 
 Before you start opening ports, there are a few things you need to bear in mind:
diff --git a/docs/source/examples/spot-jobs.rst b/docs/source/examples/spot-jobs.rst
deleted file mode 100644
index 2b3df600425..00000000000
--- a/docs/source/examples/spot-jobs.rst
+++ /dev/null
@@ -1,23 +0,0 @@
-Managed Spot Jobs
-==================
-
-.. raw:: html
-
-    <script type="text/javascript">
-        // Function to perform the replacement and redirection
-        function redirectToManagedJobs() {
-            var currentUrl = window.location.href;
-            
-            // Check if the URL contains 'spot-jobs.html'
-            if (currentUrl.includes("spot-jobs.html")) {
-                // Replace 'spot-jobs.html' with 'managed-jobs.html'
-                var newUrl = currentUrl.replace("spot-jobs.html", "managed-jobs.html");
-                
-                // Redirect to the new URL
-                window.location.href = newUrl;
-            }
-        }
-
-        // Call the redirection function on page load
-        redirectToManagedJobs();
-    </script>
diff --git a/docs/source/getting-started/quickstart.rst b/docs/source/getting-started/quickstart.rst
index 4c42156bac8..d5df7071805 100644
--- a/docs/source/getting-started/quickstart.rst
+++ b/docs/source/getting-started/quickstart.rst
@@ -12,6 +12,8 @@ This guide will walk you through:
 
 Be sure to complete the :ref:`installation instructions <installation>` first before continuing with this guide.
 
+.. _hello-skypilot:
+
 Hello, SkyPilot!
 ------------------
 
diff --git a/docs/source/overview.rst b/docs/source/overview.rst
index 67bf070166a..8095d4e8c3a 100644
--- a/docs/source/overview.rst
+++ b/docs/source/overview.rst
@@ -121,7 +121,7 @@ A *job* is a program you want to run. Two types of jobs are supported:
 
 
 
-A job can contain one or :ref:`more <pipeline>` tasks. In most cases, a job has just one task; we'll refer to them interchangeably.
+A job can contain one or :ref:`more <pipeline>` *tasks*. In most cases, a job has just one task; we'll refer to them interchangeably.
 
 
 
@@ -206,6 +206,8 @@ SkyPilot easily connects to your existing infra---clouds, Kubernetes
 clusters, or on-prem machines---using each infra's native authentication
 (cloud credentials, kubeconfig, SSH).
 
+.. _concept-cloud-vms:
+
 Cloud VMs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/reference/benchmark/callback.rst b/docs/source/reference/benchmark/callback.rst
deleted file mode 100644
index 449ed4bedcf..00000000000
--- a/docs/source/reference/benchmark/callback.rst
+++ /dev/null
@@ -1,134 +0,0 @@
-.. _benchmark-skycallback:
-
-SkyCallback
-===========
-
-SkyCallback is a simple Python library that works in conjunction with SkyPilot Benchmark.
-It enables SkyPilot to provide a more detailed benchmark report without the need to wait until the task finishes.
-
-What SkyCallback is for
---------------------------------------------
-
-SkyCallback is designed for **machine learning tasks** which have a loop iterating many `steps`.
-SkyCallback measures the average time taken by each step, and extrapolates it to the total execution time of the task.
-
-Installing SkyCallback
---------------------------------------------
-
-Unlike SkyPilot, SkyCallback must be installed and imported `in your program`.
-To install it, add the following line in the ``setup`` section of your task YAML.
-
-.. code-block:: yaml
-
-    setup:
-        # Activate conda or virtualenv if you use one
-        # Then, install SkyCallback
-        pip install "git+https://github.com/skypilot-org/skypilot.git#egg=sky-callback&subdirectory=sky/callbacks/"
-
-
-Using SkyCallback generic APIs
---------------------------------------------
-
-The SkyCallback generic APIs are for **PyTorch, TensorFlow, and JAX** programs where training loops are exposed to the users.
-Below we provide the instructions for using the APIs.
-
-First, import the SkyCallback package and initialize it using ``init``.
-
-.. code-block:: python
-
-    import sky_callback
-    sky_callback.init()
-
-Next, mark the beginning and end of each step using one of the three equivalent methods.
-
-.. code-block:: python
-
-    # Method 1: wrap your iterable (e.g., dataloader) with `step_iterator`.
-    from sky_callback import step_iterator
-    for batch in step_iterator(train_dataloader):
-        ...
-
-    # Method 2: wrap your loop body with the `step` context manager.
-    for batch in train_dataloader:
-        with sky_callback.step():
-            ...
-
-    # Method 3: call `step_begin` and `step_end` directly.
-    for batch in train_dataloader:
-        sky_callback.step_begin()
-        ...
-        sky_callback.step_end()
-
-That's it.
-Now you can launch your task and get a detailed benchmark report using SkyPilot Benchmark CLI.
-`Here <https://github.com/skypilot-org/skypilot/blob/master/examples/benchmark/timm/callback.patch>`__ we provide an example of applying SkyCallback to Pytorch ImageNet training.
-
-.. note::
-    
-    Optionally in ``sky_callback.init``, you can specify the total number of steps that the task will iterate through.
-    This information is needed to estimate the total execution time/cost of your task.
-
-    .. code-block:: python
-    
-        sky_callback.init(
-            total_steps=num_epochs * len(train_dataloader), # Optional
-        )
-
-.. note::
-    In distributed training, ``global_rank`` should be additionally passed to ``sky_callback.init`` as follows:
-
-    .. code-block:: python
-
-        # PyTorch DDP users
-        global_rank = torch.distributed.get_rank()
-
-        # Horovod users
-        global_rank = hvd.rank()
-
-        sky_callback.init(
-            global_rank=global_rank,
-            total_steps=num_epochs * len(train_dataloader), # Optional
-        )
-
-Integrations with ML frameworks
-----------------------------------------------------------
-
-Using SkyCallback is even easier for **Keras, PytorchLightning, and HuggingFace Transformers** programs where trainer APIs are used.
-SkyCallback natively supports these frameworks with simple interface.
-
-* Keras example
-
-.. code-block:: python
-
-    from sky_callback import SkyKerasCallback
-
-    # Add the callback to your Keras model.
-    model.fit(..., callbacks=[SkyKerasCallback()])
-
-`Here <https://github.com/skypilot-org/skypilot/blob/master/examples/benchmark/keras_asr/callback.patch>`__ you can find an example of applying SkyCallback to Keras ASR model training.
-
-* PytorchLightning example
-
-.. code-block:: python
-
-    from sky_callback import SkyLightningCallback
-
-    # Add the callback to your trainer.
-    trainer = pl.Trainer(..., callbacks=[SkyLightningCallback()])
-
-`Here <https://github.com/skypilot-org/skypilot/blob/master/examples/benchmark/lightning_gan/callback.patch>`__ you can find an example of applying SkyCallback to PyTorchLightning GAN model training.
-
-* HuggingFace Transformers example
-
-.. code-block:: python
-
-    from sky_callback import SkyTransformersCallback
-
-    # Add the callback to your trainer.
-    trainer = transformers.Trainer(..., callbacks=[SkyTransformersCallback()])
-
-`Here <https://github.com/skypilot-org/skypilot/blob/master/examples/benchmark/transformers_qa/callback.patch>`__ you can find an example of applying SkyCallback to HuggingFace BERT fine-tuning.
-
-.. note::
-    When using the framework-integrated callbacks, do not call ``sky_callback.init`` for initialization.
-    The callbacks will do it for you.
diff --git a/docs/source/reference/benchmark/cli.rst b/docs/source/reference/benchmark/cli.rst
deleted file mode 100644
index 6d8c12b5ccc..00000000000
--- a/docs/source/reference/benchmark/cli.rst
+++ /dev/null
@@ -1,78 +0,0 @@
-.. _benchmark-cli:
-
-CLI
-===
-
-Workflow
---------
-
-You can use SkyPilot Benchmark by simply replacing your ``sky launch`` command with ``sky bench launch``:
-
-.. code-block:: bash
-
-    # Launch mytask on a V100 VM and a T4 VM
-    $ sky bench launch mytask.yaml --gpus V100,T4 --benchmark mybench
-
-The second command will launch ``mytask.yaml`` on a V100 VM and a T4 VM simultaneously, with a benchmark name ``mybench``.
-After the task finishes, you can check the benchmark results using ``sky bench show``:
-
-.. code-block:: bash
-
-    # Show the benchmark report on `mybench`
-    $ sky bench show mybench
-
-    CLUSTER              RESOURCES                          STATUS    DURATION  SPENT($)  STEPS  SEC/STEP  $/STEP  EST(hr)  EST($)  
-    sky-bench-mybench-0  1x GCP(n1-highmem-8, {'V100': 1})  FINISHED  12m 51s   0.6317    -       -         -       -        -       
-    sky-bench-mybench-1  1x AWS(g4dn.xlarge, {'T4': 1})     FINISHED  16m 19s   0.1430    -       -         -       -        -     
-
-In the report, SkyPilot shows the duration and cost of ``mybench`` on each VM.
-The VMs can be terminated by either ``sky bench down`` or ``sky down``:
-
-.. code-block:: bash
-
-    # Terminate all the clusters used for `mybench`
-    $ sky bench down mybench
-
-    # Terminate all the clusters used for `mybench` except `sky-bench-mybench-0`
-    $ sky bench down mybench --exclude sky-bench-mybench-0
-
-    # Terminate individual clusters as usual
-    $ sky down sky-bench-mybench-0
-
-.. note::
-
-    Each cluster launched by ``sky bench launch`` will automatically **stop** itself 5 minutes after the task is finished.
-    However, you don't have to restart those clusters.
-    Regardless of the status of the clusters, ``sky bench show`` will provide the benchmark results.
-
-.. note::
-
-    SkyPilot Benchmark does not consider the time/cost of provisioning and setup.
-    The columns (such as ``DURATION`` and ``SPENT($)``) in the report indicate the time/cost spent in executing the ``run`` section of your task YAML.
-
-.. note::
-
-    Here, the columns other than ``DURATION`` and ``SPENT($)`` are empty.
-    To get a complete benchmark report, please refer to :ref:`SkyCallback <benchmark-skycallback>`.
-
-
-Managing benchmark reports
----------------------------
-
-``sky bench ls`` shows the list of the benchmark reports you have:
-
-.. code-block:: bash
-
-    # List all the benchmark reports
-    $ sky bench ls
-
-    BENCHMARK  TASK         LAUNCHED             CANDIDATE 1                    CANDIDATE 2            CANDIDATE 3            CANDIDATE 4               
-    bert       bert_qa      2022-08-10 10:07:27  1x Standard_NC6_Promo (K80:1)  1x g4dn.xlarge (T4:1)  1x g5.xlarge (A10G:1)  1x n1-highmem-8 (V100:1)  
-    mybench    mytask       2022-08-10 11:24:27  1x n1-highmem-8 (V100:1)       1x g4dn.xlarge (T4:1)
-
-To delete a benchmark report, use ``sky bench delete``:
-
-.. code-block:: bash
-
-    # Delete the benchmark report on `mybench`
-    $ sky bench delete mybench
diff --git a/docs/source/reference/benchmark/config.rst b/docs/source/reference/benchmark/config.rst
deleted file mode 100644
index b1865c8db26..00000000000
--- a/docs/source/reference/benchmark/config.rst
+++ /dev/null
@@ -1,48 +0,0 @@
-.. _benchmark-yaml:
-
-YAML Configuration
-==================
-
-The resources to benchmark can be configured in the SkyPilot YAML interface.
-Below we provide an example:
-
-.. code-block:: yaml
-
-    # Only shows `resources` as other fields do not change.
-    resources:
-        cloud: gcp  # Works as a default value for `cloud`.
-
-        # Added only for SkyPilot Benchmark.
-        candidates:
-        - {accelerators: A100}
-        - {accelerators: V100, instance_type: n1-highmem-16}
-        - {accelerators: T4, cloud: aws}  # Overrides `cloud` to `aws`.
-
-For SkyPilot Benchmark, ``candidates`` is newly added under the ``resources`` field.
-``candidates`` is the list of dictionaries that configure the resources to benchmark.
-Any subfield of ``resources`` (``accelerators``, ``instance_type``, etc.) can be re-defined in the dictionaries.
-Subfields defined outside ``candidates`` (e.g. ``cloud`` in this example) are used as default values and are overriden by those defined in the dictionaries.
-Thus, the above example can be interpreted as follows:
-
-.. code-block:: yaml
-
-    # Configuration of the first candidate.
-    resources:
-        cloud: gcp
-        accelerators: A100
-
-    # Configuration of the second candidate.
-    resources:
-        cloud: gcp
-        accelerators: V100
-        instance_type: n1-highmem-16
-
-    # Configuration of the third candidate.
-    resources:
-        cloud: aws
-        accelerators: T4
-
-.. note::
-
-    Currently, SkyPilot Benchmark does not support on-prem jobs and managed spot jobs.
-    While you can set ``use_spot: True`` to benchmark spot VMs, automatic recovery will not be provided when preemption occurs.
diff --git a/docs/source/reference/benchmark/index.rst b/docs/source/reference/benchmark/index.rst
deleted file mode 100644
index e8f1fba77e4..00000000000
--- a/docs/source/reference/benchmark/index.rst
+++ /dev/null
@@ -1,43 +0,0 @@
-.. _benchmark-overview:
-
-Benchmark: Find the Best Hardware for Your Jobs
-================================================
-
-SkyPilot allows **easy measurement of performance and cost of different kinds of cloud resources** through the benchmark feature.
-With minimal effort, you can find the right cloud resource for your task that fits your performance goals and budget constraints.
-
-For example, say you want to fine-tune a BERT model and you do not know which GPU type is the best for you.
-With SkyPilot Benchmark, you can quickly run your task on different types of VMs and get a benchmark report like the following:
-
-.. code-block:: bash
-
-    Legend:
-    - STEPS: Number of steps taken.
-    - SEC/STEP, $/STEP: Average time (cost) per step.
-    - EST(hr), EST($): Estimated total time (cost) to complete the benchmark.
-
-    CLUSTER            RESOURCES                                STATUS      DURATION  SPENT($)  STEPS   SEC/STEP  $/STEP    EST(hr)  EST($)  
-    sky-bench-bert-0  1x Azure(Standard_NC6_Promo, {'K80': 1})  TERMINATED  12m 48s   0.0384    1415    1.1548    0.000058  10.60    1.91    
-    sky-bench-bert-1  1x AWS(g4dn.xlarge, {'T4': 1})            TERMINATED  14m 2s    0.1230    2387    0.6429    0.000094  5.92     3.11    
-    sky-bench-bert-2  1x AWS(g5.xlarge, {'A10G': 1})            TERMINATED  13m 57s   0.2339    7423    0.1859    0.000052  1.75     1.76      
-    sky-bench-bert-3  1x GCP(n1-highmem-8, {'V100': 1})         TERMINATED  13m 45s   0.6768    7306    0.2005    0.000165  1.87     5.51
-
-The report shows the benchmarking results of 4 VMs each with a different GPU type.
-Based on the report, you can pick the VM with either the lowest cost (``EST($)``) or the fastest execution time (``EST(hr)``), or find a sweet spot between them.
-In this example, AWS g5.xlarge (NVIDIA A10G GPU) turns out to be the best choice in terms of both cost and time.
-
-Using SkyPilot Benchmark
-------------------------
-
-A part of the SkyPilot Benchmark report relies on the :ref:`SkyCallback <benchmark-skycallback>` library instrumented in the training code to report step completion.
-Depending on the level of detail required by you in the benchmark report, SkyPilot Benchmark can be used in two modes:
-
-1. Without SkyCallback - You can get a basic benchmark report using SkyPilot Benchmark :ref:`benchmark-cli`. **This requires zero changes in your code**.
-2. With SkyCallback - You can get a more detailed benchmark report **by a few lines of code changes**. Please refer to :ref:`SkyCallback <benchmark-skycallback>`.
-
-Table of Contents
------------------
-.. toctree::
-   cli
-   config
-   callback
diff --git a/docs/source/reference/cli.rst b/docs/source/reference/cli.rst
index 985f63482b6..0362cb97f66 100644
--- a/docs/source/reference/cli.rst
+++ b/docs/source/reference/cli.rst
@@ -11,11 +11,6 @@ Cluster CLI
    :prog: sky launch
    :nested: full
 
-.. _sky-exec:
-.. click:: sky.cli:exec
-   :prog: sky exec
-   :nested: full
-
 .. _sky-stop:
 .. click:: sky.cli:stop
    :prog: sky stop
@@ -41,22 +36,31 @@ Cluster CLI
    :prog: sky autostop
    :nested: full
 
+
+Cluster Jobs CLI
+----------------
+
+.. _sky-exec:
+.. click:: sky.cli:exec
+   :prog: sky exec
+   :nested: full
+
 .. _sky-queue:
 .. click:: sky.cli:queue
    :prog: sky queue
    :nested: full
 
-.. _sky-logs:
-.. click:: sky.cli:logs
-   :prog: sky logs
-   :nested: full
-
 .. _sky-cancel:
 .. click:: sky.cli:cancel
    :prog: sky cancel
    :nested: full
 
-Managed (Spot) Jobs CLI
+.. _sky-logs:
+.. click:: sky.cli:logs
+   :prog: sky logs
+   :nested: full
+
+Managed Jobs CLI
 ---------------------------
 
 .. _sky-job-launch:
@@ -79,8 +83,7 @@ Managed (Spot) Jobs CLI
    :prog: sky jobs logs
    :nested: full
 
-
-SkyServe CLI
+Serving CLI
 -------------
 
 .. click:: sky.cli:serve_up
diff --git a/docs/source/reference/faq.rst b/docs/source/reference/faq.rst
index 6a8a598c1ca..0538de77111 100644
--- a/docs/source/reference/faq.rst
+++ b/docs/source/reference/faq.rst
@@ -192,7 +192,7 @@ For example, if you have access to special regions of GCP, add the data to ``~/.
 Also, you can update the catalog for a specific cloud by deleting the CSV file (e.g., ``rm ~/.sky/catalogs/<schema-version>/gcp.csv``).
 SkyPilot will automatically download the latest catalog in the next run.
 
-Package Installation
+Package installation
 ---------------------
 
 Unable to import PyTorch in a SkyPilot task.
diff --git a/docs/source/reference/job-queue.rst b/docs/source/reference/job-queue.rst
index 4cb8d3b915c..58a3cd39878 100644
--- a/docs/source/reference/job-queue.rst
+++ b/docs/source/reference/job-queue.rst
@@ -1,24 +1,25 @@
 .. _job-queue:
 
-Cluster Job Queue
-=================
+Cluster Jobs
+=============
 
-SkyPilot's **job queue** allows multiple jobs to be scheduled on a cluster.
+You can run jobs on an existing cluster, which are automatically queued and scheduled.
+
+This is ideal for interactive development on an existing cluster and reusing its setup.
 
 Getting started
 --------------------------------
 
-Each task submitted by :code:`sky exec` is automatically queued and scheduled
-for execution on an existing cluster:
+Use :code:`sky exec` to submit jobs to an existing cluster:
 
 .. code-block:: bash
 
    # Launch the job 5 times.
-   sky exec mycluster task.yaml -d
-   sky exec mycluster task.yaml -d
-   sky exec mycluster task.yaml -d
-   sky exec mycluster task.yaml -d
-   sky exec mycluster task.yaml -d
+   sky exec mycluster job.yaml -d
+   sky exec mycluster job.yaml -d
+   sky exec mycluster job.yaml -d
+   sky exec mycluster job.yaml -d
+   sky exec mycluster job.yaml -d
 
 The :code:`-d / --detach` flag detaches logging from the terminal, which is useful for launching many long-running jobs concurrently.
 
@@ -46,10 +47,15 @@ To cancel a job:
    # Cancel all jobs on a cluster.
    sky cancel mycluster --all
 
+.. tip::
+
+   The ``sky launch`` command/CLI performs many steps in one call, including
+   submitting jobs to an either existing or newly provisioned cluster. See :ref:`here <hello-skypilot>`.
+
 Multi-node jobs
 --------------------------------
 
-Jobs that run on multiple nodes are also supported by the job queue.
+Jobs that run on multiple nodes are also supported.
 
 First, create a :code:`cluster.yaml` to specify the desired cluster:
 
@@ -67,7 +73,7 @@ First, create a :code:`cluster.yaml` to specify the desired cluster:
 Use :code:`sky launch -c mycluster cluster.yaml` to provision a 4-node (each having 8 H100 GPUs) cluster.
 The :code:`num_nodes` field is used to specify how many nodes are required.
 
-Next, create a :code:`task.yaml` to specify each task:
+Next, create a :code:`job.yaml` to specify each job:
 
 .. code-block:: yaml
 
@@ -79,9 +85,8 @@ Next, create a :code:`task.yaml` to specify each task:
     # Run training script.
     ...
 
-This specifies a task that needs to be run on 2 nodes, each of which must have 4 free H100s.
-
-Use :code:`sky exec mycluster task.yaml` to submit this task, which will be scheduled correctly by the job queue.
+This specifies a job that needs to be run on 2 nodes, each of which must have 4 free H100s.
+You can then use :code:`sky exec mycluster job.yaml` to submit this job.
 
 See :ref:`dist-jobs` for more details.
 
@@ -89,18 +94,18 @@ Using ``CUDA_VISIBLE_DEVICES``
 --------------------------------
 
 The environment variable ``CUDA_VISIBLE_DEVICES`` will be automatically set to
-the devices allocated to each task on each node. This variable is set
-when a task's ``run`` commands are invoked.
+the devices allocated to each job on each node. This variable is set
+when a job's ``run`` commands are invoked.
 
-For example, ``task.yaml`` above launches a 4-GPU task on each node that has 8
-GPUs, so the task's ``run`` commands will be invoked with
+For example, ``job.yaml`` above launches a 4-GPU job on each node that has 8
+GPUs, so the job's ``run`` commands will be invoked with
 ``CUDA_VISIBLE_DEVICES`` populated with 4 device IDs.
 
 If your ``run`` commands use Docker/``docker run``, simply pass ``--gpus=all``;
 the correct environment variable would be set inside the container (only the
 allocated device IDs will be set).
 
-Example: Grid Search
+Example: Grid search
 ----------------------
 
 To submit multiple trials with different hyperparameters to a cluster:
@@ -150,18 +155,18 @@ Scheduling behavior
 SkyPilot's scheduler serves two goals:
 
 1. **Preventing resource oversubscription**: SkyPilot schedules jobs on a cluster
-   using their resource requirements---either specified in a task YAML's
+   using their resource requirements---either specified in a job YAML's
    :code:`resources` field, or via the :code:`--gpus` option of the :code:`sky
    exec` CLI command. SkyPilot honors these resource requirements while ensuring that
    no resource in the cluster is oversubscribed. For example, if a node has 4
-   GPUs, it cannot host a combination of tasks whose sum of GPU requirements
+   GPUs, it cannot host a combination of jobs whose sum of GPU requirements
    exceeds 4.
 
 2. **Minimizing resource idleness**: If a resource is idle, SkyPilot will schedule a
    queued job that can utilize that resource.
 
 We illustrate the scheduling behavior by revisiting :ref:`Tutorial: AI Training <ai-training>`.
-In that tutorial, we have a task YAML that specifies these resource requirements:
+In that tutorial, we have a job YAML that specifies these resource requirements:
 
 .. code-block:: yaml
 
@@ -173,14 +178,14 @@ In that tutorial, we have a task YAML that specifies these resource requirements
 
 Since a new cluster was created when we ran :code:`sky launch -c lm-cluster
 dnn.yaml`, SkyPilot provisioned the cluster with exactly the same resources as those
-required for the task.  Thus, :code:`lm-cluster` has 4 H100 GPUs.
+required for the job.  Thus, :code:`lm-cluster` has 4 H100 GPUs.
 
-While this initial job is running, let us submit more tasks:
+While this initial job is running, let us submit more jobs:
 
 .. code-block:: console
 
   $ # Launch 4 jobs, perhaps with different hyperparameters.
-  $ # You can override the task name with `-n` (optional) and
+  $ # You can override the job name with `-n` (optional) and
   $ # the resource requirement with `--gpus` (optional).
   $ sky exec lm-cluster dnn.yaml -d -n job2 --gpus=H100:1
   $ sky exec lm-cluster dnn.yaml -d -n job3 --gpus=H100:1
diff --git a/docs/source/reference/kubernetes/index.rst b/docs/source/reference/kubernetes/index.rst
index 6ea14ed8858..a11db5da1f9 100644
--- a/docs/source/reference/kubernetes/index.rst
+++ b/docs/source/reference/kubernetes/index.rst
@@ -104,3 +104,4 @@ Table of Contents
    kubernetes-setup
    kubernetes-troubleshooting
    multi-kubernetes
+   SkyPilot vs. Vanilla Kubernetes <skypilot-and-vanilla-k8s>
diff --git a/docs/source/reference/kubernetes/kubernetes-getting-started.rst b/docs/source/reference/kubernetes/kubernetes-getting-started.rst
index 8caa99dec78..281ef9108d1 100644
--- a/docs/source/reference/kubernetes/kubernetes-getting-started.rst
+++ b/docs/source/reference/kubernetes/kubernetes-getting-started.rst
@@ -170,9 +170,7 @@ You can also inspect the real-time GPU usage on the cluster with :code:`sky show
     my-cluster-5               H100      8           8
 
 
-.. _kubernetes-custom-images:
-
-Using Custom Images
+Using custom images
 -------------------
 By default, we maintain and use two SkyPilot container images for use on Kubernetes clusters:
 
@@ -217,7 +215,7 @@ To use images from private repositories (e.g., Private DockerHub, Amazon ECR, Go
     If you use Amazon ECR, your secret credentials may expire every 12 hours. Consider using `k8s-ecr-login-renew <https://github.com/nabsul/k8s-ecr-login-renew>`_ to automatically refresh your secrets.
 
 
-Opening Ports
+Opening ports
 -------------
 
 Opening ports on SkyPilot clusters running on Kubernetes is supported through two modes:
@@ -258,17 +256,17 @@ After launching the cluster with :code:`sky launch -c myclus task.yaml`, you can
 
     To learn more about opening ports in SkyPilot tasks, see :ref:`Opening Ports <ports>`.
 
-Customizing SkyPilot pods
+Customizing SkyPilot Pods
 -------------------------
 
 You can override the pod configuration used by SkyPilot by setting the :code:`pod_config` key in :code:`~/.sky/config.yaml`.
-The value of :code:`pod_config` should be a dictionary that follows the `Kubernetes Pod API <https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.26/#pod-v1-core>`_. This will apply to all pods created by SkyPilot. 
+The value of :code:`pod_config` should be a dictionary that follows the `Kubernetes Pod API <https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.26/#pod-v1-core>`_. This will apply to all pods created by SkyPilot.
 
 For example, to set custom environment variables and use GPUDirect RDMA, you can add the following to your :code:`~/.sky/config.yaml` file:
 
 .. code-block:: yaml
 
-    # ~/.sky/config.yaml 
+    # ~/.sky/config.yaml
     kubernetes:
       pod_config:
         spec:
@@ -331,7 +329,7 @@ FAQs
 * **Are autoscaling Kubernetes clusters supported?**
 
   To run on autoscaling clusters, set the :code:`provision_timeout` key in :code:`~/.sky/config.yaml` to a large value to give enough time for the cluster autoscaler to provision new nodes.
-  This will direct SkyPilot to wait for the cluster to scale up before failing over to the next candidate resource (e.g., next cloud). 
+  This will direct SkyPilot to wait for the cluster to scale up before failing over to the next candidate resource (e.g., next cloud).
 
   If you are using GPUs in a scale-to-zero setting, you should also set the :code:`autoscaler` key to the autoscaler type of your cluster. More details in :ref:`config-yaml`.
 
diff --git a/docs/source/reference/kubernetes/kubernetes-setup.rst b/docs/source/reference/kubernetes/kubernetes-setup.rst
index dcc870fe145..64395c190d1 100644
--- a/docs/source/reference/kubernetes/kubernetes-setup.rst
+++ b/docs/source/reference/kubernetes/kubernetes-setup.rst
@@ -245,7 +245,7 @@ You may distribute the generated kubeconfig file to users who can then use it to
 
 .. _kubernetes-setup-verify:
 
-Verifying Setup
+Verifying setup
 ---------------
 
 Once the cluster is deployed and you have placed your kubeconfig at ``~/.kube/config``, verify your setup by running :code:`sky check`:
@@ -278,7 +278,7 @@ You can also check the GPUs available on your nodes by running:
 
 .. _kubernetes-observability:
 
-Observability for Administrators
+Observability for administrators
 --------------------------------
 All SkyPilot tasks are run in pods inside a Kubernetes cluster. As a cluster administrator,
 you can inspect running pods (e.g., with :code:`kubectl get pods -n namespace`) to check which
@@ -357,7 +357,7 @@ Note that this dashboard can only be accessed from the machine where the ``kubec
     for more information on how to set up access control for the dashboard.
 
 
-Troubleshooting Kubernetes Setup
+Troubleshooting Kubernetes setup
 --------------------------------
 
 If you encounter issues while setting up your Kubernetes cluster, please refer to the :ref:`troubleshooting guide <kubernetes-troubleshooting>` to diagnose and fix issues.
diff --git a/docs/source/reference/kubernetes/multi-kubernetes.rst b/docs/source/reference/kubernetes/multi-kubernetes.rst
index 7ca4533ec74..7e9ea7895e3 100644
--- a/docs/source/reference/kubernetes/multi-kubernetes.rst
+++ b/docs/source/reference/kubernetes/multi-kubernetes.rst
@@ -24,7 +24,7 @@ You may have multiple Kubernetes clusters for different:
 Configuration
 -------------
 
-Step 1: Set Up Credentials
+Step 1: Set up credentials
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 To work with multiple Kubernetes clusters, their credentials must be set up as individual `contexts <https://kubernetes.io/docs/tasks/access-application-cluster/configure-access-multiple-clusters/>`_ in your local ``~/.kube/config`` file.
@@ -63,7 +63,7 @@ For example, a ``~/.kube/config`` file may look like this:
 
 In this example, we have two Kubernetes clusters: ``my-h100-cluster`` and ``my-tpu-cluster``, and each Kubernetes cluster has a context for it.
 
-Step 2: Set up SkyPilot to Access Multiple Kubernetes Clusters
+Step 2: Set up SkyPilot to access multiple Kubernetes clusters
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Unlike clouds, SkyPilot does not failover through different Kubernetes clusters
@@ -97,7 +97,7 @@ To check the enabled Kubernetes clusters, you can run ``sky check k8s``.
         └── my-tpu-cluster
 
 
-Failover across Multiple Kubernetes Clusters
+Failover across multiple Kubernetes clusters
 --------------------------------------------
 
 With the ``kubernetes.allowed_contexts`` config set, SkyPilot will failover
@@ -117,7 +117,7 @@ through the Kubernetes clusters in the same order as they are specified in the f
     ------------------------------------------------------------------------------------------------------------
 
 
-Launching in a Specific Kubernetes Cluster
+Launching in a specific Kubernetes cluster
 ------------------------------------------
 
 SkyPilot uses the ``region`` field to denote a Kubernetes context. You can point to a Kubernetes cluster
@@ -144,7 +144,7 @@ by specifying the ``--region`` with the context name for that cluster.
 When launching a SkyPilot cluster or task, you can also specify the context name with ``--region`` to launch the cluster or task in.
 
 
-Dynamically Updating Clusters to Use
+Dynamically updating clusters to use
 ----------------------------------------------
 
 You can configure SkyPilot to dynamically fetch Kubernetes cluster configs and enforce restrictions on which clusters are used. Refer to :ref:`dynamic-kubernetes-contexts-update-policy` for more.
diff --git a/docs/source/reference/comparison.rst b/docs/source/reference/kubernetes/skypilot-and-vanilla-k8s.rst
similarity index 90%
rename from docs/source/reference/comparison.rst
rename to docs/source/reference/kubernetes/skypilot-and-vanilla-k8s.rst
index 23985e5081b..a93bf89405e 100644
--- a/docs/source/reference/comparison.rst
+++ b/docs/source/reference/kubernetes/skypilot-and-vanilla-k8s.rst
@@ -1,15 +1,7 @@
 .. _sky-compare:
 
-Comparing SkyPilot with other systems
-=====================================
-
-SkyPilot is a framework for running AI and batch workloads on any infrastructure. While SkyPilot offers unique capabilities, certain functionalities like job scheduling overlap with existing systems (e.g., Kubernetes, Slurm). That said, SkyPilot can be used in conjunction with them to provide additional benefits.
-
-This page provides a comparison of SkyPilot with other systems, focusing on the unique benefits provided by SkyPilot. We welcome feedback and contributions to this page.
-
-
 SkyPilot vs Vanilla Kubernetes
-------------------------------
+==============================
 
 Kubernetes is a powerful system for managing containerized applications. :ref:`Using SkyPilot to access your Kubernetes cluster <kubernetes-overview>` boosts developer productivity and allows you to scale your infra beyond a single Kubernetes cluster.
 
@@ -18,7 +10,7 @@ Kubernetes is a powerful system for managing containerized applications. :ref:`U
    Light: https://docs.google.com/drawings/d/1REe_W49SPJ44N-o4NRCKcIRhCkXG9o03ZXHh1mfLUzk/edit?usp=sharing
    Dark: https://docs.google.com/drawings/d/1MefAOXRNHj05B9raO3dCPhAyMJN3oWYM6nvUNgo8aoA/edit?usp=sharing
 
-.. figure:: ../images/k8s-skypilot-architecture-dark.png
+.. figure:: ../../images/k8s-skypilot-architecture-dark.png
    :width: 55%
    :align: center
    :alt: SkyPilot on Kubernetes
@@ -26,7 +18,7 @@ Kubernetes is a powerful system for managing containerized applications. :ref:`U
 
    SkyPilot layers on top of your Kubernetes cluster to deliver a better developer experience.
 
-.. figure:: ../images/k8s-skypilot-architecture-light.png
+.. figure:: ../../images/k8s-skypilot-architecture-light.png
    :width: 55%
    :align: center
    :alt: SkyPilot on Kubernetes
diff --git a/docs/source/reference/storage.rst b/docs/source/reference/storage.rst
index 467b23e0d53..b39d051054a 100644
--- a/docs/source/reference/storage.rst
+++ b/docs/source/reference/storage.rst
@@ -1,7 +1,7 @@
 .. _sky-storage:
 
-Cloud Object Storage
-====================
+Cloud Buckets
+==============
 
 SkyPilot tasks can access data from buckets in cloud object storages such as AWS S3, Google Cloud Storage (GCS), Cloudflare R2, OCI Object Storage or IBM COS.
 
diff --git a/docs/source/reference/tpu.rst b/docs/source/reference/tpu.rst
index a753c26bd31..307288cef56 100644
--- a/docs/source/reference/tpu.rst
+++ b/docs/source/reference/tpu.rst
@@ -31,7 +31,7 @@ After the command finishes, you will be dropped into a TPU host VM and can start
 
 Below, we show examples of using SkyPilot to (1) train LLMs on TPU VMs/Pods and (2) train MNIST on TPU Nodes (legacy).
 
-TPU Architectures
+TPU architectures
 =================
 
 Two different TPU architectures are available on GCP:
diff --git a/docs/source/reference/yaml-spec.rst b/docs/source/reference/yaml-spec.rst
index d2f0506993a..c380c3993d0 100644
--- a/docs/source/reference/yaml-spec.rst
+++ b/docs/source/reference/yaml-spec.rst
@@ -1,11 +1,13 @@
 .. _yaml-spec:
 
-Task YAML
-=========
+SkyPilot YAML
+=============
 
-SkyPilot provides an intuitive YAML interface to specify a task (resource requirements, setup commands, run commands, file mounts, storage mounts, and so on).
+SkyPilot provides an intuitive YAML interface to specify clusters, jobs, or
+services (resource requirements, setup commands, run commands, file mounts,
+storage mounts, and so on).
 
-Task YAMLs can be used with the :ref:`CLI <cli>`, or the programmatic API (:meth:`sky.Task.from_yaml`).
+YAMLs can be used with the :ref:`CLI <cli>`, or the programmatic API (e.g., :meth:`sky.Task.from_yaml`).
 
 Available fields:
 
@@ -19,10 +21,10 @@ Available fields:
     #
     # Commands in "setup" and "run" will be executed under it.
     #
-    # If a relative path is used, it's evaluated relative to the location from 
+    # If a relative path is used, it's evaluated relative to the location from
     # which `sky` is called.
     #
-    # To exclude files from syncing, see 
+    # To exclude files from syncing, see
     # https://docs.skypilot.co/en/latest/examples/syncing-code-artifacts.html#exclude-uploading-files
     workdir: ~/my-task-code
 
@@ -98,7 +100,7 @@ Available fields:
       #
       # If `FAILOVER` is specified, the job will be restarted in the same region
       # if the node fails, and go to the next region if no available resources
-      # are found in the same region. 
+      # are found in the same region.
       #
       # If `EAGER_NEXT_REGION` is specified, the job will go to the next region
       # directly if the node fails. This is useful for spot instances, as in
@@ -217,7 +219,7 @@ Available fields:
       #
       # OCI
       # To find OCI images: https://docs.oracle.com/en-us/iaas/images
-      # You can choose the image with OS version from the following image tags 
+      # You can choose the image with OS version from the following image tags
       # provided by SkyPilot:
       #   image_id: skypilot:gpu-ubuntu-2204
       #   image_id: skypilot:gpu-ubuntu-2004
@@ -360,7 +362,7 @@ Available fields:
 
 .. _task-yaml-experimental:
 
-Experimental Configurations
+Experimental configurations
 ---------------------------
 
 .. note::
@@ -387,3 +389,89 @@ In additional to the above fields, SkyPilot also supports the following experime
             managed_instance_group: ...
         nvidia_gpus:
             disable_ecc: ...
+
+
+
+.. _service-yaml-spec:
+
+SkyServe services
+=================
+
+To define a YAML for use for :ref:`services <sky-serve>`, use previously
+mentioned fields to describe each replica, then add a ``service`` section to
+describe the entire service.
+
+.. code-block:: yaml
+
+    service:
+
+      # Readiness probe (required). Used by SkyServe to check if your service
+      # replicas are ready for accepting traffic. If the readiness probe returns
+      # a 200, SkyServe will start routing traffic to that replica.
+      readiness_probe:
+        # Path to probe (required).
+        path: /v1/models
+        # Post data (optional). If this is specified, the readiness probe will use
+        # POST instead of GET, and the post data will be sent as the request body.
+        post_data: {'model_name': 'model'}
+        # Initial delay in seconds (optional). Defaults to 1200 seconds (20 minutes).
+        # Any readiness probe failures during this period will be ignored. This is
+        # highly related to your service, so it is recommended to set this value
+        # based on your service's startup time.
+        initial_delay_seconds: 1200
+        # The Timeout in seconds for a readiness probe request (optional).
+        # Defaults to 15 seconds. If the readiness probe takes longer than this
+        # time to respond, the probe will be considered as failed. This is
+        # useful when your service is slow to respond to readiness probe
+        # requests. Note, having a too high timeout will delay the detection
+        # of a real failure of your service replica.
+        timeout_seconds: 15
+
+      # Simplified version of readiness probe that only contains the readiness
+      # probe path. If you want to use GET method for readiness probe and the
+      # default initial delay, you can use the following syntax:
+      readiness_probe: /v1/models
+
+      # One of the two following fields (replica_policy or replicas) is required.
+
+      # Replica autoscaling policy. This describes how SkyServe autoscales
+      # your service based on the QPS (queries per second) of your service.
+      replica_policy:
+        # Minimum number of replicas (required).
+        min_replicas: 1
+        # Maximum number of replicas (optional). If not specified, SkyServe will
+        # use a fixed number of replicas (the same as min_replicas) and ignore
+        # any QPS threshold specified below.
+        max_replicas: 3
+        # Following specs describe the autoscaling policy.
+        # Target query per second per replica (optional). SkyServe will scale your
+        # service so that, ultimately, each replica manages approximately
+        # target_qps_per_replica queries per second. **Autoscaling will only be
+        # enabled if this value is specified.**
+        target_qps_per_replica: 5
+        # Upscale and downscale delay in seconds (optional). Defaults to 300 seconds
+        # (5 minutes) and 1200 seconds (20 minutes) respectively. To avoid aggressive
+        # autoscaling, SkyServe will only upscale or downscale your service if the
+        # QPS of your service is higher or lower than the target QPS for a period
+        # of time. This period of time is controlled by upscale_delay_seconds and
+        # downscale_delay_seconds. The default values should work in most cases.
+        # If you want to scale your service more aggressively, you can set
+        # these values to a smaller number.
+        upscale_delay_seconds: 300
+        downscale_delay_seconds: 1200
+      # Simplified version of replica policy that uses a fixed number of
+      # replicas:
+      replicas: 2
+
+    ##### Fields below describe each replica #####
+
+    # Besides the `service` section, the rest is a regular SkyPilot task YAML.
+
+    resources:
+      # Port to run your service on each replica (required). This port will be
+      # automatically exposed to the public internet by SkyServe.
+      ports: 8080
+      # Other resources config...
+
+    # Other fields of your SkyPilot task YAML...
+
diff --git a/docs/source/reservations/reservations.rst b/docs/source/reservations/reservations.rst
index ce39b9c1293..c46784cecf4 100644
--- a/docs/source/reservations/reservations.rst
+++ b/docs/source/reservations/reservations.rst
@@ -1,7 +1,7 @@
 
 .. _reservation:
 
-Reserved, Capacity Blocks, DWS
+Using Reservations
 ===================================
 
 
@@ -79,7 +79,7 @@ For example, if you are launching a cluster with the following SkyPilot YAML:
     resources:
       cloud: aws
       accelerators: A100:8
-    
+
     num_nodes: 2
 
 
@@ -166,9 +166,9 @@ In case you want to specify the DWS configuration for each job/cluster, you can
     resources:
       cloud: gcp
       accelerators: A100:8
-    
+
     num_nodes: 4
-    
+
 Using DWS on GKE with Kueue
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/running-jobs/distributed-jobs.rst b/docs/source/running-jobs/distributed-jobs.rst
index 7c3421aa276..d39ed6cf571 100644
--- a/docs/source/running-jobs/distributed-jobs.rst
+++ b/docs/source/running-jobs/distributed-jobs.rst
@@ -43,7 +43,7 @@ For example, here is a simple example to train a GPT-like model (inspired by Kar
 
 In the above,
 
-- :code:`num_nodes: 2` specifies that this task is to be run on 2 nodes, with each node having 8 A100s;
+- :code:`num_nodes: 2` specifies that this job is to be run on 2 nodes, with each node having 8 A100s;
 - The highlighted lines in the ``run`` section show common environment variables that are useful for launching distributed training, explained below.
 
 .. note::
@@ -58,42 +58,46 @@ In the above,
 
 You can find more `distributed training examples <https://github.com/skypilot-org/skypilot/tree/master/examples/distributed-pytorch>`_ (including `using rdvz backend for pytorch <https://github.com/skypilot-org/skypilot/blob/master/examples/distributed-pytorch/train-rdzv.yaml>`_) in our `GitHub repository <https://github.com/skypilot-org/skypilot/tree/master/examples>`_.
 
+Unless otherwise specified, descriptions below apply to both :ref:`cluster jobs <job-queue>` and :ref:`managed jobs <managed-jobs>`.
+
 Environment variables
 -----------------------------------------
 
-SkyPilot exposes these environment variables that can be accessed in a task's ``run`` commands:
+SkyPilot exposes these environment variables that can be accessed in a job's ``run`` commands:
 
 - :code:`SKYPILOT_NODE_RANK`: rank (an integer ID from 0 to :code:`num_nodes-1`) of
-  the node executing the task.
+  the node executing the job.
 - :code:`SKYPILOT_NODE_IPS`: a string of IP addresses of the nodes reserved to execute
-  the task, where each line contains one IP address.
-- :code:`SKYPILOT_NUM_NODES`: number of nodes reserved for the task, which can be specified by ``num_nodes: <n>``. Same value as :code:`echo "$SKYPILOT_NODE_IPS" | wc -l`.
+  the job, where each line contains one IP address.
+- :code:`SKYPILOT_NUM_NODES`: number of nodes reserved for the job, which can be specified by ``num_nodes: <n>``. Same value as :code:`echo "$SKYPILOT_NODE_IPS" | wc -l`.
 - :code:`SKYPILOT_NUM_GPUS_PER_NODE`: number of GPUs reserved on each node to execute the
-  task; the same as the count in ``accelerators: <name>:<count>`` (rounded up if a fraction).
+  job; the same as the count in ``accelerators: <name>:<count>`` (rounded up if a fraction).
 
 See :ref:`sky-env-vars` for more details.
 
-Launching a multi-node task (new cluster)
+Launching a multi-node job (new cluster)
 -------------------------------------------------
 
-When using ``sky launch`` to launch a multi-node task on **a new cluster**, the following happens in sequence:
+When using ``sky launch`` to launch a multi-node job on **a new cluster**, the following happens in sequence:
 
 1. Nodes are provisioned. (barrier)
 2. Workdir/file_mounts are synced to all nodes. (barrier)
 3. ``setup`` commands are executed on all nodes. (barrier)
 4. ``run`` commands are executed on all nodes.
 
-Launching a multi-node task (existing cluster)
+When using ``sky jobs launch`` to launch a :ref:`managed <managed-jobs>` multi-node job, the same behavior applies.
+
+Launching a multi-node job (existing cluster)
 -------------------------------------------------
 
-When using ``sky launch`` to launch a multi-node task on **an existing cluster**, the cluster may have more nodes than the current task's ``num_nodes`` requirement.
+When using ``sky launch`` to launch a multi-node job on **an existing cluster**, the cluster may have more nodes than the current job's ``num_nodes`` requirement.
 
 The following happens in sequence:
 
 1. SkyPilot checks the runtime on all nodes are up-to-date. (barrier)
 2. Workdir/file_mounts are synced to all nodes. (barrier)
 3. ``setup`` commands are executed on **all nodes** of the cluster. (barrier)
-4. ``run`` commands are executed on **the subset of nodes** scheduled to execute the task, which may be fewer than the cluster size.
+4. ``run`` commands are executed on **the subset of nodes** scheduled to execute the job, which may be fewer than the cluster size.
 
 .. tip::
 
@@ -101,9 +105,9 @@ The following happens in sequence:
   (performs steps 1, 2, 4 above) or ``sky exec`` (performs step 2 (workdir only)
   and step 4).
 
-Executing a task on the head node only
+Executing a job on the head node only
 --------------------------------------
-To execute a task on the head node only (a common scenario for tools like
+To execute a job on the head node only (a common scenario for tools like
 ``mpirun``), use the ``SKYPILOT_NODE_RANK`` environment variable as follows:
 
 .. code-block:: yaml
@@ -134,22 +138,28 @@ This allows you directly to SSH into the worker nodes, if required.
   $ ssh mycluster-worker1
   $ ssh mycluster-worker2
 
+SSH access is only available for :ref:`clusters <dev-cluster>` (designed for interactive development), not for :ref:`managed jobs <managed-jobs>` (designed for production, scale-out runs).
 
-Executing a Distributed Ray Program
+Executing a distributed Ray program
 ------------------------------------
-To execute a distributed Ray program on many nodes, you can download the `training script <https://github.com/skypilot-org/skypilot/blob/master/examples/distributed_ray_train/train.py>`_ and launch the `task yaml <https://github.com/skypilot-org/skypilot/blob/master/examples/distributed_ray_train/ray_train.yaml>`_:
+To execute a distributed Ray program on many nodes, you can download the `training script <https://github.com/skypilot-org/skypilot/blob/master/examples/distributed_ray_train/train.py>`_ and launch the `job yaml <https://github.com/skypilot-org/skypilot/blob/master/examples/distributed_ray_train/ray_train.yaml>`_:
 
 .. code-block:: console
 
   $ wget https://raw.githubusercontent.com/skypilot-org/skypilot/master/examples/distributed_ray_train/train.py
+
+  $ # Use a cluster (ideal for interactive development)
   $ sky launch ray_train.yaml
 
+  $ # Use a managed job (ideal for production, scale-out runs)
+  $ sky jobs launch ray_train.yaml
+
 .. code-block:: yaml
-  
+
     resources:
       accelerators: L4:2
       memory: 64+
-  
+
     num_nodes: 2
 
     workdir: .
@@ -160,11 +170,11 @@ To execute a distributed Ray program on many nodes, you can download the `traini
         conda create -n ray python=3.10 -y
         conda activate ray
       fi
-      
+
       pip install "ray[train]"
       pip install tqdm
       pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
-  
+
     run: |
       sudo chmod 777 -R /var/tmp
       HEAD_IP=`echo "$SKYPILOT_NODE_IPS" | head -n1`
@@ -177,7 +187,7 @@ To execute a distributed Ray program on many nodes, you can download the `traini
         ps aux | grep ray | grep 6379 &> /dev/null || ray start --address $HEAD_IP:6379 --disable-usage-stats
       fi
 
-.. warning:: 
+.. warning::
 
   When using Ray, avoid calling ``ray stop`` as that will also cause the SkyPilot runtime to be stopped.
 
diff --git a/docs/source/running-jobs/many-jobs.rst b/docs/source/running-jobs/many-jobs.rst
index 074db86f4cf..f32720662f9 100644
--- a/docs/source/running-jobs/many-jobs.rst
+++ b/docs/source/running-jobs/many-jobs.rst
@@ -1,4 +1,3 @@
-
 .. _many-jobs:
 
 Many Parallel Jobs
@@ -15,7 +14,7 @@ This guide shows a typical workflow for running many jobs with SkyPilot.
 .. TODO: Show the components in a GIF.
 
 
-Why Use SkyPilot to Run Many Jobs
+Why use SkyPilot to run many jobs
 -------------------------------------
 
 - **Unified**: Use any or multiple of your own infrastructure (Kubernetes, cloud VMs, reservations, etc.).
@@ -24,7 +23,7 @@ Why Use SkyPilot to Run Many Jobs
 - **Robust**: Automatically recover jobs from failures.
 - **Observable**: Monitor and manage all jobs in a single pane of glass.
 
-Write a YAML for One Job
+Write a YAML for one job
 -----------------------------------
 
 Before scaling up to many jobs, write a SkyPilot YAML for a single job first and ensure it runs correctly. This can save time by avoiding debugging many jobs at once.
@@ -100,7 +99,7 @@ Sometimes, it may be more efficient to log into the cluster and interactively de
 
 Next, after confirming the job is working correctly, **add (hyper)parameters** to the job YAML so that all job variants can be specified.
 
-1. Add Hyperparameters
+1. Add hyperparameters
 ~~~~~~~~~~~~~~~~~~~~~~
 
 To launch jobs with different hyperparameters, add them as :ref:`environment variables <env-vars>` to the SkyPilot YAML, and make your main program read these environment variables:
@@ -172,7 +171,7 @@ Alternative, store the environment variable values in a dotenv file and use ``--
 
 
 
-2. Logging Job Outputs
+2. Logging job outputs
 ~~~~~~~~~~~~~~~~~~~~~~~
 
 When running many jobs, it is useful to log the outputs of all jobs. You can use tools like `W&B <https://wandb.ai>`__ for this purpose:
@@ -234,13 +233,10 @@ You can now launch the job with the following command (``WANDB_API_KEY`` should
 
 
 
-Scale Out to Many Jobs
+Scale out to many jobs
 -----------------------
 
-With the above setup, you can now scale out to run many jobs in parallel. You
-can either use SkyPilot CLI with many config files or use SkyPilot Python API.
-
-With CLI and Config Files
+With CLI and config files
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 You can run many jobs in parallel by (1) creating multiple config files and (2)
@@ -309,10 +305,10 @@ Job statuses can be checked via ``sky jobs queue``:
 
   $ sky jobs queue
 
-  Fetching managed job statuses...
+  Fetching managed jobs...
   Managed jobs
   In progress tasks: 10 RUNNING
-  ID  TASK  NAME        RESOURCES  SUBMITTED    TOT. DURATION  JOB DURATION  #RECOVERIES  STATUS   
+  ID  TASK  NAME        RESOURCES  SUBMITTED    TOT. DURATION  JOB DURATION  #RECOVERIES  STATUS
   10  -     train-job10 1x[V100:4] 5 mins ago   5m 5s          1m 12s        0            RUNNING
   9   -     train-job9  1x[V100:4] 6 mins ago   6m 11s         2m 23s        0            RUNNING
   8   -     train-job8  1x[V100:4] 7 mins ago   7m 15s         3m 31s        0            RUNNING
@@ -346,7 +342,7 @@ To have more customized control over generation of job variants, you can also us
       job_idx += 1
 
 
-Best Practices for Scaling
+Best practices for scaling
 --------------------------
 
 By default, around 90 jobs can be managed at once. However, with some simple configuration, SkyPilot can reliably support **2000 jobs running in parallel**. See :ref:`the best practices <jobs-controller-sizing>` for more info.
diff --git a/docs/source/serving/auth.rst b/docs/source/serving/auth.rst
index 91e02a64b07..ff805e25e6a 100644
--- a/docs/source/serving/auth.rst
+++ b/docs/source/serving/auth.rst
@@ -5,7 +5,7 @@ Authorization
 
 SkyServe provides robust authorization capabilities at the replica level, allowing you to control access to service endpoints with API keys.
 
-Setup API Keys
+Setup API keys
 --------------
 
 SkyServe relies on the authorization of the service running on underlying service replicas, e.g., the inference engine. We take the vLLM inference engine as an example, which supports static API key authorization with an argument :code:`--api-key`.
@@ -78,10 +78,10 @@ To send a request to the service endpoint, a service client need to include the
 .. raw:: HTML
 
   <details>
-  
+
   <summary>Example output</summary>
-  
-  
+
+
 .. code-block:: console
 
   {
@@ -107,7 +107,7 @@ To send a request to the service endpoint, a service client need to include the
       "completion_tokens": 134
     }
   }
-  
+
 .. raw:: html
 
   </details>
diff --git a/docs/source/serving/autoscaling.rst b/docs/source/serving/autoscaling.rst
index f41912c251d..84f55119941 100644
--- a/docs/source/serving/autoscaling.rst
+++ b/docs/source/serving/autoscaling.rst
@@ -5,7 +5,7 @@ Autoscaling
 
 SkyServe provides out-of-the-box autoscaling for your services.
 
-Fixed Replicas
+Fixed replicas
 --------------
 
 In a service YAML, the number of replicas to launch is specified in the ``service`` section's ``replicas`` field:
@@ -22,7 +22,7 @@ In a service YAML, the number of replicas to launch is specified in the ``servic
 In this case, SkyServe will launch 2 replicas of your service. However, this deployment is fixed and cannot adjust to dynamic traffic.
 SkyServe provides autoscaling to help you scale your service up and down based on traffic, as shown below.
 
-Enabling Autoscaling
+Enabling autoscaling
 --------------------
 
 Here is a minimal example to enable autoscaling for your service:
@@ -71,7 +71,7 @@ Specifically, the current target number of replicas is calculated as:
 
     :code:`target_qps_per_replica` can be any positive floating point number. If processing one request takes two seconds in one replica, we can use :code:`target_qps_per_replica=0.5`.
 
-Scaling Delay
+Scaling delay
 -------------
 
 SkyServe will not scale up or down immediately. Instead, SkyServe will only
@@ -99,7 +99,7 @@ change the scaling delay by specifying the :code:`upscale_delay_seconds` and
 
 If you want more aggressive scaling, set those values to a lower number and vice versa.
 
-Scale-to-Zero
+Scale-to-zero
 -------------
 
 SkyServe supports scale-to-zero.
diff --git a/docs/source/serving/https.rst b/docs/source/serving/https.rst
index 1cc21aeded5..632ab46cbc2 100644
--- a/docs/source/serving/https.rst
+++ b/docs/source/serving/https.rst
@@ -9,7 +9,7 @@ SkyServe enables secure serving of models over HTTPS, which is essential for han
 
   To learn more about TLS and HTTPS, see `here <https://fastapi.tiangolo.com/deployment/https/>`_.
 
-HTTPS Encrypted Endpoint
+HTTPS encrypted endpoint
 ------------------------
 
 To create an HTTPS encrypted endpoint, you need to provide a certificate and a private key. Obtaining these from a trusted Certificate Authority (CA) is the most secure method. `Let's Encrypt <https://fastapi.tiangolo.com/deployment/https/#lets-encrypt>`_ is one of the most popular free solution. However, for development and testing purposes, you can generate a self-signed certificate and private key using the :code:`openssl` command-line tool. Here is an example of how to generate them:
diff --git a/docs/source/serving/service-yaml-spec.rst b/docs/source/serving/service-yaml-spec.rst
deleted file mode 100644
index 4d3ffc06d48..00000000000
--- a/docs/source/serving/service-yaml-spec.rst
+++ /dev/null
@@ -1,85 +0,0 @@
-.. _service-yaml-spec:
-
-Service YAML
-============
-
-SkyServe provides an intuitive YAML interface to specify a service. It is an extension to the :ref:`SkyPilot task YAML <yaml-spec>`: with an additional ``service`` section in your original task YAML, you could change it to a service YAML.
-
-Available fields:
-
-
-.. code-block:: yaml
-
-    # The `service` section turns a skypilot task yaml into a service yaml.
-    service:
-
-      # Readiness probe (required). Used by SkyServe to check if your service
-      # replicas are ready for accepting traffic. If the readiness probe returns
-      # a 200, SkyServe will start routing traffic to that replica.
-      readiness_probe:
-        # Path to probe (required).
-        path: /v1/models
-        # Post data (optional). If this is specified, the readiness probe will use
-        # POST instead of GET, and the post data will be sent as the request body.
-        post_data: {'model_name': 'model'}
-        # Initial delay in seconds (optional). Defaults to 1200 seconds (20 minutes).
-        # Any readiness probe failures during this period will be ignored. This is
-        # highly related to your service, so it is recommended to set this value
-        # based on your service's startup time.
-        initial_delay_seconds: 1200
-        # The Timeout in seconds for a readiness probe request (optional).
-        # Defaults to 15 seconds. If the readiness probe takes longer than this
-        # time to respond, the probe will be considered as failed. This is
-        # useful when your service is slow to respond to readiness probe
-        # requests. Note, having a too high timeout will delay the detection
-        # of a real failure of your service replica.
-        timeout_seconds: 15
-
-      # Simplified version of readiness probe that only contains the readiness
-      # probe path. If you want to use GET method for readiness probe and the
-      # default initial delay, you can use the following syntax:
-      readiness_probe: /v1/models
-
-      # One of the two following fields (replica_policy or replicas) is required.
-
-      # Replica autoscaling policy. This describes how SkyServe autoscales
-      # your service based on the QPS (queries per second) of your service.
-      replica_policy:
-        # Minimum number of replicas (required).
-        min_replicas: 1
-        # Maximum number of replicas (optional). If not specified, SkyServe will
-        # use a fixed number of replicas (the same as min_replicas) and ignore
-        # any QPS threshold specified below.
-        max_replicas: 3
-        # Following specs describe the autoscaling policy.
-        # Target query per second per replica (optional). SkyServe will scale your
-        # service so that, ultimately, each replica manages approximately
-        # target_qps_per_replica queries per second. **Autoscaling will only be
-        # enabled if this value is specified.**
-        target_qps_per_replica: 5
-        # Upscale and downscale delay in seconds (optional). Defaults to 300 seconds
-        # (5 minutes) and 1200 seconds (20 minutes) respectively. To avoid aggressive
-        # autoscaling, SkyServe will only upscale or downscale your service if the
-        # QPS of your service is higher or lower than the target QPS for a period
-        # of time. This period of time is controlled by upscale_delay_seconds and
-        # downscale_delay_seconds. The default values should work in most cases.
-        # If you want to scale your service more aggressively, you can set
-        # these values to a smaller number.
-        upscale_delay_seconds: 300
-        downscale_delay_seconds: 1200
-      # Simplified version of replica policy that uses a fixed number of
-      # replicas:
-      replicas: 2
-
-    ##### Fields below describe each replica #####
-
-    # Besides the `service` section, the rest is a regular SkyPilot task YAML.
-
-    resources:
-      # Port to run your service on each replica (required). This port will be
-      # automatically exposed to the public internet by SkyServe.
-      ports: 8080
-      # Other resources config...
-
-    # Other fields of your SkyPilot task YAML...
-
diff --git a/docs/source/serving/sky-serve.rst b/docs/source/serving/sky-serve.rst
index 693102c0550..9e8ab0e4a61 100644
--- a/docs/source/serving/sky-serve.rst
+++ b/docs/source/serving/sky-serve.rst
@@ -277,7 +277,7 @@ sending requests to :code:`<endpoint-url>` (e.g., ``44.201.119.3:30001``):
     </body>
     </html>
 
-Tutorial: Serve a Chatbot LLM!
+Tutorial: Serve a chatbot LLM!
 ------------------------------
 
 Let's bring up a real LLM chat service with FastChat + Vicuna. We'll use the `Vicuna OpenAI API Endpoint YAML <https://github.com/skypilot-org/skypilot/blob/master/llm/vicuna/serve-openai-api-endpoint.yaml>`_ as an example:
@@ -453,7 +453,7 @@ Authorization
 See :ref:`Authorization <serve-auth>` for more information.
 
 
-SkyServe Architecture
+SkyServe architecture
 ---------------------
 
 .. image:: ../images/sky-serve-architecture.png
diff --git a/docs/source/serving/spot-policy.rst b/docs/source/serving/spot-policy.rst
index f9785d0eeb0..02af9a79f26 100644
--- a/docs/source/serving/spot-policy.rst
+++ b/docs/source/serving/spot-policy.rst
@@ -3,10 +3,9 @@
 Using Spot Instances for Serving
 ================================
 
-SkyServe supports serving models on a mixture of spot and on-demand replicas with two options: :code:`base_ondemand_fallback_replicas` and :code:`dynamic_ondemand_fallback`. Currently, SkyServe relies on the user side to retry in the event of spot instance preemptions. 
+SkyServe supports serving models on a mixture of spot and on-demand replicas with two options: :code:`base_ondemand_fallback_replicas` and :code:`dynamic_ondemand_fallback`. Currently, SkyServe relies on the user side to retry in the event of spot instance preemptions.
 
-
-Base on-demand Fallback
+Base on-demand fallback
 -----------------------
 
 :code:`base_ondemand_fallback_replicas` sets the number of on-demand replicas to keep running at all times. This is useful for ensuring service availability and making sure that there is always some capacity available, even if spot replicas are not available. :code:`use_spot` should be set to :code:`true` to enable spot replicas.
@@ -36,8 +35,7 @@ Base on-demand Fallback
 
     Kubernetes instances are considered on-demand instances. You can use the :code:`base_ondemand_fallback_replicas` option to have some replicas run on Kubernetes, while others run on cloud spot instances.
 
-
-Dynamic on-demand Fallback
+Dynamic on-demand fallback
 --------------------------
 
 SkyServe supports dynamically fallback to on-demand replicas when spot replicas are not available.
@@ -73,7 +71,7 @@ When spot replicas are available, SkyServe will automatically switch back to usi
 Example
 -------
 
-The following example demonstrates how to use spot replicas with SkyServe with dynamic fallback. The example is a simple HTTP server that listens on port 8081 with :code:`dynamic_ondemand_fallback: true`. To run: 
+The following example demonstrates how to use spot replicas with SkyServe with dynamic fallback. The example is a simple HTTP server that listens on port 8081 with :code:`dynamic_ondemand_fallback: true`. To run:
 
 .. code-block:: console
 
@@ -86,31 +84,31 @@ When the service is up, we can check the status of the service and the replicas
     $ sky serve status http-server
 
     Services
-    NAME         VERSION  UPTIME  STATUS      REPLICAS  ENDPOINT         
+    NAME         VERSION  UPTIME  STATUS      REPLICAS  ENDPOINT
     http-server  1        1m 17s  NO_REPLICA  0/4       54.227.229.217:30001
 
     Service Replicas
-    SERVICE_NAME  ID  VERSION  ENDPOINT                   LAUNCHED    RESOURCES             STATUS         REGION       
-    http-server   1   1        -                          1 min ago   1x GCP([Spot]vCPU=2)  PROVISIONING  us-east1     
-    http-server   2   1        -                          1 min ago   1x GCP([Spot]vCPU=2)  PROVISIONING  us-central1  
-    http-server   3   1        -                          1 mins ago  1x GCP(vCPU=2)        PROVISIONING  us-east1     
+    SERVICE_NAME  ID  VERSION  ENDPOINT                   LAUNCHED    RESOURCES             STATUS         REGION
+    http-server   1   1        -                          1 min ago   1x GCP([Spot]vCPU=2)  PROVISIONING  us-east1
+    http-server   2   1        -                          1 min ago   1x GCP([Spot]vCPU=2)  PROVISIONING  us-central1
+    http-server   3   1        -                          1 mins ago  1x GCP(vCPU=2)        PROVISIONING  us-east1
     http-server   4   1        -                          1 min ago   1x GCP(vCPU=2)        PROVISIONING  us-central1
 
-When the required number of spot replicas are not available, SkyServe will provision on-demand replicas to meet the target number of replicas. For example, when the target number is 2 and no spot replicas are ready, SkyServe will provision 2 on-demand replicas to meet the target number of replicas. 
+When the required number of spot replicas are not available, SkyServe will provision on-demand replicas to meet the target number of replicas. For example, when the target number is 2 and no spot replicas are ready, SkyServe will provision 2 on-demand replicas to meet the target number of replicas.
 
 .. code-block:: console
 
     $ sky serve status http-server
 
     Services
-    NAME         VERSION  UPTIME  STATUS  REPLICAS  ENDPOINT              
-    http-server  1        1m 17s  READY   2/4       54.227.229.217:30001  
+    NAME         VERSION  UPTIME  STATUS  REPLICAS  ENDPOINT
+    http-server  1        1m 17s  READY   2/4       54.227.229.217:30001
 
     Service Replicas
-    SERVICE_NAME  ID  VERSION  ENDPOINT                   LAUNCHED    RESOURCES             STATUS         REGION       
-    http-server   1   1        http://34.23.22.160:8081   3 min ago   1x GCP([Spot]vCPU=2)  READY          us-east1     
-    http-server   2   1        http://34.68.226.193:8081  3 min ago   1x GCP([Spot]vCPU=2)  READY          us-central1  
-    http-server   3   1        -                          3 mins ago  1x GCP(vCPU=2)        SHUTTING_DOWN  us-east1     
+    SERVICE_NAME  ID  VERSION  ENDPOINT                   LAUNCHED    RESOURCES             STATUS         REGION
+    http-server   1   1        http://34.23.22.160:8081   3 min ago   1x GCP([Spot]vCPU=2)  READY          us-east1
+    http-server   2   1        http://34.68.226.193:8081  3 min ago   1x GCP([Spot]vCPU=2)  READY          us-central1
+    http-server   3   1        -                          3 mins ago  1x GCP(vCPU=2)        SHUTTING_DOWN  us-east1
     http-server   4   1        -                          3 min ago   1x GCP(vCPU=2)        SHUTTING_DOWN  us-central1
 
 When the spot replicas are ready, SkyServe will automatically scale down on-demand replicas to maximize cost savings.
@@ -120,13 +118,13 @@ When the spot replicas are ready, SkyServe will automatically scale down on-dema
     $ sky serve status http-server
 
     Services
-    NAME         VERSION  UPTIME  STATUS  REPLICAS  ENDPOINT              
-    http-server  1        3m 59s  READY   2/2       54.227.229.217:30001  
+    NAME         VERSION  UPTIME  STATUS  REPLICAS  ENDPOINT
+    http-server  1        3m 59s  READY   2/2       54.227.229.217:30001
 
     Service Replicas
-    SERVICE_NAME  ID  VERSION  ENDPOINT                   LAUNCHED    RESOURCES             STATUS  REGION       
-    http-server   1   1        http://34.23.22.160:8081   4 mins ago  1x GCP([Spot]vCPU=2)  READY   us-east1     
-    http-server   2   1        http://34.68.226.193:8081  4 mins ago  1x GCP([Spot]vCPU=2)  READY   us-central1 
+    SERVICE_NAME  ID  VERSION  ENDPOINT                   LAUNCHED    RESOURCES             STATUS  REGION
+    http-server   1   1        http://34.23.22.160:8081   4 mins ago  1x GCP([Spot]vCPU=2)  READY   us-east1
+    http-server   2   1        http://34.68.226.193:8081  4 mins ago  1x GCP([Spot]vCPU=2)  READY   us-central1
 
 In the event of spot instance interruptions (e.g. replica 1), SkyServe will automatically fallback to on-demand replicas (e.g. launch one on-demand replica) to meet the required capacity of replicas. SkyServe will continue trying to provision one spot replica in the event where spot availability is back. Note that SkyServe will try different regions and clouds to maximize the chance of successfully provisioning spot instances.
 
@@ -135,13 +133,13 @@ In the event of spot instance interruptions (e.g. replica 1), SkyServe will auto
     $ sky serve status http-server
 
     Services
-    NAME         VERSION  UPTIME  STATUS  REPLICAS  ENDPOINT              
-    http-server  1        7m 2s   READY   1/3       54.227.229.217:30001  
+    NAME         VERSION  UPTIME  STATUS  REPLICAS  ENDPOINT
+    http-server  1        7m 2s   READY   1/3       54.227.229.217:30001
 
     Service Replicas
-    SERVICE_NAME  ID  VERSION  ENDPOINT                   LAUNCHED     RESOURCES             STATUS        REGION       
-    http-server   2   1        http://34.68.226.193:8081  7 mins ago   1x GCP([Spot]vCPU=2)  READY         us-central1  
-    http-server   5   1        -                          13 secs ago  1x GCP([Spot]vCPU=2)  PROVISIONING  us-central1  
+    SERVICE_NAME  ID  VERSION  ENDPOINT                   LAUNCHED     RESOURCES             STATUS        REGION
+    http-server   2   1        http://34.68.226.193:8081  7 mins ago   1x GCP([Spot]vCPU=2)  READY         us-central1
+    http-server   5   1        -                          13 secs ago  1x GCP([Spot]vCPU=2)  PROVISIONING  us-central1
     http-server   6   1        -                          13 secs ago  1x GCP(vCPU=2)        PROVISIONING  us-central1
 
 Eventually, when the spot availability is back, SkyServe will automatically scale down on-demand replicas.
@@ -151,10 +149,10 @@ Eventually, when the spot availability is back, SkyServe will automatically scal
     $ sky serve status http-server
 
     Services
-    NAME         VERSION  UPTIME  STATUS  REPLICAS  ENDPOINT              
-    http-server  1        10m 5s  READY   2/3       54.227.229.217:30001  
+    NAME         VERSION  UPTIME  STATUS  REPLICAS  ENDPOINT
+    http-server  1        10m 5s  READY   2/3       54.227.229.217:30001
 
     Service Replicas
-    SERVICE_NAME  ID  VERSION  ENDPOINT                   LAUNCHED     RESOURCES             STATUS         REGION       
-    http-server   2   1        http://34.68.226.193:8081  10 mins ago  1x GCP([Spot]vCPU=2)  READY          us-central1  
+    SERVICE_NAME  ID  VERSION  ENDPOINT                   LAUNCHED     RESOURCES             STATUS         REGION
+    http-server   2   1        http://34.68.226.193:8081  10 mins ago  1x GCP([Spot]vCPU=2)  READY          us-central1
     http-server   5   1        http://34.121.49.94:8081   1 min ago    1x GCP([Spot]vCPU=2)  READY          us-central1
diff --git a/docs/source/serving/update.rst b/docs/source/serving/update.rst
index 2e34036dc69..b0c9f75e99c 100644
--- a/docs/source/serving/update.rst
+++ b/docs/source/serving/update.rst
@@ -13,10 +13,9 @@ During an update, the service will remain accessible with no downtime and its
 endpoint will remain the same. By default, :ref:`rolling update <rolling-update>`
 is applied, while you can also specify a :ref:`blue-green update <blue-green-update>`.
 
-
 .. _rolling-update:
 
-Rolling Update
+Rolling update
 ---------------
 
 To update an existing service, use ``sky serve update``:
@@ -168,7 +167,7 @@ Eventually, we will only have new replicas ready to serve user requests.
 
 .. _blue-green-update:
 
-Blue-Green Update
+Blue-green update
 ------------------
 
 SkyServe also supports blue-green updates, by the following command:
@@ -177,7 +176,7 @@ SkyServe also supports blue-green updates, by the following command:
 
     $ sky serve update --mode blue_green service-name new_service.yaml
 
-  
+
 In this update mode, SkyServe will launch new replicas described by ``new_service.yaml`` with the following behavior:
 
 * An update is initiated, and traffic will continue to be redirected to existing (old) replicas.
diff --git a/docs/source/serving/user-guides.rst b/docs/source/serving/user-guides.rst
index 8b9cba92b45..be4b1149f5b 100644
--- a/docs/source/serving/user-guides.rst
+++ b/docs/source/serving/user-guides.rst
@@ -7,3 +7,4 @@ Serving User Guides
    update
    auth
    spot-policy
+   https
diff --git a/sky/cli.py b/sky/cli.py
index bf689dc1c62..62842190e00 100644
--- a/sky/cli.py
+++ b/sky/cli.py
@@ -3785,7 +3785,7 @@ def jobs_queue(all: bool, refresh: bool, skip_finished: bool):
       watch -n60 sky jobs queue
 
     """
-    click.secho('Fetching managed job statuses...', fg='yellow')
+    click.secho('Fetching managed jobs...', fg='yellow')
     with rich_utils.safe_status(
             ux_utils.spinner_message('Checking managed jobs')):
         _, msg = _get_managed_jobs(refresh=refresh,

From 4dda53fa66b8b7378d6ae5c331a56225d3f31f11 Mon Sep 17 00:00:00 2001
From: Zongheng Yang <zongheng.y@gmail.com>
Date: Tue, 11 Feb 2025 11:38:06 -0800
Subject: [PATCH 05/18] [CLI UX]: Clean up task/job terminology a bit. (#4685)

* [CLI UX]: Clean up task/job terminology a bit.

* message

* Unbold optimizer; lint

* lower case

* remove arg
---
 sky/cli.py       | 32 ++++++++++++++------------------
 sky/execution.py |  4 ++--
 sky/optimizer.py |  6 ++----
 3 files changed, 18 insertions(+), 24 deletions(-)

diff --git a/sky/cli.py b/sky/cli.py
index 62842190e00..8ce3051e7df 100644
--- a/sky/cli.py
+++ b/sky/cli.py
@@ -623,7 +623,8 @@ def _launch_with_confirm(
             click.confirm(prompt, default=True, abort=True, show_default=True)
 
     if not confirm_shown:
-        click.secho(f'Running task on cluster {cluster}...', fg='yellow')
+        click.secho('Running on cluster: ', fg='cyan', nl=False)
+        click.secho(cluster)
 
     sky.launch(
         dag,
@@ -722,7 +723,6 @@ def _pop_and_ignore_fields_in_override_params(
 def _make_task_or_dag_from_entrypoint_with_overrides(
     entrypoint: Tuple[str, ...],
     *,
-    entrypoint_name: str = 'Task',
     name: Optional[str] = None,
     workdir: Optional[str] = None,
     cloud: Optional[str] = None,
@@ -754,19 +754,15 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
     entrypoint: Optional[str]
     if is_yaml:
         # Treat entrypoint as a yaml.
-        click.secho(f'{entrypoint_name} from YAML spec: ',
-                    fg='yellow',
-                    nl=False)
-        click.secho(entrypoint, bold=True)
+        click.secho('YAML to run: ', fg='cyan', nl=False)
+        click.secho(entrypoint)
     else:
         if not entrypoint:
             entrypoint = None
         else:
             # Treat entrypoint as a bash command.
-            click.secho(f'{entrypoint_name} from command: ',
-                        fg='yellow',
-                        nl=False)
-            click.secho(entrypoint, bold=True)
+            click.secho('Command to run: ', fg='cyan', nl=False)
+            click.secho(entrypoint)
 
     override_params = _parse_override_params(cloud=cloud,
                                              region=region,
@@ -1333,7 +1329,8 @@ def exec(
                                'supports a single task only.')
     task = task_or_dag
 
-    click.secho(f'Executing task on cluster {cluster}...', fg='yellow')
+    click.secho('Submitting job to cluster: ', fg='cyan', nl=False)
+    click.secho(cluster)
     sky.exec(task, backend=backend, cluster_name=cluster, detach_run=detach_run)
 
 
@@ -1982,7 +1979,7 @@ def cost_report(all: bool):  # pylint: disable=redefined-builtin
 def queue(clusters: List[str], skip_finished: bool, all_users: bool):
     # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
     """Show the job queue for cluster(s)."""
-    click.secho('Fetching and parsing job queue...', fg='yellow')
+    click.secho('Fetching and parsing job queue...', fg='cyan')
     if clusters:
         clusters = _get_glob_clusters(clusters)
     else:
@@ -3785,7 +3782,7 @@ def jobs_queue(all: bool, refresh: bool, skip_finished: bool):
       watch -n60 sky jobs queue
 
     """
-    click.secho('Fetching managed jobs...', fg='yellow')
+    click.secho('Fetching managed jobs...', fg='cyan')
     with rich_utils.safe_status(
             ux_utils.spinner_message('Checking managed jobs')):
         _, msg = _get_managed_jobs(refresh=refresh,
@@ -3938,7 +3935,7 @@ def jobs_dashboard(port: Optional[int]):
     # see if the controller is UP first, which is slow; (2) not have to run SSH
     # port forwarding first (we'd just launch a local dashboard which would make
     # REST API calls to the controller dashboard server).
-    click.secho('Checking if jobs controller is up...', fg='yellow')
+    click.secho('Checking if jobs controller is up...', fg='cyan')
     hint = ('Dashboard is not available if jobs controller is not up. Run a '
             'managed job first.')
     backend_utils.is_controller_accessible(
@@ -4032,7 +4029,6 @@ def _generate_task_with_service(
         disk_size=disk_size,
         disk_tier=disk_tier,
         ports=ports,
-        entrypoint_name='Service',
     )
     if isinstance(task, sky.Dag):
         raise click.UsageError(
@@ -4197,7 +4193,7 @@ def serve_up(
         ports=ports,
         not_supported_cmd='sky serve up',
     )
-    click.secho('Service Spec:', fg='cyan')
+    click.secho('Service spec:', fg='cyan')
     click.echo(task.service)
 
     click.secho('Each replica will use the following resources (estimated):',
@@ -4315,7 +4311,7 @@ def serve_update(
         ports=ports,
         not_supported_cmd='sky serve update',
     )
-    click.secho('Service Spec:', fg='cyan')
+    click.secho('Service spec:', fg='cyan')
     click.echo(task.service)
 
     click.secho('New replica will use the following resources (estimated):',
@@ -4767,7 +4763,7 @@ def benchmark_launch(
             'Please provide a YAML file.')
     assert config is not None, (is_yaml, config)
 
-    click.secho('Benchmarking a task from YAML spec: ', fg='yellow', nl=False)
+    click.secho('Benchmarking a task from YAML: ', fg='cyan', nl=False)
     click.secho(entrypoint, bold=True)
 
     candidates = _get_candidate_configs(entrypoint)
diff --git a/sky/execution.py b/sky/execution.py
index 7392d510b17..bb8ba725314 100644
--- a/sky/execution.py
+++ b/sky/execution.py
@@ -259,8 +259,8 @@ def _execute(
             bold = colorama.Style.BRIGHT
             reset = colorama.Style.RESET_ALL
             logger.info(
-                f'{yellow}Launching an unmanaged spot task, which does not '
-                f'automatically recover from preemptions.{reset}\n{yellow}To '
+                f'{yellow}Launching a spot job that does not '
+                f'automatically recover from preemptions. To '
                 'get automatic recovery, use managed job instead: '
                 f'{reset}{bold}sky jobs launch{reset} {yellow}or{reset} '
                 f'{bold}sky.jobs.launch(){reset}.')
diff --git a/sky/optimizer.py b/sky/optimizer.py
index 5aab31d7750..b5d0114476b 100644
--- a/sky/optimizer.py
+++ b/sky/optimizer.py
@@ -884,10 +884,8 @@ def _get_resource_group_hash(resources: 'resources_lib.Resources'):
                 # Add a new line for better readability, when there are multiple
                 # tasks.
                 logger.info('')
-            logger.info(
-                f'{colorama.Style.BRIGHT}Considered resources {task_str}'
-                f'({task.num_nodes} node{plural}):'
-                f'{colorama.Style.RESET_ALL}')
+            logger.info(f'Considered resources {task_str}'
+                        f'({task.num_nodes} node{plural}):')
 
             # Only print 1 row per cloud.
             # The following code is to generate the table

From a1619c9d378727da38bd512e162bb18a53840481 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Tue, 11 Feb 2025 11:41:57 -0800
Subject: [PATCH 06/18] Add heartbeat for usage collection (#4499)

* Add heartbeat usage

* upgrade loki 3.3

* fix type

* embed version in type

* Use the same type

* use schema 1

* remove entrypoint from labels

* Add interval seconds

* Respect env var for skylet

* format

* Add comment

* Increase grpc limit

* Use send time

* Add comments

* Add comment
---
 sky/provision/instance_setup.py | 43 +++++++++++++++++++---
 sky/skylet/events.py            |  9 +++++
 sky/skylet/skylet.py            |  2 ++
 sky/usage/constants.py          |  3 +-
 sky/usage/loki-s3-config.yaml   |  2 +-
 sky/usage/usage_lib.py          | 64 +++++++++++++++++++++++++++------
 sky/utils/env_options.py        |  6 ++++
 7 files changed, 111 insertions(+), 18 deletions(-)

diff --git a/sky/provision/instance_setup.py b/sky/provision/instance_setup.py
index 86d1c59f36c..c5df48c8bae 100644
--- a/sky/provision/instance_setup.py
+++ b/sky/provision/instance_setup.py
@@ -15,9 +15,12 @@
 from sky.provision import logging as provision_logging
 from sky.provision import metadata_utils
 from sky.skylet import constants
+from sky.usage import constants as usage_constants
+from sky.usage import usage_lib
 from sky.utils import accelerator_registry
 from sky.utils import command_runner
 from sky.utils import common_utils
+from sky.utils import env_options
 from sky.utils import subprocess_utils
 from sky.utils import timeline
 from sky.utils import ux_utils
@@ -67,6 +70,30 @@
                             'sky.skylet.attempt_skylet;')
 
 
+def _set_usage_run_id_cmd() -> str:
+    """Gets the command to set the usage run id.
+
+    The command saves the current usage run id to the file, so that the skylet
+    can use it to report the heartbeat.
+
+    We use a function instead of a constant so that the usage run id is the
+    latest one when the function is called.
+    """
+    return (
+        f'cat {usage_constants.USAGE_RUN_ID_FILE} || '
+        # The run id is retrieved locally for the current run, so that the
+        # remote cluster will be set with the same run id as the initial
+        # launch operation.
+        f'echo "{usage_lib.messages.usage.run_id}" > '
+        f'{usage_constants.USAGE_RUN_ID_FILE}')
+
+
+def _set_skypilot_env_var_cmd() -> str:
+    """Sets the skypilot environment variables on the remote machine."""
+    env_vars = env_options.Options.all_options()
+    return '; '.join([f'export {k}={v}' for k, v in env_vars.items()])
+
+
 def _auto_retry(should_retry: Callable[[Exception], bool] = lambda _: True):
     """Decorator that retries the function if it fails.
 
@@ -450,11 +477,17 @@ def start_skylet_on_head_node(cluster_name: str,
     logger.info(f'Running command on head node: {MAYBE_SKYLET_RESTART_CMD}')
     # We need to source bashrc for skylet to make sure the autostop event can
     # access the path to the cloud CLIs.
-    returncode, stdout, stderr = head_runner.run(MAYBE_SKYLET_RESTART_CMD,
-                                                 stream_logs=False,
-                                                 require_outputs=True,
-                                                 log_path=log_path_abs,
-                                                 source_bashrc=True)
+    set_usage_run_id_cmd = _set_usage_run_id_cmd()
+    # Set the skypilot environment variables, including the usage type, debug
+    # info, and other options.
+    set_skypilot_env_var_cmd = _set_skypilot_env_var_cmd()
+    returncode, stdout, stderr = head_runner.run(
+        f'{set_usage_run_id_cmd}; {set_skypilot_env_var_cmd}; '
+        f'{MAYBE_SKYLET_RESTART_CMD}',
+        stream_logs=False,
+        require_outputs=True,
+        log_path=log_path_abs,
+        source_bashrc=True)
     if returncode:
         raise RuntimeError('Failed to start skylet on the head node '
                            f'(exit code {returncode}). Error: '
diff --git a/sky/skylet/events.py b/sky/skylet/events.py
index e909a5e8f23..b93d7bd490c 100644
--- a/sky/skylet/events.py
+++ b/sky/skylet/events.py
@@ -20,6 +20,7 @@
 from sky.skylet import autostop_lib
 from sky.skylet import constants
 from sky.skylet import job_lib
+from sky.usage import usage_lib
 from sky.utils import cluster_yaml_utils
 from sky.utils import common_utils
 from sky.utils import ux_utils
@@ -90,6 +91,14 @@ def _run(self):
         serve_utils.update_service_status()
 
 
+class UsageHeartbeatReportEvent(SkyletEvent):
+    """Skylet event for reporting usage."""
+    EVENT_INTERVAL_SECONDS = 600
+
+    def _run(self):
+        usage_lib.send_heartbeat(interval_seconds=self.EVENT_INTERVAL_SECONDS)
+
+
 class AutostopEvent(SkyletEvent):
     """Skylet event for autostop.
 
diff --git a/sky/skylet/skylet.py b/sky/skylet/skylet.py
index 85c2cb5c4de..522010825e7 100644
--- a/sky/skylet/skylet.py
+++ b/sky/skylet/skylet.py
@@ -25,6 +25,8 @@
     # unhealthy, this event will correctly update the controller
     # status to CONTROLLER_FAILED.
     events.ServiceUpdateEvent(),
+    # Report usage heartbeat every 10 minutes.
+    events.UsageHeartbeatReportEvent(),
 ]
 
 while True:
diff --git a/sky/usage/constants.py b/sky/usage/constants.py
index c2f8c6d067b..65cd9f89584 100644
--- a/sky/usage/constants.py
+++ b/sky/usage/constants.py
@@ -3,7 +3,6 @@
 LOG_URL = 'http://usage.skypilot.co:9090/loki/api/v1/push'  # pylint: disable=line-too-long
 
 USAGE_MESSAGE_SCHEMA_VERSION = 1
-
 PRIVACY_POLICY_PATH = '~/.sky/privacy_policy'
 
 USAGE_POLICY_MESSAGE = (
@@ -15,3 +14,5 @@
 
 USAGE_MESSAGE_REDACT_KEYS = ['setup', 'run', 'envs']
 USAGE_MESSAGE_REDACT_TYPES = {str, dict}
+
+USAGE_RUN_ID_FILE = '~/.sky/usage_run_id'
diff --git a/sky/usage/loki-s3-config.yaml b/sky/usage/loki-s3-config.yaml
index 35a939e4fad..c51f7c15561 100644
--- a/sky/usage/loki-s3-config.yaml
+++ b/sky/usage/loki-s3-config.yaml
@@ -26,8 +26,8 @@ schema_config:
 
 ingester:
   chunk_idle_period: 3m
-  chunk_block_size: 262144
   chunk_retain_period: 1m
+  chunk_block_size: 1048576  # 1 MB
   wal:
     enabled: true
     dir: /loki/wal
diff --git a/sky/usage/usage_lib.py b/sky/usage/usage_lib.py
index 3cc630b3a98..58f518f4e14 100644
--- a/sky/usage/usage_lib.py
+++ b/sky/usage/usage_lib.py
@@ -44,6 +44,7 @@ def _get_current_timestamp_ns() -> int:
 class MessageType(enum.Enum):
     """Types for messages to be sent to Loki."""
     USAGE = 'usage'
+    HEARTBEAT = 'heartbeat'
     # TODO(zhwu): Add more types, e.g., cluster_lifecycle.
 
 
@@ -67,8 +68,9 @@ def get_properties(self) -> Dict[str, Any]:
         properties = self.__dict__.copy()
         return {k: v for k, v in properties.items() if not k.startswith('_')}
 
-    def __repr__(self):
-        raise NotImplementedError
+    def __repr__(self) -> str:
+        d = self.get_properties()
+        return json.dumps(d)
 
 
 class UsageMessageToReport(MessageToReport):
@@ -160,10 +162,6 @@ def __init__(self) -> None:
         self.exception: Optional[str] = None  # entrypoint_context
         self.stacktrace: Optional[str] = None  # entrypoint_context
 
-    def __repr__(self) -> str:
-        d = self.get_properties()
-        return json.dumps(d)
-
     def update_entrypoint(self, msg: str):
         self.entrypoint = msg
 
@@ -275,16 +273,43 @@ def update_runtime(self, name_or_fn: str):
                                            name_or_fn)
 
 
+class HeartbeatMessageToReport(MessageToReport):
+    """Message to be reported to Grafana Loki for heartbeat on a cluster."""
+
+    def __init__(self, interval_seconds: int = 600):
+        super().__init__(constants.USAGE_MESSAGE_SCHEMA_VERSION)
+        # This interval_seconds is mainly for recording the heartbeat interval
+        # in the heartbeat message, so that the collector can use it.
+        self.interval_seconds = interval_seconds
+
+    def get_properties(self) -> Dict[str, Any]:
+        properties = super().get_properties()
+        # The run id is set by the skylet, which will always be the same for
+        # the entire lifetime of the run.
+        with open(os.path.expanduser(constants.USAGE_RUN_ID_FILE),
+                  'r',
+                  encoding='utf-8') as f:
+            properties['run_id'] = f.read().strip()
+        return properties
+
+
 class MessageCollection:
     """A collection of messages."""
 
     def __init__(self):
-        self._messages = {MessageType.USAGE: UsageMessageToReport()}
+        self._messages = {
+            MessageType.USAGE: UsageMessageToReport(),
+            MessageType.HEARTBEAT: HeartbeatMessageToReport()
+        }
 
     @property
-    def usage(self):
+    def usage(self) -> UsageMessageToReport:
         return self._messages[MessageType.USAGE]
 
+    @property
+    def heartbeat(self) -> HeartbeatMessageToReport:
+        return self._messages[MessageType.HEARTBEAT]
+
     def reset(self, message_type: MessageType):
         self._messages[message_type] = self._messages[message_type].__class__()
 
@@ -308,13 +333,25 @@ def _send_to_loki(message_type: MessageType):
 
     message = messages[message_type]
 
+    # In case the message has no start time, set it to the current time.
+    message.start()
     message.send_time = _get_current_timestamp_ns()
-    log_timestamp = message.start_time
+    # Use send time instead of start time to avoid the message being dropped
+    # by Loki, due to the timestamp being too old. We still have the start time
+    # in the message for dashboard.
+    log_timestamp = message.send_time
 
     environment = 'prod'
     if env_options.Options.IS_DEVELOPER.get():
         environment = 'dev'
-    prom_labels = {'type': message_type.value, 'environment': environment}
+    prom_labels = {
+        'type': message_type.value,
+        'environment': environment,
+        'schema_version': message.schema_version,
+    }
+    if message_type == MessageType.USAGE:
+        prom_labels['new_cluster'] = (message.original_cluster_status != 'UP'
+                                      and message.final_cluster_status == 'UP')
 
     headers = {'Content-type': 'application/json'}
     payload = {
@@ -392,7 +429,7 @@ def prepare_json_from_yaml_config(
 def _send_local_messages():
     """Send all messages not been uploaded to Loki."""
     for msg_type, message in messages.items():
-        if not message.message_sent:
+        if not message.message_sent and msg_type != MessageType.HEARTBEAT:
             # Avoid the fallback entrypoint to send the message again
             # in normal case.
             try:
@@ -402,6 +439,11 @@ def _send_local_messages():
                              f'exception caught: {type(e)}({e})')
 
 
+def send_heartbeat(interval_seconds: int = 600):
+    messages.heartbeat.interval_seconds = interval_seconds
+    _send_to_loki(MessageType.HEARTBEAT)
+
+
 @contextlib.contextmanager
 def entrypoint_context(name: str, fallback: bool = False):
     """Context manager for entrypoint.
diff --git a/sky/utils/env_options.py b/sky/utils/env_options.py
index cfc20a76253..b1dd10a219b 100644
--- a/sky/utils/env_options.py
+++ b/sky/utils/env_options.py
@@ -1,6 +1,7 @@
 """Global environment options for sky."""
 import enum
 import os
+from typing import Dict
 
 
 class Options(enum.Enum):
@@ -35,3 +36,8 @@ def get(self) -> bool:
     def env_key(self) -> str:
         """The environment variable key name."""
         return self.value[0]
+
+    @classmethod
+    def all_options(cls) -> Dict[str, bool]:
+        """Returns all options as a dictionary."""
+        return {option.env_key: option.get() for option in list(Options)}

From 0a077c48616f853eee9e598f353384dfee2eb689 Mon Sep 17 00:00:00 2001
From: Zongheng Yang <zongheng.y@gmail.com>
Date: Tue, 11 Feb 2025 12:57:50 -0800
Subject: [PATCH 07/18] [Docs] Installation: move up k8s. (#4696)

* [Docs] Installation: move up k8s.

* update
---
 docs/source/getting-started/installation.rst | 58 ++++++++++----------
 1 file changed, 28 insertions(+), 30 deletions(-)

diff --git a/docs/source/getting-started/installation.rst b/docs/source/getting-started/installation.rst
index 06b7f8d8663..b20f6839cbe 100644
--- a/docs/source/getting-started/installation.rst
+++ b/docs/source/getting-started/installation.rst
@@ -107,27 +107,27 @@ To use more than one cloud, combine the pip extras:
 
         .. code-block:: shell
 
-          pip install -U "skypilot[aws,gcp]"
+          pip install -U "skypilot[kubernetes,aws,gcp]"
 
     .. tab-item:: Nightly
         :sync: nightly-tab
 
         .. code-block:: shell
 
-          pip install -U "skypilot-nightly[aws,gcp]"
+          pip install -U "skypilot-nightly[kubernetes,aws,gcp]"
 
     .. tab-item:: From Source
         :sync: from-source-tab
 
         .. code-block:: shell
 
-          pip install -e ".[aws,gcp]"
+          pip install -e ".[kubernetes,aws,gcp]"
 
 Alternatively, we also provide a :ref:`Docker image <docker-image>` as a quick way to try out SkyPilot.
 
 .. _verify-cloud-access:
 
-Verifying cloud access
+Verify cloud access
 ------------------------------------
 
 After installation, run :code:`sky check` to verify that credentials are correctly set up:
@@ -156,8 +156,6 @@ This will produce a summary like:
     Cloudflare (for R2 object store): enabled
     Kubernetes: enabled
 
-  SkyPilot will use only the enabled clouds to run tasks. To change this, configure cloud credentials, and run sky check.
-
 If any cloud's credentials or dependencies are missing, ``sky check`` will
 output hints on how to resolve them. You can also refer to the cloud setup
 section :ref:`below <cloud-account-setup>`.
@@ -173,16 +171,29 @@ section :ref:`below <cloud-account-setup>`.
 
 .. _cloud-account-setup:
 
-Cloud account setup
--------------------
-
-SkyPilot currently supports these cloud providers: AWS, GCP, Azure, OCI, Lambda Cloud, RunPod, Fluidstack, Paperspace, Cudo,
-IBM, SCP, VMware vSphere and Cloudflare (for R2 object store).
+Set up Kubernetes or clouds
+---------------------------
 
-If you already have cloud access set up on your local machine, run ``sky check`` to :ref:`verify that SkyPilot can properly access your enabled clouds<verify-cloud-access>`.
+SkyPilot supports most major cloud providers.
+If you already have cloud access set up on your local machine, run ``sky check`` to :ref:`verify that SkyPilot can access your enabled clouds<verify-cloud-access>`.
 
 Otherwise, configure access to at least one cloud using the following guides.
 
+Kubernetes
+~~~~~~~~~~
+
+SkyPilot can run workloads on on-prem or cloud-hosted Kubernetes clusters
+(e.g., EKS, GKE). The only requirement is a valid kubeconfig at
+:code:`~/.kube/config`.
+
+.. code-block:: shell
+
+  # Place your kubeconfig at ~/.kube/config
+  mkdir -p ~/.kube
+  cp /path/to/kubeconfig ~/.kube/config
+
+See :ref:`SkyPilot on Kubernetes <kubernetes-overview>` for more.
+
 .. _aws-installation:
 
 Amazon Web Services (AWS)
@@ -303,7 +314,7 @@ Vast
 `Vast <https://vast.ai/>`__ is a cloud provider that offers low-cost GPUs. To configure Vast access, go to the `Account <https://cloud.vast.ai/account/>`_ page on your Vast console to get your **API key**. Then, run:
 
 .. code-block:: shell
-  
+
   pip install "vastai-sdk>=0.1.12"
   echo "<your_api_key_here>" > ~/.vast_api_key
 
@@ -313,7 +324,7 @@ RunPod
 `RunPod <https://runpod.io/>`__ is a specialized AI cloud provider that offers low-cost GPUs. To configure RunPod access, go to the `Settings <https://www.runpod.io/console/user/settings>`_ page on your RunPod console and generate an **API key**. Then, run:
 
 .. code-block:: shell
-  
+
   pip install "runpod>=1.6.1"
   runpod config
 
@@ -485,21 +496,8 @@ Next, get your `Account ID <https://developers.cloudflare.com/fundamentals/get-s
   Support for R2 is in beta. Please report and issues on `Github <https://github.com/skypilot-org/skypilot/issues>`_ or reach out to us on `Slack <http://slack.skypilot.co/>`_.
 
 
-Kubernetes
-~~~~~~~~~~
-
-SkyPilot can also run tasks on on-prem or cloud hosted Kubernetes clusters (e.g., EKS, GKE). The only requirement is a valid kubeconfig at :code:`~/.kube/config`.
-
-.. code-block:: shell
-
-  # Place your kubeconfig at ~/.kube/config
-  mkdir -p ~/.kube
-  cp /path/to/kubeconfig ~/.kube/config
-
-See :ref:`SkyPilot on Kubernetes <kubernetes-overview>` for more.
-
 
-Requesting quotas for first time users
+Request quotas for first time users
 --------------------------------------
 
 If your cloud account has not been used to launch instances before, the
@@ -511,7 +509,7 @@ increases before proceeding.
 
 .. _docker-image:
 
-Quick alternative: trying in Docker
+Quick alternative: Trying in Docker
 ------------------------------------------------------
 
 As a **quick alternative to installing SkyPilot on your laptop**, we also
@@ -557,7 +555,7 @@ See more details about the dev container image
 
 .. _shell-completion:
 
-Enabling shell completion
+Enable shell completion
 -------------------------
 
 SkyPilot supports shell completion for Bash (Version 4.4 and up), Zsh and Fish. This is only available for :code:`click` versions 8.0 and up (use :code:`pip install click==8.0.4` to install).

From b2d0333f44bf868614f3639c73eb5302016666df Mon Sep 17 00:00:00 2001
From: zpoint <zp0int@qq.com>
Date: Wed, 12 Feb 2025 11:17:18 +0800
Subject: [PATCH 08/18] Buildkite support for the `--kubernetes` flag with
 EKS/GKE cluster (#4684)

* run on gke

* support eks test

* require eks

* replace all require_gke

* resolve conflict

* rename mark to resource_heavy and support both GKE and EKS

* remove mark for test_launch_fast
---
 .buildkite/generate_pipeline.py             | 12 ++++++++++--
 tests/smoke_tests/test_cluster_job.py       | 12 +++++++++---
 tests/smoke_tests/test_managed_job.py       |  3 +++
 tests/smoke_tests/test_mount_and_storage.py |  1 +
 tests/smoke_tests/test_sky_serve.py         | 14 +++++++++++++-
 5 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py
index 050e129135e..d3041fdad7b 100644
--- a/.buildkite/generate_pipeline.py
+++ b/.buildkite/generate_pipeline.py
@@ -39,7 +39,12 @@
 QUEUE_GENERIC_CLOUD = 'generic_cloud'
 QUEUE_GENERIC_CLOUD_SERVE = 'generic_cloud_serve'
 QUEUE_KUBERNETES = 'kubernetes'
+QUEUE_EKS = 'eks'
 QUEUE_GKE = 'gke'
+# We use KUBE_BACKEND to specify the queue for kubernetes tests mark as
+# resource_heavy. It can be either EKS or GKE.
+QUEUE_KUBE_BACKEND = os.getenv('KUBE_BACKEND', QUEUE_EKS).lower()
+assert QUEUE_KUBE_BACKEND in [QUEUE_EKS, QUEUE_GKE]
 # Only aws, gcp, azure, and kubernetes are supported for now.
 # Other clouds do not have credentials.
 CLOUD_QUEUE_MAP = {
@@ -174,7 +179,9 @@ def _extract_marked_tests(
     for function_name, marks in function_name_marks_map.items():
         clouds_to_include = []
         is_serve_test = 'serve' in marks
-        run_on_gke = 'requires_gke' in marks
+        run_on_cloud_kube_backend = ('resource_heavy' in marks and
+                                     'kubernetes' in default_clouds_to_run)
+
         for mark in marks:
             if mark not in PYTEST_TO_CLOUD_KEYWORD:
                 # This mark does not specify a cloud, so we skip it.
@@ -210,7 +217,8 @@ def _extract_marked_tests(
             param_list += [None
                           ] * (len(final_clouds_to_include) - len(param_list))
         function_cloud_map[function_name] = (final_clouds_to_include, [
-            QUEUE_GKE if run_on_gke else cloud_queue_map[cloud]
+            QUEUE_KUBE_BACKEND
+            if run_on_cloud_kube_backend else cloud_queue_map[cloud]
             for cloud in final_clouds_to_include
         ], param_list)
 
diff --git a/tests/smoke_tests/test_cluster_job.py b/tests/smoke_tests/test_cluster_job.py
index 21dccee2ba7..fcedda5afaf 100644
--- a/tests/smoke_tests/test_cluster_job.py
+++ b/tests/smoke_tests/test_cluster_job.py
@@ -45,6 +45,7 @@
 @pytest.mark.no_scp  # SCP does not have T4 gpus. Run test_scp_job_queue instead
 @pytest.mark.no_paperspace  # Paperspace does not have T4 gpus.
 @pytest.mark.no_oci  # OCI does not have T4 gpus
+@pytest.mark.resource_heavy
 @pytest.mark.parametrize('accelerator', [{'do': 'H100'}])
 def test_job_queue(generic_cloud: str, accelerator: Dict[str, str]):
     accelerator = accelerator.get(generic_cloud, 'T4')
@@ -267,6 +268,7 @@ def test_job_queue_multinode(generic_cloud: str, accelerator: Dict[str, str]):
 @pytest.mark.no_fluidstack  # No FluidStack VM has 8 CPUs
 @pytest.mark.no_lambda_cloud  # No Lambda Cloud VM has 8 CPUs
 @pytest.mark.no_vast  # Vast doesn't guarantee exactly 8 CPUs, only at least.
+@pytest.mark.resource_heavy
 def test_large_job_queue(generic_cloud: str):
     name = smoke_tests_utils.get_cluster_name()
     test = smoke_tests_utils.Test(
@@ -313,6 +315,7 @@ def test_large_job_queue(generic_cloud: str):
 @pytest.mark.no_fluidstack  # No FluidStack VM has 8 CPUs
 @pytest.mark.no_lambda_cloud  # No Lambda Cloud VM has 8 CPUs
 @pytest.mark.no_vast  # No Vast Cloud VM has 8 CPUs
+@pytest.mark.resource_heavy
 def test_fast_large_job_queue(generic_cloud: str):
     # This is to test the jobs can be scheduled quickly when there are many jobs in the queue.
     name = smoke_tests_utils.get_cluster_name()
@@ -401,6 +404,7 @@ def test_docker_preinstalled_package(generic_cloud: str):
 @pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
 @pytest.mark.no_oci  # OCI Cloud does not have T4 gpus
 @pytest.mark.no_do  # DO does not have T4 gpus
+@pytest.mark.resource_heavy
 def test_multi_echo(generic_cloud: str):
     name = smoke_tests_utils.get_cluster_name()
     test = smoke_tests_utils.Test(
@@ -444,6 +448,7 @@ def test_multi_echo(generic_cloud: str):
 @pytest.mark.no_lambda_cloud  # Lambda Cloud does not have V100 gpus
 @pytest.mark.no_ibm  # IBM cloud currently doesn't provide public image with CUDA
 @pytest.mark.no_scp  # SCP does not have V100 (16GB) GPUs. Run test_scp_huggingface instead.
+@pytest.mark.resource_heavy
 @pytest.mark.parametrize('accelerator', [{'do': 'H100'}])
 def test_huggingface(generic_cloud: str, accelerator: Dict[str, str]):
     accelerator = accelerator.get(generic_cloud, 'T4')
@@ -575,7 +580,6 @@ def test_tpu_vm_pod():
 
 
 # ---------- TPU Pod Slice on GKE. ----------
-@pytest.mark.requires_gke
 @pytest.mark.kubernetes
 @pytest.mark.skip
 def test_tpu_pod_slice_gke():
@@ -695,6 +699,7 @@ def test_azure_http_server_with_custom_ports():
 
 # ---------- Web apps with custom ports on Kubernetes. ----------
 @pytest.mark.kubernetes
+@pytest.mark.resource_heavy
 def test_kubernetes_http_server_with_custom_ports():
     name = smoke_tests_utils.get_cluster_name()
     test = smoke_tests_utils.Test(
@@ -888,7 +893,7 @@ def test_add_and_remove_pod_annotations_with_autostop():
 
 
 # ---------- Container logs from task on Kubernetes ----------
-@pytest.mark.requires_gke
+@pytest.mark.resource_heavy
 @pytest.mark.kubernetes
 def test_container_logs_multinode_kubernetes():
     name = smoke_tests_utils.get_cluster_name()
@@ -1256,6 +1261,7 @@ def test_cancel_azure():
 @pytest.mark.no_paperspace  # Paperspace has `gnome-shell` on nvidia-smi
 @pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
 @pytest.mark.no_vast  # Vast does not support num_nodes > 1 yet
+@pytest.mark.resource_heavy
 @pytest.mark.parametrize('accelerator', [{'do': 'H100'}])
 def test_cancel_pytorch(generic_cloud: str, accelerator: Dict[str, str]):
     accelerator = accelerator.get(generic_cloud, 'T4')
@@ -1445,7 +1451,7 @@ def test_aws_custom_image():
     smoke_tests_utils.run_one_test(test)
 
 
-@pytest.mark.requires_gke
+@pytest.mark.resource_heavy
 @pytest.mark.kubernetes
 @pytest.mark.parametrize(
     'image_id',
diff --git a/tests/smoke_tests/test_managed_job.py b/tests/smoke_tests/test_managed_job.py
index c55931f8da8..0b30bf1f63a 100644
--- a/tests/smoke_tests/test_managed_job.py
+++ b/tests/smoke_tests/test_managed_job.py
@@ -42,6 +42,7 @@
 # when the controller being on Azure, which takes a long time for launching
 # step.
 @pytest.mark.managed_jobs
+@pytest.mark.resource_heavy
 def test_managed_jobs_basic(generic_cloud: str):
     """Test the managed jobs yaml."""
     name = smoke_tests_utils.get_cluster_name()
@@ -698,6 +699,7 @@ def test_managed_jobs_retry_logs(generic_cloud: str):
 @pytest.mark.no_do  # DO does not support spot instances
 @pytest.mark.no_vast  # Uses other clouds
 @pytest.mark.managed_jobs
+@pytest.mark.resource_heavy
 def test_managed_jobs_storage(generic_cloud: str):
     """Test storage with managed job"""
     name = smoke_tests_utils.get_cluster_name()
@@ -884,6 +886,7 @@ def test_managed_jobs_tpu():
 # ---------- Testing env for managed jobs ----------
 @pytest.mark.no_vast  # Uses unsatisfiable machines
 @pytest.mark.managed_jobs
+@pytest.mark.resource_heavy
 def test_managed_jobs_inline_env(generic_cloud: str):
     """Test managed jobs env"""
     name = smoke_tests_utils.get_cluster_name()
diff --git a/tests/smoke_tests/test_mount_and_storage.py b/tests/smoke_tests/test_mount_and_storage.py
index 2b0ba79cec9..c7f3e356c0d 100644
--- a/tests/smoke_tests/test_mount_and_storage.py
+++ b/tests/smoke_tests/test_mount_and_storage.py
@@ -321,6 +321,7 @@ def test_kubernetes_context_switch():
 
 
 @pytest.mark.no_vast  # Requires AWS
+@pytest.mark.resource_heavy
 @pytest.mark.parametrize(
     'image_id',
     [
diff --git a/tests/smoke_tests/test_sky_serve.py b/tests/smoke_tests/test_sky_serve.py
index fca9f45a576..ddc3ea8a92a 100644
--- a/tests/smoke_tests/test_sky_serve.py
+++ b/tests/smoke_tests/test_sky_serve.py
@@ -220,7 +220,7 @@ def test_skyserve_azure_http():
 
 @pytest.mark.kubernetes
 @pytest.mark.serve
-@pytest.mark.requires_gke
+@pytest.mark.resource_heavy
 def test_skyserve_kubernetes_http():
     """Test skyserve on Kubernetes"""
     name = _get_service_name()
@@ -241,6 +241,7 @@ def test_skyserve_oci_http():
 @pytest.mark.no_vast  # Vast has low availability of T4 GPUs
 @pytest.mark.parametrize('accelerator', [{'do': 'H100'}])
 @pytest.mark.serve
+@pytest.mark.resource_heavy
 def test_skyserve_llm(generic_cloud: str, accelerator: Dict[str, str]):
     """Test skyserve with real LLM usecase"""
     accelerator = accelerator.get(generic_cloud, 'T4')
@@ -370,6 +371,7 @@ def test_skyserve_dynamic_ondemand_fallback():
 @pytest.mark.no_do  # DO does not support `--cpus 2`
 @pytest.mark.serve
 @pytest.mark.no_vast  # Vast doesn't support opening ports
+@pytest.mark.resource_heavy
 def test_skyserve_user_bug_restart(generic_cloud: str):
     """Tests that we restart the service after user bug."""
     # TODO(zhwu): this behavior needs some rethinking.
@@ -471,6 +473,7 @@ def test_skyserve_auto_restart():
 
 @pytest.mark.no_vast  # Vast doesn't support opening ports
 @pytest.mark.serve
+@pytest.mark.resource_heavy
 def test_skyserve_cancel(generic_cloud: str):
     """Test skyserve with cancel"""
     name = _get_service_name()
@@ -497,6 +500,7 @@ def test_skyserve_cancel(generic_cloud: str):
 
 @pytest.mark.no_vast  # Vast doesn't support opening ports
 @pytest.mark.serve
+@pytest.mark.resource_heavy
 def test_skyserve_streaming(generic_cloud: str):
     """Test skyserve with streaming"""
     name = _get_service_name()
@@ -541,6 +545,7 @@ def test_skyserve_readiness_timeout_fail(generic_cloud: str):
 
 @pytest.mark.no_vast  # Vast doesn't support opening ports
 @pytest.mark.serve
+@pytest.mark.resource_heavy
 def test_skyserve_large_readiness_timeout(generic_cloud: str):
     """Test skyserve with customized large readiness timeout"""
     name = _get_service_name()
@@ -563,6 +568,7 @@ def test_skyserve_large_readiness_timeout(generic_cloud: str):
 @pytest.mark.no_do  # DO does not support `--cpus 2`
 @pytest.mark.no_vast  # Vast doesn't support opening ports
 @pytest.mark.serve
+@pytest.mark.resource_heavy
 def test_skyserve_update(generic_cloud: str):
     """Test skyserve with update"""
     name = _get_service_name()
@@ -595,6 +601,7 @@ def test_skyserve_update(generic_cloud: str):
 @pytest.mark.no_do  # DO does not support `--cpus 2`
 @pytest.mark.no_vast  # Vast doesn't support opening ports
 @pytest.mark.serve
+@pytest.mark.resource_heavy
 def test_skyserve_rolling_update(generic_cloud: str):
     """Test skyserve with rolling update"""
     name = _get_service_name()
@@ -633,6 +640,7 @@ def test_skyserve_rolling_update(generic_cloud: str):
 @pytest.mark.no_fluidstack
 @pytest.mark.no_vast  # Vast doesn't support opening ports
 @pytest.mark.serve
+@pytest.mark.resource_heavy
 def test_skyserve_fast_update(generic_cloud: str):
     """Test skyserve with fast update (Increment version of old replicas)"""
     name = _get_service_name()
@@ -675,6 +683,7 @@ def test_skyserve_fast_update(generic_cloud: str):
 
 @pytest.mark.no_vast  # Vast doesn't support opening ports
 @pytest.mark.serve
+@pytest.mark.resource_heavy
 def test_skyserve_update_autoscale(generic_cloud: str):
     """Test skyserve update with autoscale"""
     name = _get_service_name()
@@ -781,6 +790,7 @@ def test_skyserve_new_autoscaler_update(mode: str, generic_cloud: str):
 @pytest.mark.no_do  # DO does not support `--cpus 2`
 @pytest.mark.no_vast  # Vast doesn't support opening ports
 @pytest.mark.serve
+@pytest.mark.resource_heavy
 def test_skyserve_failures(generic_cloud: str):
     """Test replica failure statuses"""
     name = _get_service_name()
@@ -828,6 +838,7 @@ def test_skyserve_failures(generic_cloud: str):
 
 
 @pytest.mark.serve
+@pytest.mark.resource_heavy
 def test_skyserve_https(generic_cloud: str):
     """Test skyserve with https"""
     name = _get_service_name()
@@ -865,6 +876,7 @@ def test_skyserve_https(generic_cloud: str):
 
 
 @pytest.mark.serve
+@pytest.mark.resource_heavy
 def test_skyserve_multi_ports(generic_cloud: str):
     """Test skyserve with multiple ports"""
     name = _get_service_name()

From 9c7ab0fbbd687b992538f3f9e361945fb33a1ea7 Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil.bhardwaj@berkeley.edu>
Date: Tue, 11 Feb 2025 22:12:18 -0800
Subject: [PATCH 09/18] [Examples] Fix ray distributed training example (#4697)

* Fix worker getting killed

* Fix worker getting killed

* Add comment
---
 docs/source/running-jobs/distributed-jobs.rst | 2 ++
 examples/distributed_ray_train/ray_train.yaml | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/docs/source/running-jobs/distributed-jobs.rst b/docs/source/running-jobs/distributed-jobs.rst
index d39ed6cf571..278880c56eb 100644
--- a/docs/source/running-jobs/distributed-jobs.rst
+++ b/docs/source/running-jobs/distributed-jobs.rst
@@ -185,6 +185,8 @@ To execute a distributed Ray program on many nodes, you can download the `traini
       else
         sleep 5
         ps aux | grep ray | grep 6379 &> /dev/null || ray start --address $HEAD_IP:6379 --disable-usage-stats
+        # Add sleep to after `ray start` to give ray enough time to daemonize
+        sleep 5
       fi
 
 .. warning::
diff --git a/examples/distributed_ray_train/ray_train.yaml b/examples/distributed_ray_train/ray_train.yaml
index f804438849d..dd92d69d738 100644
--- a/examples/distributed_ray_train/ray_train.yaml
+++ b/examples/distributed_ray_train/ray_train.yaml
@@ -29,4 +29,6 @@ run: |
   else
     sleep 5
     ps aux | grep ray | grep 6379 &> /dev/null || ray start --address $head_ip:6379 --disable-usage-stats
+    # Add sleep to after `ray start` to give ray enough time to daemonize 
+    sleep 5
   fi

From 39c90c00c531c91824119d490b3ebe40dde37ae2 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Tue, 11 Feb 2025 22:20:11 -0800
Subject: [PATCH 10/18] [Tests] Fix pod tag test (#4700)

* fix pod tag test

* format
---
 tests/smoke_tests/test_cluster_job.py | 35 +++++++++++++++------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/tests/smoke_tests/test_cluster_job.py b/tests/smoke_tests/test_cluster_job.py
index fcedda5afaf..e3a2e954dc9 100644
--- a/tests/smoke_tests/test_cluster_job.py
+++ b/tests/smoke_tests/test_cluster_job.py
@@ -845,14 +845,15 @@ def test_add_pod_annotations_for_autodown_with_launch():
             # Autodown is set.
             f'sky launch -y -c {name} -i 10 --down --num-nodes 2 --cpus=1 --cloud kubernetes',
             # Get names of the pods containing cluster name.
-            f'pod_1=$(kubectl get pods -o name | grep {name} | sed -n 1p)',
-            f'pod_2=$(kubectl get pods -o name | grep {name} | sed -n 2p)',
+            f'pod_1=$(kubectl get pods -o name | grep {name} | sed -n 1p) && '
             # Describe the first pod and check for annotations.
-            'kubectl describe pod $pod_1 | grep -q skypilot.co/autodown',
-            'kubectl describe pod $pod_1 | grep -q skypilot.co/idle_minutes_to_autostop',
+            'pod_tag=$(kubectl describe $pod_1); echo "$pod_tag"; echo "$pod_tag" | grep -q skypilot.co/autodown && '
+            'pod_tag=$(kubectl describe $pod_1); echo "$pod_tag"; echo "$pod_tag" | grep -q skypilot.co/idle_minutes_to_autostop',
+            # Get names of the pods containing cluster name.
+            f'pod_2=$(kubectl get pods -o name | grep {name} | sed -n 2p) && '
             # Describe the second pod and check for annotations.
-            'kubectl describe pod $pod_2 | grep -q skypilot.co/autodown',
-            'kubectl describe pod $pod_2 | grep -q skypilot.co/idle_minutes_to_autostop'
+            'pod_tag=$(kubectl describe $pod_2); echo "$pod_tag"; echo "$pod_tag" | grep -q skypilot.co/autodown && '
+            'pod_tag=$(kubectl describe $pod_2); echo "$pod_tag"; echo "$pod_tag" | grep -q skypilot.co/idle_minutes_to_autostop'
         ],
         f'sky down -y {name}',
     )
@@ -870,22 +871,24 @@ def test_add_and_remove_pod_annotations_with_autostop():
             # Set autodown on the cluster with 'autostop' command.
             f'sky autostop -y {name} -i 20 --down',
             # Get names of the pods containing cluster name.
-            f'pod_1=$(kubectl get pods -o name | grep {name} | sed -n 1p)',
-            f'pod_2=$(kubectl get pods -o name | grep {name} | sed -n 2p)',
+            f'pod_1=$(kubectl get pods -o name | grep {name} | sed -n 1p) && '
             # Describe the first pod and check for annotations.
-            'kubectl describe pod $pod_1 | grep -q skypilot.co/autodown',
-            'kubectl describe pod $pod_1 | grep -q skypilot.co/idle_minutes_to_autostop',
+            'pod_tag=$(kubectl describe $pod_1); echo "$pod_tag"; echo "$pod_tag" | grep -q skypilot.co/autodown && '
+            'pod_tag=$(kubectl describe $pod_1); echo "$pod_tag"; echo "$pod_tag" | grep -q skypilot.co/idle_minutes_to_autostop',
             # Describe the second pod and check for annotations.
-            'kubectl describe pod $pod_2 | grep -q skypilot.co/autodown',
-            'kubectl describe pod $pod_2 | grep -q skypilot.co/idle_minutes_to_autostop',
+            f'pod_2=$(kubectl get pods -o name | grep {name} | sed -n 2p) && '
+            'pod_tag=$(kubectl describe $pod_2); echo "$pod_tag"; echo "$pod_tag" | grep -q skypilot.co/autodown && '
+            'pod_tag=$(kubectl describe $pod_2); echo "$pod_tag"; echo "$pod_tag" | grep -q skypilot.co/idle_minutes_to_autostop',
             # Cancel the set autodown to remove the annotations from the pods.
             f'sky autostop -y {name} --cancel',
             # Describe the first pod and check if annotations are removed.
-            '! kubectl describe pod $pod_1 | grep -q skypilot.co/autodown',
-            '! kubectl describe pod $pod_1 | grep -q skypilot.co/idle_minutes_to_autostop',
+            f'pod_1=$(kubectl get pods -o name | grep {name} | sed -n 1p) && '
+            'pod_tag=$(kubectl describe $pod_1); echo "$pod_tag"; ! echo "$pod_tag" | grep -q skypilot.co/autodown && '
+            'pod_tag=$(kubectl describe $pod_1); echo "$pod_tag"; ! echo "$pod_tag" | grep -q skypilot.co/idle_minutes_to_autostop',
             # Describe the second pod and check if annotations are removed.
-            '! kubectl describe pod $pod_2 | grep -q skypilot.co/autodown',
-            '! kubectl describe pod $pod_2 | grep -q skypilot.co/idle_minutes_to_autostop',
+            f'pod_2=$(kubectl get pods -o name | grep {name} | sed -n 2p) && '
+            'pod_tag=$(kubectl describe $pod_2); echo "$pod_tag"; ! echo "$pod_tag" | grep -q skypilot.co/autodown && '
+            'pod_tag=$(kubectl describe $pod_2); echo "$pod_tag"; ! echo "$pod_tag" | grep -q skypilot.co/idle_minutes_to_autostop',
         ],
         f'sky down -y {name}',
     )

From 1fe3fab0e7a3242f32039d55b456603350dc4196 Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil.bhardwaj@berkeley.edu>
Date: Wed, 12 Feb 2025 01:14:30 -0800
Subject: [PATCH 11/18] [Tests] Update initial_delay for serve tests to account
 for slow EKS load balancer provisioning (#4698)

Fix worker getting killed
---
 tests/skyserve/http/kubernetes.yaml            | 2 +-
 tests/skyserve/multi_ports.yaml                | 2 +-
 tests/skyserve/update/bump_version_after.yaml  | 2 +-
 tests/skyserve/update/bump_version_before.yaml | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/skyserve/http/kubernetes.yaml b/tests/skyserve/http/kubernetes.yaml
index 441d097b12b..88d4465aea4 100644
--- a/tests/skyserve/http/kubernetes.yaml
+++ b/tests/skyserve/http/kubernetes.yaml
@@ -1,7 +1,7 @@
 service:
   readiness_probe:
     path: /health
-    initial_delay_seconds: 20
+    initial_delay_seconds: 180  # Use a large delay for EKS LB to be ready
   replicas: 2
 
 resources:
diff --git a/tests/skyserve/multi_ports.yaml b/tests/skyserve/multi_ports.yaml
index 5dddf644655..0ed3e05d33a 100644
--- a/tests/skyserve/multi_ports.yaml
+++ b/tests/skyserve/multi_ports.yaml
@@ -1,7 +1,7 @@
 service:
   readiness_probe:
     path: /health
-    initial_delay_seconds: 20
+    initial_delay_seconds: 180  # Use a large delay for EKS LB to be ready
   replicas: 1
   ports: 8080
 
diff --git a/tests/skyserve/update/bump_version_after.yaml b/tests/skyserve/update/bump_version_after.yaml
index 6e845f54b9e..a74934ff7ec 100644
--- a/tests/skyserve/update/bump_version_after.yaml
+++ b/tests/skyserve/update/bump_version_after.yaml
@@ -12,7 +12,7 @@
 service:
   readiness_probe:
     path: /health
-    initial_delay_seconds: 20
+    initial_delay_seconds: 180  # Use a large delay for EKS LB to be ready
   replicas: 3
 
 resources:
diff --git a/tests/skyserve/update/bump_version_before.yaml b/tests/skyserve/update/bump_version_before.yaml
index c9fd957e41a..b3ff49dfeb9 100644
--- a/tests/skyserve/update/bump_version_before.yaml
+++ b/tests/skyserve/update/bump_version_before.yaml
@@ -12,7 +12,7 @@
 service:
   readiness_probe:
     path: /health
-    initial_delay_seconds: 20
+    initial_delay_seconds: 180  # Use a large delay for EKS LB to be ready
   replicas: 2
 
 resources:

From 21214ce61c7d4257f5643e095da0a4e925f8eb58 Mon Sep 17 00:00:00 2001
From: Cody Brownstein <105375373+cbrownstein-lambda@users.noreply.github.com>
Date: Wed, 12 Feb 2025 13:16:20 -0800
Subject: [PATCH 12/18] Add IAD02 (us-east-3) to Lambda Cloud regions (#4703)

---
 sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py b/sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py
index 008bfe6abeb..8a599335944 100644
--- a/sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py
+++ b/sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py
@@ -28,6 +28,7 @@
     'asia-northeast-2',
     'us-east-1',
     'us-east-2',
+    'us-east-3',
     'us-west-2',
     'us-west-1',
     'us-south-1',

From 57137e4a18d78eafac04caf63b464e7bcd2c2e57 Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Wed, 12 Feb 2025 18:35:48 -0800
Subject: [PATCH 13/18] A Workaround for Launching Non-root Customized Docker
 Images on RunPod (#4683)

* refactor: inform that `docker_config` existence conditions

* feat: support user-specified ssh username for runpod docker

* fix: format and list Resources

* fix: do not copy resources if docker_ssh_username not exists

* fix: add a space

* style: format

* refactor: naming, with runpod stressed and ssh understated

* docs: also mention this env in task spec

* docs: apply suggestions from code review

Co-authored-by: Tian Xia <cblmemo@gmail.com>

* docs: mention this `env` as a note

* docs: remove issue

---------

Co-authored-by: Tian Xia <cblmemo@gmail.com>
---
 docs/source/examples/docker-containers.rst | 15 ++++++++++++-
 docs/source/reference/yaml-spec.rst        |  4 ++++
 sky/clouds/runpod.py                       |  6 +++++
 sky/provision/provisioner.py               |  7 ++++++
 sky/provision/runpod/utils.py              |  7 +++---
 sky/resources.py                           | 26 ++++++++++++++++++++++
 sky/skylet/constants.py                    |  2 ++
 sky/task.py                                | 25 +++++++++++++++++++++
 sky/templates/runpod-ray.yml.j2            |  2 +-
 9 files changed, 89 insertions(+), 5 deletions(-)

diff --git a/docs/source/examples/docker-containers.rst b/docs/source/examples/docker-containers.rst
index 9e7fd8e9a11..4d83823b8f9 100644
--- a/docs/source/examples/docker-containers.rst
+++ b/docs/source/examples/docker-containers.rst
@@ -10,7 +10,7 @@ SkyPilot can run a container either as a task, or as the runtime environment of
 
 .. note::
 
-    Running docker containers is `not supported on RunPod <https://docs.runpod.io/references/faq#can-i-run-my-own-docker-daemon-on-runpod>`_. To use RunPod, either use your docker image (the username should be ``root`` for RunPod) :ref:`as a runtime environment <docker-containers-as-runtime-environments>` or use ``setup`` and ``run`` to configure your environment. See `GitHub issue <https://github.com/skypilot-org/skypilot/issues/3096#issuecomment-2150559797>`_ for more.
+    Running docker containers is `not supported on RunPod <https://docs.runpod.io/references/faq#can-i-run-my-own-docker-daemon-on-runpod>`_. To use RunPod, either use your docker image :ref:`as a runtime environment <docker-containers-as-runtime-environments>` or use ``setup`` and ``run`` to configure your environment. See `GitHub issue <https://github.com/skypilot-org/skypilot/issues/3096#issuecomment-2150559797>`_ for more.
 
 
 .. _docker-containers-as-tasks:
@@ -122,6 +122,19 @@ For example, to use the :code:`ubuntu:20.04` image from Docker Hub:
   run: |
     # Commands to run inside the container
 
+.. note::
+  For **non-root** docker images on RunPod, you must manually set the :code:`SKYPILOT_RUNPOD_DOCKER_USERNAME` environment variable to match the login user of the docker image (set by the last `USER` instruction in the Dockerfile).
+
+  You can set this environment variable in the :code:`envs` section of your task YAML file:
+
+  .. code-block:: yaml
+
+    envs:
+      SKYPILOT_RUNPOD_DOCKER_USERNAME: <ssh-user>
+
+  It's a workaround for RunPod's limitation that we can't get the login user for the created pods, and even `runpodctl` uses a hardcoded `root` for SSH access.
+  But for other clouds, the login users for the created docker containers are automatically fetched and used.
+
 As another example, here's how to use `NVIDIA's PyTorch NGC Container <https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch>`_:
 
 .. code-block:: yaml
diff --git a/docs/source/reference/yaml-spec.rst b/docs/source/reference/yaml-spec.rst
index c380c3993d0..0a1ee4762cb 100644
--- a/docs/source/reference/yaml-spec.rst
+++ b/docs/source/reference/yaml-spec.rst
@@ -285,6 +285,10 @@ Available fields:
     # Values set here can be overridden by a CLI flag:
     # `sky launch/exec --env ENV=val` (if ENV is present).
     #
+    # For costumized non-root docker image in RunPod, you need to set
+    # `SKYPILOT_RUNPOD_DOCKER_USERNAME` to specify the login username for the
+    # docker image. See :ref:`docker-containers-as-runtime-environments` for more.
+    #
     # If you want to use a docker image as runtime environment in a private
     # registry, you can specify your username, password, and registry server as
     # task environment variable.  For example:
diff --git a/sky/clouds/runpod.py b/sky/clouds/runpod.py
index b1cc016abd9..5f0f83627b0 100644
--- a/sky/clouds/runpod.py
+++ b/sky/clouds/runpod.py
@@ -177,6 +177,11 @@ def make_deploy_resources_variables(
         hourly_cost = self.instance_type_to_hourly_cost(
             instance_type=instance_type, use_spot=use_spot)
 
+        # default to root
+        docker_username_for_runpod = (resources.docker_username_for_runpod
+                                      if resources.docker_username_for_runpod
+                                      is not None else 'root')
+
         return {
             'instance_type': instance_type,
             'custom_resources': custom_resources,
@@ -184,6 +189,7 @@ def make_deploy_resources_variables(
             'image_id': image_id,
             'use_spot': use_spot,
             'bid_per_gpu': str(hourly_cost),
+            'docker_username_for_runpod': docker_username_for_runpod,
         }
 
     def _get_feasible_launchable_resources(
diff --git a/sky/provision/provisioner.py b/sky/provision/provisioner.py
index 8f2142df273..243b460a469 100644
--- a/sky/provision/provisioner.py
+++ b/sky/provision/provisioner.py
@@ -450,6 +450,13 @@ def _post_provision_setup(
         logger.info(f'{indent_str}{colorama.Style.DIM}{vm_str}{plural} {verb} '
                     f'up.{colorama.Style.RESET_ALL}')
 
+        # It's promised by the cluster config that docker_config does not
+        # exist for docker-native clouds, i.e. they provide docker containers
+        # instead of full VMs, like Kubernetes and RunPod, as it requires some
+        # special handlings to run docker inside their docker virtualization.
+        # For their Docker image settings, we do them when provisioning the
+        # cluster. See provision/{cloud}/instance.py:get_cluster_info for more
+        # details.
         if docker_config:
             status.update(
                 ux_utils.spinner_message(
diff --git a/sky/provision/runpod/utils.py b/sky/provision/runpod/utils.py
index 6600cfd6198..2c7861abaa7 100644
--- a/sky/provision/runpod/utils.py
+++ b/sky/provision/runpod/utils.py
@@ -186,7 +186,7 @@ def delete_pod_template(template_name: str) -> None:
         runpod.runpod.api.graphql.run_graphql_query(
             f'mutation {{deleteTemplate(templateName: "{template_name}")}}')
     except runpod.runpod.error.QueryError as e:
-        logger.warning(f'Failed to delete template {template_name}: {e}'
+        logger.warning(f'Failed to delete template {template_name}: {e} '
                        'Please delete it manually.')
 
 
@@ -195,8 +195,9 @@ def delete_register_auth(registry_auth_id: str) -> None:
     try:
         runpod.runpod.delete_container_registry_auth(registry_auth_id)
     except runpod.runpod.error.QueryError as e:
-        logger.warning(f'Failed to delete registry auth {registry_auth_id}: {e}'
-                       'Please delete it manually.')
+        logger.warning(
+            f'Failed to delete registry auth {registry_auth_id}: {e} '
+            'Please delete it manually.')
 
 
 def _create_template_for_docker_login(
diff --git a/sky/resources.py b/sky/resources.py
index c4f04ea5543..55729894b87 100644
--- a/sky/resources.py
+++ b/sky/resources.py
@@ -67,6 +67,7 @@ def __init__(
         # Internal use only.
         # pylint: disable=invalid-name
         _docker_login_config: Optional[docker_utils.DockerLoginConfig] = None,
+        _docker_username_for_runpod: Optional[str] = None,
         _is_image_managed: Optional[bool] = None,
         _requires_fuse: Optional[bool] = None,
         _cluster_config_overrides: Optional[Dict[str, Any]] = None,
@@ -148,6 +149,9 @@ def __init__(
           _docker_login_config: the docker configuration to use. This includes
             the docker username, password, and registry server. If None, skip
             docker login.
+          _docker_username_for_runpod: the login username for the docker
+            containers. This is used by RunPod to set the ssh user for the
+            docker containers.
           _requires_fuse: whether the task requires FUSE mounting support. This
             is used internally by certain cloud implementations to do additional
             setup for FUSE mounting. This flag also safeguards against using
@@ -234,6 +238,12 @@ def __init__(
 
         self._docker_login_config = _docker_login_config
 
+        # TODO(andyl): This ctor param seems to be unused.
+        # We always use `Task.set_resources` and `Resources.copy` to set the
+        # `docker_username_for_runpod`. But to keep the consistency with
+        # `_docker_login_config`, we keep it here.
+        self._docker_username_for_runpod = _docker_username_for_runpod
+
         self._requires_fuse = _requires_fuse
 
         self._cluster_config_overrides = _cluster_config_overrides
@@ -479,6 +489,10 @@ def cluster_config_overrides(self) -> Dict[str, Any]:
     def requires_fuse(self, value: Optional[bool]) -> None:
         self._requires_fuse = value
 
+    @property
+    def docker_username_for_runpod(self) -> Optional[str]:
+        return self._docker_username_for_runpod
+
     def _set_cpus(
         self,
         cpus: Union[None, int, float, str],
@@ -1065,6 +1079,10 @@ def make_deploy_variables(self, cluster_name: resources_utils.ClusterName,
         cloud_specific_variables = self.cloud.make_deploy_resources_variables(
             self, cluster_name, region, zones, num_nodes, dryrun)
 
+        # TODO(andyl): Should we print some warnings if users' envs share
+        # same names with the cloud specific variables, but not enabled
+        # since it's not on the particular cloud?
+
         # Docker run options
         docker_run_options = skypilot_config.get_nested(
             ('docker', 'run_options'),
@@ -1277,6 +1295,9 @@ def copy(self, **override) -> 'Resources':
             labels=override.pop('labels', self.labels),
             _docker_login_config=override.pop('_docker_login_config',
                                               self._docker_login_config),
+            _docker_username_for_runpod=override.pop(
+                '_docker_username_for_runpod',
+                self._docker_username_for_runpod),
             _is_image_managed=override.pop('_is_image_managed',
                                            self._is_image_managed),
             _requires_fuse=override.pop('_requires_fuse', self._requires_fuse),
@@ -1438,6 +1459,8 @@ def _from_yaml_config_single(cls, config: Dict[str, str]) -> 'Resources':
         resources_fields['labels'] = config.pop('labels', None)
         resources_fields['_docker_login_config'] = config.pop(
             '_docker_login_config', None)
+        resources_fields['_docker_username_for_runpod'] = config.pop(
+            '_docker_username_for_runpod', None)
         resources_fields['_is_image_managed'] = config.pop(
             '_is_image_managed', None)
         resources_fields['_requires_fuse'] = config.pop('_requires_fuse', None)
@@ -1486,6 +1509,9 @@ def add_if_not_none(key, value):
         if self._docker_login_config is not None:
             config['_docker_login_config'] = dataclasses.asdict(
                 self._docker_login_config)
+        if self._docker_username_for_runpod is not None:
+            config['_docker_username_for_runpod'] = (
+                self._docker_username_for_runpod)
         add_if_not_none('_cluster_config_overrides',
                         self._cluster_config_overrides)
         if self._is_image_managed is not None:
diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py
index 4d37e9682e2..fe4021c8046 100644
--- a/sky/skylet/constants.py
+++ b/sky/skylet/constants.py
@@ -110,6 +110,8 @@
     DOCKER_SERVER_ENV_VAR,
 }
 
+RUNPOD_DOCKER_USERNAME_ENV_VAR = 'SKYPILOT_RUNPOD_DOCKER_USERNAME'
+
 # Commands for disable GPU ECC, which can improve the performance of the GPU
 # for some workloads by 30%. This will only be applied when a user specify
 # `nvidia_gpus.disable_ecc: true` in ~/.sky/config.yaml.
diff --git a/sky/task.py b/sky/task.py
index bbf6d59b2ae..09a6c402869 100644
--- a/sky/task.py
+++ b/sky/task.py
@@ -121,6 +121,9 @@ def _check_docker_login_config(task_envs: Dict[str, str]) -> bool:
 
     If any of the docker login env vars is set, all of them must be set.
 
+    Returns:
+        True if there is a valid docker login config in task_envs.
+        False otherwise.
     Raises:
         ValueError: if any of the docker login env vars is set, but not all of
             them are set.
@@ -168,6 +171,23 @@ def _add_docker_login_config(resources: 'resources_lib.Resources'):
     return type(resources)(new_resources)
 
 
+def _with_docker_username_for_runpod(
+    resources: Union[Set['resources_lib.Resources'],
+                     List['resources_lib.Resources']],
+    task_envs: Dict[str, str],
+) -> Union[Set['resources_lib.Resources'], List['resources_lib.Resources']]:
+    docker_username_for_runpod = task_envs.get(
+        constants.RUNPOD_DOCKER_USERNAME_ENV_VAR)
+
+    # We should not call r.copy() if docker_username_for_runpod is None,
+    # to prevent `DummyResources` instance becoming a `Resources` instance.
+    if docker_username_for_runpod is None:
+        return resources
+    return (type(resources)(
+        r.copy(_docker_username_for_runpod=docker_username_for_runpod)
+        for r in resources))
+
+
 class Task:
     """Task: a computation to be run on the cloud."""
 
@@ -582,6 +602,8 @@ def update_envs(
         if _check_docker_login_config(self._envs):
             self.resources = _with_docker_login_config(self.resources,
                                                        self._envs)
+        self.resources = _with_docker_username_for_runpod(
+            self.resources, self._envs)
         return self
 
     @property
@@ -647,6 +669,9 @@ def set_resources(
             resources = {resources}
         # TODO(woosuk): Check if the resources are None.
         self.resources = _with_docker_login_config(resources, self.envs)
+        # Only have effect on RunPod.
+        self.resources = _with_docker_username_for_runpod(
+            self.resources, self.envs)
 
         # Evaluate if the task requires FUSE and set the requires_fuse flag
         for _, storage_obj in self.storage_mounts.items():
diff --git a/sky/templates/runpod-ray.yml.j2 b/sky/templates/runpod-ray.yml.j2
index ea57c9ac808..29d007196a4 100644
--- a/sky/templates/runpod-ray.yml.j2
+++ b/sky/templates/runpod-ray.yml.j2
@@ -25,7 +25,7 @@ provider:
   {%- endif %}
 
 auth:
-  ssh_user: root
+  ssh_user: {{docker_username_for_runpod}}
   ssh_private_key: {{ssh_private_key}}
 
 available_node_types:

From c49961417a83b049b3f3435a252c8ec5ea0fb5e6 Mon Sep 17 00:00:00 2001
From: Christopher Cooper <cooperc@assemblesys.com>
Date: Wed, 12 Feb 2025 20:26:42 -0800
Subject: [PATCH 14/18] [k8s] make sure ray num-cpus is at least 1 (#4707)

---
 sky/clouds/kubernetes.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py
index 3bab7687f48..18c7128d045 100644
--- a/sky/clouds/kubernetes.py
+++ b/sky/clouds/kubernetes.py
@@ -464,7 +464,9 @@ def make_deploy_resources_variables(
         # CPU resources on the node instead within the pod.
         custom_ray_options = {
             'object-store-memory': 500000000,
-            'num-cpus': str(int(cpus)),
+            # 'num-cpus' must be an integer, but we should not set it to 0 if
+            # cpus is <1.
+            'num-cpus': str(max(int(cpus), 1)),
         }
         deploy_vars = {
             'instance_type': resources.instance_type,

From 5e136c250d7a2fc7d33876aa75da208198975f76 Mon Sep 17 00:00:00 2001
From: Kaiyuan Eric Chen <kych@berkeley.edu>
Date: Thu, 13 Feb 2025 16:10:35 -0800
Subject: [PATCH 15/18] [Docs] Add vector database tutorial to documentation
 (#4713)

* add vdb

* add vdb to ai gallery

* fix warning
---
 README.md                                              |  3 ++-
 .../_gallery_original/applications/vector_database.md  |  1 +
 docs/source/_gallery_original/index.rst                |  1 +
 docs/source/docs/index.rst                             |  2 +-
 examples/vector_database/README.md                     | 10 +++++-----
 5 files changed, 10 insertions(+), 7 deletions(-)
 create mode 120000 docs/source/_gallery_original/applications/vector_database.md

diff --git a/README.md b/README.md
index 8a3361f9f41..ce3ccac8606 100644
--- a/README.md
+++ b/README.md
@@ -26,6 +26,7 @@
 
 ----
 :fire: *News* :fire:
+- [Jan 2025] Prepare and Serve Large-Scale Image Search with **Vector Database**: [**blog post**](https://blog.skypilot.co/large-scale-vector-database/) [**example**](./examples/vector_database/) 
 - [Jan 2025] Launch and Serve **[DeepSeek-R1](https://github.com/deepseek-ai/DeepSeek-R1)** and **[Janus](https://github.com/deepseek-ai/DeepSeek-Janus)** on Kubernetes or Any Cloud: [**R1 example**](./llm/deepseek-r1/) and [**Janus example**](./llm/deepseek-janus/)
 - [Oct 2024] :tada: **SkyPilot crossed 1M+ downloads** :tada:: Thank you to our community! [**Twitter/X**](https://x.com/skypilot_org/status/1844770841718067638)
 - [Sep 2024] Point, Launch and Serve **Llama 3.2** on Kubernetes or Any Cloud: [**example**](./llm/llama-3_2/)
@@ -187,7 +188,7 @@ Runnable examples:
   - [LocalGPT](./llm/localgpt)
   - [Falcon](./llm/falcon)
   - Add yours here & see more in [`llm/`](./llm)!
-- Framework examples: [PyTorch DDP](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_torch.yaml), [DeepSpeed](./examples/deepspeed-multinode/sky.yaml), [JAX/Flax on TPU](https://github.com/skypilot-org/skypilot/blob/master/examples/tpu/tpuvm_mnist.yaml), [Stable Diffusion](https://github.com/skypilot-org/skypilot/tree/master/examples/stable_diffusion), [Detectron2](https://github.com/skypilot-org/skypilot/blob/master/examples/detectron2_docker.yaml), [Distributed](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_tf_app.py) [TensorFlow](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_app_storage.yaml), [Ray Train](examples/distributed_ray_train/ray_train.yaml), [NeMo](https://github.com/skypilot-org/skypilot/blob/master/examples/nemo/), [programmatic grid search](https://github.com/skypilot-org/skypilot/blob/master/examples/huggingface_glue_imdb_grid_search_app.py), [Docker](https://github.com/skypilot-org/skypilot/blob/master/examples/docker/echo_app.yaml), [Cog](https://github.com/skypilot-org/skypilot/blob/master/examples/cog/), [Unsloth](https://github.com/skypilot-org/skypilot/blob/master/examples/unsloth/unsloth.yaml), [Ollama](https://github.com/skypilot-org/skypilot/blob/master/llm/ollama), [llm.c](https://github.com/skypilot-org/skypilot/tree/master/llm/gpt-2), [Airflow](./examples/airflow/training_workflow) and [many more (`examples/`)](./examples).
+- Framework examples: [Vector Database](./examples/vector_database/), [PyTorch DDP](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_torch.yaml), [DeepSpeed](./examples/deepspeed-multinode/sky.yaml), [JAX/Flax on TPU](https://github.com/skypilot-org/skypilot/blob/master/examples/tpu/tpuvm_mnist.yaml), [Stable Diffusion](https://github.com/skypilot-org/skypilot/tree/master/examples/stable_diffusion), [Detectron2](https://github.com/skypilot-org/skypilot/blob/master/examples/detectron2_docker.yaml), [Distributed](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_tf_app.py) [TensorFlow](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_app_storage.yaml), [Ray Train](examples/distributed_ray_train/ray_train.yaml), [NeMo](https://github.com/skypilot-org/skypilot/blob/master/examples/nemo/), [programmatic grid search](https://github.com/skypilot-org/skypilot/blob/master/examples/huggingface_glue_imdb_grid_search_app.py), [Docker](https://github.com/skypilot-org/skypilot/blob/master/examples/docker/echo_app.yaml), [Cog](https://github.com/skypilot-org/skypilot/blob/master/examples/cog/), [Unsloth](https://github.com/skypilot-org/skypilot/blob/master/examples/unsloth/unsloth.yaml), [Ollama](https://github.com/skypilot-org/skypilot/blob/master/llm/ollama), [llm.c](https://github.com/skypilot-org/skypilot/tree/master/llm/gpt-2), [Airflow](./examples/airflow/training_workflow) and [many more (`examples/`)](./examples).
 
 Case Studies and Integrations: [Community Spotlights](https://blog.skypilot.co/community/)
 
diff --git a/docs/source/_gallery_original/applications/vector_database.md b/docs/source/_gallery_original/applications/vector_database.md
new file mode 120000
index 00000000000..ebcd50df736
--- /dev/null
+++ b/docs/source/_gallery_original/applications/vector_database.md
@@ -0,0 +1 @@
+../../../../examples/vector_database/README.md
\ No newline at end of file
diff --git a/docs/source/_gallery_original/index.rst b/docs/source/_gallery_original/index.rst
index e049a4ad322..8e0d0b16c35 100644
--- a/docs/source/_gallery_original/index.rst
+++ b/docs/source/_gallery_original/index.rst
@@ -50,6 +50,7 @@ Contents
    :maxdepth: 1
    :caption: Applications
 
+   Image Vector Database <applications/vector_database>
    Tabby: Coding Assistant <applications/tabby>
    LocalGPT: Chat with PDF <applications/localgpt>
 
diff --git a/docs/source/docs/index.rst b/docs/source/docs/index.rst
index 2e9ca6859c6..ea5d6c6c18e 100644
--- a/docs/source/docs/index.rst
+++ b/docs/source/docs/index.rst
@@ -108,7 +108,7 @@ Runnable examples:
   * `LocalGPT <https://github.com/skypilot-org/skypilot/tree/master/llm/localgpt>`_
   * Add yours here & see more in `llm/ <https://github.com/skypilot-org/skypilot/tree/master/llm>`_!
 
-* Framework examples: `PyTorch DDP <https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_torch.yaml>`_, `DeepSpeed <https://github.com/skypilot-org/skypilot/blob/master/examples/deepspeed-multinode/sky.yaml>`_, `JAX/Flax on TPU <https://github.com/skypilot-org/skypilot/blob/master/examples/tpu/tpuvm_mnist.yaml>`_, `Stable Diffusion <https://github.com/skypilot-org/skypilot/tree/master/examples/stable_diffusion>`_, `Detectron2 <https://github.com/skypilot-org/skypilot/blob/master/examples/detectron2_docker.yaml>`_, `Distributed <https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_tf_app.py>`_ `TensorFlow <https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_app_storage.yaml>`_, `NeMo <https://github.com/skypilot-org/skypilot/blob/master/examples/nemo/nemo_gpt_train.yaml>`_, `programmatic grid search <https://github.com/skypilot-org/skypilot/blob/master/examples/huggingface_glue_imdb_grid_search_app.py>`_, `Docker <https://github.com/skypilot-org/skypilot/blob/master/examples/docker/echo_app.yaml>`_, `Cog <https://github.com/skypilot-org/skypilot/blob/master/examples/cog/>`_, `Unsloth <https://github.com/skypilot-org/skypilot/blob/master/examples/unsloth/unsloth.yaml>`_, `Ollama <https://github.com/skypilot-org/skypilot/blob/master/llm/ollama>`_, `llm.c <https://github.com/skypilot-org/skypilot/tree/master/llm/gpt-2>`__, `Airflow <https://github.com/skypilot-org/skypilot/blob/master/examples/airflow/training_workflow>`_ and `many more <https://github.com/skypilot-org/skypilot/tree/master/examples>`_.
+* Framework examples: `Vector Database <https://github.com/skypilot-org/skypilot/tree/master/examples/vector_database>`_, `PyTorch DDP <https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_torch.yaml>`_, `DeepSpeed <https://github.com/skypilot-org/skypilot/blob/master/examples/deepspeed-multinode/sky.yaml>`_, `JAX/Flax on TPU <https://github.com/skypilot-org/skypilot/blob/master/examples/tpu/tpuvm_mnist.yaml>`_, `Stable Diffusion <https://github.com/skypilot-org/skypilot/tree/master/examples/stable_diffusion>`_, `Detectron2 <https://github.com/skypilot-org/skypilot/blob/master/examples/detectron2_docker.yaml>`_, `Distributed <https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_tf_app.py>`_ `TensorFlow <https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_app_storage.yaml>`_, `NeMo <https://github.com/skypilot-org/skypilot/blob/master/examples/nemo/nemo_gpt_train.yaml>`_, `programmatic grid search <https://github.com/skypilot-org/skypilot/blob/master/examples/huggingface_glue_imdb_grid_search_app.py>`_, `Docker <https://github.com/skypilot-org/skypilot/blob/master/examples/docker/echo_app.yaml>`_, `Cog <https://github.com/skypilot-org/skypilot/blob/master/examples/cog/>`_, `Unsloth <https://github.com/skypilot-org/skypilot/blob/master/examples/unsloth/unsloth.yaml>`_, `Ollama <https://github.com/skypilot-org/skypilot/blob/master/llm/ollama>`_, `llm.c <https://github.com/skypilot-org/skypilot/tree/master/llm/gpt-2>`__, `Airflow <https://github.com/skypilot-org/skypilot/blob/master/examples/airflow/training_workflow>`_ and `many more <https://github.com/skypilot-org/skypilot/tree/master/examples>`_.
 
 Case Studies and Integrations: `Community Spotlights <https://blog.skypilot.co/community/>`_
 
diff --git a/examples/vector_database/README.md b/examples/vector_database/README.md
index f127d2c176e..20581cc421c 100644
--- a/examples/vector_database/README.md
+++ b/examples/vector_database/README.md
@@ -4,7 +4,7 @@
 <img src="https://i.imgur.com/xNyKyxK.png" alt="VectorDB with SkyPilot" style="width: 70%;">
 </p>
 
-### Large-Scale Image Search
+## Large-Scale Image Search
 As the volume of image data grows, the need for efficient and powerful search methods becomes critical. Traditional keyword-based or metadata-based search often fails to capture the full semantic meaning in images. A vector database enables semantic search: you can find images that conceptually match a query (e.g., "a photo of a cloud") rather than relying on textual tags.
 
 In particular:
@@ -17,7 +17,7 @@ SkyPilot streamlines the process of running such large-scale jobs in the cloud.
 
 Please find the complete blog post [here](https://blog.skypilot.co/large-scale-vector-database/)
 
-### Step 0: Set Up The Environment
+## Step 0: Set Up The Environment
 Install the following Prerequisites:  
 * SkyPilot: Make sure you have SkyPilot installed and `sky check` should succeed. Refer to [SkyPilot’s documentation](https://docs.skypilot.co/en/latest/getting-started/installation.html) for instructions.
 * Hugging Face Token: To download dataset from Hugging Face Hub, you will need your token. Follow the steps below to configure your token.
@@ -28,7 +28,7 @@ HF_TOKEN=hf_xxxxx
 ```
 or set up the environment variable `HF_TOKEN`. 
 
-### Step 1: Compute Vectors from Image Data with OpenAI CLIP
+## Step 1: Compute Vectors from Image Data with OpenAI CLIP
 You need to convert images into vector representations (embeddings) so they can be stored in a vector database. Models like [CLIP by OpenAI](https://openai.com/index/clip/) learn powerful representations that map images and text into the same embedding space. This allows for semantic similarity calculations, making queries like “a photo of a cloud” match relevant images.
 
 Use the following command to launch a job that processes your image dataset and computes the CLIP embeddings: 
@@ -51,7 +51,7 @@ You can also use `sky jobs queue` and `sky jobs dashboard` to see the status of
 <img src="https://i.imgur.com/2CyQADY.png" alt="SkyPilot Dashboard" style="width: 70%;">
 </p>
 
-### Step 2: Construct the Vector Database from Computed Embeddings
+## Step 2: Construct the Vector Database from Computed Embeddings
 Once you have the image embeddings, you need a specialized engine to perform rapid similarity searches at scale. In this example, we use [ChromaDB](https://docs.trychroma.com/getting-started) to store and query the embeddings. This step ingests the embeddings from Step 1 into a vector database to enable real-time or near real-time search over millions of vectors. 
 
 To construct the database from embeddings: 
@@ -68,7 +68,7 @@ Processing batches: 100%|██████████| 1/1 [00:02<00:00,  2.39
 Processing files: 100%|██████████| 12/12 [00:05<00:00,  2.04it/s]/1 [00:00<?, ?it/s]
 ```
 
-### Step 3: Serve the Constructed Vector Database
+## Step 3: Serve the Constructed Vector Database
 
 To serve the constructed database, you expose an API endpoint that other applications (or your local client) can call to perform semantic search. Querying allows you to confirm that your database is working and retrieve semantic matches for a given text query. You can integrate this endpoint into larger applications (like an image search engine or recommendation system).
 

From aa413c63873ab59938fe7425d3a92fee5b9d84ab Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Thu, 13 Feb 2025 17:39:06 -0800
Subject: [PATCH 16/18] [Storage] Azure bucket sub directory is ignored if the
 bucket previously exists (#4706)

* Set sub path for azure storage

* Full path for the ux

* fix variable reference

* minor

* enforce azure region for bucket

* Delete storage even when failure happens
---
 sky/data/storage.py                         | 95 ++++++++++++---------
 tests/smoke_tests/test_mount_and_storage.py | 93 +++++++++++++-------
 2 files changed, 115 insertions(+), 73 deletions(-)

diff --git a/sky/data/storage.py b/sky/data/storage.py
index 5dce3f0a0d8..c3ccb3dfc67 100644
--- a/sky/data/storage.py
+++ b/sky/data/storage.py
@@ -354,7 +354,8 @@ def from_metadata(cls, metadata: StoreMetadata, **override_args):
                                              metadata.is_sky_managed),
             sync_on_reconstruction=override_args.get('sync_on_reconstruction',
                                                      True),
-            # backward compatibility
+            # Backward compatibility
+            # TODO: remove the hasattr check after v0.11.0
             _bucket_sub_path=override_args.get(
                 '_bucket_sub_path',
                 metadata._bucket_sub_path  # pylint: disable=protected-access
@@ -1462,6 +1463,8 @@ def batch_aws_rsync(self,
                 set to True, the directory is created in the bucket root and
                 contents are uploaded to it.
         """
+        sub_path = (f'/{self._bucket_sub_path}'
+                    if self._bucket_sub_path else '')
 
         def get_file_sync_command(base_dir_path, file_names):
             includes = ' '.join([
@@ -1469,8 +1472,6 @@ def get_file_sync_command(base_dir_path, file_names):
                 for file_name in file_names
             ])
             base_dir_path = shlex.quote(base_dir_path)
-            sub_path = (f'/{self._bucket_sub_path}'
-                        if self._bucket_sub_path else '')
             sync_command = ('aws s3 sync --no-follow-symlinks --exclude="*" '
                             f'{includes} {base_dir_path} '
                             f's3://{self.name}{sub_path}')
@@ -1485,8 +1486,6 @@ def get_dir_sync_command(src_dir_path, dest_dir_name):
                 for file_name in excluded_list
             ])
             src_dir_path = shlex.quote(src_dir_path)
-            sub_path = (f'/{self._bucket_sub_path}'
-                        if self._bucket_sub_path else '')
             sync_command = (f'aws s3 sync --no-follow-symlinks {excludes} '
                             f'{src_dir_path} '
                             f's3://{self.name}{sub_path}/{dest_dir_name}')
@@ -1500,7 +1499,7 @@ def get_dir_sync_command(src_dir_path, dest_dir_name):
 
         log_path = sky_logging.generate_tmp_logging_file_path(
             _STORAGE_LOG_FILE_NAME)
-        sync_path = f'{source_message} -> s3://{self.name}/'
+        sync_path = f'{source_message} -> s3://{self.name}{sub_path}/'
         with rich_utils.safe_status(
                 ux_utils.spinner_message(f'Syncing {sync_path}',
                                          log_path=log_path)):
@@ -1959,11 +1958,13 @@ def batch_gsutil_cp(self,
         copy_list = '\n'.join(
             os.path.abspath(os.path.expanduser(p)) for p in source_path_list)
         gsutil_alias, alias_gen = data_utils.get_gsutil_command()
+        sub_path = (f'/{self._bucket_sub_path}'
+                    if self._bucket_sub_path else '')
         sync_command = (f'{alias_gen}; echo "{copy_list}" | {gsutil_alias} '
-                        f'cp -e -n -r -I gs://{self.name}')
+                        f'cp -e -n -r -I gs://{self.name}{sub_path}')
         log_path = sky_logging.generate_tmp_logging_file_path(
             _STORAGE_LOG_FILE_NAME)
-        sync_path = f'{source_message} -> gs://{self.name}/'
+        sync_path = f'{source_message} -> gs://{self.name}{sub_path}/'
         with rich_utils.safe_status(
                 ux_utils.spinner_message(f'Syncing {sync_path}',
                                          log_path=log_path)):
@@ -1995,13 +1996,13 @@ def batch_gsutil_rsync(self,
                 set to True, the directory is created in the bucket root and
                 contents are uploaded to it.
         """
+        sub_path = (f'/{self._bucket_sub_path}'
+                    if self._bucket_sub_path else '')
 
         def get_file_sync_command(base_dir_path, file_names):
             sync_format = '|'.join(file_names)
             gsutil_alias, alias_gen = data_utils.get_gsutil_command()
             base_dir_path = shlex.quote(base_dir_path)
-            sub_path = (f'/{self._bucket_sub_path}'
-                        if self._bucket_sub_path else '')
             sync_command = (f'{alias_gen}; {gsutil_alias} '
                             f'rsync -e -x \'^(?!{sync_format}$).*\' '
                             f'{base_dir_path} gs://{self.name}{sub_path}')
@@ -2014,8 +2015,6 @@ def get_dir_sync_command(src_dir_path, dest_dir_name):
             excludes = '|'.join(excluded_list)
             gsutil_alias, alias_gen = data_utils.get_gsutil_command()
             src_dir_path = shlex.quote(src_dir_path)
-            sub_path = (f'/{self._bucket_sub_path}'
-                        if self._bucket_sub_path else '')
             sync_command = (f'{alias_gen}; {gsutil_alias} '
                             f'rsync -e -r -x \'({excludes})\' {src_dir_path} '
                             f'gs://{self.name}{sub_path}/{dest_dir_name}')
@@ -2029,7 +2028,7 @@ def get_dir_sync_command(src_dir_path, dest_dir_name):
 
         log_path = sky_logging.generate_tmp_logging_file_path(
             _STORAGE_LOG_FILE_NAME)
-        sync_path = f'{source_message} -> gs://{self.name}/'
+        sync_path = f'{source_message} -> gs://{self.name}{sub_path}/'
         with rich_utils.safe_status(
                 ux_utils.spinner_message(f'Syncing {sync_path}',
                                          log_path=log_path)):
@@ -2307,15 +2306,24 @@ def from_metadata(cls, metadata: AbstractStore.StoreMetadata,
             An instance of AzureBlobStore.
         """
         assert isinstance(metadata, AzureBlobStore.AzureBlobStoreMetadata)
-        return cls(name=override_args.get('name', metadata.name),
-                   storage_account_name=override_args.get(
-                       'storage_account', metadata.storage_account_name),
-                   source=override_args.get('source', metadata.source),
-                   region=override_args.get('region', metadata.region),
-                   is_sky_managed=override_args.get('is_sky_managed',
-                                                    metadata.is_sky_managed),
-                   sync_on_reconstruction=override_args.get(
-                       'sync_on_reconstruction', True))
+        # TODO: this needs to be kept in sync with the abstract
+        # AbstractStore.from_metadata.
+        return cls(
+            name=override_args.get('name', metadata.name),
+            storage_account_name=override_args.get(
+                'storage_account', metadata.storage_account_name),
+            source=override_args.get('source', metadata.source),
+            region=override_args.get('region', metadata.region),
+            is_sky_managed=override_args.get('is_sky_managed',
+                                             metadata.is_sky_managed),
+            sync_on_reconstruction=override_args.get('sync_on_reconstruction',
+                                                     True),
+            # Backward compatibility
+            # TODO: remove the hasattr check after v0.11.0
+            _bucket_sub_path=override_args.get(
+                '_bucket_sub_path',
+                metadata._bucket_sub_path  # pylint: disable=protected-access
+            ) if hasattr(metadata, '_bucket_sub_path') else None)
 
     def get_metadata(self) -> AzureBlobStoreMetadata:
         return self.AzureBlobStoreMetadata(
@@ -2795,6 +2803,8 @@ def batch_az_blob_sync(self,
                 set to True, the directory is created in the bucket root and
                 contents are uploaded to it.
         """
+        container_path = (f'{self.container_name}/{self._bucket_sub_path}'
+                          if self._bucket_sub_path else self.container_name)
 
         def get_file_sync_command(base_dir_path, file_names) -> str:
             # shlex.quote is not used for file_names as 'az storage blob sync'
@@ -2803,8 +2813,6 @@ def get_file_sync_command(base_dir_path, file_names) -> str:
             includes_list = ';'.join(file_names)
             includes = f'--include-pattern "{includes_list}"'
             base_dir_path = shlex.quote(base_dir_path)
-            container_path = (f'{self.container_name}/{self._bucket_sub_path}'
-                              if self._bucket_sub_path else self.container_name)
             sync_command = (f'az storage blob sync '
                             f'--account-name {self.storage_account_name} '
                             f'--account-key {self.storage_account_key} '
@@ -2822,18 +2830,17 @@ def get_dir_sync_command(src_dir_path, dest_dir_name) -> str:
                 [file_name.rstrip('*') for file_name in excluded_list])
             excludes = f'--exclude-path "{excludes_list}"'
             src_dir_path = shlex.quote(src_dir_path)
-            container_path = (f'{self.container_name}/{self._bucket_sub_path}'
-                              if self._bucket_sub_path else
-                              f'{self.container_name}')
             if dest_dir_name:
-                container_path = f'{container_path}/{dest_dir_name}'
+                dest_dir_name = f'/{dest_dir_name}'
+            else:
+                dest_dir_name = ''
             sync_command = (f'az storage blob sync '
                             f'--account-name {self.storage_account_name} '
                             f'--account-key {self.storage_account_key} '
                             f'{excludes} '
                             '--delete-destination false '
                             f'--source {src_dir_path} '
-                            f'--container {container_path}')
+                            f'--container {container_path}{dest_dir_name}')
             return sync_command
 
         # Generate message for upload
@@ -2844,7 +2851,7 @@ def get_dir_sync_command(src_dir_path, dest_dir_name) -> str:
             source_message = source_path_list[0]
         container_endpoint = data_utils.AZURE_CONTAINER_URL.format(
             storage_account_name=self.storage_account_name,
-            container_name=self.name)
+            container_name=container_path)
         log_path = sky_logging.generate_tmp_logging_file_path(
             _STORAGE_LOG_FILE_NAME)
         sync_path = f'{source_message} -> {container_endpoint}/'
@@ -3238,6 +3245,8 @@ def batch_aws_rsync(self,
                 set to True, the directory is created in the bucket root and
                 contents are uploaded to it.
         """
+        sub_path = (f'/{self._bucket_sub_path}'
+                    if self._bucket_sub_path else '')
 
         def get_file_sync_command(base_dir_path, file_names):
             includes = ' '.join([
@@ -3246,8 +3255,6 @@ def get_file_sync_command(base_dir_path, file_names):
             ])
             endpoint_url = cloudflare.create_endpoint()
             base_dir_path = shlex.quote(base_dir_path)
-            sub_path = (f'/{self._bucket_sub_path}'
-                        if self._bucket_sub_path else '')
             sync_command = ('AWS_SHARED_CREDENTIALS_FILE='
                             f'{cloudflare.R2_CREDENTIALS_PATH} '
                             'aws s3 sync --no-follow-symlinks --exclude="*" '
@@ -3267,8 +3274,6 @@ def get_dir_sync_command(src_dir_path, dest_dir_name):
             ])
             endpoint_url = cloudflare.create_endpoint()
             src_dir_path = shlex.quote(src_dir_path)
-            sub_path = (f'/{self._bucket_sub_path}'
-                        if self._bucket_sub_path else '')
             sync_command = ('AWS_SHARED_CREDENTIALS_FILE='
                             f'{cloudflare.R2_CREDENTIALS_PATH} '
                             f'aws s3 sync --no-follow-symlinks {excludes} '
@@ -3286,7 +3291,7 @@ def get_dir_sync_command(src_dir_path, dest_dir_name):
 
         log_path = sky_logging.generate_tmp_logging_file_path(
             _STORAGE_LOG_FILE_NAME)
-        sync_path = f'{source_message} -> r2://{self.name}/'
+        sync_path = f'{source_message} -> r2://{self.name}{sub_path}/'
         with rich_utils.safe_status(
                 ux_utils.spinner_message(f'Syncing {sync_path}',
                                          log_path=log_path)):
@@ -3710,6 +3715,8 @@ def batch_ibm_rsync(self,
                 set to True, the directory is created in the bucket root and
                 contents are uploaded to it.
         """
+        sub_path = (f'/{self._bucket_sub_path}'
+                    if self._bucket_sub_path else '')
 
         def get_dir_sync_command(src_dir_path, dest_dir_name) -> str:
             """returns an rclone command that copies a complete folder
@@ -3731,8 +3738,6 @@ def get_dir_sync_command(src_dir_path, dest_dir_name) -> str:
             # .git directory is excluded from the sync
             # wrapping src_dir_path with "" to support path with spaces
             src_dir_path = shlex.quote(src_dir_path)
-            sub_path = (f'/{self._bucket_sub_path}'
-                        if self._bucket_sub_path else '')
             sync_command = (
                 'rclone copy --exclude ".git/*" '
                 f'{src_dir_path} '
@@ -3763,8 +3768,6 @@ def get_file_sync_command(base_dir_path, file_names) -> str:
                 for file_name in file_names
             ])
             base_dir_path = shlex.quote(base_dir_path)
-            sub_path = (f'/{self._bucket_sub_path}'
-                        if self._bucket_sub_path else '')
             sync_command = (
                 'rclone copy '
                 f'{includes} {base_dir_path} '
@@ -3779,7 +3782,8 @@ def get_file_sync_command(base_dir_path, file_names) -> str:
 
         log_path = sky_logging.generate_tmp_logging_file_path(
             _STORAGE_LOG_FILE_NAME)
-        sync_path = f'{source_message} -> cos://{self.region}/{self.name}/'
+        sync_path = (
+            f'{source_message} -> cos://{self.region}/{self.name}{sub_path}/')
         with rich_utils.safe_status(
                 ux_utils.spinner_message(f'Syncing {sync_path}',
                                          log_path=log_path)):
@@ -4178,15 +4182,21 @@ def batch_oci_rsync(self,
                 set to True, the directory is created in the bucket root and
                 contents are uploaded to it.
         """
+        sub_path = (f'{self._bucket_sub_path}/'
+                    if self._bucket_sub_path else '')
 
         @oci.with_oci_env
         def get_file_sync_command(base_dir_path, file_names):
             includes = ' '.join(
                 [f'--include "{file_name}"' for file_name in file_names])
+            prefix_arg = ''
+            if sub_path:
+                prefix_arg = f'--object-prefix "{sub_path.strip("/")}"'
             sync_command = (
                 'oci os object bulk-upload --no-follow-symlinks --overwrite '
                 f'--bucket-name {self.name} --namespace-name {self.namespace} '
                 f'--region {self.region} --src-dir "{base_dir_path}" '
+                f'{prefix_arg} '
                 f'{includes}')
 
             return sync_command
@@ -4207,7 +4217,8 @@ def get_dir_sync_command(src_dir_path, dest_dir_name):
             sync_command = (
                 'oci os object bulk-upload --no-follow-symlinks --overwrite '
                 f'--bucket-name {self.name} --namespace-name {self.namespace} '
-                f'--region {self.region} --object-prefix "{dest_dir_name}" '
+                f'--region {self.region} '
+                f'--object-prefix "{sub_path}{dest_dir_name}" '
                 f'--src-dir "{src_dir_path}" {excludes}')
 
             return sync_command
@@ -4220,7 +4231,7 @@ def get_dir_sync_command(src_dir_path, dest_dir_name):
 
         log_path = sky_logging.generate_tmp_logging_file_path(
             _STORAGE_LOG_FILE_NAME)
-        sync_path = f'{source_message} -> oci://{self.name}/'
+        sync_path = f'{source_message} -> oci://{self.name}/{sub_path}'
         with rich_utils.safe_status(
                 ux_utils.spinner_message(f'Syncing {sync_path}',
                                          log_path=log_path)):
diff --git a/tests/smoke_tests/test_mount_and_storage.py b/tests/smoke_tests/test_mount_and_storage.py
index c7f3e356c0d..13c3e118f75 100644
--- a/tests/smoke_tests/test_mount_and_storage.py
+++ b/tests/smoke_tests/test_mount_and_storage.py
@@ -854,16 +854,18 @@ def yield_storage_object(
                                           persistent=persistent,
                                           mode=mode,
                                           _bucket_sub_path=_bucket_sub_path)
-        yield storage_obj
-        handle = global_user_state.get_handle_from_storage_name(
-            storage_obj.name)
-        if handle:
-            # If handle exists, delete manually
-            # TODO(romilb): This is potentially risky - if the delete method has
-            #   bugs, this can cause resource leaks. Ideally we should manually
-            #   eject storage from global_user_state and delete the bucket using
-            #   boto3 directly.
-            storage_obj.delete()
+        try:
+            yield storage_obj
+        finally:
+            handle = global_user_state.get_handle_from_storage_name(
+                storage_obj.name)
+            if handle:
+                # If handle exists, delete manually
+                # TODO(romilb): This is potentially risky - if the delete method has
+                #   bugs, this can cause resource leaks. Ideally we should manually
+                #   eject storage from global_user_state and delete the bucket using
+                #   boto3 directly.
+                storage_obj.delete()
 
     @pytest.fixture
     def tmp_scratch_storage_obj(self, tmp_bucket_name):
@@ -881,17 +883,19 @@ def tmp_multiple_scratch_storage_obj(self):
             timestamp = str(time.time()).replace('.', '')
             store_obj = storage_lib.Storage(name=f'sky-test-{timestamp}')
             storage_mult_obj.append(store_obj)
-        yield storage_mult_obj
-        for storage_obj in storage_mult_obj:
-            handle = global_user_state.get_handle_from_storage_name(
-                storage_obj.name)
-            if handle:
-                # If handle exists, delete manually
-                # TODO(romilb): This is potentially risky - if the delete method has
-                # bugs, this can cause resource leaks. Ideally we should manually
-                # eject storage from global_user_state and delete the bucket using
-                # boto3 directly.
-                storage_obj.delete()
+        try:
+            yield storage_mult_obj
+        finally:
+            for storage_obj in storage_mult_obj:
+                handle = global_user_state.get_handle_from_storage_name(
+                    storage_obj.name)
+                if handle:
+                    # If handle exists, delete manually
+                    # TODO(romilb): This is potentially risky - if the delete method has
+                    # bugs, this can cause resource leaks. Ideally we should manually
+                    # eject storage from global_user_state and delete the bucket using
+                    # boto3 directly.
+                    storage_obj.delete()
 
     @pytest.fixture
     def tmp_multiple_custom_source_storage_obj(self):
@@ -907,12 +911,14 @@ def tmp_multiple_custom_source_storage_obj(self):
             store_obj = storage_lib.Storage(name=f'sky-test-{timestamp}',
                                             source=src_path)
             storage_mult_obj.append(store_obj)
-        yield storage_mult_obj
-        for storage_obj in storage_mult_obj:
-            handle = global_user_state.get_handle_from_storage_name(
-                storage_obj.name)
-            if handle:
-                storage_obj.delete()
+        try:
+            yield storage_mult_obj
+        finally:
+            for storage_obj in storage_mult_obj:
+                handle = global_user_state.get_handle_from_storage_name(
+                    storage_obj.name)
+                if handle:
+                    storage_obj.delete()
 
     @pytest.fixture
     def tmp_local_storage_obj(self, tmp_bucket_name, tmp_source):
@@ -1099,7 +1105,14 @@ def test_bucket_sub_path(self, tmp_local_storage_obj_with_sub_path,
                              store_type):
         # Creates a new bucket with a local source, uploads files to it
         # and deletes it.
-        tmp_local_storage_obj_with_sub_path.add_store(store_type)
+        region_kwargs = {}
+        if store_type == storage_lib.StoreType.AZURE:
+            # We have to specify the region for Azure storage, as the default
+            # Azure storage account is in centralus region.
+            region_kwargs['region'] = 'centralus'
+
+        tmp_local_storage_obj_with_sub_path.add_store(store_type,
+                                                      **region_kwargs)
 
         # Check files under bucket and filter by prefix
         files = self.list_all_files(store_type,
@@ -1412,7 +1425,13 @@ def test_upload_to_existing_bucket(self, ext_bucket_fixture, request,
         # sky) and verifies that files are written.
         bucket_name, _ = request.getfixturevalue(ext_bucket_fixture)
         storage_obj = storage_lib.Storage(name=bucket_name, source=tmp_source)
-        storage_obj.add_store(store_type)
+        region_kwargs = {}
+        if store_type == storage_lib.StoreType.AZURE:
+            # We have to specify the region for Azure storage, as the default
+            # Azure storage account is in centralus region.
+            region_kwargs['region'] = 'centralus'
+
+        storage_obj.add_store(store_type, **region_kwargs)
 
         # Check if tmp_source/tmp-file exists in the bucket using aws cli
         out = subprocess.check_output(self.cli_ls_cmd(store_type, bucket_name),
@@ -1458,7 +1477,13 @@ def test_copy_mount_existing_storage(self,
     def test_list_source(self, tmp_local_list_storage_obj, store_type):
         # Uses a list in the source field to specify a file and a directory to
         # be uploaded to the storage object.
-        tmp_local_list_storage_obj.add_store(store_type)
+        region_kwargs = {}
+        if store_type == storage_lib.StoreType.AZURE:
+            # We have to specify the region for Azure storage, as the default
+            # Azure storage account is in centralus region.
+            region_kwargs['region'] = 'centralus'
+
+        tmp_local_list_storage_obj.add_store(store_type, **region_kwargs)
 
         # Check if tmp-file exists in the bucket root using cli
         out = subprocess.check_output(self.cli_ls_cmd(
@@ -1513,7 +1538,13 @@ def test_excluded_file_cloud_storage_upload_copy(self, gitignore_structure,
                                                      tmp_gitignore_storage_obj):
         # tests if files included in .gitignore and .git/info/exclude are
         # excluded from being transferred to Storage
-        tmp_gitignore_storage_obj.add_store(store_type)
+        region_kwargs = {}
+        if store_type == storage_lib.StoreType.AZURE:
+            # We have to specify the region for Azure storage, as the default
+            # Azure storage account is in centralus region.
+            region_kwargs['region'] = 'centralus'
+
+        tmp_gitignore_storage_obj.add_store(store_type, **region_kwargs)
         upload_file_name = 'included'
         # Count the number of files with the given file name
         up_cmd = self.cli_count_name_in_bucket(store_type, \

From bbfdb05cb17f94fd2938e273bc956b99c733489f Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Thu, 13 Feb 2025 22:22:26 -0800
Subject: [PATCH 17/18] [LLM] Deepseek r1-671B with SGLang (#4715)

* yaml for multi-node deepseek r1

* fix IP and --tp

* rename

* Add example to gallery

* Add example for query

* fix to be H100

* fix names

* fix

* separate readme

* update readme

* switch to uv for faster installation

* fix doc page

* Fix docs

* refine title

* update numbers

* update readme

* update cover

* Update

* gvnic comment

* Add serve section

* Shorter title

* title
---
 README.md                                     |   3 +-
 docs/source/_gallery_original/index.rst       |   2 +
 .../llms/deepseek-r1-distilled.md             |   1 +
 .../_gallery_original/llms/deepseek-r1.md     |   1 +
 llm/deepseek-r1-distilled/README.md           | 262 ++++++++++++++++
 .../deepseek-r1-vllm.yaml                     |   4 +-
 llm/deepseek-r1/README.md                     | 283 ++++++------------
 llm/deepseek-r1/deepseek-r1-671B.yaml         |  49 +++
 8 files changed, 404 insertions(+), 201 deletions(-)
 create mode 120000 docs/source/_gallery_original/llms/deepseek-r1-distilled.md
 create mode 120000 docs/source/_gallery_original/llms/deepseek-r1.md
 create mode 100644 llm/deepseek-r1-distilled/README.md
 rename llm/{deepseek-r1 => deepseek-r1-distilled}/deepseek-r1-vllm.yaml (89%)
 create mode 100644 llm/deepseek-r1/deepseek-r1-671B.yaml

diff --git a/README.md b/README.md
index ce3ccac8606..9c9b1d1e974 100644
--- a/README.md
+++ b/README.md
@@ -26,8 +26,9 @@
 
 ----
 :fire: *News* :fire:
+- [Feb 2025] Run and Serve DeepSeek-R1 671B using SkyPilot and SGLang with high throughput: [**example**](./llm/deepseek-r1/)
 - [Jan 2025] Prepare and Serve Large-Scale Image Search with **Vector Database**: [**blog post**](https://blog.skypilot.co/large-scale-vector-database/) [**example**](./examples/vector_database/) 
-- [Jan 2025] Launch and Serve **[DeepSeek-R1](https://github.com/deepseek-ai/DeepSeek-R1)** and **[Janus](https://github.com/deepseek-ai/DeepSeek-Janus)** on Kubernetes or Any Cloud: [**R1 example**](./llm/deepseek-r1/) and [**Janus example**](./llm/deepseek-janus/)
+- [Jan 2025] Launch and Serve distilled models from **[DeepSeek-R1](https://github.com/deepseek-ai/DeepSeek-R1)** and **[Janus](https://github.com/deepseek-ai/DeepSeek-Janus)** on Kubernetes or Any Cloud: [**R1 example**](./llm/deepseek-r1-distilled/) and [**Janus example**](./llm/deepseek-janus/)
 - [Oct 2024] :tada: **SkyPilot crossed 1M+ downloads** :tada:: Thank you to our community! [**Twitter/X**](https://x.com/skypilot_org/status/1844770841718067638)
 - [Sep 2024] Point, Launch and Serve **Llama 3.2** on Kubernetes or Any Cloud: [**example**](./llm/llama-3_2/)
 - [Sep 2024] Run and deploy [**Pixtral**](./llm/pixtral), the first open-source multimodal model from Mistral AI.
diff --git a/docs/source/_gallery_original/index.rst b/docs/source/_gallery_original/index.rst
index 8e0d0b16c35..9f9f60daa6d 100644
--- a/docs/source/_gallery_original/index.rst
+++ b/docs/source/_gallery_original/index.rst
@@ -34,6 +34,8 @@ Contents
    :maxdepth: 1
    :caption: LLM Models
 
+   DeepSeek-R1 <llms/deepseek-r1>
+   DeepSeek-R1 Distilled <llms/deepseek-r1-distilled>
    Vision Llama 3.2 (Meta) <llms/llama-3_2>
    Llama 3.1 (Meta) <llms/llama-3_1>
    Llama 3 (Meta) <llms/llama-3>
diff --git a/docs/source/_gallery_original/llms/deepseek-r1-distilled.md b/docs/source/_gallery_original/llms/deepseek-r1-distilled.md
new file mode 120000
index 00000000000..18f53ddcc9d
--- /dev/null
+++ b/docs/source/_gallery_original/llms/deepseek-r1-distilled.md
@@ -0,0 +1 @@
+../../../../llm/deepseek-r1-distilled/README.md
\ No newline at end of file
diff --git a/docs/source/_gallery_original/llms/deepseek-r1.md b/docs/source/_gallery_original/llms/deepseek-r1.md
new file mode 120000
index 00000000000..2e4e24ef08d
--- /dev/null
+++ b/docs/source/_gallery_original/llms/deepseek-r1.md
@@ -0,0 +1 @@
+../../../../llm/deepseek-r1/README.md
\ No newline at end of file
diff --git a/llm/deepseek-r1-distilled/README.md b/llm/deepseek-r1-distilled/README.md
new file mode 100644
index 00000000000..e6ab10d06f1
--- /dev/null
+++ b/llm/deepseek-r1-distilled/README.md
@@ -0,0 +1,262 @@
+# Run and Serve DeepSeek-R1 Distilled Models with SkyPilot and vLLM
+
+> SkyPilot is a framework for running AI and batch workloads on any infra, offering unified execution, high cost savings, and high GPU availability.
+
+<p align="center">
+<img src="https://i.imgur.com/6umSuKw.png" alt="DeepSeek-R1 on SkyPilot" style="width: 70%;">
+</p>
+
+On Jan 20, 2025, DeepSeek AI released the [DeepSeek-R1](https://github.com/deepseek-ai/DeepSeek-R1), including a family of models up to 671B parameters. 
+
+DeepSeek-R1 naturally emerged with numerous powerful and interesting reasoning behaviors. It outperforms **state-of-the-art proprietary models** such as OpenAI-o1-mini and becomes **the first time** an open LLM closely rivals like OpenAI-o1.
+
+This guide walks through how to run and host DeepSeek-R1 models **on any infrastructure** from ranging from Local GPU workstation, Kubernetes cluster and public Clouds ([15+ clouds supported](https://docs.skypilot.co/en/latest/getting-started/installation.html)). 
+
+Skypilot supports a variety of LLM frameworks and models. In this guide, we use [vLLM](https://github.com/vllm-project/vllm), an open-source library for fast LLM inference and serving, as an example. 
+
+**New**: We added a new SkyPilot YAML for running [DeepSeek-R1 685B with SGLang](https://github.com/skypilot/tree/master/deepseek-r1).
+
+## Step 0: Bring any infra
+
+Install SkyPilot on your local machine:
+
+```bash
+pip install 'skypilot-nightly[all]'
+```
+
+Pick one of the following depending on what infra you want to run DeepSeek-R1 on:
+
+**If your local machine/cluster has GPU**: you can run SkyPilot [directly on existing machines](https://docs.skypilot.co/en/latest/reservations/existing-machines.html) with 
+
+```bash
+sky local up
+```
+
+**If you want to use Clouds** (15+ clouds are supported):
+
+```bash
+sky check 
+```
+See [docs](https://docs.skypilot.co/en/latest/getting-started/installation.html) for details.
+
+
+## Step 1: Run it with SkyPilot
+
+Now it's time to run deepseek with SkyPilot. The instruction can be dependent on your existing hardware.  
+
+8B: 
+```
+sky launch deepseek-r1-vllm.yaml \
+  -c deepseek \
+  --env HF_TOKEN=YOUR_HUGGING_FACE_API_TOKEN \
+  --env MODEL_NAME=deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
+  --gpus L4:1
+```
+
+70B: 
+```
+sky launch deepseek-r1-vllm.yaml \
+  -c deepseek \
+  --env HF_TOKEN=YOUR_HUGGING_FACE_API_TOKEN \
+  --env MODEL_NAME=deepseek-ai/DeepSeek-R1-Distill-Llama-70B \
+  --gpus A100-80GB:2
+```
+
+replace the command with your own huggingface token and the GPU that you wish to use. You may run `sky show-gpus` to know what GPU that you have access to. As a reference, here is the model-GPU compatibility matrix:
+
+| **GPU**         	| **DeepSeek-R1-Distill-Qwen-7B**  | **DeepSeek-R1-Distill-Llama-70B** 	| **DeepSeek-R1**  	| 
+|-----------------	|------------------------------	|------------------------	|------------------------------	|
+| **L4:1**        	| ✅, with `--max-model-len 4096` 	| ❌                      	| ❌                            	|
+| **L4:8**        	| ✅                            	| ❌                      	| ❌                            	|
+| **A100:8**      	| ✅                            	| ✅                      	| ❌                            	|
+| **A100-80GB:12** 	| ✅                            	| ✅                      	| ✅, with `--max-model-len 4096` 	|
+
+## Step 2: Get Results 
+Get a single endpoint that load-balances across replicas:
+
+```
+ENDPOINT=$(sky status --ip deepseek)
+```
+
+Query the endpoint in a terminal:
+8B: 
+```
+curl http://$ENDPOINT:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+    "messages": [
+      {
+        "role": "system",
+        "content": "You are a helpful assistant."
+      },
+      {
+        "role": "user",
+        "content": "Who are you?"
+      }
+    ]
+  }' | jq .
+```
+
+70B: 
+```
+curl http://$ENDPOINT:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
+    "messages": [
+      {
+        "role": "system",
+        "content": "You are a helpful assistant."
+      },
+      {
+        "role": "user",
+        "content": "how many rs are in strawberry"
+      }
+    ]
+  }' | jq .
+```    
+</details>
+
+You will get both the chain of thoughts within `<think>` tags and the final results. 
+
+<details>
+    <summary>Who are you? I'm DeepSeek-R1.</summary>
+
+Greetings! I'm DeepSeek-R1, an artificial intelligence assistant created by DeepSeek. I'm at your service and would be delighted to assist you with any inquiries or tasks you may have. 
+
+```console
+{
+  "id": "chatcmpl-507f467863344f31b98d8bf36b9a3c1c",
+  "object": "chat.completion",
+  "created": 1737503962,
+  "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "<think>\n\n</think>\n\nGreetings! I'm DeepSeek-R1, an artificial intelligence assistant created by DeepSeek. I'm at your service and would be delighted to assist you with any inquiries or tasks you may have.",
+        "tool_calls": []
+      },
+      "logprobs": null,
+      "finish_reason": "stop",
+      "stop_reason": null
+    }
+  ],
+  "usage": {
+    "prompt_tokens": 13,
+    "total_tokens": 57,
+    "completion_tokens": 44,
+    "prompt_tokens_details": null
+  },
+  "prompt_logprobs": null
+}
+```
+</details>
+
+
+<details>
+    <summary>How many Rs are in strawberry: There are 3 Rs in strawberry.</summary>
+
+\<think\>
+Okay, so I need to figure out how many times the letter 'R' appears in the word "strawberry." Hmm, let me think about this step by step. First, I should probably write out the word to visualize it better. The word is S-T-R-A-W-B-E-R-R-Y. Wait, is that right? Let me double-check. S-T-R-A-W-B-E-R-R-Y, yes, that's how it's spelled.
+
+Now, I need to go through each letter one by one and count the Rs. Starting with the first letter, it's an S. Not an R, so move on. The second letter is T, still not an R. The third letter is R. Okay, that's the first R. I'll note that down.
+
+Next letters: A, W, B, E. None of those are Rs. So far, only one R. Then comes R again after E, right? So that's the second R. But wait, I think there's another R after that. Let me make sure. After the second R, there's another R, making it the third R. Wait, no, let me check the spelling again. It's S-T-R-A-W-B-E-R-R-Y. So after E, it's R, then another R, so that's two Rs there. So total, how many?
+
+Let me recount: first R is the third letter, then after E, there's another R, making it two in total, or three? Wait, no. Let me look at each position:
+
+1. S
+2. T
+3. R (1st R)
+4. A
+5. W
+6. B
+7. E
+8. R (2nd R)
+9. R (3rd R)
+10. Y
+
+Wait, so after E, there are two Rs in a row, which would make it the 8th and 9th letters. So that's two more Rs after the first one. So total, it's three Rs? Or is that correct? Let me make sure I'm not overcounting. Let's write it out:
+
+S T R A W B E R R Y
+
+So, positions:
+
+1: S
+
+2: T
+
+3: R (1)
+
+4: A
+
+5: W
+
+6: B
+
+7: E
+
+8: R (2)
+
+9: R (3)
+
+10: Y
+
+So that's three Rs. Wait, but when I think about the word "strawberry," I thought it had two Rs, but maybe it's three. Wait, maybe I'm wrong. Let me check a dictionary or something, but since I can't do that, I'll have to rely on my memory. Hmm, maybe I was mistaken earlier. Let me think again. Strawberries have a double R, I believe. But in the spelling, is it R-A-W-B-E-R-R-Y? So after the E, it's R-R-Y. So that's two Rs at the end. Plus the one after the T, so that's three Rs total.
+
+Wait, no. Let me think about how the word is pronounced. It's "straw" plus "berry," right? So "straw" has one R, and "berry" has two Rs? No, "berry" only has one R. Wait, no, "berry" is B-E-R-R-Y, so there are two Rs there. So when you put it together, "strawberry" would have the R from "straw" and two Rs from "berry," making three Rs. Hmm, but I'm not sure. Some people might think it's only two Rs, but based on the spelling, it's three.
+
+Wait, no, actually, let me break it down. The word is S-T-R-A-W-B-E-R-R-Y. So after the T, there's an R, then later after the B and E, there's another R, and then another R before Y. So that's three Rs. So the answer should be three. But I'm a bit confused because sometimes people might miscount, thinking it's two. But according to the spelling, it's three. I think that's correct.
+\</think\>
+
+The word "strawberry" contains three Rs.
+
+Step-by-step breakdown:
+- The first R is the third letter.
+- The second R is the eighth letter.
+- The third R is the ninth letter.
+
+Counting each occurrence: 1 (position 3), 2 (position 8), and 3 (position 9).
+
+Answer: There are 3 Rs in "strawberry."
+
+```console
+{
+  "id": "chatcmpl-d532bd1c1738493ab9c8c906550044bf",
+  "object": "chat.completion",
+  "created": 1737507945,
+  "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "<think>\nOkay, so I need to figure out how many times the letter 'R' appears in the word \"strawberry.\" Hmm, let me think about this step by step. First, I should probably write out the word to visualize it better. The word is S-T-R-A-W-B-E-R-R-Y. Wait, is that right? Let me double-check. S-T-R-A-W-B-E-R-R-Y, yes, that's how it's spelled.\n\nNow, I need to go through each letter one by one and count the Rs. Starting with the first letter, it's an S. Not an R, so move on. The second letter is T, still not an R. The third letter is R. Okay, that's the first R. I'll note that down.\n\nNext letters: A, W, B, E. None of those are Rs. So far, only one R. Then comes R again after E, right? So that's the second R. But wait, I think there's another R after that. Let me make sure. After the second R, there's another R, making it the third R. Wait, no, let me check the spelling again. It's S-T-R-A-W-B-E-R-R-Y. So after E, it's R, then another R, so that's two Rs there. So total, how many?\n\nLet me recount: first R is the third letter, then after E, there's another R, making it two in total, or three? Wait, no. Let me look at each position:\n\n1. S\n2. T\n3. R (1st R)\n4. A\n5. W\n6. B\n7. E\n8. R (2nd R)\n9. R (3rd R)\n10. Y\n\nWait, so after E, there are two Rs in a row, which would make it the 8th and 9th letters. So that's two more Rs after the first one. So total, it's three Rs? Or is that correct? Let me make sure I'm not overcounting. Let's write it out:\n\nS T R A W B E R R Y\n\nSo, positions:\n\n1: S\n\n2: T\n\n3: R (1)\n\n4: A\n\n5: W\n\n6: B\n\n7: E\n\n8: R (2)\n\n9: R (3)\n\n10: Y\n\nSo that's three Rs. Wait, but when I think about the word \"strawberry,\" I thought it had two Rs, but maybe it's three. Wait, maybe I'm wrong. Let me check a dictionary or something, but since I can't do that, I'll have to rely on my memory. Hmm, maybe I was mistaken earlier. Let me think again. Strawberries have a double R, I believe. But in the spelling, is it R-A-W-B-E-R-R-Y? So after the E, it's R-R-Y. So that's two Rs at the end. Plus the one after the T, so that's three Rs total.\n\nWait, no. Let me think about how the word is pronounced. It's \"straw\" plus \"berry,\" right? So \"straw\" has one R, and \"berry\" has two Rs? No, \"berry\" only has one R. Wait, no, \"berry\" is B-E-R-R-Y, so there are two Rs there. So when you put it together, \"strawberry\" would have the R from \"straw\" and two Rs from \"berry,\" making three Rs. Hmm, but I'm not sure. Some people might think it's only two Rs, but based on the spelling, it's three.\n\nWait, no, actually, let me break it down. The word is S-T-R-A-W-B-E-R-R-Y. So after the T, there's an R, then later after the B and E, there's another R, and then another R before Y. So that's three Rs. So the answer should be three. But I'm a bit confused because sometimes people might miscount, thinking it's two. But according to the spelling, it's three. I think that's correct.\n</think>\n\nThe word \"strawberry\" contains three Rs. \n\nStep-by-step breakdown:\n- The first R is the third letter.\n- The second R is the eighth letter.\n- The third R is the ninth letter.\n\nCounting each occurrence: 1 (position 3), 2 (position 8), and 3 (position 9).\n\nAnswer: There are 3 Rs in \"strawberry.\"",
+        "tool_calls": []
+      },
+      "logprobs": null,
+      "finish_reason": "stop",
+      "stop_reason": null
+    }
+  ],
+  "usage": {
+    "prompt_tokens": 15,
+    "total_tokens": 985,
+    "completion_tokens": 970,
+    "prompt_tokens_details": null
+  },
+  "prompt_logprobs": null
+}
+```
+    
+</details>
+
+
+## Shutdown 
+To shutdown, run 
+```
+sky down deepseek
+```
diff --git a/llm/deepseek-r1/deepseek-r1-vllm.yaml b/llm/deepseek-r1-distilled/deepseek-r1-vllm.yaml
similarity index 89%
rename from llm/deepseek-r1/deepseek-r1-vllm.yaml
rename to llm/deepseek-r1-distilled/deepseek-r1-vllm.yaml
index 46959aa9914..2bc074e21f2 100644
--- a/llm/deepseek-r1/deepseek-r1-vllm.yaml
+++ b/llm/deepseek-r1-distilled/deepseek-r1-vllm.yaml
@@ -10,8 +10,8 @@ resources:
   disk_tier: best
 
 setup: |
-  pip install transformers==4.48.1
-  pip install vllm==0.6.6.post1
+  uv pip install transformers==4.48.1
+  uv pip install vllm==0.6.6.post1
 
   python -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')"
 
diff --git a/llm/deepseek-r1/README.md b/llm/deepseek-r1/README.md
index b1f24ca7119..4d6ab6a6eaa 100644
--- a/llm/deepseek-r1/README.md
+++ b/llm/deepseek-r1/README.md
@@ -1,108 +1,91 @@
-# Run and Serve DeepSeek-R1 with SkyPilot
+# Distributed DeepSeek-R1 Serving with high throughput using SGLang and SkyPilot
 
-> SkyPilot is a framework for running AI and batch workloads on any infra, offering unified execution, high cost savings, and high GPU availability.
 
 <p align="center">
-<img src="https://i.imgur.com/6umSuKw.png" alt="DeepSeek-R1 on SkyPilot" style="width: 70%;">
+<img src="https://i.imgur.com/cd5C5IK.png" alt="DeepSeek-R1 on SkyPilot" style="width: 70%;">
 </p>
 
 On Jan 20, 2025, DeepSeek AI released the [DeepSeek-R1](https://github.com/deepseek-ai/DeepSeek-R1), including a family of models up to 671B parameters. 
 
 DeepSeek-R1 naturally emerged with numerous powerful and interesting reasoning behaviors. It outperforms **state-of-the-art proprietary models** such as OpenAI-o1-mini and becomes **the first time** an open LLM closely rivals like OpenAI-o1.
 
-This guide walks through how to run and host DeepSeek-R1 models **on any infrastructure** from ranging from Local GPU workstation, Kubernetes cluster and public Clouds ([15+ clouds supported](https://docs.skypilot.co/en/latest/getting-started/installation.html)). 
+We use [SGLang](https://github.com/sgl-project/sglang) to serve the model distributedly with high throughput in this example, and according to HuggingFace's [report](https://huggingface.co/blog/open-r1/update-2), SGLang offers 2x performance than vLLM for serving the large DeepSeek-R1.
 
-Skypilot supports a variety of LLM frameworks and models. In this guide, we use [vLLM](https://github.com/vllm-project/vllm), an open-source library for fast LLM inference and serving, as an example. 
 
+**Note**: This example is for the original DeepSeek-R1 671B model. For smaller size distilled models, please refer to [deepseek-r1-distilled](https://github.com/skypilot-org/skypilot/tree/master/llm/deepseek-r1-distilled/).
 
-### Step 0: Bring any infra
+## Run 671B DeepSeek-R1 on Kubernetes or any Cloud
 
-Install SkyPilot on your local machine:
+SkyPilot allows you to run the model distributedly with a single command with the framework [SGLang](https://github.com/sgl-project/sglang).
 
-```bash
-pip install 'skypilot-nightly[all]'
-```
+The SkyPilot YAML for DeepSeek-R1 671B, or see [here](https://github.com/skypilot-org/skypilot/blob/master/llm/deepseek-r1/deepseek-r1-671B.yaml):
+```yaml
+name: deepseek-r1
 
-Pick one of the following depending on what infra you want to run DeepSeek-R1 on:
+resources:
+  accelerators: {H200:8, H100:8, A100-80GB:8}
+  disk_size: 1024 # Large disk for model weights
+  disk_tier: best
+  ports: 30000
+  any_of:
+    - use_spot: true
+    - use_spot: false
 
-**If your local machine/cluster has GPU**: you can run SkyPilot [directly on existing machines](https://docs.skypilot.co/en/latest/reservations/existing-machines.html) with 
+num_nodes: 2 # Specify number of nodes to launch
 
-```bash
-sky local up
-```
+setup: |
+  # Install sglang with all dependencies using uv
+  uv pip install "sglang[all]>=0.4.2.post4" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer
 
-**If you want to use Clouds** (15+ clouds are supported):
+  # Set up shared memory for better performance
+  sudo bash -c "echo 'vm.max_map_count=655300' >> /etc/sysctl.conf"
+  sudo sysctl -p
 
-```bash
-sky check 
-```
-See [docs](https://docs.skypilot.co/en/latest/getting-started/installation.html) for details.
+run: |
+  # Launch the server with appropriate configuration
+  MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
+  # TP should be number of GPUs per node times number of nodes
+  TP=$(($SKYPILOT_NUM_GPUS_PER_NODE * $SKYPILOT_NUM_NODES))
 
+  python -m sglang.launch_server \
+    --model deepseek-ai/DeepSeek-R1 \
+    --tp $TP \
+    --dist-init-addr ${MASTER_ADDR}:5000 \
+    --nnodes ${SKYPILOT_NUM_NODES} \
+    --node-rank ${SKYPILOT_NODE_RANK} \
+    --trust-remote-code \
+    --enable-dp-attention \
+    --enable-torch-compile \
+    --torch-compile-max-bs 8 \
+    --host 0.0.0.0 \
+    --port 30000
 
-### Step 1: Run it with SkyPilot
+```
 
-Now it's time to run deepseek with SkyPilot. The instruction can be dependent on your existing hardware.  
 
-8B: 
-```
-sky launch deepseek-r1-vllm.yaml \
-  -c deepseek \
-  --env HF_TOKEN=YOUR_HUGGING_FACE_API_TOKEN \
-  --env MODEL_NAME=deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
-  --gpus L4:1
+```bash
+sky launch -c r1 llm/deepseek-r1/deepseek-r1-671B.yaml --retry-until-up
 ```
 
-70B: 
-```
-sky launch deepseek-r1-vllm.yaml \
-  -c deepseek \
-  --env HF_TOKEN=YOUR_HUGGING_FACE_API_TOKEN \
-  --env MODEL_NAME=deepseek-ai/DeepSeek-R1-Distill-Llama-70B \
-  --gpus A100-80GB:2
-```
+![Find any cheapest candidate resources](https://i.imgur.com/FpAV6Ok.png)
 
-replace the command with your own huggingface token and the GPU that you wish to use. You may run `sky show-gpus` to know what GPU that you have access to. As a reference, here is the model-GPU compatibility matrix:
+SkyPilot finds the cheapest candidate resources for you, and automatically failover through different regions,
+clouds, or Kubernetes clusters to find the resources to launch the model.
 
-| **GPU**         	| **DeepSeek-R1-Distill-Qwen-7B**  | **DeepSeek-R1-Distill-Llama-70B** 	| **DeepSeek-R1**  	| 
-|-----------------	|------------------------------	|------------------------	|------------------------------	|
-| **L4:1**        	| ✅, with `--max-model-len 4096` 	| ❌                      	| ❌                            	|
-| **L4:8**        	| ✅                            	| ❌                      	| ❌                            	|
-| **A100:8**      	| ✅                            	| ✅                      	| ❌                            	|
-| **A100-80GB:12** 	| ✅                            	| ✅                      	| ✅, with `--max-model-len 4096` 	|
+It may take a while (30-40 minutes) for SGLang to download the model weights, compile, and start the server.
 
-### Step 2: Get Results 
-Get a single endpoint that load-balances across replicas:
+![DeepSeek-R1 on SkyPilot](https://i.imgur.com/N51TU86.gif)
 
-```
-ENDPOINT=$(sky status --ip deepseek)
-```
+## Query the endpoint
 
-Query the endpoint in a terminal:
-8B: 
-```
-curl http://$ENDPOINT:8000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
-    "messages": [
-      {
-        "role": "system",
-        "content": "You are a helpful assistant."
-      },
-      {
-        "role": "user",
-        "content": "Who are you?"
-      }
-    ]
-  }' | jq .
+After the initialization, you can access the model with the endpoint:
 ```
+ENDPOINT=$(sky status --endpoint 30000 deepseek)
 
-70B: 
-```
-curl http://$ENDPOINT:8000/v1/chat/completions \
+curl http://$ENDPOINT/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
+    "model": "deepseek-ai/DeepSeek-R1-671B",
     "messages": [
       {
         "role": "system",
@@ -114,148 +97,52 @@ curl http://$ENDPOINT:8000/v1/chat/completions \
       }
     ]
   }' | jq .
-```    
-</details>
-
-You will get both the chain of thoughts within `<think>` tags and the final results. 
-
-<details>
-    <summary>Who are you? I'm DeepSeek-R1.</summary>
-
-Greetings! I'm DeepSeek-R1, an artificial intelligence assistant created by DeepSeek. I'm at your service and would be delighted to assist you with any inquiries or tasks you may have. 
-
-```console
-{
-  "id": "chatcmpl-507f467863344f31b98d8bf36b9a3c1c",
-  "object": "chat.completion",
-  "created": 1737503962,
-  "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
-  "choices": [
-    {
-      "index": 0,
-      "message": {
-        "role": "assistant",
-        "content": "<think>\n\n</think>\n\nGreetings! I'm DeepSeek-R1, an artificial intelligence assistant created by DeepSeek. I'm at your service and would be delighted to assist you with any inquiries or tasks you may have.",
-        "tool_calls": []
-      },
-      "logprobs": null,
-      "finish_reason": "stop",
-      "stop_reason": null
-    }
-  ],
-  "usage": {
-    "prompt_tokens": 13,
-    "total_tokens": 57,
-    "completion_tokens": 44,
-    "prompt_tokens_details": null
-  },
-  "prompt_logprobs": null
-}
 ```
-</details>
-
+ 
+You will get the following answer, which interestingly does not trigger any chain of thoughts.
 
 <details>
-    <summary>How many Rs are in strawberry: There are 3 Rs in strawberry.</summary>
-
-\<think\>
-Okay, so I need to figure out how many times the letter 'R' appears in the word "strawberry." Hmm, let me think about this step by step. First, I should probably write out the word to visualize it better. The word is S-T-R-A-W-B-E-R-R-Y. Wait, is that right? Let me double-check. S-T-R-A-W-B-E-R-R-Y, yes, that's how it's spelled.
-
-Now, I need to go through each letter one by one and count the Rs. Starting with the first letter, it's an S. Not an R, so move on. The second letter is T, still not an R. The third letter is R. Okay, that's the first R. I'll note that down.
-
-Next letters: A, W, B, E. None of those are Rs. So far, only one R. Then comes R again after E, right? So that's the second R. But wait, I think there's another R after that. Let me make sure. After the second R, there's another R, making it the third R. Wait, no, let me check the spelling again. It's S-T-R-A-W-B-E-R-R-Y. So after E, it's R, then another R, so that's two Rs there. So total, how many?
-
-Let me recount: first R is the third letter, then after E, there's another R, making it two in total, or three? Wait, no. Let me look at each position:
-
-1. S
-2. T
-3. R (1st R)
-4. A
-5. W
-6. B
-7. E
-8. R (2nd R)
-9. R (3rd R)
-10. Y
-
-Wait, so after E, there are two Rs in a row, which would make it the 8th and 9th letters. So that's two more Rs after the first one. So total, it's three Rs? Or is that correct? Let me make sure I'm not overcounting. Let's write it out:
-
-S T R A W B E R R Y
-
-So, positions:
+    <summary>How many Rs are in strawberry: So, the answer is **3**. 🍓</summary>
 
-1: S
+Okay, let's figure out how many times the letter \"r\" appears in the word \"strawberry.\" First, I need to make sure I'm spelling \"strawberry\" correctly. Sometimes people might miss letters or add extra ones. Let me write it out: S-T-R-A-W-B-E-R-R-Y. Wait, is that right? Let's double-check. Strawberry is spelled S-T-R-A-W-B-E-R-R-Y. Yes, that's correct. Now, I need to go through each letter one by one and count the number of \"r\"s.\n\nStarting with the first letter: S (no), T (no), R (yes, that's one). Then A (no), W (no), B (no), E (no), R (that's two), R (that's three), Y (no). Wait, wait, hold on. Let me write out the letters with their positions to be precise.\n\nBreaking down \"strawberry\" letter by letter:\n1. S\n2. T\n3. R\n4. A\n5. W\n6. B\n7. E\n8. R\n9. R\n10. Y\n\nSo, looking at positions 3, 8, and 9: that's three \"r\"s. But wait, does that match the actual spelling? Let me confirm again. The word is strawberry. Sometimes people might think it's \"strawberry\" with two \"r\"s, but actually, according to correct spelling, it's S-T-R-A-W-B-E-R-R-Y. So after the B and E, there are two R's, right? Let me check a dictionary or maybe think of the pronunciation. Straw-ber-ry. The \"ber\" part is one R, but the correct spelling includes two R's after the E. So yes, that makes three R's in total. Hmm, but let me make sure I'm not miscounting. So positions 3, 8, 9: R, then two R's at the end before Y. That's three R's. Wait, actually, in the breakdown above, position 3 is R, then positions 8 and 9 are the two R's. So total three. Yes, that's right. So the answer should be three. Let me see if I can find any source that confirms this. Alternatively, I can write the word again and count: S T R A W B E R R Y. So R appears once at the beginning (third letter) and then twice towards the end (8th and 9th letters). So total of three times. Therefore, the correct answer is three.\n</think>\n\nThe word \"strawberry\" contains **3** instances of the letter \"r\". Here's the breakdown:\n\n1. **S**  \n2. **T**  \n3. **R** (1st \"r\")  \n4. **A**  \n5. **W**  \n6. **B**  \n7. **E**  \n8. **R** (2nd \"r\")  \n9. **R** (3rd \"r\")  \n10. **Y**  \n\nSo, the answer is **3**. 🍓
 
-2: T
+    ```console
+    {"id":"01add72820794f5c884c4d5c126d2a62","object":"chat.completion","created":1739493784,"model":"deepseek-ai/DeepSeek-R1-671B","choices":[{"index":0,"message":{"role":"assistant","content":"Okay, let's figure out how many times the letter \"r\" appears in the word \"strawberry.\" First, I need to make sure I'm spelling \"strawberry\" correctly. Sometimes people might miss letters or add extra ones. Let me write it out: S-T-R-A-W-B-E-R-R-Y. Wait, is that right? Let's double-check. Strawberry is spelled S-T-R-A-W-B-E-R-R-Y. Yes, that's correct. Now, I need to go through each letter one by one and count the number of \"r\"s.\n\nStarting with the first letter: S (no), T (no), R (yes, that's one). Then A (no), W (no), B (no), E (no), R (that's two), R (that's three), Y (no). Wait, wait, hold on. Let me write out the letters with their positions to be precise.\n\nBreaking down \"strawberry\" letter by letter:\n1. S\n2. T\n3. R\n4. A\n5. W\n6. B\n7. E\n8. R\n9. R\n10. Y\n\nSo, looking at positions 3, 8, and 9: that's three \"r\"s. But wait, does that match the actual spelling? Let me confirm again. The word is strawberry. Sometimes people might think it's \"strawberry\" with two \"r\"s, but actually, according to correct spelling, it's S-T-R-A-W-B-E-R-R-Y. So after the B and E, there are two R's, right? Let me check a dictionary or maybe think of the pronunciation. Straw-ber-ry. The \"ber\" part is one R, but the correct spelling includes two R's after the E. So yes, that makes three R's in total. Hmm, but let me make sure I'm not miscounting. So positions 3, 8, 9: R, then two R's at the end before Y. That's three R's. Wait, actually, in the breakdown above, position 3 is R, then positions 8 and 9 are the two R's. So total three. Yes, that's right. So the answer should be three. Let me see if I can find any source that confirms this. Alternatively, I can write the word again and count: S T R A W B E R R Y. So R appears once at the beginning (third letter) and then twice towards the end (8th and 9th letters). So total of three times. Therefore, the correct answer is three.\n</think>\n\nThe word \"strawberry\" contains **3** instances of the letter \"r\". Here's the breakdown:\n\n1. **S**  \n2. **T**  \n3. **R** (1st \"r\")  \n4. **A**  \n5. **W**  \n6. **B**  \n7. **E**  \n8. **R** (2nd \"r\")  \n9. **R** (3rd \"r\")  \n10. **Y**  \n\nSo, the answer is **3**. 🍓","tool_calls":null},"logprobs":null,"finish_reason":"stop","matched_stop":1}],"usage":{"prompt_tokens":17,"total_tokens":688,"completion_tokens":671,"prompt_tokens_details":null}}
+    ```
 
-3: R (1)
-
-4: A
-
-5: W
-
-6: B
-
-7: E
-
-8: R (2)
-
-9: R (3)
+</details>
 
-10: Y
 
-So that's three Rs. Wait, but when I think about the word "strawberry," I thought it had two Rs, but maybe it's three. Wait, maybe I'm wrong. Let me check a dictionary or something, but since I can't do that, I'll have to rely on my memory. Hmm, maybe I was mistaken earlier. Let me think again. Strawberries have a double R, I believe. But in the spelling, is it R-A-W-B-E-R-R-Y? So after the E, it's R-R-Y. So that's two Rs at the end. Plus the one after the T, so that's three Rs total.
 
-Wait, no. Let me think about how the word is pronounced. It's "straw" plus "berry," right? So "straw" has one R, and "berry" has two Rs? No, "berry" only has one R. Wait, no, "berry" is B-E-R-R-Y, so there are two Rs there. So when you put it together, "strawberry" would have the R from "straw" and two Rs from "berry," making three Rs. Hmm, but I'm not sure. Some people might think it's only two Rs, but based on the spelling, it's three.
+## Speed for Generation
+You can find the generation speed in the log of the server.
 
-Wait, no, actually, let me break it down. The word is S-T-R-A-W-B-E-R-R-Y. So after the T, there's an R, then later after the B and E, there's another R, and then another R before Y. So that's three Rs. So the answer should be three. But I'm a bit confused because sometimes people might miscount, thinking it's two. But according to the spelling, it's three. I think that's correct.
-\</think\>
+Example speed for 2 H100:8 nodes on GCP with a single request (you may get better performance with gvnic enabled):
+```
+(head, rank=0, pid=18260) [2025-02-14 00:42:22 DP2 TP2] Decode batch. #running-req: 1, #token: 210, token usage: 0.00, gen throughput (token/s): 11.45, #queue-req: 0
+(head, rank=0, pid=18260) [2025-02-14 00:42:25 DP2 TP2] Decode batch. #running-req: 1, #token: 250, token usage: 0.00, gen throughput (token/s): 11.53, #queue-req: 0
+(head, rank=0, pid=18260) [2025-02-14 00:42:29 DP2 TP2] Decode batch. #running-req: 1, #token: 290, token usage: 0.00, gen throughput (token/s): 11.42, #queue-req: 0
+```
 
-The word "strawberry" contains three Rs.
 
-Step-by-step breakdown:
-- The first R is the third letter.
-- The second R is the eighth letter.
-- The third R is the ninth letter.
+## Deploy the Service with Multiple Replicas
 
-Counting each occurrence: 1 (position 3), 2 (position 8), and 3 (position 9).
+The lauching command above only starts a single replica (with 2 nodes) for the service. SkyServe helps deploy the service with multiple replicas with out-of-the-box load balancing, autoscaling and automatic recovering.
+Importantly, it also enables serving on spot instances resulting in 30\% lower cost.
 
-Answer: There are 3 Rs in "strawberry."
+The only difference you have to do is to add a service section for serving specific configuration:
 
-```console
-{
-  "id": "chatcmpl-d532bd1c1738493ab9c8c906550044bf",
-  "object": "chat.completion",
-  "created": 1737507945,
-  "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
-  "choices": [
-    {
-      "index": 0,
-      "message": {
-        "role": "assistant",
-        "content": "<think>\nOkay, so I need to figure out how many times the letter 'R' appears in the word \"strawberry.\" Hmm, let me think about this step by step. First, I should probably write out the word to visualize it better. The word is S-T-R-A-W-B-E-R-R-Y. Wait, is that right? Let me double-check. S-T-R-A-W-B-E-R-R-Y, yes, that's how it's spelled.\n\nNow, I need to go through each letter one by one and count the Rs. Starting with the first letter, it's an S. Not an R, so move on. The second letter is T, still not an R. The third letter is R. Okay, that's the first R. I'll note that down.\n\nNext letters: A, W, B, E. None of those are Rs. So far, only one R. Then comes R again after E, right? So that's the second R. But wait, I think there's another R after that. Let me make sure. After the second R, there's another R, making it the third R. Wait, no, let me check the spelling again. It's S-T-R-A-W-B-E-R-R-Y. So after E, it's R, then another R, so that's two Rs there. So total, how many?\n\nLet me recount: first R is the third letter, then after E, there's another R, making it two in total, or three? Wait, no. Let me look at each position:\n\n1. S\n2. T\n3. R (1st R)\n4. A\n5. W\n6. B\n7. E\n8. R (2nd R)\n9. R (3rd R)\n10. Y\n\nWait, so after E, there are two Rs in a row, which would make it the 8th and 9th letters. So that's two more Rs after the first one. So total, it's three Rs? Or is that correct? Let me make sure I'm not overcounting. Let's write it out:\n\nS T R A W B E R R Y\n\nSo, positions:\n\n1: S\n\n2: T\n\n3: R (1)\n\n4: A\n\n5: W\n\n6: B\n\n7: E\n\n8: R (2)\n\n9: R (3)\n\n10: Y\n\nSo that's three Rs. Wait, but when I think about the word \"strawberry,\" I thought it had two Rs, but maybe it's three. Wait, maybe I'm wrong. Let me check a dictionary or something, but since I can't do that, I'll have to rely on my memory. Hmm, maybe I was mistaken earlier. Let me think again. Strawberries have a double R, I believe. But in the spelling, is it R-A-W-B-E-R-R-Y? So after the E, it's R-R-Y. So that's two Rs at the end. Plus the one after the T, so that's three Rs total.\n\nWait, no. Let me think about how the word is pronounced. It's \"straw\" plus \"berry,\" right? So \"straw\" has one R, and \"berry\" has two Rs? No, \"berry\" only has one R. Wait, no, \"berry\" is B-E-R-R-Y, so there are two Rs there. So when you put it together, \"strawberry\" would have the R from \"straw\" and two Rs from \"berry,\" making three Rs. Hmm, but I'm not sure. Some people might think it's only two Rs, but based on the spelling, it's three.\n\nWait, no, actually, let me break it down. The word is S-T-R-A-W-B-E-R-R-Y. So after the T, there's an R, then later after the B and E, there's another R, and then another R before Y. So that's three Rs. So the answer should be three. But I'm a bit confused because sometimes people might miscount, thinking it's two. But according to the spelling, it's three. I think that's correct.\n</think>\n\nThe word \"strawberry\" contains three Rs. \n\nStep-by-step breakdown:\n- The first R is the third letter.\n- The second R is the eighth letter.\n- The third R is the ninth letter.\n\nCounting each occurrence: 1 (position 3), 2 (position 8), and 3 (position 9).\n\nAnswer: There are 3 Rs in \"strawberry.\"",
-        "tool_calls": []
-      },
-      "logprobs": null,
-      "finish_reason": "stop",
-      "stop_reason": null
-    }
-  ],
-  "usage": {
-    "prompt_tokens": 15,
-    "total_tokens": 985,
-    "completion_tokens": 970,
-    "prompt_tokens_details": null
-  },
-  "prompt_logprobs": null
-}
+```yaml
+service:
+  # Specifying the path to the endpoint to check the readiness of the service.
+  readiness_probe: /health
+  # Allow 1 hour for code start.
+  initial_delay_seconds: 3600
+  # Autoscaling from 0 to 2 replicas
+  replica_policy:
+    min_replicas: 0
+    max_replicas: 2
 ```
-    
-</details>
 
 
-### Shutdown 
-To shutdown, run 
-```
-sky down deepseek
-```
\ No newline at end of file
+
diff --git a/llm/deepseek-r1/deepseek-r1-671B.yaml b/llm/deepseek-r1/deepseek-r1-671B.yaml
new file mode 100644
index 00000000000..11512b9aee6
--- /dev/null
+++ b/llm/deepseek-r1/deepseek-r1-671B.yaml
@@ -0,0 +1,49 @@
+name: deepseek-r1
+
+service:
+  # Specifying the path to the endpoint to check the readiness of the service.
+  readiness_probe: /health
+  # Allow 1 hour for code start.
+  initial_delay_seconds: 3600
+  # Autoscaling from 0 to 2 replicas
+  replica_policy:
+    min_replicas: 0
+    max_replicas: 2
+
+resources:
+  accelerators: {H200:8, H100:8, A100-80GB:8}
+  disk_size: 1024 # Large disk for model weights
+  disk_tier: best
+  ports: 30000
+  any_of:
+    - use_spot: true
+    - use_spot: false
+
+num_nodes: 2 # Specify number of nodes to launch
+
+setup: |
+  # Install sglang with all dependencies using uv
+  uv pip install "sglang[all]==0.4.2.post4" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer
+
+  # Set up shared memory for better performance
+  sudo bash -c "echo 'vm.max_map_count=655300' >> /etc/sysctl.conf"
+  sudo sysctl -p
+
+run: |
+  # Launch the server with appropriate configuration
+  MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
+  # TP should be number of GPUs per node times number of nodes
+  TP=$(($SKYPILOT_NUM_GPUS_PER_NODE * $SKYPILOT_NUM_NODES))
+
+  python -m sglang.launch_server \
+    --model deepseek-ai/DeepSeek-R1 \
+    --tp $TP \
+    --dist-init-addr ${MASTER_ADDR}:5000 \
+    --nnodes ${SKYPILOT_NUM_NODES} \
+    --node-rank ${SKYPILOT_NODE_RANK} \
+    --trust-remote-code \
+    --enable-dp-attention \
+    --enable-torch-compile \
+    --torch-compile-max-bs 8 \
+    --host 0.0.0.0 \
+    --port 30000

From 7170a91ca2605e15290689b675bc7640999f4aa1 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Fri, 14 Feb 2025 00:27:03 -0800
Subject: [PATCH 18/18] [LLM] Update Deepseek r1 README.md (#4718)

---
 llm/deepseek-r1/README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/llm/deepseek-r1/README.md b/llm/deepseek-r1/README.md
index 4d6ab6a6eaa..faa6bae7454 100644
--- a/llm/deepseek-r1/README.md
+++ b/llm/deepseek-r1/README.md
@@ -144,5 +144,10 @@ service:
     max_replicas: 2
 ```
 
+And run the [SkyPilot YAML](https://github.com/skypilot-org/skypilot/blob/master/llm/deepseek-r1/deepseek-r1-671B.yaml) with a single command:
+```bash
+sky serve up -n r1-serve deepseek-r1-671B.yaml
+```
+