diff --git a/.buildkite/core.rayci.yml b/.buildkite/core.rayci.yml
index 09005593c319..9c474bb66276 100644
--- a/.buildkite/core.rayci.yml
+++ b/.buildkite/core.rayci.yml
@@ -321,7 +321,7 @@ steps:
     commands:
       - bazel run //ci/ray_ci:test_in_docker -- //... core
         --run-flaky-tests  --build-type clang
-        --parallelism-per-worker 2 --gpus 2
+        --gpus 4
         --build-name coregpubuild
         --only-tags multi_gpu
     depends_on: coregpubuild
diff --git a/.buildkite/others.rayci.yml b/.buildkite/others.rayci.yml
index 16508b6f7864..790266fbf4d2 100644
--- a/.buildkite/others.rayci.yml
+++ b/.buildkite/others.rayci.yml
@@ -1,12 +1,8 @@
 group: others
 depends_on:
   - forge
-  - oss-ci-base_build
 steps:
-  #build
-  - name: doctestbuild
-    wanda: ci/docker/doctest.build.wanda.yaml
-
+  # dependencies
   - label: ":tapioca: build: pip-compile dependencies"
     key: pip_compile_dependencies
     instance_type: small
@@ -19,10 +15,13 @@ steps:
       - cp -f ./python/requirements_compiled.txt /artifact-mount/
     soft_fail: true
     job_env: oss-ci-base_test-py3.11
-    depends_on:
-      - oss-ci-base_test-multipy
+    depends_on: oss-ci-base_test-multipy
+
+  # docs
+  - name: doctestbuild
+    wanda: ci/docker/doctest.build.wanda.yaml
+    depends_on: oss-ci-base_build
 
-  # test
   - label: doc tests
     instance_type: large
     commands:
@@ -40,6 +39,7 @@ steps:
         --skip-ray-installation
     depends_on: doctestbuild
 
+  # java
   - label: ":java: java tests"
     tags: java
     instance_type: medium
@@ -48,7 +48,7 @@ steps:
       - docker run -i --rm --volume /tmp/artifacts:/artifact-mount --shm-size=2.5gb
         "$${RAYCI_WORK_REPO}":"$${RAYCI_BUILD_ID}"-corebuild /bin/bash -iecuo pipefail 
         "./java/test.sh"
-    depends_on: [ "corebuild", "forge" ]
+    depends_on: corebuild
 
   # bot
   - label: ":robot_face: CI weekly green metric"
diff --git a/.vale/styles/config/vocabularies/Data/accept.txt b/.vale/styles/config/vocabularies/Data/accept.txt
index 8ec78bd70bce..1104d6f3cd41 100644
--- a/.vale/styles/config/vocabularies/Data/accept.txt
+++ b/.vale/styles/config/vocabularies/Data/accept.txt
@@ -7,6 +7,7 @@ Data('s)?
 [Dd]iscretizer(s)?
 dtype
 [Gg]roupby
+[Hh]udi
 [Ii]ndexable
 [Ii]ngest
 [Ii]nqueue(s)?
diff --git a/BUILD.bazel b/BUILD.bazel
index 03c002fc1256..f30037472d39 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -1627,7 +1627,7 @@ ray_cc_test(
     deps = [
         ":gcs_server_lib",
         ":gcs_test_util_lib",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1649,7 +1649,7 @@ ray_cc_test(
     deps = [
         ":gcs_server_lib",
         ":gcs_test_util_lib",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1883,7 +1883,7 @@ ray_cc_test(
         ":gcs_table_storage_test_lib",
         ":gcs_test_util_lib",
         ":store_client_test_lib",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2403,11 +2403,43 @@ ray_cc_test(
 )
 
 ray_cc_test(
-    name = "gcs_export_event_test",
+    name = "gcs_job_manager_export_event_test",
     size = "small",
-    srcs = glob([
-        "src/ray/gcs/gcs_server/test/export_api/*.cc",
-    ]),
+    srcs = ["src/ray/gcs/gcs_server/test/export_api/gcs_job_manager_export_event_test.cc"],
+    tags = [
+        "no_windows",
+        "team:core"
+    ],
+    deps = [
+        ":gcs_server_lib",
+        ":gcs_server_test_util",
+        ":gcs_test_util_lib",
+        ":ray_mock",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+ray_cc_test(
+    name = "gcs_actor_manager_export_event_test",
+    size = "small",
+    srcs = ["src/ray/gcs/gcs_server/test/export_api/gcs_actor_manager_export_event_test.cc"],
+    tags = [
+        "no_windows",
+        "team:core"
+    ],
+    deps = [
+        ":gcs_server_lib",
+        ":gcs_server_test_util",
+        ":gcs_test_util_lib",
+        ":ray_mock",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+ray_cc_test(
+    name = "gcs_node_manager_export_event_test",
+    size = "small",
+    srcs = ["src/ray/gcs/gcs_server/test/export_api/gcs_node_manager_export_event_test.cc"],
     tags = [
         "no_windows",
         "team:core"
diff --git a/ci/docker/ray-ml.cpu.base.wanda.yaml b/ci/docker/ray-ml.cpu.base.wanda.yaml
index 53dc0700a973..29838a2a3c98 100644
--- a/ci/docker/ray-ml.cpu.base.wanda.yaml
+++ b/ci/docker/ray-ml.cpu.base.wanda.yaml
@@ -3,7 +3,6 @@ froms: ["cr.ray.io/rayproject/ray-py$PYTHON_VERSION-cpu-base"]
 dockerfile: docker/ray-ml/Dockerfile
 srcs:
   - python/requirements.txt
-  - python/requirements_compiled.txt
   - python/requirements/ml/dl-cpu-requirements.txt
   - python/requirements/ml/dl-gpu-requirements.txt
   - python/requirements/ml/core-requirements.txt
diff --git a/ci/docker/ray-ml.cuda.base.wanda.yaml b/ci/docker/ray-ml.cuda.base.wanda.yaml
index 723374e90210..b3aa908c4b5f 100644
--- a/ci/docker/ray-ml.cuda.base.wanda.yaml
+++ b/ci/docker/ray-ml.cuda.base.wanda.yaml
@@ -3,7 +3,6 @@ froms: ["cr.ray.io/rayproject/ray-py$PYTHON_VERSION-cu$CUDA_VERSION-base"]
 dockerfile: docker/ray-ml/Dockerfile
 srcs:
   - python/requirements.txt
-  - python/requirements_compiled.txt
   - python/requirements/ml/dl-cpu-requirements.txt
   - python/requirements/ml/dl-gpu-requirements.txt
   - python/requirements/ml/core-requirements.txt
diff --git a/ci/docker/ray.cpu.base.aarch64.wanda.yaml b/ci/docker/ray.cpu.base.aarch64.wanda.yaml
index 43321ccb7ba5..1726fb261825 100644
--- a/ci/docker/ray.cpu.base.aarch64.wanda.yaml
+++ b/ci/docker/ray.cpu.base.aarch64.wanda.yaml
@@ -1,6 +1,8 @@
 name: "ray-py$PYTHON_VERSION-cpu-base-aarch64"
 froms: ["ubuntu:22.04"]
 dockerfile: docker/base-deps/Dockerfile
+srcs:
+  - python/requirements_compiled.txt
 build_args:
   - PYTHON_VERSION
   - BASE_IMAGE=ubuntu:22.04
diff --git a/ci/docker/ray.cpu.base.wanda.yaml b/ci/docker/ray.cpu.base.wanda.yaml
index 4310a1820957..895605ed8f71 100644
--- a/ci/docker/ray.cpu.base.wanda.yaml
+++ b/ci/docker/ray.cpu.base.wanda.yaml
@@ -1,6 +1,8 @@
 name: "ray-py$PYTHON_VERSION-cpu-base"
 froms: ["ubuntu:22.04"]
 dockerfile: docker/base-deps/Dockerfile
+srcs:
+  - python/requirements_compiled.txt
 build_args:
   - PYTHON_VERSION
   - BASE_IMAGE=ubuntu:22.04
diff --git a/ci/docker/ray.cuda.base.aarch64.wanda.yaml b/ci/docker/ray.cuda.base.aarch64.wanda.yaml
index 51fe8a870814..1d1d6df12787 100644
--- a/ci/docker/ray.cuda.base.aarch64.wanda.yaml
+++ b/ci/docker/ray.cuda.base.aarch64.wanda.yaml
@@ -1,6 +1,8 @@
 name: "ray-py$PYTHON_VERSION-cu$CUDA_VERSION-base-aarch64"
 froms: ["nvidia/cuda:$CUDA_VERSION-devel-ubuntu22.04"]
 dockerfile: docker/base-deps/Dockerfile
+srcs:
+  - python/requirements_compiled.txt
 build_args:
   - PYTHON_VERSION
   - BASE_IMAGE=nvidia/cuda:$CUDA_VERSION-devel-ubuntu22.04
diff --git a/ci/docker/ray.cuda.base.wanda.yaml b/ci/docker/ray.cuda.base.wanda.yaml
index 3b2cbf4c3081..0bcd7611c921 100644
--- a/ci/docker/ray.cuda.base.wanda.yaml
+++ b/ci/docker/ray.cuda.base.wanda.yaml
@@ -1,6 +1,8 @@
 name: "ray-py$PYTHON_VERSION-cu$CUDA_VERSION-base"
 froms: ["nvidia/cuda:$CUDA_VERSION-devel-ubuntu22.04"]
 dockerfile: docker/base-deps/Dockerfile
+srcs:
+  - python/requirements_compiled.txt
 build_args:
   - PYTHON_VERSION
   - BASE_IMAGE=nvidia/cuda:$CUDA_VERSION-devel-ubuntu22.04
diff --git a/ci/env/install-core-prerelease-dependencies.sh b/ci/env/install-core-prerelease-dependencies.sh
index 498ecd024581..55ba3b1e55f9 100755
--- a/ci/env/install-core-prerelease-dependencies.sh
+++ b/ci/env/install-core-prerelease-dependencies.sh
@@ -5,7 +5,5 @@ set -e
 # install all unbounded dependencies in setup.py for ray core
 # TOOD(scv119) reenable grpcio once https://github.com/grpc/grpc/issues/31885 is fixed.
 # TOOD(scv119) reenable jsonschema once https://github.com/ray-project/ray/issues/33411 is fixed.
-for dependency in aiosignal frozenlist requests protobuf
-do
-    python -m pip install -U --pre --upgrade-strategy=eager $dependency
-done
+DEPS=(aiosignal frozenlist requests protobuf)
+python -m pip install -U --pre --upgrade-strategy=eager "${DEPS[@]}"
diff --git a/doc/source/cluster/configure-manage-dashboard.md b/doc/source/cluster/configure-manage-dashboard.md
index ce8eb9c9e941..885357ce38b2 100644
--- a/doc/source/cluster/configure-manage-dashboard.md
+++ b/doc/source/cluster/configure-manage-dashboard.md
@@ -5,7 +5,7 @@
 Dashboard configurations may differ depending on how you launch Ray Clusters (e.g., local Ray Cluster v.s. KubeRay). Integrations with Prometheus and Grafana are optional for enhanced Dashboard experience.
 
 :::{note}
-Ray Dashboard is only intended for interactive development and debugging because the Dashboard UI and the underlying data are not accessible after Clusters are terminated. For production monitoring and debugging, users should rely on [persisted logs](../cluster/kubernetes/user-guides/logging.md), [persisted metrics](./metrics.md), [persisted Ray states](../ray-observability/user-guides/cli-sdk.rst), and other observability tools.
+Ray Dashboard is useful for interactive development and debugging because when clusters terminate, the dashboard UI and the underlying data are no longer accessible. For production monitoring and debugging, you should rely on [persisted logs](../cluster/kubernetes/user-guides/persist-kuberay-custom-resource-logs.md), [persisted metrics](./metrics.md), [persisted Ray states](../ray-observability/user-guides/cli-sdk.rst), and other observability tools.
 :::
 
 ## Changing the Ray Dashboard port
diff --git a/doc/source/cluster/kubernetes/configs/loki.log.yaml b/doc/source/cluster/kubernetes/configs/loki.log.yaml
new file mode 100644
index 000000000000..07ab28d13344
--- /dev/null
+++ b/doc/source/cluster/kubernetes/configs/loki.log.yaml
@@ -0,0 +1,46 @@
+# Fluent Bit Config
+config:
+  inputs: |
+    [INPUT]
+        Name tail
+        Path /var/log/containers/*.log
+        multiline.parser docker, cri
+        Tag kube.*
+        Mem_Buf_Limit 5MB
+        Skip_Long_Lines On
+
+  filters: |
+    [FILTER]
+        Name kubernetes
+        Match kube.*
+        Merge_Log On
+        Keep_Log Off
+        K8S-Logging.Parser On
+        K8S-Logging.Exclude On
+
+  outputs: |
+    [OUTPUT]
+        Name loki
+        Match *
+        Host loki-gateway
+        Port 80
+        Labels job=fluent-bit,namespace=$kubernetes['namespace_name'],pod=$kubernetes['pod_name'],container=$kubernetes['container_name']
+        Auto_Kubernetes_Labels Off
+        tenant_id test
+---
+# Grafana Datasource Config
+datasources:
+  datasources.yaml:
+    apiVersion: 1
+    datasources:
+      - name: Loki
+        type: loki
+        access: proxy
+        editable: true
+        url: http://loki-gateway.default
+        jsonData:
+          timeout: 60
+          maxLines: 1000
+          httpHeaderName1: "X-Scope-OrgID"
+        secureJsonData:
+          httpHeaderValue1: "test"
diff --git a/doc/source/cluster/kubernetes/configs/ray-cluster.gpu.yaml b/doc/source/cluster/kubernetes/configs/ray-cluster.gpu.yaml
index 5a2d01839e9b..b42a7cf10a06 100644
--- a/doc/source/cluster/kubernetes/configs/ray-cluster.gpu.yaml
+++ b/doc/source/cluster/kubernetes/configs/ray-cluster.gpu.yaml
@@ -12,7 +12,7 @@ spec:
   ######################headGroupSpec#################################
   # head group template and specs, (perhaps 'group' is not needed in the name)
   headGroupSpec:
-    # logical group name, for this called head-group, also can be functional
+    # logical group name, for this called headgroup, also can be functional
     # pod type head or worker
     # rayNodeType: head # Not needed since it is under the headgroup
     # the following params are used to complete the ray start: ray start --head --block ...
diff --git a/doc/source/cluster/kubernetes/getting-started/raycluster-quick-start.md b/doc/source/cluster/kubernetes/getting-started/raycluster-quick-start.md
index 1915081b0717..6275564a9ea7 100644
--- a/doc/source/cluster/kubernetes/getting-started/raycluster-quick-start.md
+++ b/doc/source/cluster/kubernetes/getting-started/raycluster-quick-start.md
@@ -35,11 +35,12 @@ kubectl get pods
 # kuberay-operator-7fbdbf8c89-pt8bk   1/1     Running   0          27s
 ```
 
-KubeRay offers multiple options for operator installations, such as Helm, Kustomize, and a single-namespaced operator. For further information, please refer to [the installation instructions in the KubeRay documentation](https://ray-project.github.io/kuberay/deploy/installation/).
+KubeRay offers multiple options for operator installations, such as Helm, Kustomize, and a single-namespaced operator. For further information, see [the installation instructions in the KubeRay documentation](https://ray-project.github.io/kuberay/deploy/installation/).
 
+(raycluster-deploy)=
 ## Step 3: Deploy a RayCluster custom resource
 
-Once the KubeRay operator is running, we are ready to deploy a RayCluster. To do so, we create a RayCluster Custom Resource (CR) in the `default` namespace.
+Once the KubeRay operator is running, you're ready to deploy a RayCluster. Create a RayCluster Custom Resource (CR) in the `default` namespace.
 
   ::::{tab-set}
 
diff --git a/doc/source/cluster/kubernetes/user-guides.md b/doc/source/cluster/kubernetes/user-guides.md
index bb8713a51822..0b2b49639949 100644
--- a/doc/source/cluster/kubernetes/user-guides.md
+++ b/doc/source/cluster/kubernetes/user-guides.md
@@ -15,7 +15,8 @@ user-guides/config
 user-guides/configuring-autoscaling
 user-guides/kuberay-gcs-ft
 user-guides/gke-gcs-bucket
-user-guides/logging
+user-guides/persist-kuberay-custom-resource-logs
+user-guides/persist-kuberay-operator-logs
 user-guides/gpu
 user-guides/tpu
 user-guides/rayserve-dev-doc
@@ -45,7 +46,8 @@ at the {ref}`introductory guide <kuberay-quickstart>` first.
 * {ref}`kuberay-gpu`
 * {ref}`kuberay-tpu`
 * {ref}`kuberay-gcs-ft`
-* {ref}`kuberay-logging`
+* {ref}`persist-kuberay-custom-resource-logs`
+* {ref}`persist-kuberay-operator-logs`
 * {ref}`kuberay-dev-serve`
 * {ref}`kuberay-pod-command`
 * {ref}`kuberay-pod-security`
diff --git a/doc/source/cluster/kubernetes/user-guides/config.md b/doc/source/cluster/kubernetes/user-guides/config.md
index 5ca8df8f1c73..e4b18aaa4dea 100644
--- a/doc/source/cluster/kubernetes/user-guides/config.md
+++ b/doc/source/cluster/kubernetes/user-guides/config.md
@@ -126,7 +126,7 @@ Here are some of the subfields of the pod `template` to pay attention to:
 #### containers
 A Ray pod template specifies at minimum one container, namely the container
 that runs the Ray processes. A Ray pod template may also specify additional sidecar
-containers, for purposes such as {ref}`log processing <kuberay-logging>`. However, the KubeRay operator assumes that
+containers, for purposes such as {ref}`log processing <persist-kuberay-custom-resource-logs>`. However, the KubeRay operator assumes that
 the first container in the containers list is the main Ray container.
 Therefore, make sure to specify any sidecar containers
 **after** the main Ray container. In other words, the Ray container should be the **first**
diff --git a/doc/source/cluster/kubernetes/user-guides/images/loki-logs.png b/doc/source/cluster/kubernetes/user-guides/images/loki-logs.png
new file mode 100644
index 000000000000..2419cf7ca8f0
Binary files /dev/null and b/doc/source/cluster/kubernetes/user-guides/images/loki-logs.png differ
diff --git a/doc/source/cluster/kubernetes/user-guides/kuberay-gcs-ft.md b/doc/source/cluster/kubernetes/user-guides/kuberay-gcs-ft.md
index dd0000049140..a54161faf82c 100644
--- a/doc/source/cluster/kubernetes/user-guides/kuberay-gcs-ft.md
+++ b/doc/source/cluster/kubernetes/user-guides/kuberay-gcs-ft.md
@@ -27,7 +27,7 @@ See {ref}`Ray Serve end-to-end fault tolerance documentation <serve-e2e-ft-guide
 
 * Ray 2.0.0+
 * KubeRay 0.6.0+
-* Redis: single shard, one or multiple replicas
+* Redis: single shard Redis Cluster or Redis Sentinel, one or multiple replicas
 
 ## Quickstart
 
diff --git a/doc/source/cluster/kubernetes/user-guides/logging.md b/doc/source/cluster/kubernetes/user-guides/persist-kuberay-custom-resource-logs.md
similarity index 99%
rename from doc/source/cluster/kubernetes/user-guides/logging.md
rename to doc/source/cluster/kubernetes/user-guides/persist-kuberay-custom-resource-logs.md
index 26a82e97a518..589fccf2c9d3 100644
--- a/doc/source/cluster/kubernetes/user-guides/logging.md
+++ b/doc/source/cluster/kubernetes/user-guides/persist-kuberay-custom-resource-logs.md
@@ -1,6 +1,6 @@
-(kuberay-logging)=
+(persist-kuberay-custom-resource-logs)=
 
-# Log Persistence
+# Persist KubeRay custom resource logs
 
 Logs (both system and application logs) are useful for troubleshooting Ray applications and Clusters. For example, you may want to access system logs if a node terminates unexpectedly.
 
diff --git a/doc/source/cluster/kubernetes/user-guides/persist-kuberay-operator-logs.md b/doc/source/cluster/kubernetes/user-guides/persist-kuberay-operator-logs.md
new file mode 100644
index 000000000000..3cadfbb81938
--- /dev/null
+++ b/doc/source/cluster/kubernetes/user-guides/persist-kuberay-operator-logs.md
@@ -0,0 +1,116 @@
+(persist-kuberay-operator-logs)=
+
+# Persist KubeRay Operator Logs
+
+The KubeRay Operator plays a vital role in managing Ray clusters on Kubernetes. Persisting its logs is essential for effective troubleshooting and monitoring. This guide describes methods to set up centralized logging for KubeRay Operator logs.
+
+## Grafana Loki
+
+[Grafana Loki][GrafanaLoki] is a log aggregation system optimized for Kubernetes, providing efficient log storage and querying. The following steps set up [Fluent Bit][FluentBit] as a DaemonSet to collect logs from Kubernetes containers and send them to Loki for centralized storage and analysis.
+
+### Deploy Loki monolithic mode
+
+Loki’s Helm chart supports three deployment methods to fit different scalability and performance needs: Monolithic, Simple Scalable, and Microservices. This guide demonstrates the monolithic method. For details on each deployment mode, see the [Loki deployment](https://grafana.com/docs/loki/latest/get-started/deployment-modes/) modes documentation.
+
+Deploy the Loki deployment with the [Helm chart repository](https://github.com/grafana/loki/tree/main/production/helm/loki).
+
+```shell
+helm repo add grafana https://grafana.github.io/helm-charts
+helm repo update
+
+# Install Loki with single replica mode
+helm install loki grafana/loki --version 6.21.0 -f https://raw.githubusercontent.com/grafana/loki/refs/heads/main/production/helm/loki/single-binary-values.yaml
+```
+
+### Configure log processing
+
+Create a `fluent-bit-config.yaml` file, which configures Fluent Bit to:
+
+* Tail log files from Kubernetes containers.
+* Parse multi-line logs for Docker and Container Runtime Interface (CRI) formats.
+* Enrich logs with Kubernetes metadata such as namespace, pod, and container names.
+* Send the logs to Loki for centralized storage and querying.
+```{literalinclude} ../configs/loki.log.yaml
+:language: yaml
+:start-after: Fluent Bit Config
+:end-before: ---
+```
+
+A few notes on the above config:
+
+* Inputs: The `tail` input reads log files from `/var/log/containers/*.log`, with `multiline.parser` to handle complex log messages across multiple lines.
+* Filters: The `kubernetes` filter adds metadata like namespace, pod, and container names to each log, enabling more efficient log management and querying in Loki.
+* Outputs: The `loki` output block specifies Loki as the target. The `Host` and `Port` define the Loki service endpoint, and `Labels` adds metadata for easier querying in Grafana. Additionally, `tenant_id` allows for multi-tenancy if required by the Loki setup.
+
+Deploy the Fluent Bit deployment with the [Helm chart repository](https://github.com/fluent/helm-charts/tree/main/charts/fluent-bit).
+
+```shell
+helm repo add fluent https://fluent.github.io/helm-charts
+helm repo update
+
+helm install fluent-bit fluent/fluent-bit --version 0.48.2 -f fluent-bit-config.yaml
+```
+
+### Install the KubeRay Operator
+
+Follow [Deploy a KubeRay operator](kuberay-operator-deploy) to install the KubeRay operator.
+
+
+### Deploy a RayCluster
+
+Follow [Deploy a RayCluster custom resource](raycluster-deploy) to deploy a RayCluster.
+
+
+### Deploy Grafana
+
+Create a `datasource-config.yaml` file with the following configuration to set up Grafana's Loki datasource:
+```{literalinclude} ../configs/loki.log.yaml
+:language: yaml
+:start-after: Grafana Datasource Config
+```
+
+Deploy the Grafana deployment with the [Helm chart repository](https://github.com/grafana/helm-charts/tree/main/charts/grafana).
+
+```shell
+helm repo add grafana https://grafana.github.io/helm-charts
+helm repo update
+
+helm install grafana grafana/grafana --version 8.6.2 -f datasource-config.yaml
+```
+
+### Check the Grafana Dashboard
+
+```shell
+# Verify that the Grafana pod is running in the `default` namespace.
+kubectl get pods --namespace default -l "app.kubernetes.io/name=grafana"
+# NAME                       READY   STATUS    RESTARTS   AGE
+# grafana-54d5d747fd-5fldc   1/1     Running   0          8m21s
+```
+
+To access Grafana from your local machine, set up port forwarding by running:
+```shell
+export POD_NAME=$(kubectl get pods --namespace default -l "app.kubernetes.io/name=grafana,app.kubernetes.io/instance=grafana" -o jsonpath="{.items[0].metadata.name}")
+kubectl --namespace default port-forward $POD_NAME 3000
+```
+
+This command makes Grafana available locally at `http://localhost:3000`.
+
+* Username: "admin"
+* Password: Get the password using the following command:
+
+```shell
+kubectl get secret --namespace default grafana -o jsonpath="{.data.admin-password}" | base64 --decode ; echo
+```
+
+Finally, use a LogQL query to view logs for a specific pod, such as the KubeRay Operator, and filter logs by the `RayCluster_name`:
+
+```
+{pod="kuberay-operator-xxxxxxxx-xxxxx"} | json | RayCluster_name = `raycluster-kuberay`
+```
+
+![Loki Logs](images/loki-logs.png)
+
+You can use LogQL's JSON syntax to filter logs based on specific fields, such as `RayCluster_name`. See  [Log query language doc](https://grafana.com/docs/loki/latest/query/) for more information about LogQL filtering.
+
+[GrafanaLoki]: https://grafana.com/oss/loki/
+[FluentBit]: https://docs.fluentbit.io/manual
diff --git a/doc/source/cluster/metrics.md b/doc/source/cluster/metrics.md
index ad9e42cd77dd..361a5d1e3c74 100644
--- a/doc/source/cluster/metrics.md
+++ b/doc/source/cluster/metrics.md
@@ -67,7 +67,17 @@ ray_dashboard_api_requests_count_requests_total
 
 You can then see the number of requests to the Ray Dashboard API over time.
 
-To stop Prometheus, run `kill <PID>` where `<PID>` is the PID of the Prometheus process that was printed out when you ran the command. To find the PID, you can also run `ps aux | grep prometheus`.
+To stop Prometheus, run the following commands:
+
+```sh
+# case 1: Ray > 2.40
+ray metrics shutdown-prometheus
+
+# case 2: Otherwise
+# Run `ps aux | grep prometheus` to find the PID of the Prometheus process. Then, kill the process.
+kill <PID>
+```
+
 
 ### [Optional] Manual: Running Prometheus locally
 
diff --git a/doc/source/conf.py b/doc/source/conf.py
index 98bed502ee8c..d8ae19629647 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -74,6 +74,8 @@
 ]
 
 # Configuration for algolia
+# Note: This API key grants read access to our indexes and is intended to be public.
+# See https://www.algolia.com/doc/guides/security/api-keys/ for more information.
 docsearch_app_id = "LBHF0PABBL"
 docsearch_api_key = "6c42f30d9669d8e42f6fc92f44028596"
 docsearch_index_name = "docs-ray"
diff --git a/doc/source/custom_directives.py b/doc/source/custom_directives.py
index 6e81d401c833..2683160332d7 100644
--- a/doc/source/custom_directives.py
+++ b/doc/source/custom_directives.py
@@ -481,6 +481,7 @@ def key(cls: type) -> str:
 class Framework(ExampleEnum):
     """Framework type for example metadata."""
 
+    AWSNEURON = "AWS Neuron"
     PYTORCH = "PyTorch"
     LIGHTNING = "Lightning"
     TRANSFORMERS = "Transformers"
diff --git a/doc/source/data/api/input_output.rst b/doc/source/data/api/input_output.rst
index bb8d791d98b2..51bd7ecedb13 100644
--- a/doc/source/data/api/input_output.rst
+++ b/doc/source/data/api/input_output.rst
@@ -186,6 +186,15 @@ Delta Sharing
 
    read_delta_sharing_tables
 
+Hudi
+----
+
+.. autosummary::
+   :nosignatures:
+   :toctree: doc/
+
+   read_hudi
+
 Iceberg
 -------
 
diff --git a/doc/source/ray-more-libs/dask-on-ray.rst b/doc/source/ray-more-libs/dask-on-ray.rst
index 3e130bfcaa35..1e6ae2f39129 100644
--- a/doc/source/ray-more-libs/dask-on-ray.rst
+++ b/doc/source/ray-more-libs/dask-on-ray.rst
@@ -31,7 +31,10 @@ workload. Using the Dask-on-Ray scheduler, the entire Dask ecosystem can be exec
 
      * - Ray Version
        - Dask Version
-     * - ``2.8.0`` or above
+     * - ``2.34.0`` or above
+       - | ``2022.10.1 (Python version < 3.12)``
+         | ``2024.6.0 (Python version >= 3.12)``
+     * - ``2.8.0`` to ``2.33.x``
        - ``2022.10.1``
      * - ``2.5.0`` to ``2.7.x``
        - | ``2022.2.0 (Python version < 3.8)``
diff --git a/doc/source/ray-observability/user-guides/configure-logging.md b/doc/source/ray-observability/user-guides/configure-logging.md
index 3be1af34cbff..358691ed5584 100644
--- a/doc/source/ray-observability/user-guides/configure-logging.md
+++ b/doc/source/ray-observability/user-guides/configure-logging.md
@@ -28,7 +28,7 @@ A new Ray session creates a new folder to the temp directory. The latest session
 
 Usually, temp directories are cleared up whenever the machines reboot. As a result, log files may get lost whenever your cluster or some of the nodes are stopped or terminated.
 
-If you need to inspect logs after the clusters are stopped or terminated, you need to store and persist the logs. View the instructions for how to process and export logs for {ref}`clusters on VMs <vm-logging>` and {ref}`KubeRay Clusters <kuberay-logging>`.
+If you need to inspect logs after the clusters stop or terminate, you need to store and persist the logs. See the instructions for how to process and export logs for {ref}`Log persistence <vm-logging>` and {ref}`KubeRay Clusters <persist-kuberay-custom-resource-logs>`.
 
 (logging-directory-structure)=
 ## Log files in logging directory
@@ -131,12 +131,12 @@ ray.get([task.remote() for _ in range(100)])
 The output is as follows:
 
 ```bash
-2023-03-27 15:08:34,195	INFO worker.py:1603 -- Started a local Ray instance. View the dashboard at http://127.0.0.1:8265 
+2023-03-27 15:08:34,195	INFO worker.py:1603 -- Started a local Ray instance. View the dashboard at http://127.0.0.1:8265
 (task pid=534172) Hello there, I am a task 0.20583517821231412
 (task pid=534174) Hello there, I am a task 0.17536720316370757 [repeated 99x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication)
 ```
 
-This feature is useful when importing libraries such as `tensorflow` or `numpy`, which may emit many verbose warning messages when you import them. 
+This feature is useful when importing libraries such as `tensorflow` or `numpy`, which may emit many verbose warning messages when you import them.
 
 Configure the following environment variables on the driver process **before importing Ray** to customize log deduplication:
 
@@ -247,8 +247,8 @@ ray_tune_logger.addHandler(logging.FileHandler("extra_ray_tune_log.log"))
 Implement structured logging to enable downstream users and applications to consume the logs efficiently.
 
 ### Application logs
-A Ray applications include both driver and worker processes. For Python applications, use Python loggers to format and structure your logs. 
-As a result, Python loggers need to be set up for both driver and worker processes.
+A Ray app includes both driver and worker processes. For Python apps, use Python loggers to format and structure your logs.
+As a result, you need to set up Python loggers for both driver and worker processes.
 
 ::::{tab-set}
 
@@ -472,4 +472,4 @@ The max size of a log file, including its backup, is `RAY_ROTATION_MAX_BYTES * R
 
 ## Log persistence
 
-To process and export logs to external stroage or management systems, view {ref}`log persistence on Kubernetes <kuberay-logging>` and {ref}`log persistence on VMs <vm-logging>` for more details.
+To process and export logs to external stroage or management systems, view {ref}`log persistence on Kubernetes <persist-kuberay-custom-resource-logs>` see {ref}`log persistence on VMs <vm-logging>` for more details.
diff --git a/doc/source/ray-overview/installation.rst b/doc/source/ray-overview/installation.rst
index 74fde96e48e6..97ff6a53e85a 100644
--- a/doc/source/ray-overview/installation.rst
+++ b/doc/source/ray-overview/installation.rst
@@ -441,8 +441,8 @@ Install Ray Java with Maven
 ---------------------------
 
 .. note::
-   
-   All Ray Java APIs are experimental and only supported by the community. 
+
+   All Ray Java APIs are experimental and only supported by the community.
 
 Before installing Ray Java with Maven, you should install Ray Python with `pip install -U ray` . Note that the versions of Ray Java and Ray Python must match.
 Note that nightly Ray python wheels are also required if you want to install Ray Java snapshot version.
@@ -506,7 +506,7 @@ Install Ray C++
 
 .. note::
 
-  All Ray C++ APIs are experimental and only supported by the community. 
+  All Ray C++ APIs are experimental and only supported by the community.
 
 You can install and use Ray C++ API as follows.
 
diff --git a/doc/source/serve/getting_started.md b/doc/source/serve/getting_started.md
index ff2620cc8052..0bbe4084f3e5 100644
--- a/doc/source/serve/getting_started.md
+++ b/doc/source/serve/getting_started.md
@@ -101,6 +101,7 @@ parameters in the `@serve.deployment` decorator. The example configures a few co
 * `ray_actor_options`: a dictionary containing configuration options for each replica.
     * `num_cpus`: a float representing the logical number of CPUs each replica should reserve. You can make this a fraction to pack multiple replicas together on a machine with fewer CPUs than replicas.
     * `num_gpus`: a float representing the logical number of GPUs each replica should reserve. You can make this a fraction to pack multiple replicas together on a machine with fewer GPUs than replicas.
+    * `resources`: a dictionary containing other resource requirements for the replicate, such as non-GPU accelerators like HPUs or TPUs.
 
 All these parameters are optional, so feel free to omit them:
 
diff --git a/doc/source/serve/production-guide/kubernetes.md b/doc/source/serve/production-guide/kubernetes.md
index f8b55a1f116f..5a4914699772 100644
--- a/doc/source/serve/production-guide/kubernetes.md
+++ b/doc/source/serve/production-guide/kubernetes.md
@@ -72,7 +72,7 @@ rayservice-sample   7s
 
 $ kubectl get pods
 NAME                                                      READY   STATUS    RESTARTS   AGE
-ervice-sample-raycluster-454c4-worker-small-group-b6mmg   1/1     Running   0          XXs
+service-sample-raycluster-454c4-worker-small-group-b6mmg  1/1     Running   0          XXs
 kuberay-operator-7fbdbf8c89-4lrnr                         1/1     Running   0          XXs
 rayservice-sample-raycluster-454c4-head-krk9d             1/1     Running   0          XXs
 
@@ -238,7 +238,7 @@ Monitor your Serve application using the Ray Dashboard.
 - Learn more about how to configure and manage Dashboard [here](observability-configure-manage-dashboard).
 - Learn about the Ray Serve Dashboard [here](serve-monitoring).
 - Learn how to set up [Prometheus](prometheus-setup) and [Grafana](grafana) for Dashboard.
-- Learn about the [Ray Serve logs](serve-logging) and how to [persistent logs](kuberay-logging) on Kubernetes.
+- Learn about the [Ray Serve logs](serve-logging) and how to [persistent logs](persist-kuberay-custom-resource-logs) on Kubernetes.
 
 :::{note}
 - To troubleshoot application deployment failures in Serve, you can check the KubeRay operator logs by running `kubectl logs -f <kuberay-operator-pod-name>` (e.g., `kubectl logs -f kuberay-operator-7447d85d58-lv7pf`). The KubeRay operator logs contain information about the Serve application deployment event and Serve application health checks.
diff --git a/doc/source/serve/resource-allocation.md b/doc/source/serve/resource-allocation.md
index 57f580f2c370..18df5a8181a4 100644
--- a/doc/source/serve/resource-allocation.md
+++ b/doc/source/serve/resource-allocation.md
@@ -6,14 +6,14 @@ This guide helps you configure Ray Serve to:
 
 - Scale your deployments horizontally by specifying a number of replicas
 - Scale up and down automatically to react to changing traffic
-- Allocate hardware resources (CPUs, GPUs, etc) for each deployment
+- Allocate hardware resources (CPUs, GPUs, other accelerators, etc) for each deployment
 
 
 (serve-cpus-gpus)=
 
-## Resource management (CPUs, GPUs)
+## Resource management (CPUs, GPUs, accelerators)
 
-You may want to specify a deployment's resource requirements to reserve cluster resources like GPUs.  To assign hardware resources per replica, you can pass resource requirements to
+You may want to specify a deployment's resource requirements to reserve cluster resources like GPUs or other accelerators.  To assign hardware resources per replica, you can pass resource requirements to
 `ray_actor_options`.
 By default, each replica reserves one CPU.
 To learn about options to pass in, take a look at the [Resources with Actors guide](actor-resource-guide).
@@ -27,6 +27,14 @@ def func(*args):
     return do_something_with_my_gpu()
 ```
 
+Or if you want to create a deployment where each replica uses another type of accelerator such as an HPU, follow the example below:
+
+```python
+@serve.deployment(ray_actor_options={"resources": {"HPU": 1}})
+def func(*args):
+    return do_something_with_my_hpu()
+```
+
 (serve-fractional-resources-guide)=
 
 ### Fractional CPUs and fractional GPUs
diff --git a/doc/source/templates/04_finetuning_llms_with_deepspeed/compute_configs/aws_70b_v1.yaml b/doc/source/templates/04_finetuning_llms_with_deepspeed/compute_configs/aws_70b_v1.yaml
index 4d553edf1500..d82552cb5c15 100644
--- a/doc/source/templates/04_finetuning_llms_with_deepspeed/compute_configs/aws_70b_v1.yaml
+++ b/doc/source/templates/04_finetuning_llms_with_deepspeed/compute_configs/aws_70b_v1.yaml
@@ -4,7 +4,7 @@ head_node_type:
   name: head_node_type
   instance_type: g5.48xlarge
   resources:
-    custom_resources: 
+    custom_resources:
       large_cpu_mem: 1
 
 worker_node_types:
@@ -14,7 +14,7 @@ worker_node_types:
     max_workers: 3
     use_spot: false
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/doc/source/templates/04_finetuning_llms_with_deepspeed/compute_configs/aws_70b_v2.yaml b/doc/source/templates/04_finetuning_llms_with_deepspeed/compute_configs/aws_70b_v2.yaml
index 975c81fe5f3a..985939a018d5 100644
--- a/doc/source/templates/04_finetuning_llms_with_deepspeed/compute_configs/aws_70b_v2.yaml
+++ b/doc/source/templates/04_finetuning_llms_with_deepspeed/compute_configs/aws_70b_v2.yaml
@@ -4,7 +4,7 @@ head_node_type:
   name: head_node_type
   instance_type: g5.48xlarge
   resources:
-    custom_resources: 
+    custom_resources:
       large_cpu_mem: 1
 
 worker_node_types:
@@ -20,7 +20,7 @@ worker_node_types:
     max_workers: 2
     use_spot: false
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/doc/source/templates/04_finetuning_llms_with_deepspeed/compute_configs/aws_7b_or_13b.yaml b/doc/source/templates/04_finetuning_llms_with_deepspeed/compute_configs/aws_7b_or_13b.yaml
index a8fee017a51d..0231e4aa53bc 100644
--- a/doc/source/templates/04_finetuning_llms_with_deepspeed/compute_configs/aws_7b_or_13b.yaml
+++ b/doc/source/templates/04_finetuning_llms_with_deepspeed/compute_configs/aws_7b_or_13b.yaml
@@ -12,7 +12,7 @@ worker_node_types:
       max_workers: 16
       use_spot: false
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/doc/source/templates/05_dreambooth_finetuning/dreambooth/dataset.py b/doc/source/templates/05_dreambooth_finetuning/dreambooth/dataset.py
index 6c7e1e52aa48..bba46f34208b 100644
--- a/doc/source/templates/05_dreambooth_finetuning/dreambooth/dataset.py
+++ b/doc/source/templates/05_dreambooth_finetuning/dreambooth/dataset.py
@@ -88,7 +88,9 @@ def transform_image(
             transform_image, fn_kwargs={"output_column_name": "instance_image"}
         )
         .drop_columns(["image"])
-        .add_column("instance_prompt_ids", lambda df: [instance_prompt_ids] * len(df))
+        .add_column(
+            "instance_prompt_ids", lambda df: pd.Series([instance_prompt_ids] * len(df))
+        )
     )
     # END: Apply preprocessing steps as Ray Dataset operations
 
@@ -97,7 +99,9 @@ def transform_image(
             transform_image, fn_kwargs={"output_column_name": "class_image"}
         )
         .drop_columns(["image"])
-        .add_column("class_prompt_ids", lambda df: [class_prompt_ids] * len(df))
+        .add_column(
+            "class_prompt_ids", lambda df: pd.Series([class_prompt_ids] * len(df))
+        )
     )
     # --- Ray Data
 
diff --git a/doc/source/templates/README.md b/doc/source/templates/README.md
index 912d3174c75f..306b31bc3dc8 100644
--- a/doc/source/templates/README.md
+++ b/doc/source/templates/README.md
@@ -32,7 +32,7 @@ To add a template:
     Your template does not need to be a Jupyter notebook. It can also be presented as a
     Python script with `README` instructions of how to run.
 
-2. Add a release test for the template in `release/release_tests.yaml` (for both AWS and GCE).
+2. Add a release test for the template in `release/release_tests.yaml` (for both AWS and GCE). For Data tests, use `release/release_data_tests.yaml` instead.
 
     See the section on workspace templates for an example. Note that the cluster env and
     compute config are a little different for release tests. Use the files in the
diff --git a/doc/source/templates/testing/compute_configs/04_finetuning_llms_with_deepspeed/aws_70b_v1.yaml b/doc/source/templates/testing/compute_configs/04_finetuning_llms_with_deepspeed/aws_70b_v1.yaml
index 97441ceca4f7..57fa332f53c7 100644
--- a/doc/source/templates/testing/compute_configs/04_finetuning_llms_with_deepspeed/aws_70b_v1.yaml
+++ b/doc/source/templates/testing/compute_configs/04_finetuning_llms_with_deepspeed/aws_70b_v1.yaml
@@ -5,7 +5,7 @@ head_node_type:
   name: head_node_type
   instance_type: g5.48xlarge
   resources:
-    custom_resources: 
+    custom_resources:
       large_cpu_mem: 1
 
 worker_node_types:
@@ -15,7 +15,7 @@ worker_node_types:
     max_workers: 3
     use_spot: false
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/doc/source/templates/testing/compute_configs/04_finetuning_llms_with_deepspeed/aws_70b_v2.yaml b/doc/source/templates/testing/compute_configs/04_finetuning_llms_with_deepspeed/aws_70b_v2.yaml
index 9ff1c7d09aae..d176e25d9051 100644
--- a/doc/source/templates/testing/compute_configs/04_finetuning_llms_with_deepspeed/aws_70b_v2.yaml
+++ b/doc/source/templates/testing/compute_configs/04_finetuning_llms_with_deepspeed/aws_70b_v2.yaml
@@ -5,7 +5,7 @@ head_node_type:
   name: head_node_type
   instance_type: g5.48xlarge
   resources:
-    custom_resources: 
+    custom_resources:
       large_cpu_mem: 1
 
 worker_node_types:
@@ -21,7 +21,7 @@ worker_node_types:
     max_workers: 2
     use_spot: false
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/doc/source/templates/testing/compute_configs/04_finetuning_llms_with_deepspeed/aws_7b.yaml b/doc/source/templates/testing/compute_configs/04_finetuning_llms_with_deepspeed/aws_7b.yaml
index d8923e7ccad0..af1d8e1fa02c 100644
--- a/doc/source/templates/testing/compute_configs/04_finetuning_llms_with_deepspeed/aws_7b.yaml
+++ b/doc/source/templates/testing/compute_configs/04_finetuning_llms_with_deepspeed/aws_7b.yaml
@@ -13,7 +13,7 @@ worker_node_types:
       max_workers: 16
       use_spot: false
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/doc/source/templates/testing/compute_configs/cpu/aws.yaml b/doc/source/templates/testing/compute_configs/cpu/aws.yaml
index 28b9115d2755..251368c99d42 100644
--- a/doc/source/templates/testing/compute_configs/cpu/aws.yaml
+++ b/doc/source/templates/testing/compute_configs/cpu/aws.yaml
@@ -13,7 +13,7 @@ worker_node_types:
   max_workers: 7
   use_spot: false
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/doc/source/templates/testing/compute_configs/gpu/aws.yaml b/doc/source/templates/testing/compute_configs/gpu/aws.yaml
index 240dbea0e19e..d27020b4af2e 100644
--- a/doc/source/templates/testing/compute_configs/gpu/aws.yaml
+++ b/doc/source/templates/testing/compute_configs/gpu/aws.yaml
@@ -13,7 +13,7 @@ worker_node_types:
   max_workers: 3
   use_spot: false
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/doc/source/train/examples.yml b/doc/source/train/examples.yml
index 8b4f1c7cf8f2..0e7f6725e100 100644
--- a/doc/source/train/examples.yml
+++ b/doc/source/train/examples.yml
@@ -119,7 +119,17 @@ examples:
     contributor: community
     link: examples/intel_gaudi/llama_pretrain
 
-  - title: Fine-tune a Llama-2 text generation models with DeepSpeed and Hugging Face Accelerate
+  - title: Fine-tune Llama3.1 with AWS Trainium
+    frameworks:
+      - pytorch
+      - aws neuron 
+    skill_level: advanced
+    use_cases:
+      - natural language processing
+      - large language models
+    contributor: community
+    link: examples/aws-trainium/llama3
+  - title: Fine-tune a Llama-2 text generation model with DeepSpeed and Hugging Face Accelerate
     frameworks:
       - accelerate
       - deepspeed
diff --git a/doc/source/train/examples/aws-trainium/llama3.rst b/doc/source/train/examples/aws-trainium/llama3.rst
new file mode 100644
index 000000000000..ee7b89faf39e
--- /dev/null
+++ b/doc/source/train/examples/aws-trainium/llama3.rst
@@ -0,0 +1,103 @@
+:orphan:
+
+Distributed fine-tuning of Llama 3.1 8B on AWS Trainium with Ray and PyTorch Lightning
+======================================================================================
+
+
+This example demonstrates how to fine-tune the `Llama 3.1 8B <https://huggingface.co/NousResearch/Meta-Llama-3.1-8B/>`__ model on `AWS
+Trainium <https://aws.amazon.com/ai/machine-learning/trainium/>`__ instances using Ray Train, PyTorch Lightning, and AWS Neuron SDK.
+
+AWS Trainium is the machine learning (ML) chip that AWS built for deep
+learning (DL) training of 100B+ parameter models. `AWS Neuron
+SDK <https://aws.amazon.com/machine-learning/neuron/>`__ helps
+developers train models on Trainium accelerators.
+
+Prepare the environment
+-----------------------
+
+See `Setup EKS cluster and tools <https://github.com/aws-neuron/aws-neuron-eks-samples/tree/master/llama3.1_8B_finetune_ray_ptl_neuron#setupeksclusterandtools>`__ for setting up an Amazon EKS cluster leveraging AWS Trainium instances.
+
+Create a Docker image
+---------------------
+When the EKS cluster is ready, create an Amazon ECR repository for building and uploading the Docker image containing artifacts for fine-tuning a Llama3.1 8B model:
+
+1. Clone the repo.
+
+::
+
+   git clone https://github.com/aws-neuron/aws-neuron-eks-samples.git
+
+2. Go to the ``llama3.1_8B_finetune_ray_ptl_neuron`` directory.
+
+::
+
+   cd aws-neuron-eks-samples/llama3.1_8B_finetune_ray_ptl_neuron
+
+3. Trigger the script.
+
+::
+
+   chmod +x 0-kuberay-trn1-llama3-finetune-build-image.sh 
+   ./0-kuberay-trn1-llama3-finetune-build-image.sh
+
+4. Enter the zone your cluster is running in, for example: us-east-2.
+
+5. Verify in the AWS console that the Amazon ECR service has the newly
+   created ``kuberay_trn1_llama3.1_pytorch2`` repository.
+
+6. Update the ECR image ARN in the manifest file used for creating the Ray cluster.
+
+Replace the <AWS_ACCOUNT_ID> and <REGION> placeholders with actual values in the ``1-llama3-finetune-trn1-create-raycluster.yaml`` file using commands below to reflect the ECR image ARN created above:
+
+::
+
+   export AWS_ACCOUNT_ID=<enter_your_aws_account_id> # for ex: 111222333444
+   export REGION=<enter_your_aws_region> # for ex: us-east-2
+   sed -i "s/<AWS_ACCOUNT_ID>/$AWS_ACCOUNT_ID/g" 1-llama3-finetune-trn1-create-raycluster.yaml
+   sed -i "s/<REGION>/$REGION/g" 1-llama3-finetune-trn1-create-raycluster.yaml
+
+Configuring Ray Cluster
+-----------------------
+
+The ``llama3.1_8B_finetune_ray_ptl_neuron`` directory in the AWS Neuron samples repository simplifies the
+Ray configuration. KubeRay provides a manifest that you can apply
+to the cluster to set up the head and worker pods. 
+
+Run the following command to set up the Ray cluster:
+
+::
+
+   kubectl apply -f 1-llama3-finetune-trn1-create-raycluster.yaml
+
+
+Accessing Ray Dashboard
+-----------------------
+Port forward from the cluster to see the state of the Ray dashboard and
+then view it on `http://localhost:8265 <http://localhost:8265/>`__.
+Run it in the background with the following command:
+
+::
+
+   kubectl port-forward service/kuberay-trn1-head-svc 8265:8265 &
+
+Launching Ray Jobs
+------------------
+
+The Ray cluster now ready to handle workloads. Initiate the data preparation and fine-tuning Ray jobs:
+
+1. Launch the Ray job for downloading the dolly-15k dataset and the Llama3.1 8B model artifacts:
+
+::
+
+   kubectl apply -f 2-llama3-finetune-trn1-rayjob-create-data.yaml
+
+2. When the job has executed successfully, run the following fine-tuning job:
+
+::
+
+   kubectl apply -f 3-llama3-finetune-trn1-rayjob-submit-finetuning-job.yaml
+
+3. Monitor the jobs via the Ray Dashboard
+
+
+For detailed information on each of the steps above, see the `AWS documentation link <https://github.com/aws-neuron/aws-neuron-eks-samples/blob/master/llama3.1_8B_finetune_ray_ptl_neuron/README.md/>`__.
\ No newline at end of file
diff --git a/docker/base-deps/Dockerfile b/docker/base-deps/Dockerfile
index eec1e564d8c0..ee7d7872dfb0 100644
--- a/docker/base-deps/Dockerfile
+++ b/docker/base-deps/Dockerfile
@@ -1,3 +1,5 @@
+# syntax=docker/dockerfile:1.3-labs
+
 # The base-deps Docker image installs main libraries needed to run Ray
 
 # The GPU options are NVIDIA CUDA developer images.
@@ -13,82 +15,115 @@ ENV LANG=C.UTF-8
 # TODO(ilr) $HOME seems to point to result in "" instead of "/home/ray"
 ENV PATH "/home/ray/anaconda3/bin:$PATH"
 ARG DEBIAN_FRONTEND=noninteractive
-ARG PYTHON_VERSION=3.8.16
+ARG PYTHON_VERSION=3.9
 ARG HOSTTYPE=${HOSTTYPE:-x86_64}
 
 ARG RAY_UID=1000
 ARG RAY_GID=100
 
-RUN apt-get update -y \
-    && apt-get install -y sudo tzdata \
-    && useradd -ms /bin/bash -d /home/ray ray --uid $RAY_UID --gid $RAY_GID \
-    && usermod -aG sudo ray \
-    && echo 'ray ALL=NOPASSWD: ALL' >> /etc/sudoers \
-    && rm -rf /var/lib/apt/lists/* \
-    && apt-get clean
+RUN <<EOF
+#!/bin/bash
+
+set -euo pipefail
+
+apt-get update -y
+apt-get upgrade -y
+
+APT_PKGS=(
+    sudo
+    tzdata
+    git
+    libjemalloc-dev
+    wget
+    cmake
+    g++
+    zlib1g-dev
+)
+if [[ "$AUTOSCALER" == "autoscaler" ]]; then
+    APT_PKGS+=(
+        tmux
+        screen
+        rsync
+        netbase
+        openssh-client
+        gnupg
+    )
+fi
+
+apt-get install -y "${APT_PKGS[@]}"
+
+useradd -ms /bin/bash -d /home/ray ray --uid $RAY_UID --gid $RAY_GID
+usermod -aG sudo ray
+echo 'ray ALL=NOPASSWD: ALL' >> /etc/sudoers
+
+EOF
 
 USER $RAY_UID
 ENV HOME=/home/ray
 
+COPY python/requirements_compiled.txt /home/ray/requirements_compiled.txt
+
 SHELL ["/bin/bash", "-c"]
-RUN sudo apt-get update -y && sudo apt-get upgrade -y \
-    && sudo apt-get install -y \
-        git \
-        libjemalloc-dev \
-        wget \
-        cmake \
-        g++ \ 
-        zlib1g-dev \
-        $(if [ "$AUTOSCALER" = "autoscaler" ]; then echo \
-        tmux \
-        screen \
-        rsync \
-        netbase \
-        openssh-client \
-        gnupg; fi) \
-    && wget --quiet \
-        "https://repo.anaconda.com/miniconda/Miniconda3-py311_24.4.0-0-Linux-${HOSTTYPE}.sh" \
-        -O /tmp/miniconda.sh \
-    && /bin/bash /tmp/miniconda.sh -b -u -p $HOME/anaconda3 \
-    && $HOME/anaconda3/bin/conda init \ 
-    && echo 'export PATH=$HOME/anaconda3/bin:$PATH' >> /home/ray/.bashrc \
-    && rm /tmp/miniconda.sh  \
-    && $HOME/anaconda3/bin/conda install -y libgcc-ng python=$PYTHON_VERSION \
-    && $HOME/anaconda3/bin/conda install -y -c conda-forge libffi=3.4.2 \
-    && $HOME/anaconda3/bin/conda clean -y --all \
-    && $HOME/anaconda3/bin/pip install --no-cache-dir \
-        flatbuffers \
-        cython==0.29.37 \
-        # Necessary for Dataset to work properly.
-        numpy\>=1.20 \
-        psutil \
-        # Required a recent version of setuptools to be compatible with python 3.12+.
-        setuptools==71.1.0 \
-    # To avoid the following error on Jenkins:
-    # AttributeError: 'numpy.ufunc' object has no attribute '__module__'
-    && $HOME/anaconda3/bin/pip uninstall -y dask \ 
-    # We install cmake temporarily to get psutil
-    && sudo apt-get autoremove -y cmake zlib1g-dev \
-        # We keep g++ on GPU images, because uninstalling removes CUDA Devel tooling
-        $(if [[ "$BASE_IMAGE" == "ubuntu:22.04" && "$HOSTTYPE" == "x86_64" ]]; then echo \
-        g++; fi) \
-    && sudo rm -rf /var/lib/apt/lists/* \
-    && sudo apt-get clean \
-    && (if [ "$AUTOSCALER" = "autoscaler" ]; \
-        then $HOME/anaconda3/bin/pip --no-cache-dir install \
-        "redis>=3.5.0,<4.0.0" \
-        "six==1.13.0" \
-        "boto3==1.26.76" \
-        "pyOpenSSL==22.1.0" \
-        "cryptography==38.0.1" \
-        "google-api-python-client==1.7.8" \
-        "google-oauth" \
-        "azure-cli-core==2.40.0" \
-        "azure-identity==1.10.0" \
-        "azure-mgmt-compute==23.1.0" \
-        "azure-mgmt-network==19.0.0" \
-        "azure-mgmt-resource==20.0.0" \
-        "msrestazure==0.6.4"; \
-    fi;)
+
+RUN <<EOF
+#!/bin/bash
+
+set -euo pipefail
+
+# Install miniconda
+wget --quiet \
+    "https://repo.anaconda.com/miniconda/Miniconda3-py311_24.4.0-0-Linux-${HOSTTYPE}.sh" \
+    -O /tmp/miniconda.sh
+
+/bin/bash /tmp/miniconda.sh -b -u -p $HOME/anaconda3
+
+$HOME/anaconda3/bin/conda init
+echo 'export PATH=$HOME/anaconda3/bin:$PATH' >> /home/ray/.bashrc
+rm /tmp/miniconda.sh
+$HOME/anaconda3/bin/conda install -y libgcc-ng python=$PYTHON_VERSION
+$HOME/anaconda3/bin/conda install -y -c conda-forge libffi=3.4.2
+$HOME/anaconda3/bin/conda clean -y --all
+
+PIP_PKGS=(
+    # Required a recent version of setuptools to be compatible with python 3.12+.
+    setuptools==71.1.0
+
+    flatbuffers
+    cython
+    numpy  # Necessary for Dataset to work properly.
+    psutil
+)
+if [[ "$AUTOSCALER" == "autoscaler" ]]; then
+    PIP_PKGS+=(
+        redis
+        six
+        boto3
+        pyopenssl
+        cryptography
+        google-api-python-client
+        google-oauth
+    )
+fi
+
+$HOME/anaconda3/bin/pip install --no-cache-dir \
+    -c $HOME/requirements_compiled.txt \
+    "${PIP_PKGS[@]}"
+
+# To avoid the following error on Jenkins:
+# AttributeError: 'numpy.ufunc' object has no attribute '__module__'
+$HOME/anaconda3/bin/pip uninstall -y dask
+
+# We install cmake temporarily to get psutil
+sudo apt-get autoremove -y cmake zlib1g-dev
+
+# We keep g++ on GPU images, because uninstalling removes CUDA Devel tooling
+if [[ "$BASE_IMAGE" == "ubuntu:22.04" && "$HOSTTYPE" == "x86_64" ]]; then
+    sudo apt-get autoremove -y g++
+fi
+
+sudo rm -rf /var/lib/apt/lists/*
+sudo apt-get clean
+
+EOF
 
 WORKDIR $HOME
diff --git a/docker/ray-ml/Dockerfile b/docker/ray-ml/Dockerfile
index 67ee790389a6..42743924a118 100644
--- a/docker/ray-ml/Dockerfile
+++ b/docker/ray-ml/Dockerfile
@@ -1,19 +1,13 @@
+# syntax=docker/dockerfile:1.3-labs
+
 ARG BASE_IMAGE
 ARG FULL_BASE_IMAGE=rayproject/ray:nightly"$BASE_IMAGE"
 FROM "$FULL_BASE_IMAGE"
 
-# The python/* paths only exist in civ2, so we put them as non-first arguments. Docker
-# will ignore non-existent paths if they are non-first arguments.
-#
-# TODO(can): simplify this once civ1 is completely deprecated.
-COPY *requirements.txt \
-     python/*requirements.txt \
+COPY python/*requirements.txt \
      python/requirements/ml/*requirements.txt  \
      python/requirements/docker/*requirements.txt ./
-COPY *requirements_compiled.txt \
-     python/*requirements_compiled.txt ./
-COPY *install-ml-docker-requirements.sh \
-     docker/ray-ml/*install-ml-docker-requirements.sh ./
+COPY docker/ray-ml/install-ml-docker-requirements.sh ./
 
 RUN sudo chmod +x install-ml-docker-requirements.sh \
     && ./install-ml-docker-requirements.sh
diff --git a/docker/ray-ml/install-ml-docker-requirements.sh b/docker/ray-ml/install-ml-docker-requirements.sh
index d6744a13d2aa..0763b4d9589e 100755
--- a/docker/ray-ml/install-ml-docker-requirements.sh
+++ b/docker/ray-ml/install-ml-docker-requirements.sh
@@ -52,7 +52,8 @@ pip --no-cache-dir install \
 
 sudo apt-get clean
 
-sudo rm ./*requirements*.txt
+# requirements_compiled.txt will be kept.
+sudo rm ./*requirements.txt requirements_compiled_gpu.txt
 
 # MuJoCo Installation.
 export MUJOCO_GL=osmesa
diff --git a/docker/ray/Dockerfile b/docker/ray/Dockerfile
index 888183223609..9e54302603c9 100644
--- a/docker/ray/Dockerfile
+++ b/docker/ray/Dockerfile
@@ -1,3 +1,5 @@
+# syntax=docker/dockerfile:1.3-labs
+
 ARG BASE_IMAGE
 ARG FULL_BASE_IMAGE=rayproject/ray-deps:nightly"$BASE_IMAGE"
 FROM $FULL_BASE_IMAGE
@@ -6,7 +8,6 @@ ARG WHEEL_PATH
 ARG FIND_LINKS_PATH=".whl"
 ARG CONSTRAINTS_FILE="requirements_compiled.txt"
 
-COPY requirements_compiled.txt ./
 COPY $WHEEL_PATH .
 COPY $FIND_LINKS_PATH $FIND_LINKS_PATH
 
diff --git a/python/ray/_private/ray_logging/constants.py b/python/ray/_private/ray_logging/constants.py
index de84d510c16c..54552bdfe1d7 100644
--- a/python/ray/_private/ray_logging/constants.py
+++ b/python/ray/_private/ray_logging/constants.py
@@ -41,6 +41,9 @@ class LogKey(str, Enum):
     NODE_ID = "node_id"
     ACTOR_ID = "actor_id"
     TASK_ID = "task_id"
+    ACTOR_NAME = "actor_name"
+    TASK_NAME = "task_name"
+    TASK_FUNCTION_NAME = "task_func_name"
 
     # Logger built-in context
     ASCTIME = "asctime"
diff --git a/python/ray/_private/ray_logging/filters.py b/python/ray/_private/ray_logging/filters.py
index e7003022040a..91233a2b11c6 100644
--- a/python/ray/_private/ray_logging/filters.py
+++ b/python/ray/_private/ray_logging/filters.py
@@ -20,4 +20,13 @@ def filter(self, record):
             task_id = runtime_context.get_task_id()
             if task_id is not None:
                 setattr(record, LogKey.TASK_ID.value, task_id)
+            task_name = runtime_context.get_task_name()
+            if task_name is not None:
+                setattr(record, LogKey.TASK_NAME.value, task_name)
+            task_function_name = runtime_context.get_task_function_name()
+            if task_function_name is not None:
+                setattr(record, LogKey.TASK_FUNCTION_NAME.value, task_function_name)
+            actor_name = runtime_context.get_actor_name()
+            if actor_name is not None:
+                setattr(record, LogKey.ACTOR_NAME.value, actor_name)
         return True
diff --git a/python/ray/_private/ray_option_utils.py b/python/ray/_private/ray_option_utils.py
index 91345e536446..61c898aff8c4 100644
--- a/python/ray/_private/ray_option_utils.py
+++ b/python/ray/_private/ray_option_utils.py
@@ -147,6 +147,7 @@ def _validate_resources(resources: Optional[Dict[str, float]]) -> Optional[str]:
     ),
     "_metadata": Option((dict, type(None))),
     "enable_task_events": Option(bool, default_value=True),
+    "_labels": Option((dict, type(None))),
 }
 
 
diff --git a/python/ray/_private/services.py b/python/ray/_private/services.py
index 0de0ba78405f..23461e950286 100644
--- a/python/ray/_private/services.py
+++ b/python/ray/_private/services.py
@@ -834,8 +834,8 @@ def start_ray_process(
     use_valgrind_profiler: bool = False,
     use_perftools_profiler: bool = False,
     use_tmux: bool = False,
-    stdout_file: Optional[str] = None,
-    stderr_file: Optional[str] = None,
+    stdout_file: Optional[IO[AnyStr]] = None,
+    stderr_file: Optional[IO[AnyStr]] = None,
     pipe_stdin: bool = False,
 ):
     """Start one of the Ray processes.
@@ -1443,8 +1443,8 @@ def start_gcs_server(
     redis_address: str,
     log_dir: str,
     session_name: str,
-    stdout_file: Optional[str] = None,
-    stderr_file: Optional[str] = None,
+    stdout_file: Optional[IO[AnyStr]] = None,
+    stderr_file: Optional[IO[AnyStr]] = None,
     redis_password: Optional[str] = None,
     config: Optional[dict] = None,
     fate_share: Optional[bool] = None,
diff --git a/python/ray/_private/state.py b/python/ray/_private/state.py
index bebca03c0044..b8af96491b08 100644
--- a/python/ray/_private/state.py
+++ b/python/ray/_private/state.py
@@ -525,7 +525,7 @@ def chrome_tracing_object_transfer_dump(self, filename=None):
         """Return a list of transfer events that can viewed as a timeline.
 
         To view this information as a timeline, simply dump it as a json file
-        by passing in "filename" or using using json.dump, and then load go to
+        by passing in "filename" or using json.dump, and then load go to
         chrome://tracing in the Chrome web browser and load the dumped file.
         Make sure to enable "Flow events" in the "View Options" menu.
 
@@ -748,7 +748,7 @@ def _live_node_ids(self):
         return set(self.total_resources_per_node().keys())
 
     def available_resources_per_node(self):
-        """Returns a dictionary mapping node id to avaiable resources."""
+        """Returns a dictionary mapping node id to available resources."""
         self._check_connected()
         available_resources_by_id = {}
 
@@ -952,7 +952,7 @@ def timeline(filename=None):
     variable prior to starting Ray, and set RAY_task_events_report_interval_ms=0
 
     To view this information as a timeline, simply dump it as a json file by
-    passing in "filename" or using using json.dump, and then load go to
+    passing in "filename" or using json.dump, and then load go to
     chrome://tracing in the Chrome web browser and load the dumped file.
 
     Args:
@@ -970,7 +970,7 @@ def object_transfer_timeline(filename=None):
     """Return a list of transfer events that can viewed as a timeline.
 
     To view this information as a timeline, simply dump it as a json file by
-    passing in "filename" or using using json.dump, and then load go to
+    passing in "filename" or using json.dump, and then load go to
     chrome://tracing in the Chrome web browser and load the dumped file. Make
     sure to enable "Flow events" in the "View Options" menu.
 
diff --git a/python/ray/_private/test_utils.py b/python/ray/_private/test_utils.py
index 1eb26e0fad25..7bf0de943269 100644
--- a/python/ray/_private/test_utils.py
+++ b/python/ray/_private/test_utils.py
@@ -98,6 +98,12 @@ def redis_replicas():
     return int(os.environ.get("TEST_EXTERNAL_REDIS_REPLICAS", "1"))
 
 
+def redis_sentinel_replicas():
+    import os
+
+    return int(os.environ.get("TEST_EXTERNAL_REDIS_SENTINEL_REPLICAS", "2"))
+
+
 def get_redis_cli(port, enable_tls):
     try:
         # If there is no redis libs installed, skip the check.
@@ -122,6 +128,63 @@ def get_redis_cli(port, enable_tls):
     return redis.Redis("localhost", str(port), **params)
 
 
+def start_redis_sentinel_instance(
+    session_dir_path: str,
+    port: int,
+    redis_master_port: int,
+    password: Optional[str] = None,
+    enable_tls: bool = False,
+    db_dir=None,
+    free_port=0,
+):
+    config_file = os.path.join(
+        session_dir_path, "redis-sentinel-" + uuid.uuid4().hex + ".conf"
+    )
+    config_lines = []
+    # Port for this Sentinel instance
+    if enable_tls:
+        config_lines.append(f"port {free_port}")
+    else:
+        config_lines.append(f"port {port}")
+
+    # Monitor the Redis master
+    config_lines.append(f"sentinel monitor redis-test 127.0.0.1 {redis_master_port} 1")
+    config_lines.append(
+        "sentinel down-after-milliseconds redis-test 1000"
+    )  # failover after 1 second
+    config_lines.append("sentinel failover-timeout redis-test 5000")  #
+    config_lines.append("sentinel parallel-syncs redis-test 1")
+
+    if password:
+        config_lines.append(f"sentinel auth-pass redis-test {password}")
+
+    if enable_tls:
+        config_lines.append(f"tls-port {port}")
+        if Config.REDIS_CA_CERT():
+            config_lines.append(f"tls-ca-cert-file {Config.REDIS_CA_CERT()}")
+        # Check and add TLS client certificate file
+        if Config.REDIS_CLIENT_CERT():
+            config_lines.append(f"tls-cert-file {Config.REDIS_CLIENT_CERT()}")
+        # Check and add TLS client key file
+        if Config.REDIS_CLIENT_KEY():
+            config_lines.append(f"tls-key-file {Config.REDIS_CLIENT_KEY()}")
+        config_lines.append("tls-auth-clients no")
+        config_lines.append("sentinel tls-auth-clients redis-test no")
+    if db_dir:
+        config_lines.append(f"dir {db_dir}")
+
+    with open(config_file, "w") as f:
+        f.write("\n".join(config_lines))
+
+    command = [REDIS_EXECUTABLE, config_file, "--sentinel"]
+    process_info = ray._private.services.start_ray_process(
+        command,
+        ray_constants.PROCESS_TYPE_REDIS_SERVER,
+        fate_share=False,
+    )
+    return process_info
+
+
 def start_redis_instance(
     session_dir_path: str,
     port: int,
diff --git a/python/ray/_private/usage/usage_lib.py b/python/ray/_private/usage/usage_lib.py
index e980703ed3eb..558f56c602ef 100644
--- a/python/ray/_private/usage/usage_lib.py
+++ b/python/ray/_private/usage/usage_lib.py
@@ -634,8 +634,8 @@ def _get_cluster_status_to_report_v2(gcs_client) -> ClusterStatusToReport:
     try:
         cluster_status = get_cluster_status(gcs_client.address)
         total_resources = cluster_status.total_resources()
-        result.total_num_cpus = total_resources.get("CPU", 0)
-        result.total_num_gpus = total_resources.get("GPU", 0)
+        result.total_num_cpus = int(total_resources.get("CPU", 0))
+        result.total_num_gpus = int(total_resources.get("GPU", 0))
 
         to_GiB = 1 / 2**30
         result.total_memory_gb = total_resources.get("memory", 0) * to_GiB
diff --git a/python/ray/_private/utils.py b/python/ray/_private/utils.py
index 0eb5bf09c997..bd23131bebdf 100644
--- a/python/ray/_private/utils.py
+++ b/python/ray/_private/utils.py
@@ -1608,7 +1608,8 @@ def get_runtime_env_info(
     In the user interface, the argument `runtime_env` contains some fields
     which not contained in `ProtoRuntimeEnv` but in `ProtoRuntimeEnvInfo`,
     such as `eager_install`. This function will extract those fields from
-    `RuntimeEnv` and create a new `ProtoRuntimeEnvInfo`, and serialize it.
+    `RuntimeEnv` and create a new `ProtoRuntimeEnvInfo`, and serialize it
+    into json format.
     """
     from ray.runtime_env import RuntimeEnvConfig
 
diff --git a/python/ray/_private/worker.py b/python/ray/_private/worker.py
index 118c556ec966..d2b0cf3b013d 100644
--- a/python/ray/_private/worker.py
+++ b/python/ray/_private/worker.py
@@ -531,6 +531,14 @@ def actor_name(self):
     def current_task_id(self):
         return self.core_worker.get_current_task_id()
 
+    @property
+    def current_task_name(self):
+        return self.core_worker.get_current_task_name()
+
+    @property
+    def current_task_function_name(self):
+        return self.core_worker.get_current_task_function_name()
+
     @property
     def current_node_id(self):
         return self.core_worker.get_current_node_id()
@@ -3549,7 +3557,7 @@ def method(self):
             for more details.
         _metadata: Extended options for Ray libraries. For example,
             _metadata={"workflows.io/options": <workflow options>} for Ray workflows.
-
+        _labels: The key-value labels of a task or actor.
     """
     # "callable" returns true for both function and class.
     if len(args) == 1 and len(kwargs) == 0 and callable(args[0]):
diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx
index 3ddf101189dc..f3d93dce33ba 100644
--- a/python/ray/_raylet.pyx
+++ b/python/ray/_raylet.pyx
@@ -261,6 +261,9 @@ cdef optional[ObjectIDIndexType] NULL_PUT_INDEX = nullopt
 # https://docs.python.org/3/library/contextvars.html#contextvars.ContextVar
 # It is thread-safe.
 async_task_id = contextvars.ContextVar('async_task_id', default=None)
+async_task_name = contextvars.ContextVar('async_task_name', default=None)
+async_task_function_name = contextvars.ContextVar('async_task_function_name',
+                                                  default=None)
 
 
 class DynamicObjectRefGenerator:
@@ -737,11 +740,26 @@ cdef class Language:
     JAVA = Language.from_native(LANGUAGE_JAVA)
 
 
+cdef int prepare_labels(
+        dict label_dict,
+        unordered_map[c_string, c_string] *label_map) except -1:
+
+    if label_dict is None:
+        return 0
+
+    for key, value in label_dict.items():
+        if not isinstance(key, str):
+            raise ValueError(f"Label key must be string, but got {type(key)}")
+        if not isinstance(value, str):
+            raise ValueError(f"Label value must be string, but got {type(value)}")
+        label_map[0][key.encode("utf-8")] = value.encode("utf-8")
+
+    return 0
+
 cdef int prepare_resources(
         dict resource_dict,
         unordered_map[c_string, double] *resource_map) except -1:
     cdef:
-        unordered_map[c_string, double] out
         c_string resource_name
         list unit_resources
 
@@ -1800,7 +1818,8 @@ cdef void execute_task(
                     return core_worker.run_async_func_or_coro_in_event_loop(
                         async_function, function_descriptor,
                         name_of_concurrency_group_to_execute, task_id=task_id,
-                        func_args=(actor, *arguments), func_kwargs=kwarguments)
+                        task_name=task_name, func_args=(actor, *arguments),
+                        func_kwargs=kwarguments)
 
             return function(actor, *arguments, **kwarguments)
 
@@ -1912,7 +1931,8 @@ cdef void execute_task(
                                 execute_streaming_generator_async(context),
                                 function_descriptor,
                                 name_of_concurrency_group_to_execute,
-                                task_id=task_id)
+                                task_id=task_id,
+                                task_name=task_name)
                         else:
                             execute_streaming_generator_sync(context)
 
@@ -3400,6 +3420,48 @@ cdef class CoreWorker:
         with nogil:
             CCoreWorkerProcess.GetCoreWorker().Exit(c_exit_type, detail, null_ptr)
 
+    def get_current_task_name(self) -> str:
+        """Return the current task name.
+
+        If it is a normal task, it returns the task name from the main thread.
+        If it is a threaded actor, it returns the task name for the current thread.
+        If it is async actor, it returns the task name stored in contextVar for
+        the current asyncio task.
+        """
+        # We can only obtain the correct task name within asyncio task
+        # via async_task_name contextvar. We try this first.
+        # It is needed because the core worker's GetCurrentTask API
+        # doesn't have asyncio context, thus it cannot return the
+        # correct task name.
+        task_name = async_task_name.get()
+        if task_name is None:
+            # if it is not within asyncio context, fallback to TaskName
+            # obtainable from core worker.
+            task_name = CCoreWorkerProcess.GetCoreWorker().GetCurrentTaskName() \
+                .decode("utf-8")
+        return task_name
+
+    def get_current_task_function_name(self) -> str:
+        """Return the current task function.
+
+        If it is a normal task, it returns the task function from the main thread.
+        If it is a threaded actor, it returns the task function for the current thread.
+        If it is async actor, it returns the task function stored in contextVar for
+        the current asyncio task.
+        """
+        # We can only obtain the correct task function within asyncio task
+        # via async_task_function_name contextvar. We try this first.
+        # It is needed because the core Worker's GetCurrentTask API
+        # doesn't have asyncio context, thus it cannot return the
+        # correct task function.
+        task_function_name = async_task_function_name.get()
+        if task_function_name is None:
+            # if it is not within asyncio context, fallback to TaskName
+            # obtainable from core worker.
+            task_function_name = CCoreWorkerProcess.GetCoreWorker() \
+                .GetCurrentTaskFunctionName().decode("utf-8")
+        return task_function_name
+
     def get_current_task_id(self) -> TaskID:
         """Return the current task ID.
 
@@ -4009,10 +4071,12 @@ cdef class CoreWorker:
                     c_string debugger_breakpoint,
                     c_string serialized_runtime_env_info,
                     int64_t generator_backpressure_num_objects,
-                    c_bool enable_task_events
+                    c_bool enable_task_events,
+                    labels,
                     ):
         cdef:
             unordered_map[c_string, double] c_resources
+            unordered_map[c_string, c_string] c_labels
             CRayFunction ray_function
             CTaskOptions task_options
             c_vector[unique_ptr[CTaskArg]] args_vector
@@ -4032,6 +4096,7 @@ cdef class CoreWorker:
 
         with self.profile_event(b"submit_task"):
             prepare_resources(resources, &c_resources)
+            prepare_labels(labels, &c_labels)
             ray_function = CRayFunction(
                 language.lang, function_descriptor.descriptor)
             prepare_args_and_increment_put_refs(
@@ -4043,7 +4108,9 @@ cdef class CoreWorker:
                 b"",
                 generator_backpressure_num_objects,
                 serialized_runtime_env_info,
-                enable_task_events)
+                enable_task_events,
+                c_labels,
+                )
 
             current_c_task_id = current_task.native()
 
@@ -4089,6 +4156,7 @@ cdef class CoreWorker:
                      int32_t max_pending_calls,
                      scheduling_strategy,
                      c_bool enable_task_events,
+                     labels,
                      ):
         cdef:
             CRayFunction ray_function
@@ -4101,6 +4169,7 @@ cdef class CoreWorker:
             CSchedulingStrategy c_scheduling_strategy
             c_vector[CObjectID] incremented_put_arg_ids
             optional[c_bool] is_detached_optional = nullopt
+            unordered_map[c_string, c_string] c_labels
 
         self.python_scheduling_strategy_to_c(
             scheduling_strategy, &c_scheduling_strategy)
@@ -4108,6 +4177,7 @@ cdef class CoreWorker:
         with self.profile_event(b"submit_task"):
             prepare_resources(resources, &c_resources)
             prepare_resources(placement_resources, &c_placement_resources)
+            prepare_labels(labels, &c_labels)
             ray_function = CRayFunction(
                 language.lang, function_descriptor.descriptor)
             prepare_args_and_increment_put_refs(
@@ -4136,7 +4206,8 @@ cdef class CoreWorker:
                         # async or threaded actors.
                         is_asyncio or max_concurrency > 1,
                         max_pending_calls,
-                        enable_task_events),
+                        enable_task_events,
+                        c_labels),
                     extension_data,
                     &c_actor_id)
 
@@ -4247,6 +4318,7 @@ cdef class CoreWorker:
             TaskID current_task = self.get_current_task_id()
             c_string serialized_retry_exception_allowlist
             c_string serialized_runtime_env = b"{}"
+            unordered_map[c_string, c_string] c_labels
 
         serialized_retry_exception_allowlist = serialize_retry_exception_allowlist(
             retry_exception_allowlist,
@@ -4275,7 +4347,8 @@ cdef class CoreWorker:
                         concurrency_group_name,
                         generator_backpressure_num_objects,
                         serialized_runtime_env,
-                        enable_task_events),
+                        enable_task_events,
+                        c_labels),
                     max_retries,
                     retry_exceptions,
                     serialized_retry_exception_allowlist,
@@ -4796,6 +4869,7 @@ cdef class CoreWorker:
           specified_cgname: str,
           *,
           task_id: Optional[TaskID] = None,
+          task_name: Optional[str] = None,
           func_args: Optional[Tuple] = None,
           func_kwargs: Optional[Dict] = None,
     ):
@@ -4842,6 +4916,9 @@ cdef class CoreWorker:
             try:
                 if task_id:
                     async_task_id.set(task_id)
+                if task_name is not None:
+                    async_task_name.set(task_name)
+                async_task_function_name.set(function_descriptor.repr)
 
                 if inspect.isawaitable(func_or_coro):
                     coroutine = func_or_coro
diff --git a/python/ray/actor.py b/python/ray/actor.py
index 222f52c24b5f..824de9efad73 100644
--- a/python/ray/actor.py
+++ b/python/ray/actor.py
@@ -924,6 +924,7 @@ def _remote(self, args=None, kwargs=None, **actor_options):
             scheduling_strategy: Strategy about how to schedule this actor.
             enable_task_events: True if tracing is enabled, i.e., task events from
                 the actor should be reported. Defaults to True.
+            _labels: The key-value labels of the actor.
 
         Returns:
             A handle to the newly created actor.
@@ -1197,6 +1198,7 @@ def _remote(self, args=None, kwargs=None, **actor_options):
             max_pending_calls=max_pending_calls,
             scheduling_strategy=scheduling_strategy,
             enable_task_events=enable_task_events,
+            labels=actor_options.get("_labels"),
         )
 
         if _actor_launch_hook:
diff --git a/python/ray/air/BUILD b/python/ray/air/BUILD
index 0799ef871078..58951ee92889 100644
--- a/python/ray/air/BUILD
+++ b/python/ray/air/BUILD
@@ -46,6 +46,14 @@ py_test(
     deps = [":ml_lib"]
 )
 
+py_test(
+    name = "test_arrow",
+    size = "small",
+    srcs = ["tests/test_arrow.py"],
+    tags = ["team:ml", "team:data", "ray_data", "exclusive"],
+    deps = [":ml_lib"]
+)
+
 py_test(
     name = "test_air_usage",
     size = "small",
diff --git a/python/ray/air/data_batch_type.py b/python/ray/air/data_batch_type.py
index a6fad4591d35..5d5d09b3218e 100644
--- a/python/ray/air/data_batch_type.py
+++ b/python/ray/air/data_batch_type.py
@@ -2,6 +2,10 @@
 
 if TYPE_CHECKING:
     import numpy
-    import pandas
+    import pandas  # noqa: F401
+    import pyarrow
 
-DataBatchType = Union["numpy.ndarray", "pandas.DataFrame", Dict[str, "numpy.ndarray"]]
+# TODO de-dup with ray.data.block.DataBatch
+DataBatchType = Union[
+    "numpy.ndarray", "pyarrow.Table" "pandas.DataFrame", Dict[str, "numpy.ndarray"]
+]
diff --git a/python/ray/air/tests/test_arrow.py b/python/ray/air/tests/test_arrow.py
new file mode 100644
index 000000000000..efe68937836b
--- /dev/null
+++ b/python/ray/air/tests/test_arrow.py
@@ -0,0 +1,71 @@
+from dataclasses import dataclass, field
+
+import pyarrow as pa
+import pytest
+
+from ray.air.util.tensor_extensions.arrow import (
+    ArrowConversionError,
+    _convert_to_pyarrow_native_array,
+    _infer_pyarrow_type,
+    convert_to_pyarrow_array,
+)
+from ray.air.util.tensor_extensions.utils import create_ragged_ndarray
+
+
+@dataclass
+class UserObj:
+    i: int = field()
+
+
+def test_pa_infer_type_failing_to_infer():
+    # Represent a single column that will be using `ArrowPythonObjectExtension` type
+    # to ser/de native Python objects into bytes
+    column_vals = create_ragged_ndarray(
+        [
+            "hi",
+            1,
+            None,
+            [[[[]]]],
+            {"a": [[{"b": 2, "c": UserObj(i=123)}]]},
+            UserObj(i=456),
+        ]
+    )
+
+    inferred_dtype = _infer_pyarrow_type(column_vals)
+
+    # Arrow (17.0) seem to fallback to assume the dtype of the first element
+    assert pa.string().equals(inferred_dtype)
+
+
+def test_convert_to_pyarrow_array_object_ext_type_fallback():
+    column_values = create_ragged_ndarray(
+        [
+            "hi",
+            1,
+            None,
+            [[[[]]]],
+            {"a": [[{"b": 2, "c": UserObj(i=123)}]]},
+            UserObj(i=456),
+        ]
+    )
+    column_name = "py_object_column"
+
+    # First, assert that straightforward conversion into Arrow native types fails
+    with pytest.raises(ArrowConversionError) as exc_info:
+        _convert_to_pyarrow_native_array(column_values, column_name)
+
+    assert (
+        str(exc_info.value)
+        == "Error converting data to Arrow: ['hi' 1 None list([[[[]]]]) {'a': [[{'b': 2, 'c': UserObj(i=123)}]]}\n UserObj(i=456)]"  # noqa: E501
+    )
+
+    # Subsequently, assert that fallback to `ArrowObjectExtensionType` succeeds
+    pa_array = convert_to_pyarrow_array(column_values, column_name)
+
+    assert pa_array.to_pylist() == column_values.tolist()
+
+
+if __name__ == "__main__":
+    import sys
+
+    sys.exit(pytest.main(["-v", "-x", __file__]))
diff --git a/python/ray/air/tests/test_object_extension.py b/python/ray/air/tests/test_object_extension.py
index b1479dbc4ac0..64600bafc69c 100644
--- a/python/ray/air/tests/test_object_extension.py
+++ b/python/ray/air/tests/test_object_extension.py
@@ -7,13 +7,13 @@
 from ray.air.util.object_extensions.arrow import (
     ArrowPythonObjectArray,
     ArrowPythonObjectType,
-    object_extension_type_allowed,
+    _object_extension_type_allowed,
 )
 from ray.air.util.object_extensions.pandas import PythonObjectArray
 
 
 @pytest.mark.skipif(
-    not object_extension_type_allowed(), reason="Object extension not supported."
+    not _object_extension_type_allowed(), reason="Object extension not supported."
 )
 def test_object_array_validation():
     # Test unknown input type raises TypeError.
@@ -25,7 +25,7 @@ def test_object_array_validation():
 
 
 @pytest.mark.skipif(
-    not object_extension_type_allowed(), reason="Object extension not supported."
+    not _object_extension_type_allowed(), reason="Object extension not supported."
 )
 def test_arrow_scalar_object_array_roundtrip():
     arr = np.array(
@@ -41,7 +41,7 @@ def test_arrow_scalar_object_array_roundtrip():
 
 
 @pytest.mark.skipif(
-    not object_extension_type_allowed(), reason="Object extension not supported."
+    not _object_extension_type_allowed(), reason="Object extension not supported."
 )
 def test_arrow_python_object_array_slice():
     arr = np.array(["test", 20, "test2", 40, "test3", 60], dtype=object)
@@ -51,7 +51,7 @@ def test_arrow_python_object_array_slice():
 
 
 @pytest.mark.skipif(
-    not object_extension_type_allowed(), reason="Object extension not supported."
+    not _object_extension_type_allowed(), reason="Object extension not supported."
 )
 def test_arrow_pandas_roundtrip():
     obj = types.SimpleNamespace(a=1, b="test")
diff --git a/python/ray/air/util/data_batch_conversion.py b/python/ray/air/util/data_batch_conversion.py
index 4fe7a8ab2ea9..1bf69b4b9398 100644
--- a/python/ray/air/util/data_batch_conversion.py
+++ b/python/ray/air/util/data_batch_conversion.py
@@ -6,9 +6,6 @@
 
 from ray.air.constants import TENSOR_COLUMN_NAME
 from ray.air.data_batch_type import DataBatchType
-from ray.air.util.tensor_extensions.arrow import (
-    get_arrow_extension_fixed_shape_tensor_types,
-)
 from ray.util.annotations import Deprecated, DeveloperAPI
 
 if TYPE_CHECKING:
@@ -220,37 +217,31 @@ def _convert_batch_type_to_numpy(
                 )
         return data
     elif pyarrow is not None and isinstance(data, pyarrow.Table):
-        from ray.air.util.transform_pyarrow import (
-            _concatenate_extension_column,
-            _is_column_extension_type,
+        from ray.air.util.tensor_extensions.arrow import (
+            get_arrow_extension_fixed_shape_tensor_types,
         )
+        from ray.data._internal.arrow_ops import transform_pyarrow
 
-        if data.column_names == [TENSOR_COLUMN_NAME] and (
-            isinstance(
-                data.schema.types[0], get_arrow_extension_fixed_shape_tensor_types()
+        column_values_ndarrays = []
+
+        for col in data.columns:
+            # Combine columnar values arrays to make these contiguous
+            # (making them compatible with numpy format)
+            combined_array = transform_pyarrow.combine_chunked_array(col)
+
+            column_values_ndarrays.append(
+                transform_pyarrow.to_numpy(combined_array, zero_copy_only=False)
             )
+
+        arrow_fixed_shape_tensor_types = get_arrow_extension_fixed_shape_tensor_types()
+
+        # NOTE: This branch is here for backwards-compatibility
+        if data.column_names == [TENSOR_COLUMN_NAME] and (
+            isinstance(data.schema.types[0], arrow_fixed_shape_tensor_types)
         ):
-            # If representing a tensor dataset, return as a single numpy array.
-            # Example: ray.data.from_numpy(np.arange(12).reshape((3, 2, 2)))
-            # Arrow’s incorrect concatenation of extension arrays:
-            # https://issues.apache.org/jira/browse/ARROW-16503
-            return _concatenate_extension_column(data[TENSOR_COLUMN_NAME]).to_numpy(
-                zero_copy_only=False
-            )
-        else:
-            output_dict = {}
-            for col_name in data.column_names:
-                col = data[col_name]
-                if col.num_chunks == 0:
-                    col = pyarrow.array([], type=col.type)
-                elif _is_column_extension_type(col):
-                    # Arrow’s incorrect concatenation of extension arrays:
-                    # https://issues.apache.org/jira/browse/ARROW-16503
-                    col = _concatenate_extension_column(col)
-                else:
-                    col = col.combine_chunks()
-                output_dict[col_name] = col.to_numpy(zero_copy_only=False)
-            return output_dict
+            return column_values_ndarrays[0]
+
+        return dict(zip(data.column_names, column_values_ndarrays))
     elif isinstance(data, pd.DataFrame):
         return _convert_pandas_to_batch_type(data, BatchFormat.NUMPY)
     else:
diff --git a/python/ray/air/util/object_extensions/arrow.py b/python/ray/air/util/object_extensions/arrow.py
index c3158bbff68b..a56a04869855 100644
--- a/python/ray/air/util/object_extensions/arrow.py
+++ b/python/ray/air/util/object_extensions/arrow.py
@@ -16,7 +16,7 @@
 PYARROW_VERSION = None if _VER is None else parse_version(_VER)
 
 
-def object_extension_type_allowed() -> bool:
+def _object_extension_type_allowed() -> bool:
     return (
         PYARROW_VERSION is not None
         and PYARROW_VERSION >= MIN_PYARROW_VERSION_SCALAR_SUBCLASS
@@ -104,7 +104,9 @@ def from_objects(
         arr = pa.array(all_dumped_bytes, type=type_.storage_type)
         return ArrowPythonObjectArray.from_storage(type_, arr)
 
-    def to_numpy(self, zero_copy_only: bool = False) -> np.ndarray:
+    def to_numpy(
+        self, zero_copy_only: bool = False, writable: bool = False
+    ) -> np.ndarray:
         arr = np.empty(len(self), dtype=object)
         arr[:] = self.to_pylist()
         return arr
diff --git a/python/ray/air/util/tensor_extensions/arrow.py b/python/ray/air/util/tensor_extensions/arrow.py
index bef940c136f9..ebe01c792458 100644
--- a/python/ray/air/util/tensor_extensions/arrow.py
+++ b/python/ray/air/util/tensor_extensions/arrow.py
@@ -10,10 +10,14 @@
 from packaging.version import parse as parse_version
 
 from ray._private.utils import _get_pyarrow_version
+from ray.air.constants import TENSOR_COLUMN_NAME
 from ray.air.util.tensor_extensions.utils import (
+    _is_ndarray_tensor,
     _is_ndarray_variable_shaped_tensor,
     create_ragged_ndarray,
 )
+from ray.data._internal.util import GiB
+from ray.util import log_once
 from ray.util.annotations import DeveloperAPI, PublicAPI
 
 PYARROW_VERSION = _get_pyarrow_version()
@@ -25,9 +29,15 @@
 # Minimum version of Arrow that supports subclassable ExtensionScalars.
 # TODO(Clark): Remove conditional definition once we only support Arrow 9.0.0+.
 MIN_PYARROW_VERSION_SCALAR_SUBCLASS = parse_version("9.0.0")
+# Minimum version supporting `zero_copy_only` flag in `ChunkedArray.to_numpy`
+MIN_PYARROW_VERSION_CHUNKED_ARRAY_TO_NUMPY_ZERO_COPY_ONLY = parse_version("13.0.0")
 
 NUM_BYTES_PER_UNICODE_CHAR = 4
 
+# NOTE: Overflow threshold in bytes for most Arrow types using int32 as
+#       its offsets
+INT32_OVERFLOW_THRESHOLD = 2 * GiB
+
 logger = logging.getLogger(__name__)
 
 
@@ -85,14 +95,152 @@ def pyarrow_table_from_pydict(
         raise ArrowConversionError(str(pydict)) from e
 
 
-@DeveloperAPI
-def convert_list_to_pyarrow_array(
-    val: List[Any], enclosing_dict: Dict[str, Any]
+@DeveloperAPI(stability="alpha")
+def convert_to_pyarrow_array(column_values: np.ndarray, column_name: str) -> pa.Array:
+    """Converts provided NumPy `ndarray` into PyArrow's `array` while utilizing
+    both Arrow's natively supported types as well as custom extension types:
+
+        - ArrowTensorArray (for tensors)
+        - ArrowPythonObjectArray (for user-defined python class objects, as well as
+        any python object that aren't represented by a corresponding Arrow's native
+        scalar type)
+    """
+
+    try:
+        # Since Arrow does NOT support tensors (aka multidimensional arrays) natively,
+        # we have to make sure that we handle this case utilizing `ArrowTensorArray`
+        # extension type
+        if column_name == TENSOR_COLUMN_NAME or _is_ndarray_tensor(column_values):
+            from ray.data.extensions.tensor_extension import ArrowTensorArray
+
+            return ArrowTensorArray.from_numpy(column_values, column_name)
+        else:
+            return _convert_to_pyarrow_native_array(column_values, column_name)
+
+    except ArrowConversionError as ace:
+        from ray.data.extensions.object_extension import (
+            ArrowPythonObjectArray,
+            _object_extension_type_allowed,
+        )
+
+        if not _object_extension_type_allowed():
+            should_serialize_as_object_ext_type = False
+            object_ext_type_detail = (
+                "skipping fallback to serialize as pickled python"
+                f" objects (due to unsupported Arrow version {PYARROW_VERSION}, "
+                f"min required version is {MIN_PYARROW_VERSION_SCALAR_SUBCLASS})"
+            )
+        else:
+            from ray.data import DataContext
+
+            if not DataContext.get_current().enable_fallback_to_arrow_object_ext_type:
+                should_serialize_as_object_ext_type = False
+                object_ext_type_detail = (
+                    "skipping fallback to serialize as pickled python objects "
+                    "(due to DataContext.enable_fallback_to_arrow_object_ext_type "
+                    "= False)"
+                )
+            else:
+                should_serialize_as_object_ext_type = True
+                object_ext_type_detail = (
+                    "falling back to serialize as pickled python objects"
+                )
+
+        # NOTE: To avoid logging following warning for every block it's
+        #       only going to be logged in following cases
+        #           - When fallback is disabled, or
+        #           - It's being logged for the first time
+        if not should_serialize_as_object_ext_type or log_once(
+            "_fallback_to_arrow_object_extension_type_warning"
+        ):
+            logger.warning(
+                f"Failed to convert column '{column_name}' into pyarrow "
+                f"array due to: {ace}; {object_ext_type_detail}",
+                exc_info=ace,
+            )
+
+        # If `ArrowPythonObjectType` is not supported raise original exception
+        if not should_serialize_as_object_ext_type:
+            raise
+
+        # Otherwise, attempt to fall back to serialize as python objects
+        return ArrowPythonObjectArray.from_objects(column_values)
+
+
+def _convert_to_pyarrow_native_array(
+    column_values: np.ndarray, column_name: str
 ) -> pa.Array:
+    """Converts provided NumPy `ndarray` into PyArrow's `array` while only utilizing
+    Arrow's natively supported types (ie no custom extension types)"""
+
     try:
-        return pa.array(val)
+        # NOTE: We explicitly infer PyArrow `DataType` so that
+        #       we can perform upcasting to be able to accommodate
+        #       blocks that are larger than 2Gb in size (limited
+        #       by int32 offsets used by Arrow internally)
+        dtype = _infer_pyarrow_type(column_values)
+
+        logger.log(
+            logging.getLevelName("TRACE"),
+            f"Inferred dtype of '{dtype}' for column '{column_name}'",
+        )
+
+        return pa.array(column_values, type=dtype)
     except Exception as e:
-        raise ArrowConversionError(str(enclosing_dict)) from e
+        raise ArrowConversionError(str(column_values)) from e
+
+
+def _infer_pyarrow_type(column_values: np.ndarray) -> Optional[pa.DataType]:
+    """Infers target Pyarrow `DataType` based on the provided
+    columnar values.
+
+    NOTE: This is a wrapper on top of `pa.infer_type(...)` utility
+          performing up-casting of `binary` and `string` types to
+          corresponding `large_binary` and `large_string` types in case
+          any of the array elements exceeds 2Gb in size therefore
+          making it impossible for original types to accommodate such
+          values.
+
+          Unfortunately, for unknown reasons PA doesn't perform
+          that upcasting itself henceforth we have to do perform
+          it manually
+
+    Args:
+        column_values: List of columnar values
+
+    Returns:
+        Instance of PyArrow's `DataType` based on the provided
+        column values
+    """
+
+    if len(column_values) == 0:
+        return None
+
+    inferred_pa_dtype = pa.infer_type(column_values)
+
+    def _len_gt_overflow_threshold(obj: Any) -> bool:
+        # NOTE: This utility could be seeing objects other than strings or bytes in
+        #       cases when column contains non-scalar non-homogeneous object types as
+        #       column values, therefore making Arrow unable to infer corresponding
+        #       column type appropriately, therefore falling back to assume the type
+        #       of the first element in the list.
+        #
+        #       Check out test cases for this method for an additional context.
+        if isinstance(obj, (str, bytes)):
+            return len(obj) > INT32_OVERFLOW_THRESHOLD
+
+        return False
+
+    if pa.types.is_binary(inferred_pa_dtype) and any(
+        [_len_gt_overflow_threshold(v) for v in column_values]
+    ):
+        return pa.large_binary()
+    elif pa.types.is_string(inferred_pa_dtype) and any(
+        [_len_gt_overflow_threshold(v) for v in column_values]
+    ):
+        return pa.large_string()
+
+    return inferred_pa_dtype
 
 
 @DeveloperAPI
@@ -427,7 +575,13 @@ def from_numpy(
             # Stack ndarrays and pass through to ndarray handling logic below.
             try:
                 arr = np.stack(arr, axis=0)
-            except ValueError:
+            except ValueError as ve:
+                logger.warning(
+                    f"Failed to stack lists due to: {ve}; "
+                    f"falling back to using np.array(..., dtype=object)",
+                    exc_info=ve,
+                )
+
                 # ndarray stacking may fail if the arrays are heterogeneously-shaped.
                 arr = np.array(arr, dtype=object)
         if not isinstance(arr, np.ndarray):
diff --git a/python/ray/air/util/tensor_extensions/utils.py b/python/ray/air/util/tensor_extensions/utils.py
index be250d89a04d..dfaa95a0acab 100644
--- a/python/ray/air/util/tensor_extensions/utils.py
+++ b/python/ray/air/util/tensor_extensions/utils.py
@@ -9,9 +9,28 @@
     from pandas.core.dtypes.generic import ABCSeries
 
 
+def _is_ndarray_tensor(arr: np.ndarray) -> bool:
+    """Return whether the provided NumPy ndarray is comprised of tensors.
+
+    NOTE: Tensor is defined as a NumPy array such that `len(arr.shape) > 1`
+    """
+
+    # Case of uniform-shaped (ie non-ragged) tensor
+    if arr.ndim > 1:
+        return True
+
+    # Case of ragged tensor (as produced by `create_ragged_ndarray` utility)
+    elif (
+        arr.dtype.type is np.object_ and len(arr) > 0 and isinstance(arr[0], np.ndarray)
+    ):
+        return True
+
+    return False
+
+
 def _is_ndarray_variable_shaped_tensor(arr: np.ndarray) -> bool:
-    """Return whether the provided NumPy ndarray is representing a variable-shaped
-    tensor.
+    """Return whether the provided NumPy ndarray is comprised of variable-shaped
+    tensors.
 
     NOTE: This is an O(rows) check.
     """
@@ -69,7 +88,7 @@ def _create_possibly_ragged_ndarray(
 
 
 @PublicAPI(stability="alpha")
-def create_ragged_ndarray(values: Sequence[np.ndarray]) -> np.ndarray:
+def create_ragged_ndarray(values: Sequence[Any]) -> np.ndarray:
     """Create an array that contains arrays of different length
 
     If you're working with variable-length arrays like images, use this function to
diff --git a/python/ray/autoscaler/_private/cli_logger.py b/python/ray/autoscaler/_private/cli_logger.py
index 01083be23eff..5172891d3119 100644
--- a/python/ray/autoscaler/_private/cli_logger.py
+++ b/python/ray/autoscaler/_private/cli_logger.py
@@ -113,39 +113,6 @@ def __getattr__(self, name):
 colorama.init(strip=False)
 
 
-def _patched_makeRecord(
-    self, name, level, fn, lno, msg, args, exc_info, func=None, extra=None, sinfo=None
-):
-    """Monkey-patched version of logging.Logger.makeRecord
-    We have to patch default loggers so they use the proper frame for
-    line numbers and function names (otherwise everything shows up as
-    e.g. cli_logger:info() instead of as where it was called from).
-
-    In Python 3.8 we could just use stacklevel=2, but we have to support
-    Python 3.6 and 3.7 as well.
-
-    The solution is this Python magic superhack.
-
-    The default makeRecord will deliberately check that we don't override
-    any existing property on the LogRecord using `extra`,
-    so we remove that check.
-
-    This patched version is otherwise identical to the one in the standard
-    library.
-
-    TODO: Remove this magic superhack. Find a more responsible workaround.
-    """
-    rv = logging._logRecordFactory(
-        name, level, fn, lno, msg, args, exc_info, func, sinfo
-    )
-    if extra is not None:
-        rv.__dict__.update(extra)
-    return rv
-
-
-logging.Logger.makeRecord = _patched_makeRecord
-
-
 def _external_caller_info():
     """Get the info from the caller frame.
 
diff --git a/python/ray/autoscaler/_private/commands.py b/python/ray/autoscaler/_private/commands.py
index 3c03738854f7..9a9b9d91cc2f 100644
--- a/python/ray/autoscaler/_private/commands.py
+++ b/python/ray/autoscaler/_private/commands.py
@@ -1153,16 +1153,15 @@ def exec_cluster(
         },
         docker_config=config.get("docker"),
     )
-    shutdown_after_run = False
     if cmd and stop:
         cmd = "; ".join(
             [
                 cmd,
                 "ray stop",
                 "ray teardown ~/ray_bootstrap_config.yaml --yes --workers-only",
+                "sudo shutdown -h now",
             ]
         )
-        shutdown_after_run = True
 
     result = _exec(
         updater,
@@ -1172,7 +1171,7 @@ def exec_cluster(
         port_forward=port_forward,
         with_output=with_output,
         run_env=run_env,
-        shutdown_after_run=shutdown_after_run,
+        shutdown_after_run=False,
         extra_screen_args=extra_screen_args,
     )
     if tmux or screen:
diff --git a/python/ray/autoscaler/_private/kuberay/autoscaling_config.py b/python/ray/autoscaler/_private/kuberay/autoscaling_config.py
index d74bb253560a..0bf61b311128 100644
--- a/python/ray/autoscaler/_private/kuberay/autoscaling_config.py
+++ b/python/ray/autoscaler/_private/kuberay/autoscaling_config.py
@@ -30,7 +30,7 @@
 
 # Logical group name for the KubeRay head group.
 # Used as the name of the "head node type" by the autoscaler.
-_HEAD_GROUP_NAME = "head-group"
+_HEAD_GROUP_NAME = "headgroup"
 
 
 class AutoscalingConfigProducer:
@@ -219,7 +219,7 @@ def _node_type_from_group_spec(
 
     resources = _get_ray_resources_from_group_spec(group_spec, is_head)
 
-    return {
+    node_type = {
         "min_workers": min_workers,
         "max_workers": max_workers,
         # `node_config` is a legacy field required for compatibility.
@@ -228,6 +228,12 @@ def _node_type_from_group_spec(
         "resources": resources,
     }
 
+    idle_timeout_s = group_spec.get(IDLE_SECONDS_KEY)
+    if idle_timeout_s is not None:
+        node_type["idle_timeout_s"] = float(idle_timeout_s)
+
+    return node_type
+
 
 def _get_ray_resources_from_group_spec(
     group_spec: Dict[str, Any], is_head: bool
diff --git a/python/ray/autoscaler/_private/kuberay/node_provider.py b/python/ray/autoscaler/_private/kuberay/node_provider.py
index 060e4794867d..5378347ba78a 100644
--- a/python/ray/autoscaler/_private/kuberay/node_provider.py
+++ b/python/ray/autoscaler/_private/kuberay/node_provider.py
@@ -38,8 +38,6 @@
 # Kind label value indicating the pod is the worker.
 KUBERAY_KIND_WORKER = "worker"
 
-# Group name (node type) to use for the head.
-KUBERAY_TYPE_HEAD = "head-group"
 # KubeRay CRD version
 KUBERAY_CRD_VER = os.getenv("KUBERAY_CRD_VER", "v1alpha1")
 
@@ -104,12 +102,12 @@ def kind_and_type(pod: Dict[str, Any]) -> Tuple[NodeKind, NodeType]:
     from a Ray pod's labels.
     """
     labels = pod["metadata"]["labels"]
-    if labels[KUBERAY_LABEL_KEY_KIND] == KUBERAY_KIND_HEAD:
-        kind = NODE_KIND_HEAD
-        type = KUBERAY_TYPE_HEAD
-    else:
-        kind = NODE_KIND_WORKER
-        type = labels[KUBERAY_LABEL_KEY_TYPE]
+    kind = (
+        NODE_KIND_HEAD
+        if labels[KUBERAY_LABEL_KEY_KIND] == KUBERAY_KIND_HEAD
+        else NODE_KIND_WORKER
+    )
+    type = labels[KUBERAY_LABEL_KEY_TYPE]
     return kind, type
 
 
diff --git a/python/ray/autoscaler/_private/kuberay/run_autoscaler.py b/python/ray/autoscaler/_private/kuberay/run_autoscaler.py
index 452bfef66c3e..efafac6d8f37 100644
--- a/python/ray/autoscaler/_private/kuberay/run_autoscaler.py
+++ b/python/ray/autoscaler/_private/kuberay/run_autoscaler.py
@@ -44,12 +44,12 @@ def run_kuberay_autoscaler(cluster_name: str, cluster_namespace: str):
                     "--skip-version-check",
                 ]
             )
-            # Logging is not ready yet. Print to stdout for now.
-            print("The Ray head is ready. Starting the autoscaler.")
+            logger.info("The Ray head is ready. Starting the autoscaler.")
             break
         except subprocess.CalledProcessError:
-            print("The Ray head is not yet ready.")
-            print(f"Will check again in {BACKOFF_S} seconds.")
+            logger.warning(
+                f"The Ray head is not ready. Will check again in {BACKOFF_S} seconds."
+            )
             time.sleep(BACKOFF_S)
 
     # The Ray head container sets up the log directory. Thus, we set up logging
diff --git a/python/ray/autoscaler/_private/monitor.py b/python/ray/autoscaler/_private/monitor.py
index a641652615df..90b5610f59ae 100644
--- a/python/ray/autoscaler/_private/monitor.py
+++ b/python/ray/autoscaler/_private/monitor.py
@@ -604,16 +604,6 @@ def log_resource_batch_data_if_desired(
     parser.add_argument(
         "--gcs-address", required=False, type=str, help="The address (ip:port) of GCS."
     )
-    parser.add_argument(
-        "--redis-address", required=False, type=str, help="This is deprecated"
-    )
-    parser.add_argument(
-        "--redis-password",
-        required=False,
-        type=str,
-        default=None,
-        help="This is deprecated",
-    )
     parser.add_argument(
         "--autoscaling-config",
         required=False,
diff --git a/python/ray/autoscaler/aws/tests/aws_compute.yaml b/python/ray/autoscaler/aws/tests/aws_compute.yaml
index 1ef4e02ba1e8..8bf740d8eeed 100644
--- a/python/ray/autoscaler/aws/tests/aws_compute.yaml
+++ b/python/ray/autoscaler/aws/tests/aws_compute.yaml
@@ -1,7 +1,7 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-aws:
+advanced_configurations_json:
     IamInstanceProfile: {"Name": "ray-autoscaler-v1"}
 
 head_node_type:
diff --git a/python/ray/autoscaler/azure/example-full.yaml b/python/ray/autoscaler/azure/example-full.yaml
index a043603bfaaa..41d7fbfd60d1 100644
--- a/python/ray/autoscaler/azure/example-full.yaml
+++ b/python/ray/autoscaler/azure/example-full.yaml
@@ -164,9 +164,8 @@ setup_commands: []
     # - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl"
 
 # Custom commands that will be run on the head node after common setup.
-# NOTE: rayproject/ray-ml:latest has azure packages bundled
-head_setup_commands: []
-    # - pip install -U azure-cli-core==2.22.0 azure-mgmt-compute==14.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.2.0 azure-mgmt-resource==13.0.0
+head_setup_commands:
+    - pip install -U azure-cli-core==2.29.1 azure-identity==1.7.0 azure-mgmt-compute==23.1.0 azure-mgmt-network==19.0.0 azure-mgmt-resource==20.0.0 msrestazure==0.6.4
 
 # Custom commands that will be run on worker nodes after common setup.
 worker_setup_commands: []
diff --git a/python/ray/autoscaler/azure/example-gpu-docker.yaml b/python/ray/autoscaler/azure/example-gpu-docker.yaml
index 6faaed48fb64..3ebc763e7d26 100644
--- a/python/ray/autoscaler/azure/example-gpu-docker.yaml
+++ b/python/ray/autoscaler/azure/example-gpu-docker.yaml
@@ -117,9 +117,8 @@ setup_commands: []
 #     - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl"
 
 # Custom commands that will be run on the head node after common setup.
-# NOTE: rayproject/ray-ml:latest has azure packages bundled
-head_setup_commands: []
-    # - pip install -U azure-cli-core==2.22.0 azure-mgmt-compute==14.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.2.0 azure-mgmt-resource==13.0.0
+head_setup_commands:
+    - pip install -U azure-cli-core==2.29.1 azure-identity==1.7.0 azure-mgmt-compute==23.1.0 azure-mgmt-network==19.0.0 azure-mgmt-resource==20.0.0 msrestazure==0.6.4
 
 # Custom commands that will be run on worker nodes after common setup.
 worker_setup_commands: []
diff --git a/python/ray/autoscaler/gcp/example-full.yaml b/python/ray/autoscaler/gcp/example-full.yaml
index f5b30613aed9..2fa4c1211752 100644
--- a/python/ray/autoscaler/gcp/example-full.yaml
+++ b/python/ray/autoscaler/gcp/example-full.yaml
@@ -70,7 +70,7 @@ available_node_types:
                 initializeParams:
                   diskSizeGb: 50
                   # See https://cloud.google.com/compute/docs/images for more images
-                  sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
+                  sourceImage: projects/deeplearning-platform-release/global/images/common-cpu-v20240922
 
             # Additional options can be found in in the compute docs at
             # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
@@ -105,7 +105,7 @@ available_node_types:
                 initializeParams:
                   diskSizeGb: 50
                   # See https://cloud.google.com/compute/docs/images for more images
-                  sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
+                  sourceImage: projects/deeplearning-platform-release/global/images/common-cpu-v20240922
             # Run workers on preemtible instance by default.
             # Comment this out to use on-demand.
             scheduling:
diff --git a/python/ray/autoscaler/gcp/example-minimal-pinned.yaml b/python/ray/autoscaler/gcp/example-minimal-pinned.yaml
new file mode 100644
index 000000000000..ce42e6705854
--- /dev/null
+++ b/python/ray/autoscaler/gcp/example-minimal-pinned.yaml
@@ -0,0 +1,36 @@
+auth:
+  ssh_user: ubuntu
+cluster_name: minimal
+provider:
+  availability_zone: us-west1-a
+  project_id: null # TODO: set your GCP project ID here
+  region: us-west1
+  type: gcp
+
+# Needs to pin the VM images for stability..
+available_node_types:
+    ray_head_default:
+        resources: {"CPU": 2}
+        node_config:
+            machineType: n1-standard-2
+            disks:
+              - boot: true
+                autoDelete: true
+                type: PERSISTENT
+                initializeParams:
+                  diskSizeGb: 50
+                  sourceImage: projects/deeplearning-platform-release/global/images/common-cpu-v20240922
+    ray_worker_small:
+        min_workers: 0
+        resources: {"CPU": 2}
+        node_config:
+            machineType: n1-standard-2
+            disks:
+              - boot: true
+                autoDelete: true
+                type: PERSISTENT
+                initializeParams:
+                  diskSizeGb: 50
+                  sourceImage: projects/deeplearning-platform-release/global/images/common-cpu-v20240922
+            scheduling:
+              - preemptible: true
diff --git a/python/ray/autoscaler/gcp/tests/single_node_32_cpu_gce.yaml b/python/ray/autoscaler/gcp/tests/single_node_32_cpu_gce.yaml
index c6d1a6729fa0..466d7fe8602c 100644
--- a/python/ray/autoscaler/gcp/tests/single_node_32_cpu_gce.yaml
+++ b/python/ray/autoscaler/gcp/tests/single_node_32_cpu_gce.yaml
@@ -19,7 +19,7 @@ gcp_advanced_configurations_json:
         initialize_params:
           disk_size_gb: 500
 
-#aws:
+#advanced_configurations_json:
 #    BlockDeviceMappings:
 #        - DeviceName: /dev/sda1
 #          Ebs:
diff --git a/python/ray/autoscaler/kuberay/ray-cluster.complete.yaml b/python/ray/autoscaler/kuberay/ray-cluster.complete.yaml
index f9e1a6cef375..d57f5d6f23b9 100644
--- a/python/ray/autoscaler/kuberay/ray-cluster.complete.yaml
+++ b/python/ray/autoscaler/kuberay/ray-cluster.complete.yaml
@@ -18,7 +18,7 @@ spec:
     serviceType: ClusterIP
     # the pod replicas in this group typed head (assuming there could be more than 1 in the future)
     replicas: 1
-    # logical group name, for this called head-group, also can be functional
+    # logical group name, for this called headgroup, also can be functional
     # pod type head or worker
     # rayNodeType: head # Not needed since it is under the headgroup
     # the following params are used to complete the ray start: ray start --head --block --port=6379 ...
@@ -108,7 +108,7 @@ spec:
   workerGroupSpecs:
   # the pod replicas in this group typed worker
   - replicas: 1
-    minReplicas: 1
+    minReplicas: 0
     maxReplicas: 300
     # logical group name, for this called small-group, also can be functional
     groupName: small-group
diff --git a/python/ray/autoscaler/ray-schema.json b/python/ray/autoscaler/ray-schema.json
index ad5da68ea2a0..2e07dadac912 100644
--- a/python/ray/autoscaler/ray-schema.json
+++ b/python/ray/autoscaler/ray-schema.json
@@ -345,6 +345,7 @@
                        },
                        "min_workers": {"type": "integer"},
                        "max_workers": {"type": "integer"},
+                       "idle_timeout_s": {"type": "number", "nullable": true},
                        "resources": {
 						   "type": "object",
 						   "patternProperties": {
diff --git a/python/ray/autoscaler/v2/instance_manager/cloud_providers/kuberay/cloud_provider.py b/python/ray/autoscaler/v2/instance_manager/cloud_providers/kuberay/cloud_provider.py
index c685be58cf60..c1b8ddc2a31b 100644
--- a/python/ray/autoscaler/v2/instance_manager/cloud_providers/kuberay/cloud_provider.py
+++ b/python/ray/autoscaler/v2/instance_manager/cloud_providers/kuberay/cloud_provider.py
@@ -55,13 +55,12 @@ def __init__(
         """
         Args:
             cluster_name: The name of the RayCluster resource.
-            namespace: The namespace of the RayCluster resource.
+            provider_config: The namespace of the RayCluster.
             k8s_api_client: The client to the Kubernetes API server.
                 This could be used to mock the Kubernetes API server for testing.
         """
         self._cluster_name = cluster_name
         self._namespace = provider_config["namespace"]
-        self._head_node_type = provider_config["head_node_type"]
 
         self._k8s_api_client = k8s_api_client or KubernetesHttpApiClient(
             namespace=self._namespace
@@ -210,21 +209,25 @@ def _initialize_scale_request(
         cur_instances = self.instances
 
         # Get the worker groups that have pending deletes and the worker groups that
-        # have finished deletes.
+        # have finished deletes, and the set of workers included in the workersToDelete
+        # field of any worker group.
         (
             worker_groups_with_pending_deletes,
             worker_groups_without_pending_deletes,
-        ) = self._get_workers_groups_with_deletes(
-            ray_cluster, set(cur_instances.keys())
-        )
+            worker_to_delete_set,
+        ) = self._get_workers_delete_info(ray_cluster, set(cur_instances.keys()))
 
         # Calculate the desired number of workers by type.
         num_workers_dict = defaultdict(int)
-        for _, cur_instance in cur_instances.items():
-            if cur_instance.node_kind == NodeKind.HEAD:
-                # Only track workers.
-                continue
-            num_workers_dict[cur_instance.node_type] += 1
+        worker_groups = ray_cluster["spec"].get("workerGroupSpecs", [])
+        for worker_group in worker_groups:
+            node_type = worker_group["groupName"]
+            # Handle the case where users manually increase `minReplicas`
+            # to scale up the number of worker Pods. In this scenario,
+            # `replicas` will be smaller than `minReplicas`.
+            num_workers_dict[node_type] = max(
+                worker_group["replicas"], worker_group["minReplicas"]
+            )
 
         # Add to launch nodes.
         for node_type, count in to_launch.items():
@@ -243,6 +246,11 @@ def _initialize_scale_request(
                 # Not possible to delete head node.
                 continue
 
+            if to_delete_instance.cloud_instance_id in worker_to_delete_set:
+                # If the instance is already in the workersToDelete field of
+                # any worker group, skip it.
+                continue
+
             num_workers_dict[to_delete_instance.node_type] -= 1
             assert num_workers_dict[to_delete_instance.node_type] >= 0
             to_delete_instances_by_type[to_delete_instance.node_type].append(
@@ -322,6 +330,7 @@ def _submit_scale_request(
             # No patch required.
             return
 
+        logger.info(f"Submitting a scale request: {scale_request}")
         self._patch(f"rayclusters/{self._cluster_name}", patch_payload)
 
     def _add_launch_errors(
@@ -393,9 +402,9 @@ def instances(self) -> Dict[CloudInstanceId, CloudInstance]:
         return copy.deepcopy(self._cached_instances)
 
     @staticmethod
-    def _get_workers_groups_with_deletes(
+    def _get_workers_delete_info(
         ray_cluster_spec: Dict[str, Any], node_set: Set[CloudInstanceId]
-    ) -> Tuple[Set[NodeType], Set[NodeType]]:
+    ) -> Tuple[Set[NodeType], Set[NodeType], Set[CloudInstanceId]]:
         """
         Gets the worker groups that have pending deletes and the worker groups that
         have finished deletes.
@@ -405,10 +414,13 @@ def _get_workers_groups_with_deletes(
                 deletes.
             worker_groups_with_finished_deletes: The worker groups that have finished
                 deletes.
+            worker_to_delete_set: A set of Pods that are included in the workersToDelete
+                field of any worker group.
         """
 
         worker_groups_with_pending_deletes = set()
         worker_groups_with_deletes = set()
+        worker_to_delete_set = set()
 
         worker_groups = ray_cluster_spec["spec"].get("workerGroupSpecs", [])
         for worker_group in worker_groups:
@@ -423,6 +435,7 @@ def _get_workers_groups_with_deletes(
             worker_groups_with_deletes.add(node_type)
 
             for worker in workersToDelete:
+                worker_to_delete_set.add(worker)
                 if worker in node_set:
                     worker_groups_with_pending_deletes.add(node_type)
                     break
@@ -430,7 +443,11 @@ def _get_workers_groups_with_deletes(
         worker_groups_with_finished_deletes = (
             worker_groups_with_deletes - worker_groups_with_pending_deletes
         )
-        return worker_groups_with_pending_deletes, worker_groups_with_finished_deletes
+        return (
+            worker_groups_with_pending_deletes,
+            worker_groups_with_finished_deletes,
+            worker_to_delete_set,
+        )
 
     def _fetch_instances(self) -> Dict[CloudInstanceId, CloudInstance]:
         """
@@ -478,26 +495,23 @@ def _fetch_instances(self) -> Dict[CloudInstanceId, CloudInstance]:
                 # Ignore pods marked for termination.
                 continue
             pod_name = pod["metadata"]["name"]
-            cloud_instance = self._cloud_instance_from_pod(pod, self._head_node_type)
+            cloud_instance = self._cloud_instance_from_pod(pod)
             if cloud_instance:
                 cloud_instances[pod_name] = cloud_instance
         return cloud_instances
 
     @staticmethod
-    def _cloud_instance_from_pod(
-        pod: Dict[str, Any], head_node_type: NodeType
-    ) -> Optional[CloudInstance]:
+    def _cloud_instance_from_pod(pod: Dict[str, Any]) -> Optional[CloudInstance]:
         """
         Convert a pod to a Ray CloudInstance.
 
         Args:
             pod: The pod resource dict.
-            head_node_type: The node type of the head node.
         """
         labels = pod["metadata"]["labels"]
         if labels[KUBERAY_LABEL_KEY_KIND] == KUBERAY_KIND_HEAD:
             kind = NodeKind.HEAD
-            type = head_node_type
+            type = labels[KUBERAY_LABEL_KEY_TYPE]
         elif labels[KUBERAY_LABEL_KEY_KIND] == KUBERAY_KIND_WORKER:
             kind = NodeKind.WORKER
             type = labels[KUBERAY_LABEL_KEY_TYPE]
diff --git a/python/ray/autoscaler/v2/instance_manager/config.py b/python/ray/autoscaler/v2/instance_manager/config.py
index c9597eef6c73..a7e582eacdbc 100644
--- a/python/ray/autoscaler/v2/instance_manager/config.py
+++ b/python/ray/autoscaler/v2/instance_manager/config.py
@@ -128,6 +128,8 @@ class NodeTypeConfig:
     min_worker_nodes: int
     # The maximal number of worker nodes can be launched for this node type.
     max_worker_nodes: int
+    # Idle timeout seconds for worker nodes of this node type.
+    idle_timeout_s: Optional[float] = None
     # The total resources on the node.
     resources: Dict[str, float] = field(default_factory=dict)
     # The labels on the node.
@@ -346,6 +348,7 @@ def get_node_type_configs(self) -> Dict[NodeType, NodeTypeConfig]:
                 name=node_type,
                 min_worker_nodes=node_config.get("min_workers", 0),
                 max_worker_nodes=max_workers_nodes,
+                idle_timeout_s=node_config.get("idle_timeout_s", None),
                 resources=node_config.get("resources", {}),
                 labels=node_config.get("labels", {}),
                 launch_config_hash=launch_config_hash,
diff --git a/python/ray/autoscaler/v2/instance_manager/instance_manager.py b/python/ray/autoscaler/v2/instance_manager/instance_manager.py
index aa926ba92747..6a1f6e207408 100644
--- a/python/ray/autoscaler/v2/instance_manager/instance_manager.py
+++ b/python/ray/autoscaler/v2/instance_manager/instance_manager.py
@@ -58,7 +58,7 @@ def update_instance_manager_state(
         """
         Updates the instance manager state.
 
-        If there's a any failure, no updates would be made and the reply
+        If there's any failure, no updates would be made and the reply
         would contain the latest version of the instance manager state,
         and the error info.
 
@@ -80,7 +80,7 @@ def update_instance_manager_state(
                 f"Version mismatch: expected: {request.expected_version}, "
                 f"actual: {version}"
             )
-            logger.warn(err_str)
+            logger.warning(err_str)
             return self._get_update_im_state_reply(
                 StatusCode.VERSION_MISMATCH,
                 version,
@@ -110,7 +110,7 @@ def update_instance_manager_state(
                 err_str = (
                     f"Version mismatch: expected: {version}, actual: {result.version}"
                 )
-                logger.warn(err_str)
+                logger.warning(err_str)
                 return self._get_update_im_state_reply(
                     StatusCode.VERSION_MISMATCH, result.version, err_str
                 )
diff --git a/python/ray/autoscaler/v2/monitor.py b/python/ray/autoscaler/v2/monitor.py
index 8277addc0017..558725f3e78f 100644
--- a/python/ray/autoscaler/v2/monitor.py
+++ b/python/ray/autoscaler/v2/monitor.py
@@ -17,6 +17,7 @@
 from ray._private.event.event_logger import get_event_logger
 from ray._private.ray_logging import setup_component_logger
 from ray._private.usage.usage_lib import record_extra_usage_tag
+from ray._private.worker import SCRIPT_MODE
 from ray._raylet import GcsClient
 from ray.autoscaler._private.constants import (
     AUTOSCALER_METRIC_PORT,
@@ -77,7 +78,7 @@ def __init__(
             )
         self._session_name = self._get_session_name(self.gcs_client)
         logger.info(f"session_name: {self._session_name}")
-        worker.mode = 0
+        worker.set_mode(SCRIPT_MODE)
         head_node_ip = self.gcs_address.split(":")[0]
 
         self.autoscaler = None
@@ -197,16 +198,6 @@ def record_autoscaler_v2_usage(gcs_client: GcsClient) -> None:
     parser.add_argument(
         "--gcs-address", required=False, type=str, help="The address (ip:port) of GCS."
     )
-    parser.add_argument(
-        "--redis-address", required=False, type=str, help="This is deprecated"
-    )
-    parser.add_argument(
-        "--redis-password",
-        required=False,
-        type=str,
-        default=None,
-        help="This is deprecated",
-    )
     parser.add_argument(
         "--autoscaling-config",
         required=False,
diff --git a/python/ray/autoscaler/v2/scheduler.py b/python/ray/autoscaler/v2/scheduler.py
index 3732a6282632..2d5a70065066 100644
--- a/python/ray/autoscaler/v2/scheduler.py
+++ b/python/ray/autoscaler/v2/scheduler.py
@@ -1584,6 +1584,11 @@ def _enforce_idle_termination(
                 continue
 
             idle_timeout_s = ctx.get_idle_timeout_s()
+            # Override the scheduler idle_timeout_s if set for this node_type.
+            node_type = node.node_type
+            if node_type in node_type_configs:
+                if node_type_configs[node_type].idle_timeout_s is not None:
+                    idle_timeout_s = node_type_configs[node_type].idle_timeout_s
             if idle_timeout_s is None:
                 # No idle timeout is set, skip the idle termination.
                 continue
@@ -1606,7 +1611,6 @@ def _enforce_idle_termination(
 
             # Honor the min_worker_nodes setting for the node type.
             min_count = 0
-            node_type = node.node_type
             if node_type in node_type_configs:
                 min_count = node_type_configs[node_type].min_worker_nodes
             if (
diff --git a/python/ray/autoscaler/v2/tests/test_node_provider.py b/python/ray/autoscaler/v2/tests/test_node_provider.py
index 5141891c0a36..47483d3f61fa 100644
--- a/python/ray/autoscaler/v2/tests/test_node_provider.py
+++ b/python/ray/autoscaler/v2/tests/test_node_provider.py
@@ -19,10 +19,7 @@
     AUTOSCALER_MAX_LAUNCH_BATCH,
 )
 from ray.autoscaler._private.fake_multi_node.node_provider import FakeMultiNodeProvider
-from ray.autoscaler._private.kuberay.node_provider import (
-    KUBERAY_TYPE_HEAD,
-    IKubernetesHttpApiClient,
-)
+from ray.autoscaler._private.kuberay.node_provider import IKubernetesHttpApiClient
 from ray.autoscaler.v2.instance_manager.cloud_providers.kuberay.cloud_provider import (
     KubeRayProvider,
 )
@@ -372,7 +369,7 @@ def setUp(self):
             cluster_name="test",
             provider_config={
                 "namespace": "default",
-                "head_node_type": KUBERAY_TYPE_HEAD,
+                "head_node_type": "headgroup",
             },
             k8s_api_client=self.mock_client,
         )
@@ -389,7 +386,7 @@ def test_get_nodes(self):
                 "raycluster-autoscaler-head-8zsc8": CloudInstance(
                     cloud_instance_id="raycluster-autoscaler-head-8zsc8",
                     node_kind=NodeKind.HEAD,
-                    node_type="head-group",
+                    node_type="headgroup",
                     is_running=True,
                 ),  # up-to-date status because the Ray container is in running status
                 "raycluster-autoscaler-worker-small-group-dkz2r": CloudInstance(
@@ -495,6 +492,124 @@ def test_pending_deletes(self):
             },
         ]
 
+    def test_increase_min_replicas_to_scale_up(self):
+        # Simulate the case where users manually increase the `minReplicas` field
+        # from 0 to $num_pods. KubeRay will create $num_pods worker Pods to meet the new
+        # `minReplicas`, even though the `replicas` field is still 0.
+        small_group = "small-group"
+        num_pods = 0
+        assert (
+            self.mock_client._ray_cluster["spec"]["workerGroupSpecs"][0]["groupName"]
+            == small_group
+        )
+        for pod in self.mock_client._pod_list["items"]:
+            if pod["metadata"]["labels"]["ray.io/group"] == small_group:
+                num_pods += 1
+        assert num_pods > 0
+        self.mock_client._ray_cluster["spec"]["workerGroupSpecs"][0]["replicas"] = 0
+        self.mock_client._ray_cluster["spec"]["workerGroupSpecs"][0][
+            "minReplicas"
+        ] = num_pods
+
+        # Launching a new node and `replicas` should be
+        # `max(replicas, minReplicas) + 1`.
+        self.provider.launch(shape={small_group: 1}, request_id="launch-1")
+        patches = self.mock_client.get_patches(
+            f"rayclusters/{self.provider._cluster_name}"
+        )
+        assert len(patches) == 1
+        assert patches[0] == {
+            "op": "replace",
+            "path": "/spec/workerGroupSpecs/0/replicas",
+            "value": num_pods + 1,
+        }
+
+    def test_inconsistent_pods_raycr_scale_up(self):
+        """
+        Test the case where the cluster state has not yet reached the desired state.
+        Specifically, the replicas field in the RayCluster CR does not match the actual
+        number of Pods.
+        """
+        # Check the assumptions of the test
+        small_group = "small-group"
+        num_pods = 0
+        for pod in self.mock_client._pod_list["items"]:
+            if pod["metadata"]["labels"]["ray.io/group"] == small_group:
+                num_pods += 1
+
+        assert (
+            self.mock_client._ray_cluster["spec"]["workerGroupSpecs"][0]["groupName"]
+            == small_group
+        )
+        desired_replicas = num_pods + 1
+        self.mock_client._ray_cluster["spec"]["workerGroupSpecs"][0][
+            "replicas"
+        ] = desired_replicas
+
+        # Launch a new node. The replicas field should be incremented by 1, even though
+        # the cluster state has not yet reached the goal state.
+        launch_request = {"small-group": 1}
+        self.provider.launch(shape=launch_request, request_id="launch-1")
+
+        patches = self.mock_client.get_patches(
+            f"rayclusters/{self.provider._cluster_name}"
+        )
+        assert len(patches) == 1
+        assert patches[0] == {
+            "op": "replace",
+            "path": "/spec/workerGroupSpecs/0/replicas",
+            "value": desired_replicas + 1,
+        }
+
+    def test_inconsistent_pods_raycr_scale_down(self):
+        """
+        Test the case where the cluster state has not yet reached the desired state.
+        Specifically, the replicas field in the RayCluster CR does not match the actual
+        number of Pods.
+        """
+        # Check the assumptions of the test
+        small_group = "small-group"
+        num_pods = 0
+        pod_to_delete = None
+        for pod in self.mock_client._pod_list["items"]:
+            if pod["metadata"]["labels"]["ray.io/group"] == small_group:
+                num_pods += 1
+                pod_to_delete = pod["metadata"]["name"]
+        assert pod_to_delete is not None
+
+        assert (
+            self.mock_client._ray_cluster["spec"]["workerGroupSpecs"][0]["groupName"]
+            == small_group
+        )
+        desired_replicas = num_pods + 1
+        self.mock_client._ray_cluster["spec"]["workerGroupSpecs"][0][
+            "replicas"
+        ] = desired_replicas
+
+        # Terminate a node. The replicas field should be decremented by 1, even though
+        # the cluster state has not yet reached the goal state.
+        self.provider.terminate(ids=[pod_to_delete], request_id="term-1")
+        patches = self.mock_client.get_patches(
+            f"rayclusters/{self.provider._cluster_name}"
+        )
+        assert len(patches) == 2
+        assert patches == [
+            {
+                "op": "replace",
+                "path": "/spec/workerGroupSpecs/0/replicas",
+                "value": desired_replicas - 1,
+            },
+            {
+                "op": "replace",
+                "path": "/spec/workerGroupSpecs/0/scaleStrategy",
+                "value": {
+                    "workersToDelete": [
+                        pod_to_delete,
+                    ]
+                },
+            },
+        ]
+
 
 if __name__ == "__main__":
     if os.environ.get("PARALLEL_CI"):
diff --git a/python/ray/autoscaler/v2/tests/test_scheduler.py b/python/ray/autoscaler/v2/tests/test_scheduler.py
index e6d6cb71978d..3a188bdaf2ce 100644
--- a/python/ray/autoscaler/v2/tests/test_scheduler.py
+++ b/python/ray/autoscaler/v2/tests/test_scheduler.py
@@ -1434,6 +1434,82 @@ def test_idle_termination_with_min_worker(min_workers):
         assert len(to_terminate) == 0
 
 
+@pytest.mark.parametrize("node_type_idle_timeout_s", [1, 2, 10])
+def test_idle_termination_with_node_type_idle_timeout(node_type_idle_timeout_s):
+    """
+    Test that idle nodes are terminated when idle_timeout_s is set for node type.
+    """
+    scheduler = ResourceDemandScheduler(event_logger)
+
+    node_type_configs = {
+        "type_cpu_with_idle_timeout": NodeTypeConfig(
+            name="type_cpu",
+            resources={"CPU": 1},
+            min_worker_nodes=0,
+            max_worker_nodes=5,
+            idle_timeout_s=node_type_idle_timeout_s,
+            launch_config_hash="hash1",
+        ),
+    }
+
+    idle_time_s = 5
+    constraints = []
+
+    request = sched_request(
+        node_type_configs=node_type_configs,
+        instances=[
+            make_autoscaler_instance(
+                im_instance=Instance(
+                    instance_type="type_cpu_with_idle_timeout",
+                    status=Instance.RAY_RUNNING,
+                    launch_config_hash="hash1",
+                    instance_id="i-1",
+                    node_id="r-1",
+                ),
+                ray_node=NodeState(
+                    node_id=b"r-1",
+                    ray_node_type_name="type_cpu_with_idle_timeout",
+                    available_resources={"CPU": 0},
+                    total_resources={"CPU": 1},
+                    idle_duration_ms=0,  # Non idle
+                    status=NodeStatus.RUNNING,
+                ),
+                cloud_instance_id="c-1",
+            ),
+            make_autoscaler_instance(
+                im_instance=Instance(
+                    instance_id="i-2",
+                    instance_type="type_cpu_with_idle_timeout",
+                    status=Instance.RAY_RUNNING,
+                    launch_config_hash="hash1",
+                    node_id="r-2",
+                ),
+                ray_node=NodeState(
+                    ray_node_type_name="type_cpu_with_idle_timeout",
+                    node_id=b"r-2",
+                    available_resources={"CPU": 1},
+                    total_resources={"CPU": 1},
+                    idle_duration_ms=idle_time_s * 1000,
+                    status=NodeStatus.IDLE,
+                ),
+                cloud_instance_id="c-2",
+            ),
+        ],
+        # Set autoscaler idle_timeout_s to a value greater than
+        # node_type_idle_timeout_s and idle_time_s.
+        idle_timeout_s=idle_time_s * 1000,
+        cluster_resource_constraints=constraints,
+    )
+
+    reply = scheduler.schedule(request)
+    _, to_terminate = _launch_and_terminate(reply)
+    if node_type_idle_timeout_s <= idle_time_s:
+        assert len(to_terminate) == 1
+        assert to_terminate == [("i-2", "r-2", TerminationRequest.Cause.IDLE)]
+    else:
+        assert len(to_terminate) == 0
+
+
 def test_gang_scheduling():
     """
     Test that gang scheduling works.
diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py
index acec6c2672cd..7c6160d8937d 100644
--- a/python/ray/dag/compiled_dag_node.py
+++ b/python/ray/dag/compiled_dag_node.py
@@ -185,7 +185,7 @@ def do_profile_tasks(
     """
     try:
         for task in tasks:
-            task.prepare()
+            task.prepare(overlap_gpu_communication=overlap_gpu_communication)
 
         if not hasattr(self, "__ray_adag_events"):
             self.__ray_adag_events = []
@@ -1880,7 +1880,7 @@ def wait_teardown(self, kill_actors: bool = False):
                 from ray.dag import DAGContext
 
                 ctx = DAGContext.get_current()
-                teardown_timeout = ctx.retrieval_timeout
+                teardown_timeout = ctx.teardown_timeout
 
                 for actor, ref in outer.worker_task_refs.items():
                     timeout = False
@@ -2443,7 +2443,14 @@ def teardown(self, kill_actors: bool = False):
 
         monitor = getattr(self, "_monitor", None)
         if monitor is not None:
+            from ray.dag import DAGContext
+
+            ctx = DAGContext.get_current()
             monitor.teardown(kill_actors=kill_actors)
+            monitor.join(timeout=ctx.teardown_timeout)
+            # We do not log a warning here if the thread is still alive because
+            # wait_teardown already logs upon teardown_timeout.
+
         self._is_teardown = True
 
     def __del__(self):
diff --git a/python/ray/dag/context.py b/python/ray/dag/context.py
index 29e1d5bf2c78..531785c50262 100644
--- a/python/ray/dag/context.py
+++ b/python/ray/dag/context.py
@@ -10,6 +10,7 @@
 
 DEFAULT_EXECUTION_TIMEOUT_S = int(os.environ.get("RAY_DAG_execution_timeout", 10))
 DEFAULT_RETRIEVAL_TIMEOUT_S = int(os.environ.get("RAY_DAG_retrieval_timeout", 10))
+DEFAULT_TEARDOWN_TIMEOUT_S = int(os.environ.get("RAY_DAG_teardown_timeout", 30))
 # Default buffer size is 1MB.
 DEFAULT_BUFFER_SIZE_BYTES = int(os.environ.get("RAY_DAG_buffer_size_bytes", 1e6))
 # Default asyncio_max_queue_size is 0, which means no limit.
@@ -51,6 +52,8 @@ class DAGContext:
             calls.
         retrieval_timeout: The maximum time in seconds to wait to retrieve
             a result from the DAG.
+        teardown_timeout: The maximum time in seconds to wait for the DAG to
+            cleanly shut down.
         buffer_size_bytes: The maximum size of messages that can be passed
             between tasks in the DAG.
         asyncio_max_queue_size: The max queue size for the async execution.
@@ -72,6 +75,7 @@ class DAGContext:
 
     execution_timeout: int = DEFAULT_EXECUTION_TIMEOUT_S
     retrieval_timeout: int = DEFAULT_RETRIEVAL_TIMEOUT_S
+    teardown_timeout: int = DEFAULT_TEARDOWN_TIMEOUT_S
     buffer_size_bytes: int = DEFAULT_BUFFER_SIZE_BYTES
     asyncio_max_queue_size: int = DEFAULT_ASYNCIO_MAX_QUEUE_SIZE
     max_buffered_results: int = DEFAULT_MAX_BUFFERED_RESULTS
diff --git a/python/ray/dag/tests/experimental/test_accelerated_dag.py b/python/ray/dag/tests/experimental/test_accelerated_dag.py
index d6176fd57dc6..cbec80a871c4 100644
--- a/python/ray/dag/tests/experimental/test_accelerated_dag.py
+++ b/python/ray/dag/tests/experimental/test_accelerated_dag.py
@@ -1078,6 +1078,12 @@ def test_dag_exception_chained(ray_start_regular, capsys):
     # Can use the DAG after exceptions are thrown.
     assert ray.get(compiled_dag.execute(1)) == 2
 
+    # Note: somehow the auto triggered teardown() from ray.shutdown()
+    # does not finish in time for this test, leading to a segfault
+    # of the following test (likely due to a dangling monitor thread
+    # upon the new Ray init).
+    compiled_dag.teardown()
+
 
 @pytest.mark.parametrize("single_fetch", [True, False])
 def test_dag_exception_multi_output(ray_start_regular, single_fetch, capsys):
diff --git a/python/ray/dag/tests/experimental/test_torch_tensor_dag.py b/python/ray/dag/tests/experimental/test_torch_tensor_dag.py
index d1ac1c68063f..1797068e7e2d 100644
--- a/python/ray/dag/tests/experimental/test_torch_tensor_dag.py
+++ b/python/ray/dag/tests/experimental/test_torch_tensor_dag.py
@@ -182,7 +182,11 @@ def test_torch_tensor_as_dag_input(ray_start_regular):
 
 
 @pytest.mark.parametrize("ray_start_regular", [{"num_cpus": 4}], indirect=True)
-def test_torch_tensor_nccl(ray_start_regular):
+@pytest.mark.parametrize("enable_profiling", [False, True])
+@pytest.mark.parametrize("overlap_gpu_communication", [False, True])
+def test_torch_tensor_nccl(
+    ray_start_regular, monkeypatch, enable_profiling, overlap_gpu_communication
+):
     if not USE_GPU:
         pytest.skip("NCCL tests require GPUs")
 
@@ -190,6 +194,10 @@ def test_torch_tensor_nccl(ray_start_regular):
         sum(node["Resources"].get("GPU", 0) for node in ray.nodes()) > 1
     ), "This test requires at least 2 GPUs"
 
+    monkeypatch.setattr(
+        ray.dag.constants, "RAY_ADAG_ENABLE_PROFILING", enable_profiling
+    )
+
     actor_cls = TorchTensorWorker.options(num_cpus=0, num_gpus=1)
 
     sender = actor_cls.remote()
@@ -204,7 +212,9 @@ def test_torch_tensor_nccl(ray_start_regular):
         dag = dag.with_type_hint(TorchTensorType(transport="nccl"))
         dag = receiver.recv.bind(dag)
 
-    compiled_dag = dag.experimental_compile()
+    compiled_dag = dag.experimental_compile(
+        _overlap_gpu_communication=overlap_gpu_communication
+    )
 
     # Test that we can pass different shapes and data.
     for i in range(3):
diff --git a/python/ray/dashboard/client/src/App.tsx b/python/ray/dashboard/client/src/App.tsx
index bc30e06ccc0c..84b4a2c88e7b 100644
--- a/python/ray/dashboard/client/src/App.tsx
+++ b/python/ray/dashboard/client/src/App.tsx
@@ -23,7 +23,12 @@ import {
   StateApiLogViewerPage,
 } from "./pages/log/Logs";
 import { Metrics } from "./pages/metrics";
-import { DashboardUids, getMetricsInfo } from "./pages/metrics/utils";
+import {
+  DashboardUids,
+  getMetricsInfo,
+  getTimeZoneInfo,
+  TimezoneInfo,
+} from "./pages/metrics/utils";
 import Nodes, { ClusterMainPageLayout } from "./pages/node";
 import { ClusterDetailInfoPage } from "./pages/node/ClusterDetailInfoPage";
 import { ClusterLayout } from "./pages/node/ClusterLayout";
@@ -91,6 +96,14 @@ export type GlobalContextType = {
    * The name of the current selected datasource.
    */
   dashboardDatasource: string | undefined;
+  /**
+   * The timezone set on the ray cluster.
+   */
+  serverTimeZone: TimezoneInfo | null | undefined;
+  /**
+   * The globally selected current time zone.
+   */
+  currentTimeZone: string | undefined;
 };
 export const GlobalContext = React.createContext<GlobalContextType>({
   nodeMap: {},
@@ -102,10 +115,15 @@ export const GlobalContext = React.createContext<GlobalContextType>({
   prometheusHealth: undefined,
   sessionName: undefined,
   dashboardDatasource: undefined,
+  serverTimeZone: undefined,
+  currentTimeZone: undefined,
 });
 
 const App = () => {
-  const [context, setContext] = useState<GlobalContextType>({
+  const [currentTimeZone, setCurrentTimeZone] = useState<string>();
+  const [context, setContext] = useState<
+    Omit<GlobalContextType, "currentTimeZone">
+  >({
     nodeMap: {},
     nodeMapByIp: {},
     namespaceMap: {},
@@ -115,6 +133,7 @@ const App = () => {
     prometheusHealth: undefined,
     sessionName: undefined,
     dashboardDatasource: undefined,
+    serverTimeZone: undefined,
   });
   useEffect(() => {
     getNodeList().then((res) => {
@@ -158,11 +177,36 @@ const App = () => {
     doEffect();
   }, []);
 
+  useEffect(() => {
+    const updateTimezone = async () => {
+      // Sets the intial timezone to localStorage value if it exists
+      const storedTimeZone = localStorage.getItem("timezone");
+      if (storedTimeZone) {
+        setCurrentTimeZone(storedTimeZone);
+      }
+
+      // Fetch the server time zone.
+      const tzInfo = await getTimeZoneInfo();
+
+      const timeZone =
+        storedTimeZone ||
+        tzInfo?.value ||
+        Intl.DateTimeFormat().resolvedOptions().timeZone;
+
+      setCurrentTimeZone(timeZone);
+      setContext((existingContext) => ({
+        ...existingContext,
+        serverTimeZone: tzInfo,
+      }));
+    };
+    updateTimezone();
+  }, []);
+
   return (
     <StyledEngineProvider injectFirst>
       <ThemeProvider theme={lightTheme}>
         <Suspense fallback={Loading}>
-          <GlobalContext.Provider value={context}>
+          <GlobalContext.Provider value={{ ...context, currentTimeZone }}>
             <CssBaseline />
             <HashRouter>
               <Routes>
diff --git a/python/ray/dashboard/client/src/common/formatUtils.ts b/python/ray/dashboard/client/src/common/formatUtils.ts
index 03610fd82553..4b0c9a24a7d1 100644
--- a/python/ray/dashboard/client/src/common/formatUtils.ts
+++ b/python/ray/dashboard/client/src/common/formatUtils.ts
@@ -1,4 +1,9 @@
 import dayjs from "dayjs";
+import timezone from "dayjs/plugin/timezone";
+import utc from "dayjs/plugin/utc";
+
+dayjs.extend(utc);
+dayjs.extend(timezone);
 
 export const formatByteAmount = (
   amount: number,
@@ -60,5 +65,10 @@ export const formatValue = (rawFloat: number) => {
   }
 };
 
-export const formatDateFromTimeMs = (time: number) =>
-  dayjs(time).format("YYYY/MM/DD HH:mm:ss");
+export const formatTimeZone = (UTC: string) => {
+  dayjs.tz.setDefault(UTC);
+};
+
+export const formatDateFromTimeMs = (time: number) => {
+  return dayjs.utc(time).tz().format("YYYY/MM/DD HH:mm:ss");
+};
diff --git a/python/ray/dashboard/client/src/common/timezone.ts b/python/ray/dashboard/client/src/common/timezone.ts
new file mode 100644
index 000000000000..d02b785550fe
--- /dev/null
+++ b/python/ray/dashboard/client/src/common/timezone.ts
@@ -0,0 +1,794 @@
+export const timezones = [
+  {
+    utc: "GMT-12:00",
+    value: "Etc/GMT+12",
+    group: "Pacific",
+    country: "International Date Line West",
+  },
+  {
+    utc: "GMT-11:00",
+    value: "Pacific/Pago_Pago",
+    group: "Pacific",
+    country: "American Samoa",
+  },
+  {
+    utc: "GMT-11:00",
+    value: "Pacific/Midway",
+    group: "Pacific",
+    country: "Midway Island",
+  },
+  {
+    utc: "GMT-10:00",
+    value: "Pacific/Honolulu",
+    group: "Pacific",
+    country: "Hawaii",
+  },
+  {
+    utc: "GMT-09:00",
+    value: "America/Anchorage",
+    group: "America",
+    country: "Alaska",
+  },
+  {
+    utc: "GMT-08:00",
+    value: "America/Los_Angeles",
+    group: "America",
+    country: "Pacific Time (US & Canada)",
+  },
+  {
+    utc: "GMT-08:00",
+    value: "America/Tijuana",
+    group: "America",
+    country: "Tijuana",
+  },
+  {
+    utc: "GMT-07:00",
+    value: "America/Phoenix",
+    group: "America",
+    country: "Arizona",
+  },
+  {
+    utc: "GMT-07:00",
+    value: "America/Mazatlan",
+    group: "America",
+    country: "Mazatlan",
+  },
+  {
+    utc: "GMT-07:00",
+    value: "America/Denver",
+    group: "America",
+    country: "Mountain Time (US & Canada)",
+  },
+  {
+    utc: "GMT-06:00",
+    value: "America/Guatemala",
+    group: "America",
+    country: "Central America",
+  },
+  {
+    utc: "GMT-06:00",
+    value: "America/Chicago",
+    group: "America",
+    country: "Central Time (US & Canada)",
+  },
+  {
+    utc: "GMT-06:00",
+    value: "America/Chihuahua",
+    group: "America",
+    country: "Chihuahua",
+  },
+  {
+    utc: "GMT-06:00",
+    value: "America/Guadalajara",
+    group: "America",
+    country: "Guadalajara",
+  },
+  {
+    utc: "GMT-06:00",
+    value: "America/Mexico_City",
+    group: "America",
+    country: "Mexico City",
+  },
+  {
+    utc: "GMT-06:00",
+    value: "America/Monterrey",
+    group: "America",
+    country: "Monterrey",
+  },
+  {
+    utc: "GMT-06:00",
+    value: "America/Regina",
+    group: "America",
+    country: "Saskatchewan",
+  },
+  {
+    utc: "GMT-05:00",
+    value: "America/Bogota",
+    group: "America",
+    country: "Bogota",
+  },
+  {
+    utc: "GMT-05:00",
+    value: "America/New_York",
+    group: "America",
+    country: "Eastern Time (US & Canada)",
+  },
+  {
+    utc: "GMT-05:00",
+    value: "America/Indiana/Indianapolis",
+    group: "America",
+    country: "Indiana (East)",
+  },
+  {
+    utc: "GMT-05:00",
+    value: "America/Lima",
+    group: "America",
+    country: "Lima",
+  },
+  {
+    utc: "GMT-05:00",
+    value: "America/Guayaquil",
+    group: "America",
+    country: "Quito",
+  },
+  {
+    utc: "GMT-04:00",
+    value: "America/Halifax",
+    group: "America",
+    country: "Atlantic Time (Canada)",
+  },
+  {
+    utc: "GMT-04:00",
+    value: "America/Caracas",
+    group: "America",
+    country: "Caracas",
+  },
+  {
+    utc: "GMT-04:00",
+    value: "America/Guyana",
+    group: "America",
+    country: "Georgetown",
+  },
+  {
+    utc: "GMT-04:00",
+    value: "America/La_Paz",
+    group: "America",
+    country: "La Paz",
+  },
+  {
+    utc: "GMT-04:00",
+    value: "America/Puerto_Rico",
+    group: "America",
+    country: "Puerto Rico",
+  },
+  {
+    utc: "GMT-04:00",
+    value: "America/Santiago",
+    group: "America",
+    country: "Santiago",
+  },
+  {
+    utc: "GMT-03:30",
+    value: "America/St_Johns",
+    group: "America",
+    country: "Newfoundland",
+  },
+  {
+    utc: "GMT-03:00",
+    value: "America/Sao_Paulo",
+    group: "America",
+    country: "Brasilia",
+  },
+  {
+    utc: "GMT-03:00",
+    value: "America/Argentina/Buenos_Aires",
+    group: "America",
+    country: "Buenos Aires",
+  },
+  {
+    utc: "GMT-03:00",
+    value: "America/Montevideo",
+    group: "America",
+    country: "Montevideo",
+  },
+  {
+    utc: "GMT-02:00",
+    value: "America/Godthab",
+    group: "America",
+    country: "Greenland",
+  },
+  {
+    utc: "GMT-02:00",
+    value: "Etc/GMT+2",
+    group: "Atlantic",
+    country: "Mid-Atlantic",
+  },
+  {
+    utc: "GMT-01:00",
+    value: "Atlantic/Azores",
+    group: "Atlantic",
+    country: "Azores",
+  },
+  {
+    utc: "GMT-01:00",
+    value: "Atlantic/Cape_Verde",
+    group: "Atlantic",
+    country: "Cape Verde Is.",
+  },
+  {
+    utc: "GMT+00:00",
+    value: "Europe/London",
+    group: "Europe",
+    country: "Edinburgh",
+  },
+  {
+    utc: "GMT+00:00",
+    value: "Europe/Lisbon",
+    group: "Europe",
+    country: "Lisbon",
+  },
+  {
+    utc: "GMT+00:00",
+    value: "Europe/London",
+    group: "Europe",
+    country: "London",
+  },
+  {
+    utc: "GMT+00:00",
+    value: "Africa/Monrovia",
+    group: "Africa",
+    country: "Monrovia",
+  },
+  {
+    utc: "GMT+01:00",
+    value: "Europe/Amsterdam",
+    group: "Europe",
+    country: "Amsterdam",
+  },
+  {
+    utc: "GMT+01:00",
+    value: "Europe/Belgrade",
+    group: "Europe",
+    country: "Belgrade",
+  },
+  {
+    utc: "GMT+01:00",
+    value: "Europe/Berlin",
+    group: "Europe",
+    country: "Berlin",
+  },
+  {
+    utc: "GMT+01:00",
+    value: "Europe/Brussels",
+    group: "Europe",
+    country: "Brussels",
+  },
+  {
+    utc: "GMT+01:00",
+    value: "Europe/Budapest",
+    group: "Europe",
+    country: "Budapest",
+  },
+  {
+    utc: "GMT+01:00",
+    value: "Europe/Copenhagen",
+    group: "Europe",
+    country: "Copenhagen",
+  },
+  {
+    utc: "GMT+01:00",
+    value: "Europe/Madrid",
+    group: "Europe",
+    country: "Madrid",
+  },
+  {
+    utc: "GMT+01:00",
+    value: "Europe/Paris",
+    group: "Europe",
+    country: "Paris",
+  },
+  {
+    utc: "GMT+01:00",
+    value: "Europe/Prague",
+    group: "Europe",
+    country: "Prague",
+  },
+  {
+    utc: "GMT+01:00",
+    value: "Europe/Rome",
+    group: "Europe",
+    country: "Rome",
+  },
+  {
+    utc: "GMT+01:00",
+    value: "Europe/Sarajevo",
+    group: "Europe",
+    country: "Sarajevo",
+  },
+  {
+    utc: "GMT+01:00",
+    value: "Europe/Stockholm",
+    group: "Europe",
+    country: "Stockholm",
+  },
+  {
+    utc: "GMT+01:00",
+    value: "Europe/Vienna",
+    group: "Europe",
+    country: "Vienna",
+  },
+  {
+    utc: "GMT+01:00",
+    value: "Europe/Warsaw",
+    group: "Europe",
+    country: "Warsaw",
+  },
+  {
+    utc: "GMT+01:00",
+    value: "Africa/Lagos",
+    group: "Africa",
+    country: "West Central Africa",
+  },
+  {
+    utc: "GMT+02:00",
+    value: "Asia/Amman",
+    group: "Asia",
+    country: "Amman",
+  },
+  {
+    utc: "GMT+02:00",
+    value: "Europe/Athens",
+    group: "Europe",
+    country: "Athens",
+  },
+  {
+    utc: "GMT+02:00",
+    value: "Asia/Beirut",
+    group: "Asia",
+    country: "Beirut",
+  },
+  {
+    utc: "GMT+02:00",
+    value: "Europe/Bucharest",
+    group: "Europe",
+    country: "Bucharest",
+  },
+  {
+    utc: "GMT+02:00",
+    value: "Africa/Cairo",
+    group: "Africa",
+    country: "Cairo",
+  },
+  {
+    utc: "GMT+02:00",
+    value: "Africa/Harare",
+    group: "Africa",
+    country: "Harare",
+  },
+  {
+    utc: "GMT+02:00",
+    value: "Europe/Helsinki",
+    group: "Europe",
+    country: "Helsinki",
+  },
+  {
+    utc: "GMT+02:00",
+    value: "Europe/Istanbul",
+    group: "Europe",
+    country: "Istanbul",
+  },
+  {
+    utc: "GMT+02:00",
+    value: "Asia/Jerusalem",
+    group: "Asia",
+    country: "Jerusalem",
+  },
+  {
+    utc: "GMT+02:00",
+    value: "Europe/Kiev",
+    group: "Europe",
+    country: "Kyiv",
+  },
+  {
+    utc: "GMT+02:00",
+    value: "Europe/Minsk",
+    group: "Europe",
+    country: "Minsk",
+  },
+  {
+    utc: "GMT+02:00",
+    value: "Europe/Riga",
+    group: "Europe",
+    country: "Riga",
+  },
+  {
+    utc: "GMT+02:00",
+    value: "Europe/Sofia",
+    group: "Europe",
+    country: "Sofia",
+  },
+  {
+    utc: "GMT+02:00",
+    value: "Europe/Tallinn",
+    group: "Europe",
+    country: "Tallinn",
+  },
+  {
+    utc: "GMT+02:00",
+    value: "Europe/Vilnius",
+    group: "Europe",
+    country: "Vilnius",
+  },
+  {
+    utc: "GMT+03:00",
+    value: "Asia/Baghdad",
+    group: "Asia",
+    country: "Baghdad",
+  },
+  {
+    utc: "GMT+03:00",
+    value: "Asia/Kuwait",
+    group: "Asia",
+    country: "Kuwait",
+  },
+  {
+    utc: "GMT+03:00",
+    value: "Europe/Moscow",
+    group: "Europe",
+    country: "Moscow",
+  },
+  {
+    utc: "GMT+03:00",
+    value: "Africa/Nairobi",
+    group: "Africa",
+    country: "Nairobi",
+  },
+  {
+    utc: "GMT+03:00",
+    value: "Asia/Riyadh",
+    group: "Asia",
+    country: "Riyadh",
+  },
+  {
+    utc: "GMT+03:30",
+    value: "Asia/Tehran",
+    group: "Asia",
+    country: "Tehran",
+  },
+  {
+    utc: "GMT+04:00",
+    value: "Asia/Dubai",
+    group: "Asia",
+    country: "Abu Dhabi",
+  },
+  {
+    utc: "GMT+04:00",
+    value: "Asia/Baku",
+    group: "Asia",
+    country: "Baku",
+  },
+  {
+    utc: "GMT+04:00",
+    value: "Asia/Muscat",
+    group: "Asia",
+    country: "Muscat",
+  },
+  {
+    utc: "GMT+04:00",
+    value: "Asia/Tbilisi",
+    group: "Asia",
+    country: "Tbilisi",
+  },
+  {
+    utc: "GMT+04:00",
+    value: "Asia/Yerevan",
+    group: "Asia",
+    country: "Yerevan",
+  },
+  {
+    utc: "GMT+04:30",
+    value: "Asia/Kabul",
+    group: "Asia",
+    country: "Kabul",
+  },
+  {
+    utc: "GMT+05:00",
+    value: "Asia/Karachi",
+    group: "Asia",
+    country: "Islamabad",
+  },
+  {
+    utc: "GMT+05:00",
+    value: "Asia/Tashkent",
+    group: "Asia",
+    country: "Tashkent",
+  },
+  {
+    utc: "GMT+05:30",
+    value: "Asia/Kolkata",
+    group: "Asia",
+    country: "Chennai",
+  },
+  {
+    utc: "GMT+05:30",
+    value: "Asia/Kolkata",
+    group: "Asia",
+    country: "Kolkata",
+  },
+  {
+    utc: "GMT+05:30",
+    value: "Asia/Kolkata",
+    group: "Asia",
+    country: "Mumbai",
+  },
+  {
+    utc: "GMT+05:30",
+    value: "Asia/Kolkata",
+    group: "Asia",
+    country: "New Delhi",
+  },
+  {
+    utc: "GMT+05:45",
+    value: "Asia/Kathmandu",
+    group: "Asia",
+    country: "Kathmandu",
+  },
+  {
+    utc: "GMT+06:00",
+    value: "Asia/Almaty",
+    group: "Asia",
+    country: "Almaty",
+  },
+  {
+    utc: "GMT+06:00",
+    value: "Asia/Dhaka",
+    group: "Asia",
+    country: "Dhaka",
+  },
+  {
+    utc: "GMT+06:00",
+    value: "Asia/Yekaterinburg",
+    group: "Asia",
+    country: "Yekaterinburg",
+  },
+  {
+    utc: "GMT+06:30",
+    value: "Asia/Yangon",
+    group: "Asia",
+    country: "Yangon (Rangoon)",
+  },
+  {
+    utc: "GMT+07:00",
+    value: "Asia/Bangkok",
+    group: "Asia",
+    country: "Bangkok",
+  },
+  {
+    utc: "GMT+07:00",
+    value: "Asia/Hanoi",
+    group: "Asia",
+    country: "Hanoi",
+  },
+  {
+    utc: "GMT+07:00",
+    value: "Asia/Jakarta",
+    group: "Asia",
+    country: "Jakarta",
+  },
+  {
+    utc: "GMT+07:00",
+    value: "Asia/Novosibirsk",
+    group: "Asia",
+    country: "Novosibirsk",
+  },
+  {
+    utc: "GMT+08:00",
+    value: "Asia/Shanghai",
+    group: "Asia",
+    country: "Beijing",
+  },
+  {
+    utc: "GMT+08:00",
+    value: "Asia/Chongqing",
+    group: "Asia",
+    country: "Chongqing",
+  },
+  {
+    utc: "GMT+08:00",
+    value: "Asia/Hong_Kong",
+    group: "Asia",
+    country: "Hong Kong",
+  },
+  {
+    utc: "GMT+08:00",
+    value: "Asia/Krasnoyarsk",
+    group: "Asia",
+    country: "Krasnoyarsk",
+  },
+  {
+    utc: "GMT+08:00",
+    value: "Asia/Kuala_Lumpur",
+    group: "Asia",
+    country: "Kuala Lumpur",
+  },
+  {
+    utc: "GMT+08:00",
+    value: "Australia/Perth",
+    group: "Australia",
+    country: "Perth",
+  },
+  {
+    utc: "GMT+08:00",
+    value: "Asia/Singapore",
+    group: "Asia",
+    country: "Singapore",
+  },
+  {
+    utc: "GMT+08:00",
+    value: "Asia/Taipei",
+    group: "Asia",
+    country: "Taipei",
+  },
+  {
+    utc: "GMT+08:00",
+    value: "Asia/Ulaanbaatar",
+    group: "Asia",
+    country: "Ulaan Bataar",
+  },
+  {
+    utc: "GMT+08:00",
+    value: "Asia/Urumqi",
+    group: "Asia",
+    country: "Urumqi",
+  },
+  {
+    utc: "GMT+09:00",
+    value: "Asia/Irkutsk",
+    group: "Asia",
+    country: "Irkutsk",
+  },
+  {
+    utc: "GMT+09:00",
+    value: "Asia/Tokyo",
+    group: "Asia",
+    country: "Osaka",
+  },
+  {
+    utc: "GMT+09:00",
+    value: "Asia/Tokyo",
+    group: "Asia",
+    country: "Sapporo",
+  },
+  {
+    utc: "GMT+09:00",
+    value: "Asia/Seoul",
+    group: "Asia",
+    country: "Seoul",
+  },
+  {
+    utc: "GMT+09:00",
+    value: "Asia/Tokyo",
+    group: "Asia",
+    country: "Tokyo",
+  },
+  {
+    utc: "GMT+09:30",
+    value: "Australia/Adelaide",
+    group: "Australia",
+    country: "Adelaide",
+  },
+  {
+    utc: "GMT+09:30",
+    value: "Australia/Darwin",
+    group: "Australia",
+    country: "Darwin",
+  },
+  {
+    utc: "GMT+10:00",
+    value: "Australia/Brisbane",
+    group: "Australia",
+    country: "Brisbane",
+  },
+  {
+    utc: "GMT+10:00",
+    value: "Australia/Sydney",
+    group: "Australia",
+    country: "Canberra",
+  },
+  {
+    utc: "GMT+10:00",
+    value: "Pacific/Guam",
+    group: "Pacific",
+    country: "Guam",
+  },
+  {
+    utc: "GMT+10:00",
+    value: "Australia/Hobart",
+    group: "Australia",
+    country: "Hobart",
+  },
+  {
+    utc: "GMT+10:00",
+    value: "Australia/Melbourne",
+    group: "Australia",
+    country: "Melbourne",
+  },
+  {
+    utc: "GMT+10:00",
+    value: "Pacific/Port_Moresby",
+    group: "Pacific",
+    country: "Port Moresby",
+  },
+  {
+    utc: "GMT+10:00",
+    value: "Australia/Sydney",
+    group: "Australia",
+    country: "Sydney",
+  },
+  {
+    utc: "GMT+11:00",
+    value: "Asia/Magadan",
+    group: "Asia",
+    country: "Magadan",
+  },
+  {
+    utc: "GMT+11:00",
+    value: "Pacific/Noumea",
+    group: "Pacific",
+    country: "New Caledonia",
+  },
+  {
+    utc: "GMT+11:00",
+    value: "Pacific/Guadalcanal",
+    group: "Pacific",
+    country: "Solomon Is.",
+  },
+  {
+    utc: "GMT+12:00",
+    value: "Pacific/Auckland",
+    group: "Pacific",
+    country: "Auckland",
+  },
+  {
+    utc: "GMT+12:00",
+    value: "Pacific/Fiji",
+    group: "Pacific",
+    country: "Fiji",
+  },
+  {
+    utc: "GMT+12:00",
+    value: "Asia/Kamchatka",
+    group: "Asia",
+    country: "Kamchatka",
+  },
+  {
+    utc: "GMT+12:00",
+    value: "Pacific/Majuro",
+    group: "Pacific",
+    country: "Marshall Is.",
+  },
+  {
+    utc: "GMT+12:00",
+    value: "Pacific/Auckland",
+    group: "Pacific",
+    country: "Wellington",
+  },
+  {
+    utc: "GMT+13:00",
+    value: "Pacific/Tongatapu",
+    group: "Pacific",
+    country: "Nuku'alofa",
+  },
+  {
+    utc: "GMT+13:00",
+    value: "Pacific/Apia",
+    group: "Pacific",
+    country: "Samoa",
+  },
+  {
+    utc: "GMT+13:00",
+    value: "Pacific/Fakaofo",
+    group: "Pacific",
+    country: "Tokelau Is.",
+  },
+];
diff --git a/python/ray/dashboard/client/src/components/DataOverviewTable.tsx b/python/ray/dashboard/client/src/components/DataOverviewTable.tsx
index c52bdee0c28b..c3538ba9ccd4 100644
--- a/python/ray/dashboard/client/src/components/DataOverviewTable.tsx
+++ b/python/ray/dashboard/client/src/components/DataOverviewTable.tsx
@@ -193,7 +193,7 @@ const DataRow = ({
       </TableCell>
       <TableCell align="left">
         {isDatasetRow && datasetMetrics.dataset}
-        {isOperatorRow && operatorMetrics.operator}
+        {isOperatorRow && operatorMetrics.name}
       </TableCell>
       <TableCell align="right" style={{ width: 200 }}>
         <TaskProgressBar
diff --git a/python/ray/dashboard/client/src/components/SearchComponent.tsx b/python/ray/dashboard/client/src/components/SearchComponent.tsx
index 8cfc426c6818..495c3d9fc0a2 100644
--- a/python/ray/dashboard/client/src/components/SearchComponent.tsx
+++ b/python/ray/dashboard/client/src/components/SearchComponent.tsx
@@ -1,6 +1,18 @@
 import { SearchOutlined } from "@mui/icons-material";
-import { InputAdornment, MenuItem, TextField } from "@mui/material";
-import React from "react";
+import {
+  Autocomplete,
+  Box,
+  Divider,
+  InputAdornment,
+  MenuItem,
+  TextField,
+  Typography,
+} from "@mui/material";
+
+import React, { useEffect, useState } from "react";
+import { formatTimeZone } from "../common/formatUtils";
+import { timezones } from "../common/timezone";
+import { TimezoneInfo } from "../pages/metrics/utils";
 
 export const SearchInput = ({
   label,
@@ -77,3 +89,161 @@ export const SearchSelect = ({
     </TextField>
   );
 };
+
+export const SearchTimezone = ({
+  serverTimeZone,
+  currentTimeZone,
+}: {
+  serverTimeZone?: TimezoneInfo | null;
+  currentTimeZone?: string;
+}) => {
+  const [timezone, setTimezone] = useState<string>("");
+
+  useEffect(() => {
+    if (currentTimeZone !== undefined) {
+      formatTimeZone(currentTimeZone);
+      setTimezone(currentTimeZone);
+    }
+  }, [currentTimeZone]);
+
+  const handleTimezoneChange = (value: string) => {
+    localStorage.setItem("timezone", value);
+    window.location.reload();
+  };
+
+  const options = timezones
+    .map((x) => x) // Create a copy
+    .sort((a, b) => a.group.localeCompare(b.group));
+  options.unshift({
+    value: "Etc/UTC",
+    utc: "GMT+00:00",
+    group: "System",
+    country: "Coordinated Universal Time",
+  });
+
+  const browserTimezone = Intl.DateTimeFormat().resolvedOptions().timeZone;
+
+  const browserOffset = (() => {
+    const offset = new Date().getTimezoneOffset();
+    const sign = offset < 0 ? "+" : "-";
+    const hours = Math.abs(Math.floor(offset / 60))
+      .toString()
+      .padStart(2, "0");
+    const minutes = Math.abs(offset % 60)
+      .toString()
+      .padStart(2, "0");
+    return `GMT${sign}${hours}:${minutes}`;
+  })();
+
+  if (browserOffset) {
+    options.unshift({
+      value: browserTimezone,
+      utc: browserOffset,
+      group: "System",
+      country: "Browser Time",
+    });
+  }
+
+  const serverUtc =
+    serverTimeZone?.value &&
+    timezones.find((t) => t.value === serverTimeZone.value)?.utc;
+  if (serverUtc) {
+    options.unshift({
+      value: serverTimeZone.value,
+      utc: serverUtc,
+      group: "System",
+      country: "Dashboard Server Timezone",
+    });
+  }
+
+  const curUtc = timezones.find((t) => t.value === timezone)?.utc;
+  return (
+    <Autocomplete
+      size="small"
+      onChange={(event, newValue) => {
+        if (newValue) {
+          handleTimezoneChange(newValue.value);
+        }
+      }}
+      options={options}
+      getOptionLabel={(option) => option.value}
+      groupBy={(option) => option.group}
+      filterOptions={(options, { inputValue }) =>
+        options.filter(
+          (item) =>
+            item.value.includes(inputValue) ||
+            item.utc.includes(inputValue) ||
+            item.country.toLowerCase().includes(inputValue.toLowerCase()) ||
+            item.group.toLowerCase().includes(inputValue.toLowerCase()),
+        )
+      }
+      renderOption={(props, option) => (
+        <Box
+          component="li"
+          {...props}
+          sx={{
+            display: "flex",
+            justifyContent: "space-between",
+          }}
+        >
+          <Typography component="span" sx={{ marginRight: 1 }}>
+            {option.country}
+          </Typography>
+          <Typography sx={{ color: "#8C9196" }} component="span">
+            {option.value}
+          </Typography>
+          <Box sx={{ flexGrow: 1 }} />
+          <Typography component="span" sx={{ marginLeft: 1 }}>
+            {option.utc}
+          </Typography>
+        </Box>
+      )}
+      renderInput={(params) => (
+        <TextField
+          {...params}
+          sx={{
+            width: 120,
+            "& .MuiOutlinedInput-notchedOutline": {
+              borderColor: "#D2DCE6",
+            },
+          }}
+          placeholder={curUtc}
+        />
+      )}
+      renderGroup={(params) => (
+        <li>
+          <Typography sx={{ color: "#5F6469", paddingX: 2, paddingY: "6px" }}>
+            {params.group}
+          </Typography>
+          <Box
+            component="ul"
+            sx={{
+              padding: 0,
+            }}
+          >
+            {params.children}
+          </Box>
+          <Divider
+            sx={{
+              marginX: 2,
+              marginY: 1,
+            }}
+          />
+        </li>
+      )}
+      slotProps={{
+        paper: {
+          style: {
+            width: "400px",
+          },
+        },
+        popper: {
+          placement: "bottom-end",
+          style: {
+            width: "fit-content",
+          },
+        },
+      }}
+    />
+  );
+};
diff --git a/python/ray/dashboard/client/src/pages/actor/ActorDetail.tsx b/python/ray/dashboard/client/src/pages/actor/ActorDetail.tsx
index 60e8fc8ec8ac..22cb1e030351 100644
--- a/python/ray/dashboard/client/src/pages/actor/ActorDetail.tsx
+++ b/python/ray/dashboard/client/src/pages/actor/ActorDetail.tsx
@@ -133,6 +133,15 @@ const ActorDetailPage = () => {
                 }
               : { value: "-" },
           },
+          {
+            label: "PID",
+            content: actorDetail.pid
+              ? {
+                  value: `${actorDetail.pid}`,
+                  copyableValue: `${actorDetail.pid}`,
+                }
+              : { value: "-" },
+          },
           {
             label: "Started at",
             content: {
diff --git a/python/ray/dashboard/client/src/pages/data/DataOverview.component.test.tsx b/python/ray/dashboard/client/src/pages/data/DataOverview.component.test.tsx
index 12cd19792928..107cd2037cfc 100644
--- a/python/ray/dashboard/client/src/pages/data/DataOverview.component.test.tsx
+++ b/python/ray/dashboard/client/src/pages/data/DataOverview.component.test.tsx
@@ -36,6 +36,7 @@ describe("DataOverview", () => {
         operators: [
           {
             operator: "test_ds1_op1",
+            name: "test_ds1_op",
             state: "RUNNING",
             progress: 99,
             total: 101,
@@ -104,11 +105,11 @@ describe("DataOverview", () => {
     expect(screen.getByText("70/80")).toBeVisible();
 
     // Operator dropdown
-    expect(screen.queryByText("test_ds1_op1")).toBeNull();
+    expect(screen.queryByText("test_ds1_op")).toBeNull();
     await user.click(screen.getByTitle("Expand Dataset test_ds1"));
-    expect(screen.getByText("test_ds1_op1")).toBeVisible();
+    expect(screen.getByText("test_ds1_op")).toBeVisible();
     await user.click(screen.getByTitle("Collapse Dataset test_ds1"));
-    expect(screen.queryByText("test_ds1_op1")).toBeNull();
+    expect(screen.queryByText("test_ds1_op")).toBeNull();
 
     // Second Dataset
     expect(screen.getByText("test_ds2")).toBeVisible();
diff --git a/python/ray/dashboard/client/src/pages/layout/MainNavLayout.tsx b/python/ray/dashboard/client/src/pages/layout/MainNavLayout.tsx
index 256c830cd29f..ea261a454b75 100644
--- a/python/ray/dashboard/client/src/pages/layout/MainNavLayout.tsx
+++ b/python/ray/dashboard/client/src/pages/layout/MainNavLayout.tsx
@@ -3,6 +3,7 @@ import React, { useContext } from "react";
 import { RiBookMarkLine, RiFeedbackLine } from "react-icons/ri/";
 import { Outlet, Link as RouterLink } from "react-router-dom";
 import { GlobalContext } from "../../App";
+import { SearchTimezone } from "../../components/SearchComponent";
 import Logo from "../../logo.svg";
 import { MainNavContext, useMainNavState } from "./mainNavContext";
 
@@ -106,7 +107,8 @@ const NAV_ITEMS = [
 const MainNavBar = () => {
   const { mainNavPageHierarchy } = useContext(MainNavContext);
   const rootRouteId = mainNavPageHierarchy[0]?.id;
-  const { metricsContextLoaded, grafanaHost } = useContext(GlobalContext);
+  const { metricsContextLoaded, grafanaHost, serverTimeZone, currentTimeZone } =
+    useContext(GlobalContext);
 
   let navItems = NAV_ITEMS;
   if (!metricsContextLoaded || grafanaHost === "DISABLED") {
@@ -179,6 +181,17 @@ const MainNavBar = () => {
           </IconButton>
         </Tooltip>
       </Box>
+      <Tooltip
+        placement="left-start"
+        title="The timezone of logs are not impacted by this selection."
+      >
+        <Box sx={{ marginRight: 3 }}>
+          <SearchTimezone
+            currentTimeZone={currentTimeZone}
+            serverTimeZone={serverTimeZone}
+          />
+        </Box>
+      </Tooltip>
     </Box>
   );
 };
diff --git a/python/ray/dashboard/client/src/pages/metrics/Metrics.component.test.tsx b/python/ray/dashboard/client/src/pages/metrics/Metrics.component.test.tsx
index c2cf646b743d..a7e3e06f0cea 100644
--- a/python/ray/dashboard/client/src/pages/metrics/Metrics.component.test.tsx
+++ b/python/ray/dashboard/client/src/pages/metrics/Metrics.component.test.tsx
@@ -22,6 +22,8 @@ const Wrapper = ({ children }: PropsWithChildren<{}>) => {
         nodeMapByIp: {},
         namespaceMap: {},
         dashboardDatasource: "Prometheus",
+        serverTimeZone: undefined,
+        currentTimeZone: undefined,
       }}
     >
       <STYLE_WRAPPER>{children}</STYLE_WRAPPER>
@@ -47,6 +49,8 @@ const MetricsDisabledWrapper = ({ children }: PropsWithChildren<{}>) => {
         nodeMapByIp: {},
         namespaceMap: {},
         dashboardDatasource: "Prometheus",
+        serverTimeZone: undefined,
+        currentTimeZone: undefined,
       }}
     >
       <STYLE_WRAPPER>{children}</STYLE_WRAPPER>
diff --git a/python/ray/dashboard/client/src/pages/metrics/Metrics.tsx b/python/ray/dashboard/client/src/pages/metrics/Metrics.tsx
index fd542e534cd1..99d459a4fe54 100644
--- a/python/ray/dashboard/client/src/pages/metrics/Metrics.tsx
+++ b/python/ray/dashboard/client/src/pages/metrics/Metrics.tsx
@@ -571,8 +571,8 @@ const MetricsSection = ({
   dashboardUid,
   dashboardDatasource,
 }: MetricsSectionProps) => {
-  const { grafanaHost, sessionName } = useContext(GlobalContext);
-
+  const { grafanaHost, sessionName, currentTimeZone } =
+    useContext(GlobalContext);
   return (
     <CollapsibleSection
       key={title}
@@ -593,7 +593,7 @@ const MetricsSection = ({
         {contents.map(({ title, pathParams }) => {
           const path =
             `/d-solo/${dashboardUid}?${pathParams}` +
-            `&${refreshParams}${timeRangeParams}&var-SessionName=${sessionName}&var-datasource=${dashboardDatasource}`;
+            `&${refreshParams}&timezone=${currentTimeZone}${timeRangeParams}&var-SessionName=${sessionName}&var-datasource=${dashboardDatasource}`;
           return (
             <Paper
               key={pathParams}
diff --git a/python/ray/dashboard/client/src/pages/metrics/utils.ts b/python/ray/dashboard/client/src/pages/metrics/utils.ts
index 55425c8f3630..3c7f404fba11 100644
--- a/python/ray/dashboard/client/src/pages/metrics/utils.ts
+++ b/python/ray/dashboard/client/src/pages/metrics/utils.ts
@@ -2,6 +2,7 @@ import { get } from "../../service/requestHandlers";
 
 const GRAFANA_HEALTHCHECK_URL = "/api/grafana_health";
 const PROMETHEUS_HEALTHCHECK_URL = "/api/prometheus_health";
+const TIMEZONE_URL = "/timezone";
 
 export type DashboardUids = {
   default: string;
@@ -26,6 +27,11 @@ type PrometheusHealthcheckRsp = {
   msg: string;
 };
 
+type TimezoneRsp = {
+  offset: string;
+  value: string;
+};
+
 const fetchGrafanaHealthcheck = async () => {
   return await get<GrafanaHealthcheckRsp>(GRAFANA_HEALTHCHECK_URL);
 };
@@ -68,3 +74,21 @@ export const getMetricsInfo = async () => {
 
   return info;
 };
+
+export type TimezoneInfo = {
+  offset: string;
+  value: string;
+};
+
+export const getTimeZoneInfo = async () => {
+  try {
+    const resp = await get<TimezoneRsp>(TIMEZONE_URL);
+    if (resp.data) {
+      return {
+        offset: resp.data.offset,
+        value: resp.data.value,
+      };
+    }
+  } catch (e) {}
+  return null;
+};
diff --git a/python/ray/dashboard/client/src/pages/overview/OverviewPage.component.test.tsx b/python/ray/dashboard/client/src/pages/overview/OverviewPage.component.test.tsx
index a92ebb6f4701..9b0d432b9f92 100644
--- a/python/ray/dashboard/client/src/pages/overview/OverviewPage.component.test.tsx
+++ b/python/ray/dashboard/client/src/pages/overview/OverviewPage.component.test.tsx
@@ -84,6 +84,8 @@ const Wrapper =
               nodeMapByIp: {},
               namespaceMap: {},
               dashboardDatasource: "Prometheus",
+              serverTimeZone: undefined,
+              currentTimeZone: undefined,
             }}
           >
             {children}
diff --git a/python/ray/dashboard/client/src/pages/overview/cards/ClusterUtilizationCard.tsx b/python/ray/dashboard/client/src/pages/overview/cards/ClusterUtilizationCard.tsx
index ab76f014f22f..a1c04c8047ab 100644
--- a/python/ray/dashboard/client/src/pages/overview/cards/ClusterUtilizationCard.tsx
+++ b/python/ray/dashboard/client/src/pages/overview/cards/ClusterUtilizationCard.tsx
@@ -20,11 +20,12 @@ export const ClusterUtilizationCard = ({
     sessionName,
     dashboardUids,
     dashboardDatasource,
+    currentTimeZone,
   } = useContext(GlobalContext);
   const grafanaDefaultDashboardUid =
     dashboardUids?.default ?? "rayDefaultDashboard";
   const path = `/d-solo/${grafanaDefaultDashboardUid}/default-dashboard?orgId=1&theme=light&panelId=41&var-datasource=${dashboardDatasource}`;
-  const timeRangeParams = "&from=now-30m&to=now";
+  const timeRangeParams = "&from=now-1h&to=now";
 
   if (!metricsContextLoaded || grafanaHost === "DISABLED") {
     return null;
@@ -51,7 +52,7 @@ export const ClusterUtilizationCard = ({
             component="iframe"
             title="Cluster Utilization"
             sx={{ flex: 1 }}
-            src={`${grafanaHost}${path}&refresh${timeRangeParams}&var-SessionName=${sessionName}`}
+            src={`${grafanaHost}${path}&refresh&timezone=${currentTimeZone}${timeRangeParams}&var-SessionName=${sessionName}`}
             frameBorder="0"
           />
           <Box
diff --git a/python/ray/dashboard/client/src/pages/overview/cards/NodeCountCard.tsx b/python/ray/dashboard/client/src/pages/overview/cards/NodeCountCard.tsx
index cebfafe86c06..373f30ead13a 100644
--- a/python/ray/dashboard/client/src/pages/overview/cards/NodeCountCard.tsx
+++ b/python/ray/dashboard/client/src/pages/overview/cards/NodeCountCard.tsx
@@ -17,11 +17,12 @@ export const NodeCountCard = ({ className, sx }: NodeCountCardProps) => {
     sessionName,
     dashboardUids,
     dashboardDatasource,
+    currentTimeZone,
   } = useContext(GlobalContext);
   const grafanaDefaultDashboardUid =
     dashboardUids?.default ?? "rayDefaultDashboard";
   const path = `/d-solo/${grafanaDefaultDashboardUid}/default-dashboard?orgId=1&theme=light&panelId=24&var-datasource=${dashboardDatasource}`;
-  const timeRangeParams = "&from=now-30m&to=now";
+  const timeRangeParams = "&from=now-1h&to=now";
 
   if (!metricsContextLoaded || grafanaHost === "DISABLED") {
     return null;
@@ -45,7 +46,7 @@ export const NodeCountCard = ({ className, sx }: NodeCountCardProps) => {
           component="iframe"
           title="Node Count"
           sx={{ flex: 1 }}
-          src={`${grafanaHost}${path}&refresh${timeRangeParams}&var-SessionName=${sessionName}`}
+          src={`${grafanaHost}${path}&refresh&timezone=${currentTimeZone}${timeRangeParams}&var-SessionName=${sessionName}`}
           frameBorder="0"
         />
       )}
diff --git a/python/ray/dashboard/client/src/pages/serve/ServeDeploymentMetricsSection.component.test.tsx b/python/ray/dashboard/client/src/pages/serve/ServeDeploymentMetricsSection.component.test.tsx
index d0629ed7832c..a1f552514043 100644
--- a/python/ray/dashboard/client/src/pages/serve/ServeDeploymentMetricsSection.component.test.tsx
+++ b/python/ray/dashboard/client/src/pages/serve/ServeDeploymentMetricsSection.component.test.tsx
@@ -22,6 +22,8 @@ const Wrapper = ({ children }: PropsWithChildren<{}>) => {
         nodeMapByIp: {},
         namespaceMap: {},
         dashboardDatasource: "Prometheus",
+        serverTimeZone: undefined,
+        currentTimeZone: undefined,
       }}
     >
       <STYLE_WRAPPER>{children}</STYLE_WRAPPER>
@@ -47,6 +49,8 @@ const MetricsDisabledWrapper = ({ children }: PropsWithChildren<{}>) => {
         nodeMapByIp: {},
         namespaceMap: {},
         dashboardDatasource: "Prometheus",
+        serverTimeZone: undefined,
+        currentTimeZone: undefined,
       }}
     >
       <STYLE_WRAPPER>{children}</STYLE_WRAPPER>
diff --git a/python/ray/dashboard/client/src/pages/serve/ServeDeploymentMetricsSection.tsx b/python/ray/dashboard/client/src/pages/serve/ServeDeploymentMetricsSection.tsx
index 33f62879a1d4..921f804e7fb6 100644
--- a/python/ray/dashboard/client/src/pages/serve/ServeDeploymentMetricsSection.tsx
+++ b/python/ray/dashboard/client/src/pages/serve/ServeDeploymentMetricsSection.tsx
@@ -51,8 +51,13 @@ export const ServeReplicaMetricsSection = ({
   className,
   sx,
 }: ServeDeploymentMetricsSectionProps) => {
-  const { grafanaHost, prometheusHealth, dashboardUids, dashboardDatasource } =
-    useContext(GlobalContext);
+  const {
+    grafanaHost,
+    prometheusHealth,
+    dashboardUids,
+    dashboardDatasource,
+    currentTimeZone,
+  } = useContext(GlobalContext);
   const grafanaServeDashboardUid =
     dashboardUids?.serveDeployment ?? "rayServeDashboard";
 
@@ -179,7 +184,7 @@ export const ServeReplicaMetricsSection = ({
           {METRICS_CONFIG.map(({ title, pathParams }) => {
             const path =
               `/d-solo/${grafanaServeDashboardUid}?${pathParams}` +
-              `${refreshParams}${timeRangeParams}&var-Deployment=${encodeURIComponent(
+              `${refreshParams}&timezone=${currentTimeZone}${timeRangeParams}&var-Deployment=${encodeURIComponent(
                 deploymentName,
               )}&var-Replica=${encodeURIComponent(
                 replicaId,
diff --git a/python/ray/dashboard/client/src/pages/serve/ServeMetricsSection.component.test.tsx b/python/ray/dashboard/client/src/pages/serve/ServeMetricsSection.component.test.tsx
index 6f5826087a63..c276d0fde417 100644
--- a/python/ray/dashboard/client/src/pages/serve/ServeMetricsSection.component.test.tsx
+++ b/python/ray/dashboard/client/src/pages/serve/ServeMetricsSection.component.test.tsx
@@ -26,6 +26,8 @@ const Wrapper = ({ children }: PropsWithChildren<{}>) => {
         nodeMapByIp: {},
         namespaceMap: {},
         dashboardDatasource: "Prometheus",
+        serverTimeZone: undefined,
+        currentTimeZone: undefined,
       }}
     >
       <STYLE_WRAPPER>{children}</STYLE_WRAPPER>
@@ -51,6 +53,8 @@ const MetricsDisabledWrapper = ({ children }: PropsWithChildren<{}>) => {
         nodeMapByIp: {},
         namespaceMap: {},
         dashboardDatasource: "Prometheus",
+        serverTimeZone: undefined,
+        currentTimeZone: undefined,
       }}
     >
       <STYLE_WRAPPER>{children}</STYLE_WRAPPER>
diff --git a/python/ray/dashboard/client/src/pages/serve/ServeMetricsSection.tsx b/python/ray/dashboard/client/src/pages/serve/ServeMetricsSection.tsx
index efb3be529314..b03cfbcc56f8 100644
--- a/python/ray/dashboard/client/src/pages/serve/ServeMetricsSection.tsx
+++ b/python/ray/dashboard/client/src/pages/serve/ServeMetricsSection.tsx
@@ -77,8 +77,13 @@ export const ServeMetricsSection = ({
   metricsConfig,
   sx,
 }: ServeMetricsSectionProps) => {
-  const { grafanaHost, prometheusHealth, dashboardUids, dashboardDatasource } =
-    useContext(GlobalContext);
+  const {
+    grafanaHost,
+    prometheusHealth,
+    dashboardUids,
+    dashboardDatasource,
+    currentTimeZone,
+  } = useContext(GlobalContext);
   const grafanaServeDashboardUid = dashboardUids?.serve ?? "rayServeDashboard";
   const [refreshOption, setRefreshOption] = useState<RefreshOptions>(
     RefreshOptions.FIVE_SECONDS,
@@ -196,7 +201,7 @@ export const ServeMetricsSection = ({
           {metricsConfig.map(({ title, pathParams }) => {
             const path =
               `/d-solo/${grafanaServeDashboardUid}?${pathParams}` +
-              `${refreshParams}${timeRangeParams}&var-datasource=${dashboardDatasource}`;
+              `${refreshParams}&timezone=${currentTimeZone}${timeRangeParams}&var-datasource=${dashboardDatasource}`;
             return (
               <Paper
                 key={pathParams}
diff --git a/python/ray/dashboard/client/src/type/data.ts b/python/ray/dashboard/client/src/type/data.ts
index dbbfc5889238..64d91a006568 100644
--- a/python/ray/dashboard/client/src/type/data.ts
+++ b/python/ray/dashboard/client/src/type/data.ts
@@ -12,6 +12,7 @@ export type DatasetMetrics = DataMetrics & {
 
 export type OperatorMetrics = DataMetrics & {
   operator: string;
+  name: string;
 };
 
 export type DataMetrics = {
diff --git a/python/ray/dashboard/client/src/util/test-utils.tsx b/python/ray/dashboard/client/src/util/test-utils.tsx
index bb84cf1ce7ba..81f73d9583bb 100644
--- a/python/ray/dashboard/client/src/util/test-utils.tsx
+++ b/python/ray/dashboard/client/src/util/test-utils.tsx
@@ -21,6 +21,8 @@ export const TEST_APP_WRAPPER = ({ children }: PropsWithChildren<{}>) => {
     prometheusHealth: true,
     sessionName: "session-name",
     dashboardDatasource: "Prometheus",
+    serverTimeZone: undefined,
+    currentTimeZone: undefined,
   };
 
   return (
diff --git a/python/ray/dashboard/http_server_head.py b/python/ray/dashboard/http_server_head.py
index e1c427b1b288..078d4a97dd38 100644
--- a/python/ray/dashboard/http_server_head.py
+++ b/python/ray/dashboard/http_server_head.py
@@ -12,6 +12,7 @@
 
 import ray
 import ray.dashboard.optional_utils as dashboard_optional_utils
+import ray.dashboard.timezone_utils as timezone_utils
 import ray.dashboard.utils as dashboard_utils
 from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag
 from ray._private.utils import get_or_create_event_loop
@@ -139,6 +140,18 @@ async def get_favicon(self, req) -> aiohttp.web.FileResponse:
             )
         )
 
+    @routes.get("/timezone")
+    async def get_timezone(self, req) -> aiohttp.web.Response:
+        try:
+            current_timezone = timezone_utils.get_current_timezone_info()
+            return aiohttp.web.json_response(current_timezone)
+
+        except Exception as e:
+            logger.error(f"Error getting timezone: {e}")
+            return aiohttp.web.Response(
+                status=500, text="Internal Server Error:" + str(e)
+            )
+
     def get_address(self):
         assert self.http_host and self.http_port
         return self.http_host, self.http_port
diff --git a/python/ray/dashboard/modules/data/tests/test_data_head.py b/python/ray/dashboard/modules/data/tests/test_data_head.py
index 650079360a8b..c94a50878c95 100644
--- a/python/ray/dashboard/modules/data/tests/test_data_head.py
+++ b/python/ray/dashboard/modules/data/tests/test_data_head.py
@@ -32,6 +32,7 @@
 ] + DATA_SCHEMA
 
 OPERATOR_SCHEMA = [
+    "name",
     "operator",
 ] + DATA_SCHEMA
 
@@ -64,12 +65,23 @@ def test_get_datasets():
     operators = dataset["operators"]
     assert len(operators) == 2
     op0 = operators[0]
+    op1 = operators[1]
     assert sorted(op0.keys()) == sorted(OPERATOR_SCHEMA)
-    assert op0["operator"] == "Input0"
-    assert op0["progress"] == 20
-    assert op0["total"] == 20
-    assert op0["state"] == "FINISHED"
-    assert operators[1]["operator"] == "ReadRange->MapBatches(<lambda>)1"
+    assert sorted(op1.keys()) == sorted(OPERATOR_SCHEMA)
+    assert {
+        "operator": "Input0",
+        "name": "Input",
+        "state": "FINISHED",
+        "progress": 20,
+        "total": 20,
+    }.items() <= op0.items()
+    assert {
+        "operator": "ReadRange->MapBatches(<lambda>)1",
+        "name": "ReadRange->MapBatches(<lambda>)",
+        "state": "FINISHED",
+        "progress": 20,
+        "total": 20,
+    }.items() <= op1.items()
 
     ds.map_batches(lambda x: x).materialize()
     data = requests.get(DATA_HEAD_URLS["GET"].format(job_id=job_id)).json()
@@ -83,4 +95,4 @@ def test_get_datasets():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(["-v", __file__]))
+    sys.exit(pytest.main(["-vv", __file__]))
diff --git a/python/ray/dashboard/modules/metrics/install_and_start_prometheus.py b/python/ray/dashboard/modules/metrics/install_and_start_prometheus.py
index a65050212950..ea0ff2459f65 100644
--- a/python/ray/dashboard/modules/metrics/install_and_start_prometheus.py
+++ b/python/ray/dashboard/modules/metrics/install_and_start_prometheus.py
@@ -26,6 +26,9 @@ def get_system_info():
     if architecture == "x86_64":
         # In the Prometheus filename, it's called amd64
         architecture = "amd64"
+    elif architecture == "aarch64":
+        # In the Prometheus filename, it's called arm64
+        architecture = "arm64"
     return os_type, architecture
 
 
@@ -90,6 +93,7 @@ def start_prometheus(prometheus_dir):
         f"{prometheus_dir}/prometheus",
         "--config.file",
         str(config_file),
+        "--web.enable-lifecycle",
     ]
     try:
         process = subprocess.Popen(prometheus_cmd)
@@ -104,6 +108,7 @@ def print_shutdown_message(process_id):
     message = (
         f"Prometheus is running with PID {process_id}.\n"
         "To stop Prometheus, use the command: "
+        "`ray metrics shutdown-prometheus`, "
         f"'kill {process_id}', or if you need to force stop, "
         f"use 'kill -9 {process_id}'."
     )
diff --git a/python/ray/dashboard/modules/tests/test_metrics_integration.py b/python/ray/dashboard/modules/tests/test_metrics_integration.py
index 7974b4a43cf9..0b125ca2dd5b 100644
--- a/python/ray/dashboard/modules/tests/test_metrics_integration.py
+++ b/python/ray/dashboard/modules/tests/test_metrics_integration.py
@@ -1,11 +1,14 @@
 import subprocess
 import sys
+import time
 
 import pytest
+from click.testing import CliRunner
 
 from ray.dashboard.consts import PROMETHEUS_CONFIG_INPUT_PATH
 from ray.dashboard.modules.metrics import install_and_start_prometheus
 from ray.dashboard.modules.metrics.templates import PROMETHEUS_YML_TEMPLATE
+from ray.scripts.scripts import metrics_group
 
 
 @pytest.mark.parametrize(
@@ -40,6 +43,16 @@ def test_e2e(capsys):
     subprocess.run(["kill", str(pid)])
 
 
+def test_shutdown_prometheus():
+    install_and_start_prometheus.main()
+    runner = CliRunner()
+    # Sleep for a few seconds to make sure Prometheus is running
+    # before we try to shut it down.
+    time.sleep(5)
+    result = runner.invoke(metrics_group, ["shutdown-prometheus"])
+    assert result.exit_code == 0
+
+
 def test_prometheus_config_content():
     # Test to make sure the content in the hardcoded file
     # (python/ray/dashboard/modules/metrics/export/prometheus/prometheus.yml) will
diff --git a/python/ray/dashboard/modules/tests/test_utils.py b/python/ray/dashboard/modules/tests/test_utils.py
index fc4fa85dfaed..38b24da02504 100644
--- a/python/ray/dashboard/modules/tests/test_utils.py
+++ b/python/ray/dashboard/modules/tests/test_utils.py
@@ -6,7 +6,7 @@
 
 
 async def http_get(http_session, url, timeout_seconds=60):
-    with async_timeout.timeout(timeout_seconds):
+    async with async_timeout.timeout(timeout_seconds):
         async with http_session.get(url) as response:
             return await response.json()
 
diff --git a/python/ray/dashboard/tests/test_dashboard.py b/python/ray/dashboard/tests/test_dashboard.py
index 4b61accc53c3..19571d9f76b7 100644
--- a/python/ray/dashboard/tests/test_dashboard.py
+++ b/python/ray/dashboard/tests/test_dashboard.py
@@ -11,6 +11,7 @@
 import time
 import warnings
 from unittest.mock import MagicMock
+from urllib.parse import quote_plus
 
 import pytest
 import requests
@@ -370,7 +371,9 @@ def test_http_get(enable_test_module, ray_start_with_dashboard):
     while True:
         time.sleep(3)
         try:
-            response = requests.get(webui_url + "/test/http_get?url=" + target_url)
+            response = requests.get(
+                webui_url + "/test/http_get?url=" + quote_plus(target_url)
+            )
             response.raise_for_status()
             try:
                 dump_info = response.json()
@@ -385,7 +388,8 @@ def test_http_get(enable_test_module, ray_start_with_dashboard):
             http_port, grpc_port = ports
 
             response = requests.get(
-                f"http://{ip}:{http_port}" f"/test/http_get_from_agent?url={target_url}"
+                f"http://{ip}:{http_port}"
+                f"/test/http_get_from_agent?url={quote_plus(target_url)}"
             )
             response.raise_for_status()
             try:
diff --git a/python/ray/dashboard/timezone_utils.py b/python/ray/dashboard/timezone_utils.py
new file mode 100644
index 000000000000..6a0d68b9c1a9
--- /dev/null
+++ b/python/ray/dashboard/timezone_utils.py
@@ -0,0 +1,56 @@
+import logging
+from datetime import datetime
+
+logger = logging.getLogger(__name__)
+
+timezones = [
+    {"offset": "-12:00", "value": "Etc/+12"},
+    {"offset": "-11:00", "value": "Pacific/Pago_Pago"},
+    {"offset": "-10:00", "value": "Pacific/Honolulu"},
+    {"offset": "-09:00", "value": "America/Anchorage"},
+    {"offset": "-08:00", "value": "America/Los_Angeles"},
+    {"offset": "-07:00", "value": "America/Phoenix"},
+    {"offset": "-06:00", "value": "America/Guatemala"},
+    {"offset": "-05:00", "value": "America/Bogota"},
+    {"offset": "-04:00", "value": "America/Halifax"},
+    {"offset": "-03:30", "value": "America/St_Johns"},
+    {"offset": "-03:00", "value": "America/Sao_Paulo"},
+    {"offset": "-02:00", "value": "America/Godthab"},
+    {"offset": "-01:00", "value": "Atlantic/Azores"},
+    {"offset": "+00:00", "value": "Europe/London"},
+    {"offset": "+01:00", "value": "Europe/Amsterdam"},
+    {"offset": "+02:00", "value": "Asia/Amman"},
+    {"offset": "+03:00", "value": "Asia/Baghdad"},
+    {"offset": "+03:30", "value": "Asia/Tehran"},
+    {"offset": "+04:00", "value": "Asia/Dubai"},
+    {"offset": "+04:30", "value": "Asia/Kabul"},
+    {"offset": "+05:00", "value": "Asia/Karachi"},
+    {"offset": "+05:30", "value": "Asia/Kolkata"},
+    {"offset": "+05:45", "value": "Asia/Kathmandu"},
+    {"offset": "+06:00", "value": "Asia/Almaty"},
+    {"offset": "+06:30", "value": "Asia/Yangon"},
+    {"offset": "+07:00", "value": "Asia/Bangkok"},
+    {"offset": "+08:00", "value": "Asia/Shanghai"},
+    {"offset": "+09:00", "value": "Asia/Irkutsk"},
+    {"offset": "+09:30", "value": "Australia/Adelaide"},
+    {"offset": "+10:00", "value": "Australia/Brisbane"},
+    {"offset": "+11:00", "value": "Asia/Magadan"},
+    {"offset": "+12:00", "value": "Pacific/Auckland"},
+    {"offset": "+13:00", "value": "Pacific/Tongatapu"},
+]
+
+
+def get_current_timezone_info():
+    current_tz = datetime.now().astimezone().tzinfo
+    offset = current_tz.utcoffset(None)
+    hours, remainder = divmod(offset.total_seconds(), 3600)
+    minutes = remainder // 60
+    sign = "+" if hours >= 0 else "-"
+    current_offset = f"{sign}{abs(int(hours)):02d}:{abs(int(minutes)):02d}"
+
+    current_timezone = next(
+        (tz for tz in timezones if tz["offset"] == current_offset),
+        {"offset": None, "value": None},
+    )
+
+    return current_timezone
diff --git a/python/ray/data/BUILD b/python/ray/data/BUILD
index d232ab352ba0..d46db0940c6e 100644
--- a/python/ray/data/BUILD
+++ b/python/ray/data/BUILD
@@ -99,7 +99,7 @@ py_test(
 
 py_test(
     name = "test_arrow_block",
-    size = "small",
+    size = "medium",
     srcs = ["tests/test_arrow_block.py"],
     tags = ["team:data", "exclusive"],
     deps = ["//:ray_lib", ":conftest"],
@@ -225,6 +225,14 @@ py_test(
     deps = ["//:ray_lib", ":conftest"],
 )
 
+py_test(
+    name = "test_hudi",
+    size = "small",
+    srcs = ["tests/test_hudi.py"],
+    tags = ["team:data", "exclusive"],
+    deps = ["//:ray_lib", ":conftest"],
+)
+
 py_test(
     name = "test_image",
     size = "small",
diff --git a/python/ray/data/__init__.py b/python/ray/data/__init__.py
index 89d531aa2ee5..5883ae6c542c 100644
--- a/python/ray/data/__init__.py
+++ b/python/ray/data/__init__.py
@@ -48,6 +48,7 @@
     read_databricks_tables,
     read_datasource,
     read_delta_sharing_tables,
+    read_hudi,
     read_iceberg,
     read_images,
     read_json,
@@ -139,6 +140,7 @@
     "read_csv",
     "read_datasource",
     "read_delta_sharing_tables",
+    "read_hudi",
     "read_iceberg",
     "read_images",
     "read_json",
diff --git a/python/ray/data/_internal/arrow_block.py b/python/ray/data/_internal/arrow_block.py
index f12f89d8cceb..1473b8fb6e3b 100644
--- a/python/ray/data/_internal/arrow_block.py
+++ b/python/ray/data/_internal/arrow_block.py
@@ -21,15 +21,11 @@
 from ray._private.utils import _get_pyarrow_version
 from ray.air.constants import TENSOR_COLUMN_NAME
 from ray.air.util.tensor_extensions.arrow import (
-    ArrowConversionError,
-    convert_list_to_pyarrow_array,
+    convert_to_pyarrow_array,
     pyarrow_table_from_pydict,
 )
 from ray.data._internal.arrow_ops import transform_polars, transform_pyarrow
-from ray.data._internal.numpy_support import (
-    convert_udf_returns_to_numpy,
-    validate_numpy_batch,
-)
+from ray.data._internal.numpy_support import convert_to_numpy
 from ray.data._internal.row import TableRow
 from ray.data._internal.table_block import TableBlockAccessor, TableBlockBuilder
 from ray.data._internal.util import NULL_SENTINEL, find_partitions
@@ -43,7 +39,6 @@
     U,
 )
 from ray.data.context import DataContext
-from ray.util.debug import log_once
 
 try:
     import pyarrow
@@ -61,17 +56,6 @@
 T = TypeVar("T")
 logger = logging.getLogger(__name__)
 
-ARROW_OBJECT_FIXABLE_ERRORS = (
-    pyarrow.lib.ArrowTypeError,
-    pyarrow.lib.ArrowNotImplementedError,
-    pyarrow.lib.ArrowInvalid,
-)
-
-
-def is_object_fixable_error(e: ArrowConversionError) -> bool:
-    """Returns whether this error can be fixed by using an ArrowPythonObjectArray"""
-    return isinstance(e.__cause__, ARROW_OBJECT_FIXABLE_ERRORS)
-
 
 # We offload some transformations to polars for performance.
 def get_sort_transform(context: DataContext) -> Callable:
@@ -151,27 +135,14 @@ def __init__(self):
 
     @staticmethod
     def _table_from_pydict(columns: Dict[str, List[Any]]) -> Block:
-        for col_name, col in columns.items():
-            try:
-                if col_name == TENSOR_COLUMN_NAME or isinstance(
-                    next(iter(col), None), np.ndarray
-                ):
-                    from ray.data.extensions.tensor_extension import ArrowTensorArray
-
-                    columns[col_name] = ArrowTensorArray.from_numpy(col, col_name)
-                else:
-                    columns[col_name] = convert_list_to_pyarrow_array(col, columns)
-            except ArrowConversionError as e:
-                from ray.data.extensions.object_extension import (
-                    ArrowPythonObjectArray,
-                    object_extension_type_allowed,
-                )
+        pa_cols: Dict[str, pyarrow.Array] = dict()
 
-                if object_extension_type_allowed() and is_object_fixable_error(e):
-                    columns[col_name] = ArrowPythonObjectArray.from_objects(col)
-                else:
-                    raise
-        return pyarrow_table_from_pydict(columns)
+        for col_name, col_vals in columns.items():
+            np_col_vals = convert_to_numpy(col_vals)
+
+            pa_cols[col_name] = convert_to_pyarrow_array(np_col_vals, col_name)
+
+        return pyarrow_table_from_pydict(pa_cols)
 
     @staticmethod
     def _concat_tables(tables: List[Block]) -> Block:
@@ -216,40 +187,6 @@ def from_bytes(cls, data: bytes) -> "ArrowBlockAccessor":
         reader = pyarrow.ipc.open_stream(data)
         return cls(reader.read_all())
 
-    @staticmethod
-    def numpy_to_block(
-        batch: Union[Dict[str, np.ndarray], Dict[str, list]],
-    ) -> "pyarrow.Table":
-        from ray.data.extensions.object_extension import (
-            ArrowPythonObjectArray,
-            object_extension_type_allowed,
-        )
-        from ray.data.extensions.tensor_extension import ArrowTensorArray
-
-        validate_numpy_batch(batch)
-
-        new_batch = {}
-        for col_name, col in batch.items():
-            # Coerce to np.ndarray format if possible.
-            col = convert_udf_returns_to_numpy(col)
-            # Use Arrow's native *List types for 1-dimensional ndarrays.
-            if col.dtype.type is np.object_ or col.ndim > 1:
-                try:
-                    col = ArrowTensorArray.from_numpy(col, col_name)
-                except ArrowConversionError as e:
-                    if object_extension_type_allowed() and is_object_fixable_error(e):
-                        if log_once(f"arrow_object_pickle_{col_name}"):
-                            logger.debug(
-                                f"Failed to interpret {col_name} as "
-                                "multi-dimensional arrays. It will be pickled."
-                            )
-                        col = ArrowPythonObjectArray.from_objects(col)
-                    else:
-                        raise
-
-            new_batch[col_name] = col
-        return pyarrow_table_from_pydict(new_batch)
-
     @staticmethod
     def _build_tensor_row(
         row: ArrowRow, col_name: str = TENSOR_COLUMN_NAME
@@ -282,7 +219,7 @@ def _build_tensor_row(
     def slice(self, start: int, end: int, copy: bool = False) -> "pyarrow.Table":
         view = self._table.slice(start, end - start)
         if copy:
-            view = _copy_table(view)
+            view = transform_pyarrow.combine_chunks(view)
         return view
 
     def random_shuffle(self, random_seed: Optional[int]) -> "pyarrow.Table":
@@ -308,11 +245,6 @@ def to_pandas(self) -> "pandas.DataFrame":
     def to_numpy(
         self, columns: Optional[Union[str, List[str]]] = None
     ) -> Union[np.ndarray, Dict[str, np.ndarray]]:
-        from ray.air.util.transform_pyarrow import (
-            _concatenate_extension_column,
-            _is_column_extension_type,
-        )
-
         if columns is None:
             columns = self._table.column_names
             should_be_single_ndarray = False
@@ -330,23 +262,24 @@ def to_numpy(
                     f"{column_names_set}"
                 )
 
-        arrays = []
-        for column in columns:
-            array = self._table[column]
-            if _is_column_extension_type(array):
-                array = _concatenate_extension_column(array)
-            elif array.num_chunks == 0:
-                array = pyarrow.array([], type=array.type)
-            else:
-                array = array.combine_chunks()
-            arrays.append(array.to_numpy(zero_copy_only=False))
+        column_values_ndarrays = []
+
+        for col_name in columns:
+            col = self._table[col_name]
+
+            # Combine columnar values arrays to make these contiguous
+            # (making them compatible with numpy format)
+            combined_array = transform_pyarrow.combine_chunked_array(col)
+
+            column_values_ndarrays.append(
+                transform_pyarrow.to_numpy(combined_array, zero_copy_only=False)
+            )
 
         if should_be_single_ndarray:
             assert len(columns) == 1
-            arrays = arrays[0]
+            return column_values_ndarrays[0]
         else:
-            arrays = dict(zip(columns, arrays))
-        return arrays
+            return dict(zip(columns, column_values_ndarrays))
 
     def to_arrow(self) -> "pyarrow.Table":
         return self._table
@@ -715,8 +648,3 @@ def gen():
 
     def block_type(self) -> BlockType:
         return BlockType.ARROW
-
-
-def _copy_table(table: "pyarrow.Table") -> "pyarrow.Table":
-    """Copy the provided Arrow table."""
-    return transform_pyarrow.combine_chunks(table)
diff --git a/python/ray/data/_internal/arrow_ops/transform_pyarrow.py b/python/ray/data/_internal/arrow_ops/transform_pyarrow.py
index 093588ca8f34..a71a1eae6f61 100644
--- a/python/ray/data/_internal/arrow_ops/transform_pyarrow.py
+++ b/python/ray/data/_internal/arrow_ops/transform_pyarrow.py
@@ -1,8 +1,14 @@
 from typing import TYPE_CHECKING, List, Union
 
+import numpy as np
 from packaging.version import parse as parse_version
 
 from ray._private.utils import _get_pyarrow_version
+from ray.air.util.tensor_extensions.arrow import (
+    INT32_OVERFLOW_THRESHOLD,
+    MIN_PYARROW_VERSION_CHUNKED_ARRAY_TO_NUMPY_ZERO_COPY_ONLY,
+    PYARROW_VERSION,
+)
 
 try:
     import pyarrow
@@ -236,6 +242,7 @@ def concat(blocks: List["pyarrow.Table"]) -> "pyarrow.Table":
         schema = unify_schemas(schemas_to_unify)
     except Exception as e:
         raise ArrowConversionError(str(blocks)) from e
+
     if (
         any(isinstance(type_, pa.ExtensionType) for type_ in schema.types)
         or cols_with_null_list
@@ -246,6 +253,7 @@ def concat(blocks: List["pyarrow.Table"]) -> "pyarrow.Table":
             col_chunked_arrays = []
             for block in blocks:
                 col_chunked_arrays.append(block.column(col_name))
+
             if isinstance(schema.field(col_name).type, tensor_types):
                 # For our tensor extension types, manually construct a chunked array
                 # containing chunks from all blocks. This is to handle
@@ -326,24 +334,164 @@ def concat_and_sort(
     return take_table(ret, indices)
 
 
+def to_numpy(
+    array: Union["pyarrow.Array", "pyarrow.ChunkedArray"],
+    *,
+    zero_copy_only: bool = True,
+) -> np.ndarray:
+    """Wrapper for `Array`s and `ChunkedArray`s `to_numpy` API,
+    handling API divergence b/w Arrow versions"""
+
+    import pyarrow as pa
+
+    if isinstance(array, pa.Array):
+        return array.to_numpy(zero_copy_only=zero_copy_only)
+    elif isinstance(array, pa.ChunkedArray):
+        if PYARROW_VERSION >= MIN_PYARROW_VERSION_CHUNKED_ARRAY_TO_NUMPY_ZERO_COPY_ONLY:
+            return array.to_numpy(zero_copy_only=zero_copy_only)
+        else:
+            return array.to_numpy()
+    else:
+        raise ValueError(
+            f"Either of `Array` or `ChunkedArray` was expected, got {type(array)}"
+        )
+
+
 def combine_chunks(table: "pyarrow.Table") -> "pyarrow.Table":
-    """This is pyarrow.Table.combine_chunks()
-    with support for extension types.
+    """This is counterpart for Pyarrow's `Table.combine_chunks` that's using
+    extended `ChunkedArray` combination protocol.
 
-    This will create a new table by combining the chunks the input table has.
+    For more details check out `combine_chunked_array` py-doc
     """
+
+    new_column_values_arrays = []
+
+    for col in table.columns:
+        new_column_values_arrays.append(combine_chunked_array(col))
+
+    return pyarrow.Table.from_arrays(new_column_values_arrays, schema=table.schema)
+
+
+def combine_chunked_array(
+    array: "pyarrow.ChunkedArray",
+) -> Union["pyarrow.Array", "pyarrow.ChunkedArray"]:
+    """This is counterpart for Pyarrow's `ChunkedArray.combine_chunks` that additionally
+
+        1. Handles `ExtensionType`s (like ArrowTensorType, ArrowTensorTypeV2,
+           ArrowPythonObjectType, etc)
+
+        2. Making sure `ChunkedArray`s comprising provided `Table` are combined
+           safely, ie avoiding overflows of Arrow's internal offsets (using int32 for
+           most of its native types, other than "large" kind).
+
+    For more details check py-doc of `_try_combine_chunks_safe` method.
+    """
+
+    import pyarrow as pa
+
     from ray.air.util.transform_pyarrow import (
         _concatenate_extension_column,
         _is_column_extension_type,
     )
 
-    cols = table.columns
-    new_cols = []
-    for col in cols:
-        if _is_column_extension_type(col):
-            # Extension arrays don't support concatenation.
-            arr = _concatenate_extension_column(col)
-        else:
-            arr = col.combine_chunks()
-        new_cols.append(arr)
-    return pyarrow.Table.from_arrays(new_cols, schema=table.schema)
+    assert isinstance(
+        array, pa.ChunkedArray
+    ), f"Expected `ChunkedArray`, got {type(array)}"
+
+    if _is_column_extension_type(array):
+        # Arrow `ExtensionArray`s can't be concatenated via `combine_chunks`,
+        # hence require manual concatenation
+        return _concatenate_extension_column(array)
+    elif len(array.chunks) == 0:
+        # NOTE: In case there's no chunks, we need to explicitly create
+        #       an empty array since calling into `combine_chunks` would fail
+        #       due to it expecting at least 1 chunk to be present
+        return pa.array([], type=array.type)
+    else:
+        return _try_combine_chunks_safe(array)
+
+
+def _try_combine_chunks_safe(
+    array: "pyarrow.ChunkedArray", max_chunk_size=INT32_OVERFLOW_THRESHOLD
+) -> Union["pyarrow.Array", "pyarrow.ChunkedArray"]:
+    """This method provides a safe way of combining `ChunkedArray`s exceeding 2 GiB
+    in size, which aren't using "large_*" types (and therefore relying on int32
+    offsets).
+
+    When handling provided `ChunkedArray` this method will be either
+
+        - Relying on PyArrow's default `combine_chunks` (therefore returning single
+        contiguous `Array`) in cases when
+            - Array's total size is < 2 GiB
+            - Array's underlying type is of "large" kind (ie using one of the
+            `large_*` type family)
+        - Safely combining subsets of tasks such that resulting `Array`s to not
+        exceed 2 GiB in size (therefore returning another `ChunkedArray` albeit
+        with potentially smaller number of chunks that have resulted from clumping
+        the original ones)
+
+    Returns:
+        - pa.Array if it's possible to combine provided pa.ChunkedArray into single
+        contiguous array
+        - pa.ChunkedArray (albeit with chunks re-combined) if it's not possible to
+        produce single pa.Array
+    """
+
+    import pyarrow as pa
+
+    from ray.air.util.transform_pyarrow import _is_column_extension_type
+
+    assert not _is_column_extension_type(
+        array
+    ), f"Arrow `ExtensionType`s are not accepted (got {array.type})"
+
+    int64_type_predicates = [
+        pa.types.is_large_list,
+        pa.types.is_large_string,
+        pa.types.is_large_binary,
+        pa.types.is_large_unicode,
+    ]
+
+    if array.nbytes < max_chunk_size or any(
+        p(array.type) for p in int64_type_predicates
+    ):
+        # It's safe to combine provided `ChunkedArray` in either of 2 cases:
+        #   - It's cumulative size is < 2 GiB
+        #   - It's of 'large' kind (ie one using int64 offsets internally)
+        return array.combine_chunks()
+
+    # In this case it's actually *NOT* safe to try to directly combine
+    # Arrow's `ChunkedArray` and is impossible to produce single, contiguous
+    # `Array` since
+    #     - It's estimated to hold > 2 GiB
+    #     - Its type is not of the "large" kind (and hence is using int32
+    #       offsets internally, which would overflow)
+    #
+    # In this case instead of combining into single contiguous array, we
+    # instead just "clump" existing chunks into bigger ones, but no bigger
+    # than 2 GiB each.
+    #
+    # NOTE: This branch actually returns `ChunkedArray` and not an `Array`
+
+    # To stay under 2 GiB limit we are slicing provided list of chunks into
+    # slices no larger than 2 GiB (as compared to just directly using `concat_arrays`)
+    slices = []
+
+    cur_slice_start = 0
+    cur_slice_size_bytes = 0
+
+    for i, chunk in enumerate(array.chunks):
+        chunk_size = chunk.nbytes
+
+        if cur_slice_size_bytes + chunk_size > max_chunk_size:
+            slices.append(array.chunks[cur_slice_start:i])
+
+            cur_slice_start = i
+            cur_slice_size_bytes = 0
+
+        cur_slice_size_bytes += chunk_size
+
+    # Add remaining chunks as last slice
+    slices.append(array.chunks[cur_slice_start:])
+
+    return pa.chunked_array([pa.concat_arrays(s) for s in slices])
diff --git a/python/ray/data/_internal/batcher.py b/python/ray/data/_internal/batcher.py
index 104e3c7ae51d..d27ed089f03f 100644
--- a/python/ray/data/_internal/batcher.py
+++ b/python/ray/data/_internal/batcher.py
@@ -11,7 +11,7 @@
 # See https://github.com/ray-project/ray/issues/31108 for more details.
 # TODO(jjyao): remove this once
 # https://github.com/apache/arrow/issues/35126 is resolved.
-MIN_NUM_CHUNKS_TO_TRIGGER_COMBINE_CHUNKS = 2
+MIN_NUM_CHUNKS_TO_TRIGGER_COMBINE_CHUNKS = 10
 
 # Delay compaction until the shuffle buffer has reached this ratio over the min
 # shuffle buffer size. Setting this to 1 minimizes memory usage, at the cost of
@@ -130,10 +130,7 @@ def next_batch(self) -> Block:
                 # the leftovers.
                 leftover.append(block)
             elif accessor.num_rows() <= needed:
-                # We need this entire block to fill out a batch.
-                # We need to call `accessor.slice()` to ensure
-                # the subsequent block's type are the same.
-                output.add_block(accessor.slice(0, accessor.num_rows(), copy=False))
+                output.add_block(accessor.to_block())
                 needed -= accessor.num_rows()
             else:
                 if (
diff --git a/python/ray/data/_internal/datasource/hudi_datasource.py b/python/ray/data/_internal/datasource/hudi_datasource.py
new file mode 100644
index 000000000000..828d9baada7f
--- /dev/null
+++ b/python/ray/data/_internal/datasource/hudi_datasource.py
@@ -0,0 +1,91 @@
+import logging
+import os
+from typing import Dict, Iterator, List, Optional
+
+from ray.data._internal.util import _check_import
+from ray.data.block import BlockMetadata
+from ray.data.datasource.datasource import Datasource, ReadTask
+
+logger = logging.getLogger(__name__)
+
+
+class HudiDatasource(Datasource):
+    """Hudi datasource, for reading Apache Hudi table."""
+
+    def __init__(
+        self,
+        table_uri: str,
+        storage_options: Optional[Dict[str, str]] = None,
+    ):
+        _check_import(self, module="hudi", package="hudi-python")
+
+        self._table_uri = table_uri
+        self._storage_options = storage_options
+
+    def get_read_tasks(self, parallelism: int) -> List["ReadTask"]:
+        import pyarrow
+        from hudi import HudiTable
+
+        def _perform_read(
+            table_uri: str,
+            base_file_paths: List[str],
+            options: Dict[str, str],
+        ) -> Iterator["pyarrow.Table"]:
+            from hudi import HudiFileGroupReader
+
+            for p in base_file_paths:
+                file_group_reader = HudiFileGroupReader(table_uri, options)
+                batch = file_group_reader.read_file_slice_by_base_file_path(p)
+                yield pyarrow.Table.from_batches([batch])
+
+        hudi_table = HudiTable(self._table_uri, self._storage_options)
+
+        reader_options = {
+            **hudi_table.storage_options(),
+            **hudi_table.hudi_options(),
+        }
+
+        schema = hudi_table.get_schema()
+        read_tasks = []
+        for file_slices_split in hudi_table.split_file_slices(parallelism):
+            if len(file_slices_split) == 0:
+                # when the table is empty, this will be an empty split
+                continue
+
+            num_rows = 0
+            relative_paths = []
+            input_files = []
+            size_bytes = 0
+            for file_slice in file_slices_split:
+                # A file slice in a Hudi table is a logical group of data files
+                # within a physical partition. Records stored in a file slice
+                # are associated with a commit on the Hudi table's timeline.
+                # For more info, see https://hudi.apache.org/docs/file_layouts
+                num_rows += file_slice.num_records
+                relative_path = file_slice.base_file_relative_path()
+                relative_paths.append(relative_path)
+                full_path = os.path.join(self._table_uri, relative_path)
+                input_files.append(full_path)
+                size_bytes += file_slice.base_file_size
+
+            metadata = BlockMetadata(
+                num_rows=num_rows,
+                schema=schema,
+                input_files=input_files,
+                size_bytes=size_bytes,
+                exec_stats=None,
+            )
+
+            read_task = ReadTask(
+                read_fn=lambda paths=relative_paths: _perform_read(
+                    self._table_uri, paths, reader_options
+                ),
+                metadata=metadata,
+            )
+            read_tasks.append(read_task)
+
+        return read_tasks
+
+    def estimate_inmemory_data_size(self) -> Optional[int]:
+        # TODO(xushiyan) add APIs to provide estimated in-memory size
+        return None
diff --git a/python/ray/data/_internal/execution/streaming_executor.py b/python/ray/data/_internal/execution/streaming_executor.py
index 238f6f9421cc..a4276e2bafe6 100644
--- a/python/ray/data/_internal/execution/streaming_executor.py
+++ b/python/ray/data/_internal/execution/streaming_executor.py
@@ -188,11 +188,9 @@ def shutdown(self, execution_completed: bool = True):
                 state="FINISHED" if execution_completed else "FAILED",
                 force_update=True,
             )
-            # Clears metrics for this dataset so that they do
-            # not persist in the grafana dashboard after execution
-            StatsManager.clear_execution_metrics(
-                self._dataset_tag, self._get_operator_tags()
-            )
+            # Once Dataset execution completes, mark it as complete
+            # and remove last cached execution stats.
+            StatsManager.clear_last_execution_stats(self._dataset_tag)
             # Freeze the stats and save it.
             self._final_stats = self._generate_stats()
             stats_summary_string = self._final_stats.to_summary().to_string(
@@ -401,6 +399,7 @@ def _get_state_dict(self, state):
             "end_time": time.time() if state != "RUNNING" else None,
             "operators": {
                 f"{op.name}{i}": {
+                    "name": op.name,
                     "progress": op_state.num_completed_tasks,
                     "total": op.num_outputs_total(),
                     "state": state,
diff --git a/python/ray/data/_internal/numpy_support.py b/python/ray/data/_internal/numpy_support.py
index 9e6a7c305dfb..d04060fc831e 100644
--- a/python/ray/data/_internal/numpy_support.py
+++ b/python/ray/data/_internal/numpy_support.py
@@ -1,4 +1,5 @@
 import collections
+import logging
 from datetime import datetime
 from typing import Any, Dict, List, Union
 
@@ -7,6 +8,8 @@
 from ray.air.util.tensor_extensions.utils import create_ragged_ndarray
 from ray.data._internal.util import _truncated_repr
 
+logger = logging.getLogger(__name__)
+
 
 def is_array_like(value: Any) -> bool:
     """Checks whether objects are array-like, excluding numpy scalars."""
@@ -66,7 +69,7 @@ def _convert_datetime_list_to_array(datetime_list: List[datetime]) -> np.ndarray
     )
 
 
-def convert_udf_returns_to_numpy(udf_return_col: Any) -> Any:
+def convert_to_numpy(column_values: Any) -> np.ndarray:
     """Convert UDF columns (output of map_batches) to numpy, if possible.
 
     This includes lists of scalars, objects supporting the array protocol, and lists
@@ -80,36 +83,31 @@ def convert_udf_returns_to_numpy(udf_return_col: Any) -> Any:
         ValueError if an input was array-like but we failed to convert it to an array.
     """
 
-    if isinstance(udf_return_col, np.ndarray):
+    if isinstance(column_values, np.ndarray):
         # No copy/conversion needed, just keep it verbatim.
-        return udf_return_col
+        return column_values
 
-    if isinstance(udf_return_col, list):
-        if len(udf_return_col) == 1 and isinstance(udf_return_col[0], np.ndarray):
+    elif isinstance(column_values, list):
+        if len(column_values) == 1 and isinstance(column_values[0], np.ndarray):
             # Optimization to avoid conversion overhead from list to np.array.
-            udf_return_col = np.expand_dims(udf_return_col[0], axis=0)
-            return udf_return_col
+            return np.expand_dims(column_values[0], axis=0)
 
-        if all(isinstance(elem, datetime) for elem in udf_return_col):
-            return _convert_datetime_list_to_array(udf_return_col)
+        if all(isinstance(elem, datetime) for elem in column_values):
+            return _convert_datetime_list_to_array(column_values)
 
         # Try to convert list values into an numpy array via
         # np.array(), so users don't need to manually cast.
         # NOTE: we don't cast generic iterables, since types like
         # `str` are also Iterable.
         try:
-            # Try to cast the inner scalars to numpy as well, to avoid unnecessarily
-            # creating an inefficient array of array of object dtype.
-            # But don't convert if the list is nested. Because if sub-lists have
-            # heterogeneous shapes, we need to create a ragged ndarray.
-            if not is_nested_list(udf_return_col) and all(
-                is_valid_udf_return(e) for e in udf_return_col
-            ):
+            # Convert array-like objects (like torch.Tensor) to `np.ndarray`s
+            if all(is_array_like(e) for e in column_values):
                 # Use np.asarray() instead of np.array() to avoid copying if possible.
-                udf_return_col = [np.asarray(e) for e in udf_return_col]
+                column_values = [np.asarray(e) for e in column_values]
+
             shapes = set()
             has_object = False
-            for e in udf_return_col:
+            for e in column_values:
                 if isinstance(e, np.ndarray):
                     shapes.add((e.dtype, e.shape))
                 elif isinstance(e, bytes):
@@ -122,24 +120,48 @@ def convert_udf_returns_to_numpy(udf_return_col: Any) -> Any:
                     has_object = True
                 elif not np.isscalar(e):
                     has_object = True
+
+            # When column values are
+            #   - Arrays of heterogeneous shapes
+            #   - Byte-strings (viewed as arrays of heterogeneous shapes)
+            #   - Non-scalar objects (tuples, lists, arbitrary object types)
+            #
+            # Custom "ragged ndarray" is created, represented as an array of
+            # references (ie ndarray with dtype=object)
             if has_object or len(shapes) > 1:
                 # This util works around some limitations of np.array(dtype=object).
-                udf_return_col = create_ragged_ndarray(udf_return_col)
+                return create_ragged_ndarray(column_values)
             else:
-                udf_return_col = np.array(udf_return_col)
+                return np.array(column_values)
+
         except Exception as e:
+            logger.error(
+                f"Failed to convert column values to numpy array: "
+                f"{_truncated_repr(column_values)}",
+                exc_info=e,
+            )
+
             raise ValueError(
                 "Failed to convert column values to numpy array: "
-                f"({_truncated_repr(udf_return_col)}): {e}."
-            )
-    elif hasattr(udf_return_col, "__array__"):
+                f"({_truncated_repr(column_values)}): {e}."
+            ) from e
+
+    elif is_array_like(column_values):
         # Converts other array-like objects such as torch.Tensor.
         try:
-            udf_return_col = np.array(udf_return_col)
+            # Use np.asarray() instead of np.array() to avoid copying if possible.
+            return np.asarray(column_values)
         except Exception as e:
+            logger.error(
+                f"Failed to convert column values to numpy array: "
+                f"{_truncated_repr(column_values)}",
+                exc_info=e,
+            )
+
             raise ValueError(
                 "Failed to convert column values to numpy array: "
-                f"({_truncated_repr(udf_return_col)}): {e}."
-            )
+                f"({_truncated_repr(column_values)}): {e}."
+            ) from e
 
-    return udf_return_col
+    else:
+        return column_values
diff --git a/python/ray/data/_internal/pandas_block.py b/python/ray/data/_internal/pandas_block.py
index 04ff4a35a7e0..119469b46c1b 100644
--- a/python/ray/data/_internal/pandas_block.py
+++ b/python/ray/data/_internal/pandas_block.py
@@ -17,10 +17,8 @@
 import numpy as np
 
 from ray.air.constants import TENSOR_COLUMN_NAME
-from ray.data._internal.numpy_support import (
-    convert_udf_returns_to_numpy,
-    validate_numpy_batch,
-)
+from ray.air.util.tensor_extensions.utils import _is_ndarray_tensor
+from ray.data._internal.numpy_support import convert_to_numpy, validate_numpy_batch
 from ray.data._internal.row import TableRow
 from ray.data._internal.table_block import TableBlockAccessor, TableBlockBuilder
 from ray.data._internal.util import find_partitions
@@ -114,14 +112,20 @@ def __init__(self):
     @staticmethod
     def _table_from_pydict(columns: Dict[str, List[Any]]) -> "pandas.DataFrame":
         pandas = lazy_import_pandas()
-        for key, value in columns.items():
-            if key == TENSOR_COLUMN_NAME or isinstance(
-                next(iter(value), None), np.ndarray
-            ):
+
+        pd_columns: Dict[str, Any] = {}
+
+        for col_name, col_vals in columns.items():
+            np_col_vals = convert_to_numpy(col_vals)
+
+            if col_name == TENSOR_COLUMN_NAME or _is_ndarray_tensor(np_col_vals):
                 from ray.data.extensions.tensor_extension import TensorArray
 
-                columns[key] = TensorArray(value)
-        return pandas.DataFrame(columns)
+                pd_columns[col_name] = TensorArray(np_col_vals)
+            else:
+                pd_columns[col_name] = np_col_vals
+
+        return pandas.DataFrame(pd_columns)
 
     @staticmethod
     def _concat_tables(tables: List["pandas.DataFrame"]) -> "pandas.DataFrame":
@@ -283,10 +287,6 @@ def numpy_to_block(
     ) -> "pandas.DataFrame":
         validate_numpy_batch(batch)
 
-        batch = {
-            column_name: convert_udf_returns_to_numpy(column)
-            for column_name, column in batch.items()
-        }
         block = PandasBlockBuilder._table_from_pydict(batch)
         return block
 
diff --git a/python/ray/data/_internal/planner/exchange/sort_task_spec.py b/python/ray/data/_internal/planner/exchange/sort_task_spec.py
index 827c4a2c7a51..7c67b3dbdefe 100644
--- a/python/ray/data/_internal/planner/exchange/sort_task_spec.py
+++ b/python/ray/data/_internal/planner/exchange/sort_task_spec.py
@@ -81,8 +81,9 @@ def validate_schema(self, schema: Optional[Union[type, "pyarrow.lib.Schema"]]):
             for column in self._columns:
                 if column not in schema_names_set:
                     raise ValueError(
-                        "The column '{}' does not exist in the "
-                        "schema '{}'.".format(column, schema)
+                        f"You specified the column '{column}', but there's no such "
+                        "column in the dataset. The dataset has columns: "
+                        f"{schema_names_set}"
                     )
 
     @property
diff --git a/python/ray/data/_internal/planner/plan_udf_map_op.py b/python/ray/data/_internal/planner/plan_udf_map_op.py
index 9cd81c0d5f3b..605efe7c95cc 100644
--- a/python/ray/data/_internal/planner/plan_udf_map_op.py
+++ b/python/ray/data/_internal/planner/plan_udf_map_op.py
@@ -352,6 +352,8 @@ def transform_fn(
         # generators, and in the main event loop, yield them from
         # the queue as they become available.
         output_batch_queue = queue.Queue()
+        # Sentinel object to signal the end of the async generator.
+        sentinel = object()
 
         async def process_batch(batch: DataBatch):
             try:
@@ -366,29 +368,33 @@ async def process_batch(batch: DataBatch):
                 )  # Put the exception into the queue to signal an error
 
         async def process_all_batches():
-            loop = ray.data._map_actor_context.udf_map_asyncio_loop
-            tasks = [loop.create_task(process_batch(x)) for x in input_iterable]
+            try:
+                loop = ray.data._map_actor_context.udf_map_asyncio_loop
+                tasks = [loop.create_task(process_batch(x)) for x in input_iterable]
 
-            ctx = ray.data.DataContext.get_current()
-            if ctx.execution_options.preserve_order:
-                for task in tasks:
-                    await task()
-            else:
-                for task in asyncio.as_completed(tasks):
-                    await task
+                ctx = ray.data.DataContext.get_current()
+                if ctx.execution_options.preserve_order:
+                    for task in tasks:
+                        await task()
+                else:
+                    for task in asyncio.as_completed(tasks):
+                        await task
+            finally:
+                output_batch_queue.put(sentinel)
 
         # Use the existing event loop to create and run Tasks to process each batch
         loop = ray.data._map_actor_context.udf_map_asyncio_loop
-        future = asyncio.run_coroutine_threadsafe(process_all_batches(), loop)
+        asyncio.run_coroutine_threadsafe(process_all_batches(), loop)
 
         # Yield results as they become available.
-        # After all futures are completed, drain the queue to
-        # yield any remaining results.
-        while not future.done() or not output_batch_queue.empty():
+        while True:
             # Here, `out_batch` is a one-row output batch
             # from the async generator, corresponding to a
             # single row from the input batch.
             out_batch = output_batch_queue.get()
+            if out_batch is sentinel:
+                # Break out of the loop when the sentinel is received.
+                break
             if isinstance(out_batch, Exception):
                 raise out_batch
             _validate_batch_output(out_batch)
diff --git a/python/ray/data/_internal/stats.py b/python/ray/data/_internal/stats.py
index 46435ec9ceb4..fc6903cd92e2 100644
--- a/python/ray/data/_internal/stats.py
+++ b/python/ray/data/_internal/stats.py
@@ -378,33 +378,6 @@ def update_iteration_metrics(
         self.iter_user_s.set(stats.iter_user_s.get(), tags)
         self.iter_initialize_s.set(stats.iter_initialize_s.get(), tags)
 
-    def clear_execution_metrics(self, dataset_tag: str, operator_tags: List[str]):
-        for operator_tag in operator_tags:
-            tags = self._create_tags(dataset_tag, operator_tag)
-            self.spilled_bytes.set(0, tags)
-            self.allocated_bytes.set(0, tags)
-            self.freed_bytes.set(0, tags)
-            self.current_bytes.set(0, tags)
-            self.output_bytes.set(0, tags)
-            self.output_rows.set(0, tags)
-            self.cpu_usage_cores.set(0, tags)
-            self.gpu_usage_cores.set(0, tags)
-
-            for prom_metric in self.execution_metrics_inputs.values():
-                prom_metric.set(0, tags)
-
-            for prom_metric in self.execution_metrics_outputs.values():
-                prom_metric.set(0, tags)
-
-            for prom_metric in self.execution_metrics_tasks.values():
-                prom_metric.set(0, tags)
-
-            for prom_metric in self.execution_metrics_obj_store_memory.values():
-                prom_metric.set(0, tags)
-
-            for prom_metric in self.execution_metrics_misc.values():
-                prom_metric.set(0, tags)
-
     def register_dataset(self, job_id: str, dataset_tag: str, operator_tags: List[str]):
         self.datasets[dataset_tag] = {
             "job_id": job_id,
@@ -593,19 +566,13 @@ def update_execution_metrics(
                 self._last_execution_stats[dataset_tag] = args
             self._start_thread_if_not_running()
 
-    def clear_execution_metrics(self, dataset_tag: str, operator_tags: List[str]):
+    def clear_last_execution_stats(self, dataset_tag: str):
+        # After dataset completes execution, remove cached execution stats.
+        # Marks the dataset as finished on job page's Ray Data Overview.
         with self._stats_lock:
             if dataset_tag in self._last_execution_stats:
                 del self._last_execution_stats[dataset_tag]
 
-        try:
-            self._stats_actor(
-                create_if_not_exists=False
-            ).clear_execution_metrics.remote(dataset_tag, operator_tags)
-        except Exception:
-            # Cluster may be shut down.
-            pass
-
     # Iteration methods
 
     def update_iteration_metrics(self, stats: "DatasetStats", dataset_tag: str):
diff --git a/python/ray/data/_internal/table_block.py b/python/ray/data/_internal/table_block.py
index 55ea4fcc553a..a8995fc6703b 100644
--- a/python/ray/data/_internal/table_block.py
+++ b/python/ray/data/_internal/table_block.py
@@ -15,9 +15,10 @@
 
 from ray.air.constants import TENSOR_COLUMN_NAME
 from ray.data._internal.block_builder import BlockBuilder
-from ray.data._internal.numpy_support import convert_udf_returns_to_numpy, is_array_like
+from ray.data._internal.numpy_support import is_array_like
 from ray.data._internal.row import TableRow
 from ray.data._internal.size_estimator import SizeEstimator
+from ray.data._internal.util import MiB
 from ray.data.block import Block, BlockAccessor
 
 if TYPE_CHECKING:
@@ -28,7 +29,7 @@
 
 # The max size of Python tuples to buffer before compacting them into a
 # table in the BlockBuilder.
-MAX_UNCOMPACTED_SIZE_BYTES = 50 * 1024 * 1024
+MAX_UNCOMPACTED_SIZE_BYTES = 50 * MiB
 
 
 class TableBlockBuilder(BlockBuilder):
@@ -121,14 +122,13 @@ def will_build_yield_copy(self) -> bool:
         return self._concat_would_copy() and len(self._tables) > 1
 
     def build(self) -> Block:
-        columns = {
-            key: convert_udf_returns_to_numpy(col) for key, col in self._columns.items()
-        }
-        if columns:
-            tables = [self._table_from_pydict(columns)]
+        if self._columns:
+            tables = [self._table_from_pydict(self._columns)]
         else:
             tables = []
+
         tables.extend(self._tables)
+
         if len(tables) > 0:
             return self._concat_tables(tables)
         else:
@@ -149,10 +149,7 @@ def _compact_if_needed(self) -> None:
         assert self._columns
         if self._uncompacted_size.size_bytes() < MAX_UNCOMPACTED_SIZE_BYTES:
             return
-        columns = {
-            key: convert_udf_returns_to_numpy(col) for key, col in self._columns.items()
-        }
-        block = self._table_from_pydict(columns)
+        block = self._table_from_pydict(self._columns)
         self.add_block(block)
         self._uncompacted_size = SizeEstimator()
         self._columns.clear()
diff --git a/python/ray/data/_internal/util.py b/python/ray/data/_internal/util.py
index 5e8c921c3733..1d0b70cf6a6c 100644
--- a/python/ray/data/_internal/util.py
+++ b/python/ray/data/_internal/util.py
@@ -26,7 +26,6 @@
 
 import ray
 from ray._private.utils import _get_pyarrow_version
-from ray.data._internal.arrow_ops.transform_pyarrow import unify_schemas
 from ray.data.context import DEFAULT_READ_OP_MIN_NUM_BLOCKS, WARN_PREFIX, DataContext
 
 if TYPE_CHECKING:
@@ -41,6 +40,12 @@
 
 logger = logging.getLogger(__name__)
 
+
+KiB = 1024  # bytes
+MiB = 1024 * KiB
+GiB = 1024 * MiB
+
+
 # NOTE: Make sure that these lower and upper bounds stay in sync with version
 # constraints given in python/setup.py.
 # Inclusive minimum pyarrow version.
@@ -707,6 +712,7 @@ def unify_block_metadata_schema(
     """
     # Some blocks could be empty, in which case we cannot get their schema.
     # TODO(ekl) validate schema is the same across different blocks.
+    from ray.data._internal.arrow_ops.transform_pyarrow import unify_schemas
 
     # First check if there are blocks with computed schemas, then unify
     # valid schemas from all such blocks.
diff --git a/python/ray/data/block.py b/python/ray/data/block.py
index 15cf6b68b20c..fcab3feb67eb 100644
--- a/python/ray/data/block.py
+++ b/python/ray/data/block.py
@@ -1,4 +1,5 @@
 import collections
+import logging
 import os
 import time
 from dataclasses import dataclass
@@ -25,6 +26,7 @@
 from ray.air.util.tensor_extensions.arrow import ArrowConversionError
 from ray.data._internal.util import _check_pyarrow_version, _truncated_repr
 from ray.types import ObjectRef
+from ray.util import log_once
 from ray.util.annotations import DeveloperAPI
 
 import psutil
@@ -57,6 +59,9 @@
 Block = Union["pyarrow.Table", "pandas.DataFrame"]
 
 
+logger = logging.getLogger(__name__)
+
+
 @DeveloperAPI
 class BlockType(Enum):
     ARROW = "arrow"
@@ -67,6 +72,12 @@ class BlockType(Enum):
 # returned from batch UDFs.
 DataBatch = Union["pyarrow.Table", "pandas.DataFrame", Dict[str, np.ndarray]]
 
+# User-facing data column type. This is the data type for data that is supplied to and
+# returned from column UDFs.
+DataBatchColumn = Union[
+    "pyarrow.ChunkedArray", "pyarrow.Array", "pandas.Series", np.ndarray
+]
+
 
 # A class type that implements __call__.
 CallableClass = type
@@ -374,6 +385,12 @@ def batch_to_block(
                 try:
                     return cls.batch_to_arrow_block(batch)
                 except ArrowConversionError as e:
+                    if log_once("_fallback_to_pandas_block_warning"):
+                        logger.warning(
+                            f"Failed to convert batch to Arrow due to: {e}; "
+                            f"falling back to Pandas block"
+                        )
+
                     if block_type is None:
                         return cls.batch_to_pandas_block(batch)
                     else:
@@ -386,9 +403,9 @@ def batch_to_block(
     @classmethod
     def batch_to_arrow_block(cls, batch: Dict[str, Any]) -> Block:
         """Create an Arrow block from user-facing data formats."""
-        from ray.data._internal.arrow_block import ArrowBlockAccessor
+        from ray.data._internal.arrow_block import ArrowBlockBuilder
 
-        return ArrowBlockAccessor.numpy_to_block(batch)
+        return ArrowBlockBuilder._table_from_pydict(batch)
 
     @classmethod
     def batch_to_pandas_block(cls, batch: Dict[str, Any]) -> Block:
diff --git a/python/ray/data/context.py b/python/ray/data/context.py
index 5ed9b4fe68ef..347d3da68372 100644
--- a/python/ray/data/context.py
+++ b/python/ray/data/context.py
@@ -80,6 +80,8 @@
 #       V2 in turn relies on int64 offsets, therefore having a limit of ~9Eb (exabytes)
 DEFAULT_USE_ARROW_TENSOR_V2 = env_bool("RAY_DATA_USE_ARROW_TENSOR_V2", True)
 
+DEFAULT_ENABLE_FALLBACK_TO_ARROW_OBJECT_EXT_TYPE = True
+
 DEFAULT_AUTO_LOG_STATS = False
 
 DEFAULT_VERBOSE_STATS_LOG = False
@@ -222,6 +224,12 @@ class DataContext:
         read_op_min_num_blocks: Minimum number of read output blocks for a dataset.
         enable_tensor_extension_casting: Whether to automatically cast NumPy ndarray
             columns in Pandas DataFrames to tensor extension columns.
+        use_arrow_tensor_v2: Config enabling V2 version of ArrowTensorArray supporting
+            tensors > 2Gb in size (off by default)
+        enable_fallback_to_arrow_object_ext_type: Enables fallback to serialize column
+            values not suppported by Arrow natively (like user-defined custom Python
+            classes for ex, etc) using `ArrowPythonObjectType` (simply serializing
+            these as bytes)
         enable_auto_log_stats: Whether to automatically log stats after execution. If
             disabled, you can still manually print stats with ``Dataset.stats()``.
         verbose_stats_logs: Whether stats logs should be verbose. This includes fields
@@ -293,6 +301,9 @@ class DataContext:
     read_op_min_num_blocks: int = DEFAULT_READ_OP_MIN_NUM_BLOCKS
     enable_tensor_extension_casting: bool = DEFAULT_ENABLE_TENSOR_EXTENSION_CASTING
     use_arrow_tensor_v2: bool = DEFAULT_USE_ARROW_TENSOR_V2
+    enable_fallback_to_arrow_object_ext_type = (
+        DEFAULT_ENABLE_FALLBACK_TO_ARROW_OBJECT_EXT_TYPE
+    )
     enable_auto_log_stats: bool = DEFAULT_AUTO_LOG_STATS
     verbose_stats_logs: bool = DEFAULT_VERBOSE_STATS_LOG
     trace_allocations: bool = DEFAULT_TRACE_ALLOCATIONS
diff --git a/python/ray/data/dataset.py b/python/ray/data/dataset.py
index d576b8eb2ea7..496e29a8dea4 100644
--- a/python/ray/data/dataset.py
+++ b/python/ray/data/dataset.py
@@ -5,6 +5,7 @@
 import logging
 import time
 import warnings
+from collections.abc import Sequence
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -87,6 +88,7 @@
     Block,
     BlockAccessor,
     DataBatch,
+    DataBatchColumn,
     T,
     U,
     UserDefinedFunction,
@@ -529,7 +531,8 @@ def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
             compute: This argument is deprecated. Use ``concurrency`` argument.
             batch_format: If ``"default"`` or ``"numpy"``, batches are
                 ``Dict[str, numpy.ndarray]``. If ``"pandas"``, batches are
-                ``pandas.DataFrame``.
+                ``pandas.DataFrame``. If ``"pyarrow"``, batches are
+                ``pyarrow.Table``.
             zero_copy_batch: Whether ``fn`` should be provided zero-copy, read-only
                 batches. If this is ``True`` and no copy is required for the
                 ``batch_format`` conversion, the batch is a zero-copy, read-only
@@ -700,16 +703,21 @@ def _map_batches_without_batch_size_validation(
     def add_column(
         self,
         col: str,
-        fn: Callable[["pandas.DataFrame"], "pandas.Series"],
+        fn: Callable[
+            [DataBatch],
+            DataBatchColumn,
+        ],
         *,
+        batch_format: Optional[str] = "pandas",
         compute: Optional[str] = None,
         concurrency: Optional[Union[int, Tuple[int, int]]] = None,
         **ray_remote_args,
     ) -> "Dataset":
         """Add the given column to the dataset.
 
-        A function generating the new column values given the batch in pandas
-        format must be specified.
+        A function generating the new column values given the batch in pyarrow or pandas
+        format must be specified. This function must operate on batches of
+        `batch_format`.
 
         Examples:
 
@@ -729,11 +737,6 @@ def add_column(
             id      int64
             new_id  int64
 
-            Overwrite the existing values with zeros.
-
-            >>> ds.add_column("id", lambda df: 0).take(3)
-            [{'id': 0}, {'id': 0}, {'id': 0}]
-
         Time complexity: O(dataset size / parallelism)
 
         Args:
@@ -741,6 +744,11 @@ def add_column(
                 column is overwritten.
             fn: Map function generating the column values given a batch of
                 records in pandas format.
+            batch_format: If ``"default"`` or ``"numpy"``, batches are
+                ``Dict[str, numpy.ndarray]``. If ``"pandas"``, batches are
+                ``pandas.DataFrame``. If ``"pyarrow"``, batches are
+                ``pyarrow.Table``. If ``"numpy"``, batches are
+                ``Dict[str, numpy.ndarray]``.
             compute: This argument is deprecated. Use ``concurrency`` argument.
             concurrency: The number of Ray workers to use concurrently. For a
                 fixed-sized worker pool of size ``n``, specify ``concurrency=n``. For
@@ -749,17 +757,69 @@ def add_column(
             ray_remote_args: Additional resource requirements to request from
                 ray (e.g., num_gpus=1 to request GPUs for the map tasks).
         """
+        # Check that batch_format
+        accepted_batch_formats = ["pandas", "pyarrow", "numpy"]
+        if batch_format not in accepted_batch_formats:
+            raise ValueError(
+                f"batch_format argument must be on of {accepted_batch_formats}, "
+                f"got: {batch_format}"
+            )
+
+        def _raise_duplicate_column_error(col: str):
+            raise ValueError(f"Trying to add an existing column with name {col!r}")
 
-        def add_column(batch: "pandas.DataFrame") -> "pandas.DataFrame":
-            batch.loc[:, col] = fn(batch)
-            return batch
+        def add_column(batch: DataBatch) -> DataBatch:
+            column = fn(batch)
+            if batch_format == "pandas":
+                import pandas as pd
+
+                assert isinstance(column, (pd.Series, Sequence)), (
+                    f"For pandas batch format, the function must return a pandas "
+                    f"Series or sequence, got: {type(column)}"
+                )
+                if col in batch:
+                    _raise_duplicate_column_error(col)
+                batch.loc[:, col] = column
+                return batch
+            elif batch_format == "pyarrow":
+                import pyarrow as pa
+
+                assert isinstance(column, (pa.Array, pa.ChunkedArray)), (
+                    f"For pyarrow batch format, the function must return a pyarrow "
+                    f"Array, got: {type(column)}"
+                )
+                # Historically, this method was written for pandas batch format.
+                # To resolve https://github.com/ray-project/ray/issues/48090,
+                # we also allow pyarrow batch format which is preferred but would be
+                # a breaking change to enforce.
+
+                # For pyarrow, the index of the column will be -1 if it is missing in
+                # which case we'll want to append it
+                column_idx = batch.schema.get_field_index(col)
+                if column_idx == -1:
+                    # Append the column to the table
+                    return batch.append_column(col, column)
+                else:
+                    _raise_duplicate_column_error(col)
+
+            else:
+                # batch format is assumed to be numpy since we checked at the
+                # beginning of the add_column function
+                assert isinstance(column, np.ndarray), (
+                    f"For numpy batch format, the function must return a "
+                    f"numpy.ndarray, got: {type(column)}"
+                )
+                if col in batch:
+                    _raise_duplicate_column_error(col)
+                batch[col] = column
+                return batch
 
         if not callable(fn):
             raise ValueError("`fn` must be callable, got {}".format(fn))
 
         return self.map_batches(
             add_column,
-            batch_format="pandas",  # TODO(ekl) we should make this configurable.
+            batch_format=batch_format,
             compute=compute,
             concurrency=concurrency,
             zero_copy_batch=False,
@@ -801,7 +861,7 @@ def drop_columns(
 
         Args:
             cols: Names of the columns to drop. If any name does not exist,
-                an exception is raised.
+                an exception is raised. Column names must be unique.
             compute: This argument is deprecated. Use ``concurrency`` argument.
             concurrency: The number of Ray workers to use concurrently. For a fixed-sized
                 worker pool of size ``n``, specify ``concurrency=n``. For an autoscaling
@@ -810,12 +870,15 @@ def drop_columns(
                 ray (e.g., num_gpus=1 to request GPUs for the map tasks).
         """  # noqa: E501
 
+        if len(cols) != len(set(cols)):
+            raise ValueError(f"drop_columns expects unique column names, got: {cols}")
+
         def drop_columns(batch):
-            return batch.drop(columns=cols)
+            return batch.drop(cols)
 
         return self.map_batches(
             drop_columns,
-            batch_format="pandas",
+            batch_format="pyarrow",
             zero_copy_batch=True,
             compute=compute,
             concurrency=concurrency,
@@ -4316,7 +4379,8 @@ def to_tf(
             If your model accepts additional metadata aside from features and label, specify a single additional column or a list of additional columns.
             A common use case is to include sample weights in the data samples and train a ``tf.keras.Model`` with ``tf.keras.Model.fit``.
 
-            >>> ds = ds.add_column("sample weights", lambda df: 1)
+            >>> import pandas as pd
+            >>> ds = ds.add_column("sample weights", lambda df: pd.Series([1] * len(df)))
             >>> ds.to_tf(feature_columns="features", label_columns="target", additional_columns="sample weights")
             <_OptionsDataset element_spec=(TensorSpec(shape=(None, 4), dtype=tf.float64, name='features'), TensorSpec(shape=(None,), dtype=tf.int64, name='target'), TensorSpec(shape=(None,), dtype=tf.int64, name='sample weights'))>
 
diff --git a/python/ray/data/examples/data/hudi-tables/0.x_cow_partitioned.zip b/python/ray/data/examples/data/hudi-tables/0.x_cow_partitioned.zip
new file mode 100644
index 000000000000..9f78c06de945
Binary files /dev/null and b/python/ray/data/examples/data/hudi-tables/0.x_cow_partitioned.zip differ
diff --git a/python/ray/data/extensions/__init__.py b/python/ray/data/extensions/__init__.py
index bebf3c2b2a5c..517b4fe7a3a2 100644
--- a/python/ray/data/extensions/__init__.py
+++ b/python/ray/data/extensions/__init__.py
@@ -8,7 +8,7 @@
     ArrowPythonObjectType,
     PythonObjectArray,
     PythonObjectDtype,
-    object_extension_type_allowed,
+    _object_extension_type_allowed,
 )
 from ray.data.extensions.tensor_extension import (
     ArrowConversionError,
@@ -40,6 +40,6 @@
     "ArrowPythonObjectScalar",
     "PythonObjectArray",
     "PythonObjectDtype",
-    "object_extension_type_allowed",
+    "_object_extension_type_allowed",
     "get_arrow_extension_tensor_types",
 ]
diff --git a/python/ray/data/extensions/object_extension.py b/python/ray/data/extensions/object_extension.py
index a5daf4811a05..42ab20a231c6 100644
--- a/python/ray/data/extensions/object_extension.py
+++ b/python/ray/data/extensions/object_extension.py
@@ -2,7 +2,7 @@
     ArrowPythonObjectArray,
     ArrowPythonObjectScalar,
     ArrowPythonObjectType,
-    object_extension_type_allowed,
+    _object_extension_type_allowed,
 )
 from ray.air.util.object_extensions.pandas import (  # noqa: F401
     PythonObjectArray,
diff --git a/python/ray/data/grouped_data.py b/python/ray/data/grouped_data.py
index 8f7b7dde118d..427ea18b7bbf 100644
--- a/python/ray/data/grouped_data.py
+++ b/python/ray/data/grouped_data.py
@@ -1,3 +1,4 @@
+from functools import partial
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
 from ray.data._internal.aggregate import Count, Max, Mean, Min, Std, Sum
@@ -261,7 +262,10 @@ def wrapped_fn(batch, *args, **kwargs):
 
         # Change the name of the wrapped function so that users see the name of their
         # function rather than `wrapped_fn` in the progress bar.
-        wrapped_fn.__name__ = fn.__name__
+        if isinstance(fn, partial):
+            wrapped_fn.__name__ = fn.func.__name__
+        else:
+            wrapped_fn.__name__ = fn.__name__
 
         # Note we set batch_size=None here, so it will use the entire block as a batch,
         # which ensures that each group will be contained within a batch in entirety.
diff --git a/python/ray/data/iterator.py b/python/ray/data/iterator.py
index 58e9a1b7355e..2f19111af80f 100644
--- a/python/ray/data/iterator.py
+++ b/python/ray/data/iterator.py
@@ -734,7 +734,8 @@ def to_tf(
             If your model accepts additional metadata aside from features and label, specify a single additional column or a list of additional columns.
             A common use case is to include sample weights in the data samples and train a ``tf.keras.Model`` with ``tf.keras.Model.fit``.
 
-            >>> ds = ds.add_column("sample weights", lambda df: 1)
+            >>> import pandas as pd
+            >>> ds = ds.add_column("sample weights", lambda df: pd.Series([1] * len(df)))
             >>> it = ds.iterator()
             >>> it.to_tf(feature_columns="sepal length (cm)", label_columns="target", additional_columns="sample weights")
             <_OptionsDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.float64, name='sepal length (cm)'), TensorSpec(shape=(None,), dtype=tf.int64, name='target'), TensorSpec(shape=(None,), dtype=tf.int64, name='sample weights'))>
diff --git a/python/ray/data/read_api.py b/python/ray/data/read_api.py
index 60eee8571c1d..d60a89858512 100644
--- a/python/ray/data/read_api.py
+++ b/python/ray/data/read_api.py
@@ -27,6 +27,7 @@
 from ray.data._internal.datasource.delta_sharing_datasource import (
     DeltaSharingDatasource,
 )
+from ray.data._internal.datasource.hudi_datasource import HudiDatasource
 from ray.data._internal.datasource.iceberg_datasource import IcebergDatasource
 from ray.data._internal.datasource.image_datasource import (
     ImageDatasource,
@@ -2312,6 +2313,58 @@ def get_dbutils():
     )
 
 
+@PublicAPI(stability="alpha")
+def read_hudi(
+    table_uri: str,
+    *,
+    storage_options: Optional[Dict[str, str]] = None,
+    ray_remote_args: Optional[Dict[str, Any]] = None,
+    concurrency: Optional[int] = None,
+    override_num_blocks: Optional[int] = None,
+) -> Dataset:
+    """
+    Create a :class:`~ray.data.Dataset` from an
+    `Apache Hudi table <https://hudi.apache.org>`_.
+
+    Examples:
+        >>> import ray
+        >>> ds = ray.data.read_hudi( # doctest: +SKIP
+        ...     table_uri="/hudi/trips",
+        ... )
+
+    Args:
+        table_uri: The URI of the Hudi table to read from. Local file paths, S3, and GCS
+            are supported.
+        storage_options: Extra options that make sense for a particular storage
+            connection. This is used to store connection parameters like credentials,
+            endpoint, etc. See more explanation
+            `here <https://github.com/apache/hudi-rs?tab=readme-ov-file#work-with-cloud-storage>`_.
+        ray_remote_args: kwargs passed to :meth:`~ray.remote` in the read tasks.
+        concurrency: The maximum number of Ray tasks to run concurrently. Set this
+            to control number of tasks to run concurrently. This doesn't change the
+            total number of tasks run or the total number of output blocks. By default,
+            concurrency is dynamically decided based on the available resources.
+        override_num_blocks: Override the number of output blocks from all read tasks.
+            By default, the number of output blocks is dynamically decided based on
+            input data size and available resources. You shouldn't manually set this
+            value in most cases.
+
+    Returns:
+        A :class:`~ray.data.Dataset` producing records read from the Hudi table.
+    """  # noqa: E501
+    datasource = HudiDatasource(
+        table_uri=table_uri,
+        storage_options=storage_options,
+    )
+
+    return read_datasource(
+        datasource=datasource,
+        ray_remote_args=ray_remote_args,
+        concurrency=concurrency,
+        override_num_blocks=override_num_blocks,
+    )
+
+
 @PublicAPI
 def from_dask(df: "dask.dataframe.DataFrame") -> MaterializedDataset:
     """Create a :class:`~ray.data.Dataset` from a
diff --git a/python/ray/data/tests/preprocessors/test_encoder.py b/python/ray/data/tests/preprocessors/test_encoder.py
index 46b719ba6e83..bfae00596439 100644
--- a/python/ray/data/tests/preprocessors/test_encoder.py
+++ b/python/ray/data/tests/preprocessors/test_encoder.py
@@ -298,7 +298,7 @@ def test_one_hot_encoder_with_max_categories():
     expected_df = pd.DataFrame(
         {
             "A": col_a,
-            "B": [[0, 1], [1, 0], [0, 0], [1, 0]],
+            "B": [[0, 0], [1, 0], [0, 1], [1, 0]],
             "C": [[1, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 1]],
         }
     )
diff --git a/python/ray/data/tests/test_all_to_all.py b/python/ray/data/tests/test_all_to_all.py
index cf0cb8b2b2e7..a6b173383145 100644
--- a/python/ray/data/tests/test_all_to_all.py
+++ b/python/ray/data/tests/test_all_to_all.py
@@ -1167,7 +1167,6 @@ def test_groupby_map_groups_multicolumn(
     ray_start_regular_shared, ds_format, num_parts, use_push_based_shuffle
 ):
     # Test built-in count aggregation
-    print(f"Seeding RNG for test_groupby_arrow_count with: {RANDOM_SEED}")
     random.seed(RANDOM_SEED)
     xs = list(range(100))
     random.shuffle(xs)
@@ -1190,6 +1189,33 @@ def test_groupby_map_groups_multicolumn(
     ]
 
 
+def test_groupby_map_groups_with_partial():
+    """
+    The partial function name should show up as
+    +- Sort
+       +- MapBatches(func)
+    """
+    from functools import partial
+
+    def func(x, y):
+        return {f"x_add_{y}": [len(x["id"]) + y]}
+
+    df = pd.DataFrame({"id": list(range(100))})
+    df["key"] = df["id"] % 5
+
+    ds = ray.data.from_pandas(df).groupby("key").map_groups(partial(func, y=5))
+    result = ds.take_all()
+
+    assert result == [
+        {"x_add_5": 25},
+        {"x_add_5": 25},
+        {"x_add_5": 25},
+        {"x_add_5": 25},
+        {"x_add_5": 25},
+    ]
+    assert "MapBatches(func)" in ds.__repr__()
+
+
 def test_random_block_order_schema(ray_start_regular_shared):
     df = pd.DataFrame({"a": np.random.rand(10), "b": np.random.rand(10)})
     ds = ray.data.from_pandas(df).randomize_block_order()
diff --git a/python/ray/data/tests/test_arrow_block.py b/python/ray/data/tests/test_arrow_block.py
index 43888ace8c57..00ed13363f20 100644
--- a/python/ray/data/tests/test_arrow_block.py
+++ b/python/ray/data/tests/test_arrow_block.py
@@ -1,13 +1,264 @@
+import gc
+import os
+import sys
 import types
+from tempfile import TemporaryDirectory
+from typing import Union
 
 import numpy as np
 import pyarrow as pa
 import pytest
+from pyarrow import parquet as pq
 
 import ray
 from ray._private.test_utils import run_string_as_driver
+from ray.air.util.tensor_extensions.arrow import ArrowTensorArray
+from ray.data import DataContext
 from ray.data._internal.arrow_block import ArrowBlockAccessor
-from ray.data.extensions.object_extension import object_extension_type_allowed
+from ray.data._internal.arrow_ops.transform_pyarrow import combine_chunked_array
+from ray.data._internal.util import GiB, MiB
+from ray.data.extensions.object_extension import _object_extension_type_allowed
+
+
+@pytest.fixture(scope="module")
+def parquet_dataset_single_column_gt_2gb():
+    chunk_size = 256 * MiB
+    num_chunks = 10
+
+    total_column_size = chunk_size * 10  # ~2.5 GiB
+
+    with TemporaryDirectory() as tmp_dir:
+        dataset_path = f"{tmp_dir}/large_parquet_chunk_{chunk_size}"
+
+        # Create directory
+        os.mkdir(dataset_path)
+
+        for i in range(num_chunks):
+            chunk = b"a" * chunk_size
+
+            d = {"id": [i], "bin": [chunk]}
+            t = pa.Table.from_pydict(d)
+
+            print(f">>> Table schema: {t.schema} (size={sys.getsizeof(t)})")
+
+            filepath = f"{dataset_path}/chunk_{i}.parquet"
+            pq.write_table(t, filepath)
+
+            print(f">>> Created a chunk #{i}")
+
+        print(f">>> Created dataset at {dataset_path}")
+
+        yield dataset_path, num_chunks, total_column_size
+
+        print(f">>> Cleaning up dataset at {dataset_path}")
+
+
+@pytest.fixture(scope="module")
+def binary_dataset_single_file_gt_2gb():
+    total_size = int(2.1 * GiB)
+    chunk_size = 256 * MiB
+    num_chunks = total_size // chunk_size
+    remainder = total_size % chunk_size
+
+    with TemporaryDirectory() as tmp_dir:
+        dataset_path = f"{tmp_dir}/binary_dataset_gt_2gb_single_file"
+
+        # Create directory
+        os.mkdir(dataset_path)
+
+        with open(f"{dataset_path}/chunk.bin", "wb") as f:
+            for i in range(num_chunks):
+                f.write(b"a" * chunk_size)
+
+                print(f">>> Written chunk #{i}")
+
+            if remainder:
+                f.write(b"a" * remainder)
+
+        print(f">>> Wrote chunked dataset at: {dataset_path}")
+
+        yield dataset_path, total_size
+
+        print(f">>> Cleaning up dataset: {dataset_path}")
+
+
+@pytest.mark.parametrize(
+    "col_name",
+    [
+        "bytes",
+        # TODO fix numpy conversion
+        # "text",
+    ],
+)
+def test_single_row_gt_2gb(
+    ray_start_regular,
+    restore_data_context,
+    binary_dataset_single_file_gt_2gb,
+    col_name,
+):
+    # Disable (automatic) fallback to `ArrowPythonObjectType` extension type
+    DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False
+
+    dataset_path, target_binary_size = binary_dataset_single_file_gt_2gb
+
+    def _id(row):
+        bs = row[col_name]
+        assert round(len(bs) / GiB, 1) == round(target_binary_size / GiB, 1)
+        return row
+
+    if col_name == "text":
+        ds = ray.data.read_text(dataset_path)
+    elif col_name == "bytes":
+        ds = ray.data.read_binary_files(dataset_path)
+
+    total = ds.map(_id).count()
+
+    assert total == 1
+
+
+@pytest.mark.parametrize(
+    "op",
+    [
+        "map",
+        "map_batches",
+    ],
+)
+def test_arrow_batch_gt_2gb(
+    ray_start_regular,
+    parquet_dataset_single_column_gt_2gb,
+    restore_data_context,
+    op,
+):
+    # Disable (automatic) fallback to `ArrowPythonObjectType` extension type
+    DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False
+
+    dataset_path, num_rows, total_column_size = parquet_dataset_single_column_gt_2gb
+
+    def _id(x):
+        return x
+
+    ds = ray.data.read_parquet(dataset_path)
+
+    if op == "map":
+        ds = ds.map(_id)
+    elif op == "map_batches":
+        # Combine all rows into a single batch using `map_batches` coercing to
+        # numpy format
+        ds = ds.map_batches(
+            _id,
+            batch_format="numpy",
+            batch_size=num_rows,
+            zero_copy_batch=False,
+        )
+
+    batch = ds.take_batch()
+
+    total_binary_column_size = sum([len(b) for b in batch["bin"]])
+
+    print(
+        f">>> Batch:\n"
+        f"------\n"
+        "Column: 'id'\n"
+        f"Values: {batch['id']}\n"
+        f"------\n"
+        "Column: 'bin'\n"
+        f"Total: {total_binary_column_size / GiB} GiB\n"
+        f"Values: {[str(v)[:3] + ' x ' + str(len(v)) for v in batch['bin']]}\n"
+    )
+
+    assert total_binary_column_size == total_column_size
+
+    # Clean up refs
+    del batch
+    del ds
+    # Force GC to free up object store memory
+    gc.collect()
+
+
+@pytest.mark.parametrize(
+    "input_,expected_output",
+    [
+        # Empty chunked array
+        (pa.chunked_array([], type=pa.int8()), pa.array([], type=pa.int8())),
+        # Fixed-shape tensors
+        (
+            pa.chunked_array(
+                [
+                    ArrowTensorArray.from_numpy(np.arange(3).reshape(3, 1)),
+                    ArrowTensorArray.from_numpy(np.arange(3).reshape(3, 1)),
+                ]
+            ),
+            ArrowTensorArray.from_numpy(
+                np.concatenate(
+                    [
+                        np.arange(3).reshape(3, 1),
+                        np.arange(3).reshape(3, 1),
+                    ]
+                )
+            ),
+        ),
+        # Ragged (variable-shaped) tensors
+        (
+            pa.chunked_array(
+                [
+                    ArrowTensorArray.from_numpy(np.arange(3).reshape(3, 1)),
+                    ArrowTensorArray.from_numpy(np.arange(5).reshape(5, 1)),
+                ]
+            ),
+            ArrowTensorArray.from_numpy(
+                np.concatenate(
+                    [
+                        np.arange(3).reshape(3, 1),
+                        np.arange(5).reshape(5, 1),
+                    ]
+                )
+            ),
+        ),
+        # Small (< 2 GiB) arrays
+        (
+            pa.chunked_array(
+                [
+                    pa.array([1, 2, 3], type=pa.int16()),
+                    pa.array([4, 5, 6], type=pa.int16()),
+                ]
+            ),
+            pa.array([1, 2, 3, 4, 5, 6], type=pa.int16()),
+        ),
+    ],
+)
+def test_combine_chunked_array_small(
+    input_, expected_output: Union[pa.Array, pa.ChunkedArray]
+):
+    result = combine_chunked_array(input_)
+
+    expected_output.equals(result)
+
+
+def test_combine_chunked_array_large():
+    """Verifies `combine_chunked_array` on arrays > 2 GiB"""
+
+    # 144 MiB
+    ones_1gb = np.ones(shape=(550, 128, 128, 4), dtype=np.int32()).ravel()
+
+    # Total ~2.15 GiB
+    input_ = pa.chunked_array(
+        [
+            pa.array(ones_1gb),
+        ]
+        * 16
+    )
+
+    assert round(input_.nbytes / GiB, 2) == 2.15
+
+    result = combine_chunked_array(input_)
+
+    assert isinstance(result, pa.ChunkedArray)
+    assert len(result.chunks) == 2
+
+    # Should re-combine first provided 14 chunks into 1
+    assert result.chunks[0].nbytes == sum([c.nbytes for c in input_.chunks[:14]])
+    # Remaining 2 go into the second one
+    assert result.chunks[1].nbytes == sum([c.nbytes for c in input_.chunks[14:]])
 
 
 def test_append_column(ray_start_regular_shared):
@@ -46,7 +297,7 @@ def test_register_arrow_types(tmp_path):
 
 
 @pytest.mark.skipif(
-    not object_extension_type_allowed(), reason="Object extension type not supported."
+    not _object_extension_type_allowed(), reason="Object extension type not supported."
 )
 def test_dict_doesnt_fallback_to_pandas_block(ray_start_regular_shared):
     # If the UDF returns a column with dict, previously, we would
@@ -81,6 +332,4 @@ def fn2(batch):
 
 
 if __name__ == "__main__":
-    import sys
-
     sys.exit(pytest.main(["-v", __file__]))
diff --git a/python/ray/data/tests/test_arrow_serialization.py b/python/ray/data/tests/test_arrow_serialization.py
index b3e8b10c19b1..232ed32cc749 100644
--- a/python/ray/data/tests/test_arrow_serialization.py
+++ b/python/ray/data/tests/test_arrow_serialization.py
@@ -26,7 +26,7 @@
 from ray._private.utils import _get_pyarrow_version
 from ray.data.extensions.object_extension import (
     ArrowPythonObjectArray,
-    object_extension_type_allowed,
+    _object_extension_type_allowed,
 )
 from ray.data.extensions.tensor_extension import (
     ArrowTensorArray,
@@ -423,7 +423,7 @@ def pickled_objects_array():
     (lazy_fixture("complex_nested_array"), 0.1),
 ]
 
-if object_extension_type_allowed():
+if _object_extension_type_allowed():
     pytest_custom_serialization_arrays.append(
         # Array of pickled objects
         (lazy_fixture("pickled_objects_array"), 0.1),
@@ -550,7 +550,7 @@ def fn(batch: list):
 
 
 @pytest.mark.skipif(
-    not object_extension_type_allowed(), reason="Object extension not supported."
+    not _object_extension_type_allowed(), reason="Object extension not supported."
 )
 def test_arrow_object_and_array_support(ray_start_regular_shared):
     obj = types.SimpleNamespace(some_attribute="test")
diff --git a/python/ray/data/tests/test_dynamic_block_split.py b/python/ray/data/tests/test_dynamic_block_split.py
index cda00239331f..398831aaea9c 100644
--- a/python/ray/data/tests/test_dynamic_block_split.py
+++ b/python/ray/data/tests/test_dynamic_block_split.py
@@ -10,7 +10,7 @@
 
 import ray
 from ray.data import Dataset
-from ray.data._internal.arrow_block import ArrowBlockAccessor
+from ray.data._internal.arrow_block import ArrowBlockBuilder
 from ray.data._internal.datasource.csv_datasource import CSVDatasource
 from ray.data.block import BlockMetadata
 from ray.data.datasource import Datasource
@@ -68,7 +68,7 @@ def _blocks_generator():
                             (self.num_rows_per_batch, self.row_size), dtype=np.uint8
                         )
                     }
-                    block = ArrowBlockAccessor.numpy_to_block(batch)
+                    block = ArrowBlockBuilder._table_from_pydict(batch)
                     yield block
                 else:
                     yield pd.DataFrame(
diff --git a/python/ray/data/tests/test_execution_optimizer.py b/python/ray/data/tests/test_execution_optimizer.py
index d657ce1c9d98..af7af855b187 100644
--- a/python/ray/data/tests/test_execution_optimizer.py
+++ b/python/ray/data/tests/test_execution_optimizer.py
@@ -1145,9 +1145,7 @@ def test_sort_validate_keys(ray_start_regular_shared):
     assert extract_values("id", ds.sort("id").take_all()) == list(range(10))
 
     invalid_col_name = "invalid_column"
-    with pytest.raises(
-        ValueError, match=f"The column '{invalid_col_name}' does not exist"
-    ):
+    with pytest.raises(ValueError, match="there's no such column in the dataset"):
         ds.sort(invalid_col_name).take_all()
 
     ds_named = ray.data.from_items(
@@ -1165,10 +1163,7 @@ def test_sort_validate_keys(ray_start_regular_shared):
     assert [d["col1"] for d in r1] == [7, 5, 3, 1]
     assert [d["col2"] for d in r2] == [8, 6, 4, 2]
 
-    with pytest.raises(
-        ValueError,
-        match=f"The column '{invalid_col_name}' does not exist in the schema",
-    ):
+    with pytest.raises(ValueError, match="there's no such column in the dataset"):
         ds_named.sort(invalid_col_name).take_all()
 
 
@@ -1279,9 +1274,7 @@ def test_aggregate_e2e(ray_start_regular_shared, use_push_based_shuffle):
 def test_aggregate_validate_keys(ray_start_regular_shared):
     ds = ray.data.range(10)
     invalid_col_name = "invalid_column"
-    with pytest.raises(
-        ValueError, match=f"The column '{invalid_col_name}' does not exist"
-    ):
+    with pytest.raises(ValueError):
         ds.groupby(invalid_col_name).count()
 
     ds_named = ray.data.from_items(
@@ -1308,7 +1301,7 @@ def test_aggregate_validate_keys(ray_start_regular_shared):
 
     with pytest.raises(
         ValueError,
-        match=f"The column '{invalid_col_name}' does not exist in the schema",
+        match="there's no such column in the dataset",
     ):
         ds_named.groupby(invalid_col_name).count()
 
diff --git a/python/ray/data/tests/test_hudi.py b/python/ray/data/tests/test_hudi.py
new file mode 100644
index 000000000000..af8035cc315f
--- /dev/null
+++ b/python/ray/data/tests/test_hudi.py
@@ -0,0 +1,114 @@
+import os
+import zipfile
+
+import pytest
+from packaging.version import parse as parse_version
+from pytest_lazyfixture import lazy_fixture
+
+import ray
+from ray._private.utils import _get_pyarrow_version
+from ray.data.datasource.path_util import (
+    _resolve_paths_and_filesystem,
+    _unwrap_protocol,
+)
+from ray.data.tests.conftest import *  # noqa
+from ray.data.tests.mock_http_server import *  # noqa
+from ray.tests.conftest import *  # noqa
+
+MIN_PYARROW_VERSION_FOR_HUDI = parse_version("11.0.0")
+_VER = _get_pyarrow_version()
+PYARROW_VERSION = parse_version(_VER) if _VER else None
+PYARROW_VERSION_MEETS_REQUIREMENT = (
+    PYARROW_VERSION and PYARROW_VERSION >= MIN_PYARROW_VERSION_FOR_HUDI
+)
+PYARROW_HUDI_TEST_SKIP_REASON = (
+    f"Hudi only supported if pyarrow >= {MIN_PYARROW_VERSION_FOR_HUDI}"
+)
+
+
+def _extract_testing_table(fixture_path: str, table_dir: str, target_dir: str) -> str:
+    with zipfile.ZipFile(fixture_path, "r") as zip_ref:
+        zip_ref.extractall(target_dir)
+    return os.path.join(target_dir, table_dir)
+
+
+@pytest.mark.skipif(
+    not PYARROW_VERSION_MEETS_REQUIREMENT,
+    reason=PYARROW_HUDI_TEST_SKIP_REASON,
+)
+@pytest.mark.parametrize(
+    "fs,data_path",
+    [
+        (None, lazy_fixture("local_path")),
+        (lazy_fixture("local_fs"), lazy_fixture("local_path")),
+    ],
+)
+def test_read_hudi_simple_cow_table(ray_start_regular_shared, fs, data_path):
+    setup_data_path = _unwrap_protocol(data_path)
+    target_testing_dir = os.path.join(setup_data_path, "test_hudi")
+    fixture_path, _ = _resolve_paths_and_filesystem(
+        "example://hudi-tables/0.x_cow_partitioned.zip", fs
+    )
+    target_table_path = _extract_testing_table(
+        fixture_path[0], "trips_table", target_testing_dir
+    )
+
+    ds = ray.data.read_hudi(target_table_path)
+
+    assert ds.schema().names == [
+        "_hoodie_commit_time",
+        "_hoodie_commit_seqno",
+        "_hoodie_record_key",
+        "_hoodie_partition_path",
+        "_hoodie_file_name",
+        "ts",
+        "uuid",
+        "rider",
+        "driver",
+        "fare",
+        "city",
+    ]
+    assert ds.count() == 5
+    rows = (
+        ds.select_columns(["_hoodie_commit_time", "ts", "uuid", "fare"])
+        .sort("fare")
+        .take_all()
+    )
+    assert rows == [
+        {
+            "_hoodie_commit_time": "20240402123035233",
+            "ts": 1695115999911,
+            "uuid": "c8abbe79-8d89-47ea-b4ce-4d224bae5bfa",
+            "fare": 17.85,
+        },
+        {
+            "_hoodie_commit_time": "20240402123035233",
+            "ts": 1695159649087,
+            "uuid": "334e26e9-8355-45cc-97c6-c31daf0df330",
+            "fare": 19.1,
+        },
+        {
+            "_hoodie_commit_time": "20240402123035233",
+            "ts": 1695091554788,
+            "uuid": "e96c4396-3fad-413a-a942-4cb36106d721",
+            "fare": 27.7,
+        },
+        {
+            "_hoodie_commit_time": "20240402123035233",
+            "ts": 1695516137016,
+            "uuid": "e3cf430c-889d-4015-bc98-59bdce1e530c",
+            "fare": 34.15,
+        },
+        {
+            "_hoodie_commit_time": "20240402144910683",
+            "ts": 1695046462179,
+            "uuid": "9909a8b1-2d15-4d3d-8ec9-efc48c536a00",
+            "fare": 339.0,
+        },
+    ]
+
+
+if __name__ == "__main__":
+    import sys
+
+    sys.exit(pytest.main(["-v", __file__]))
diff --git a/python/ray/data/tests/test_map.py b/python/ray/data/tests/test_map.py
index 9b1a4f8d4575..41100f4b8a2c 100644
--- a/python/ray/data/tests/test_map.py
+++ b/python/ray/data/tests/test_map.py
@@ -9,6 +9,7 @@
 import numpy as np
 import pandas as pd
 import pyarrow as pa
+import pyarrow.compute as pc
 import pyarrow.parquet as pq
 import pytest
 
@@ -330,18 +331,101 @@ def map_generator(item: dict) -> Iterator[int]:
 
 
 def test_add_column(ray_start_regular_shared):
-    ds = ray.data.range(5).add_column("foo", lambda x: 1)
+    """Tests the add column API."""
+
+    # Test with pyarrow batch format
+    ds = ray.data.range(5).add_column(
+        "foo", lambda x: pa.array([1] * x.num_rows), batch_format="pyarrow"
+    )
+    assert ds.take(1) == [{"id": 0, "foo": 1}]
+
+    # Test with chunked array batch format
+    ds = ray.data.range(5).add_column(
+        "foo", lambda x: pa.chunked_array([[1] * x.num_rows]), batch_format="pyarrow"
+    )
+    assert ds.take(1) == [{"id": 0, "foo": 1}]
+
+    ds = ray.data.range(5).add_column(
+        "foo", lambda x: pc.add(x["id"], 1), batch_format="pyarrow"
+    )
+    assert ds.take(1) == [{"id": 0, "foo": 1}]
+
+    # Adding a column that is already there should result in an error
+    with pytest.raises(
+        ray.exceptions.UserCodeException,
+        match="Trying to add an existing column with name 'id'",
+    ):
+        ds = ray.data.range(5).add_column(
+            "id", lambda x: pc.add(x["id"], 1), batch_format="pyarrow"
+        )
+        assert ds.take(2) == [{"id": 1}, {"id": 2}]
+
+    # Adding a column in the wrong format should result in an error
+    with pytest.raises(
+        ray.exceptions.UserCodeException, match="For pyarrow batch format"
+    ):
+        ds = ray.data.range(5).add_column("id", lambda x: [1], batch_format="pyarrow")
+        assert ds.take(2) == [{"id": 1}, {"id": 2}]
+
+    # Test with numpy batch format
+    ds = ray.data.range(5).add_column(
+        "foo", lambda x: np.array([1] * len(list(x.keys())[0])), batch_format="numpy"
+    )
+    assert ds.take(1) == [{"id": 0, "foo": 1}]
+
+    ds = ray.data.range(5).add_column(
+        "foo", lambda x: np.add(x["id"], 1), batch_format="numpy"
+    )
+    assert ds.take(1) == [{"id": 0, "foo": 1}]
+
+    # Adding a column that is already there should result in an error
+    with pytest.raises(
+        ray.exceptions.UserCodeException,
+        match="Trying to add an existing column with name 'id'",
+    ):
+        ds = ray.data.range(5).add_column(
+            "id", lambda x: np.add(x["id"], 1), batch_format="numpy"
+        )
+        assert ds.take(2) == [{"id": 1}, {"id": 2}]
+
+    # Adding a column in the wrong format should result in an error
+    with pytest.raises(
+        ray.exceptions.UserCodeException, match="For numpy batch format"
+    ):
+        ds = ray.data.range(5).add_column("id", lambda x: [1], batch_format="numpy")
+        assert ds.take(2) == [{"id": 1}, {"id": 2}]
+
+    # Test with pandas batch format
+    ds = ray.data.range(5).add_column("foo", lambda x: pd.Series([1] * x.shape[0]))
     assert ds.take(1) == [{"id": 0, "foo": 1}]
 
     ds = ray.data.range(5).add_column("foo", lambda x: x["id"] + 1)
     assert ds.take(1) == [{"id": 0, "foo": 1}]
 
-    ds = ray.data.range(5).add_column("id", lambda x: x["id"] + 1)
-    assert ds.take(2) == [{"id": 1}, {"id": 2}]
+    # Adding a column that is already there should result in an error
+    with pytest.raises(
+        ray.exceptions.UserCodeException,
+        match="Trying to add an existing column with name 'id'",
+    ):
+        ds = ray.data.range(5).add_column("id", lambda x: x["id"] + 1)
+        assert ds.take(2) == [{"id": 1}, {"id": 2}]
+
+    # Adding a column in the wrong format should result in an error
+    with pytest.raises(
+        ray.exceptions.UserCodeException, match="For pandas batch format"
+    ):
+        ds = ray.data.range(5).add_column(
+            "id", lambda x: np.array([1]), batch_format="pandas"
+        )
+        assert ds.take(2) == [{"id": 1}, {"id": 2}]
 
     with pytest.raises(ValueError):
         ds = ray.data.range(5).add_column("id", 0)
 
+    # Test that an invalid batch_format raises an error
+    with pytest.raises(ValueError):
+        ray.data.range(5).add_column("foo", lambda x: x["id"] + 1, batch_format="foo")
+
 
 @pytest.mark.parametrize("names", (["foo", "bar"], {"spam": "foo", "ham": "bar"}))
 def test_rename_columns(ray_start_regular_shared, names):
@@ -362,14 +446,15 @@ def test_drop_columns(ray_start_regular_shared, tmp_path):
         assert ds.drop_columns(["col2"]).take(1) == [{"col1": 1, "col3": 3}]
         assert ds.drop_columns(["col1", "col3"]).take(1) == [{"col2": 2}]
         assert ds.drop_columns([]).take(1) == [{"col1": 1, "col2": 2, "col3": 3}]
-        assert ds.drop_columns(["col1", "col2", "col3"]).take(1) == [{}]
-        assert ds.drop_columns(["col1", "col1", "col2", "col1"]).take(1) == [
-            {"col3": 3}
-        ]
+        assert ds.drop_columns(["col1", "col2", "col3"]).take(1) == []
+        assert ds.drop_columns(["col1", "col2"]).take(1) == [{"col3": 3}]
         # Test dropping non-existent column
         with pytest.raises((UserCodeException, KeyError)):
             ds.drop_columns(["dummy_col", "col1", "col2"]).materialize()
 
+    with pytest.raises(ValueError, match="drop_columns expects unique column names"):
+        ds1.drop_columns(["col1", "col2", "col2"])
+
 
 def test_select_columns(ray_start_regular_shared):
     # Test pandas and arrow
diff --git a/python/ray/data/tests/test_mongo.py b/python/ray/data/tests/test_mongo.py
index 97828aae6bea..eb03aab39f80 100644
--- a/python/ray/data/tests/test_mongo.py
+++ b/python/ray/data/tests/test_mongo.py
@@ -93,13 +93,13 @@ def test_read_write_mongo(ray_start_regular_shared, start_mongo):
         override_num_blocks=2,
     )
     assert ds._block_num_rows() == [3, 2]
-    assert str(ds) == (
-        "Dataset(\n"
-        "   num_rows=5,\n"
-        "   schema={_id: fixed_size_binary[12], float_field: double, "
-        "int_field: int32}\n"
-        ")"
-    )
+    assert ds.count() == 5
+    assert ds.schema().names == ["_id", "float_field", "int_field"]
+    # We are not testing the datatype of _id here, because it varies per platform
+    assert ds.schema().types[1:] == [
+        pa.float64(),
+        pa.int32(),
+    ]
     assert df.equals(ds.drop_columns(["_id"]).to_pandas())
 
     # Read a subset of the collection.
@@ -111,13 +111,8 @@ def test_read_write_mongo(ray_start_regular_shared, start_mongo):
         override_num_blocks=2,
     )
     assert ds._block_num_rows() == [2, 1]
-    assert str(ds) == (
-        "Dataset(\n"
-        "   num_rows=3,\n"
-        "   schema={_id: fixed_size_binary[12], float_field: double, "
-        "int_field: int32}\n"
-        ")"
-    )
+    assert ds.count() == 3
+    assert ds.schema().names == ["_id", "float_field", "int_field"]
     df[df["int_field"] < 3].equals(ds.drop_columns(["_id"]).to_pandas())
 
     # Read with auto-tuned parallelism.
@@ -126,13 +121,14 @@ def test_read_write_mongo(ray_start_regular_shared, start_mongo):
         database=foo_db,
         collection=foo_collection,
     )
-    assert str(ds) == (
-        "Dataset(\n"
-        "   num_rows=5,\n"
-        "   schema={_id: fixed_size_binary[12], float_field: double, "
-        "int_field: int32}\n"
-        ")"
-    )
+
+    assert ds.count() == 5
+    assert ds.schema().names == ["_id", "float_field", "int_field"]
+    # We are not testing the datatype of _id here, because it varies per platform
+    assert ds.schema().types[1:] == [
+        pa.float64(),
+        pa.int32(),
+    ]
     assert df.equals(ds.drop_columns(["_id"]).to_pandas())
 
     # Read with a parallelism larger than number of rows.
@@ -142,13 +138,14 @@ def test_read_write_mongo(ray_start_regular_shared, start_mongo):
         collection=foo_collection,
         override_num_blocks=1000,
     )
-    assert str(ds) == (
-        "Dataset(\n"
-        "   num_rows=5,\n"
-        "   schema={_id: fixed_size_binary[12], float_field: double, "
-        "int_field: int32}\n"
-        ")"
-    )
+
+    assert ds.count() == 5
+    assert ds.schema().names == ["_id", "float_field", "int_field"]
+    # We are not testing the datatype of _id here, because it varies per platform
+    assert ds.schema().types[1:] == [
+        pa.float64(),
+        pa.int32(),
+    ]
     assert df.equals(ds.drop_columns(["_id"]).to_pandas())
 
     # Add a column and then write back to MongoDB.
diff --git a/python/ray/data/tests/test_numpy_support.py b/python/ray/data/tests/test_numpy_support.py
index c14038918c0a..ec67bcf689bb 100644
--- a/python/ray/data/tests/test_numpy_support.py
+++ b/python/ray/data/tests/test_numpy_support.py
@@ -6,6 +6,7 @@
 
 import ray
 from ray.air.util.tensor_extensions.utils import create_ragged_ndarray
+from ray.data import DataContext
 from ray.data.tests.conftest import *  # noqa
 from ray.tests.conftest import *  # noqa
 
@@ -27,22 +28,31 @@ def assert_structure_equals(a, b):
     assert a.dtype == b.dtype
     assert a.shape == b.shape
     for i in range(len(a)):
-        assert np.array_equiv(a[i], b[i]), (i, a, b)
+        assert np.array_equal(a[i], b[i]), (i, a[i], b[i])
 
 
-def test_list_of_scalars(ray_start_regular_shared):
+def test_list_of_scalars(ray_start_regular_shared, restore_data_context):
+    # Disable (automatic) fallback to `ArrowPythonObjectType` extension type
+    DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False
+
     data = [1, 2, 3]
     output = do_map_batches(data)
     assert_structure_equals(output, np.array([1, 2, 3], dtype=np.int64))
 
 
-def test_list_of_numpy_scalars(ray_start_regular_shared):
+def test_list_of_numpy_scalars(ray_start_regular_shared, restore_data_context):
+    # Disable (automatic) fallback to `ArrowPythonObjectType` extension type
+    DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False
+
     data = [np.int64(1), np.int64(2), np.int64(3)]
     output = do_map_batches(data)
     assert_structure_equals(output, np.array([1, 2, 3], dtype=np.int64))
 
 
-def test_list_of_objects(ray_start_regular_shared):
+def test_list_of_objects(ray_start_regular_shared, restore_data_context):
+    # NOTE: Fallback is enabled by default, this is purely for notational purposes
+    DataContext.get_current().enable_fallback_to_arrow_object_ext_type = True
+
     data = [1, 2, 3, UserObj()]
     output = do_map_batches(data)
     assert_structure_equals(output, np.array([1, 2, 3, UserObj()]))
@@ -88,34 +98,51 @@ def test_list_of_objects(ray_start_regular_shared):
         ),
     ],
 )
-def test_list_of_datetimes(data, expected_output, ray_start_regular_shared):
+def test_list_of_datetimes(
+    data, expected_output, ray_start_regular_shared, restore_data_context
+):
+    # Disable (automatic) fallback to `ArrowPythonObjectType` extension type
+    DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False
+
     output = do_map_batches(data)
     assert_structure_equals(output, expected_output)
 
 
-def test_array_like(ray_start_regular_shared):
+def test_array_like(ray_start_regular_shared, restore_data_context):
+    # Disable (automatic) fallback to `ArrowPythonObjectType` extension type
+    DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False
+
     data = torch.Tensor([1, 2, 3])
     output = do_map_batches(data)
     assert_structure_equals(output, np.array([1.0, 2.0, 3.0], dtype=np.float32))
 
 
-def test_list_of_arrays(ray_start_regular_shared):
+def test_list_of_arrays(ray_start_regular_shared, restore_data_context):
+    # Disable (automatic) fallback to `ArrowPythonObjectType` extension type
+    DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False
+
     data = [np.array([1, 2, 3]), np.array([4, 5, 6])]
     output = do_map_batches(data)
     assert_structure_equals(output, np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int64))
 
 
-def test_list_of_array_like(ray_start_regular_shared):
+def test_list_of_array_like(ray_start_regular_shared, restore_data_context):
+    # Disable (automatic) fallback to `ArrowPythonObjectType` extension type
+    DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False
+
     data = [torch.Tensor([1, 2, 3]), torch.Tensor([4, 5, 6])]
     output = do_map_batches(data)
     assert_structure_equals(output, np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32))
 
 
-def test_ragged_array_like(ray_start_regular_shared):
+def test_ragged_tensors_map_batches(ray_start_regular_shared, restore_data_context):
+    # Disable (automatic) fallback to `ArrowPythonObjectType` extension type
+    DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False
+
     data = [torch.Tensor([1, 2, 3]), torch.Tensor([1, 2])]
     output = do_map_batches(data)
     assert_structure_equals(
-        output, np.array([np.array([1, 2, 3]), np.array([1, 2])], dtype=object)
+        output, create_ragged_ndarray([np.array([1, 2, 3]), np.array([1, 2])])
     )
 
     data = [torch.zeros((3, 5, 10)), torch.zeros((3, 8, 8))]
@@ -125,23 +152,42 @@ def test_ragged_array_like(ray_start_regular_shared):
     )
 
 
-def test_scalar_nested_arrays(ray_start_regular_shared):
+def test_scalar_nested_arrays(ray_start_regular_shared, restore_data_context):
+    # Disable (automatic) fallback to `ArrowPythonObjectType` extension type
+    DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False
+
     data = [[[1]], [[2]]]
     output = do_map_batches(data)
-    assert_structure_equals(output, create_ragged_ndarray(data))
+
+    assert_structure_equals(
+        output,
+        create_ragged_ndarray(
+            [np.array([1], dtype=np.object_), np.array([2], dtype=np.object_)]
+        ),
+    )
 
 
-def test_scalar_lists_not_converted(ray_start_regular_shared):
+def test_scalar_lists_not_converted(ray_start_regular_shared, restore_data_context):
+    # Disable (automatic) fallback to `ArrowPythonObjectType` extension type
+    DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False
+
     data = [[1, 2], [1, 2]]
     output = do_map_batches(data)
-    assert_structure_equals(output, create_ragged_ndarray([[1, 2], [1, 2]]))
+    assert_structure_equals(
+        output, create_ragged_ndarray([np.array([1, 2]), np.array([1, 2])])
+    )
 
     data = [[1, 2, 3], [1, 2]]
     output = do_map_batches(data)
-    assert_structure_equals(output, create_ragged_ndarray([[1, 2, 3], [1, 2]]))
+    assert_structure_equals(
+        output, create_ragged_ndarray([np.array([1, 2, 3]), np.array([1, 2])])
+    )
 
 
-def test_scalar_numpy(ray_start_regular_shared):
+def test_scalar_numpy(ray_start_regular_shared, restore_data_context):
+    # Disable (automatic) fallback to `ArrowPythonObjectType` extension type
+    DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False
+
     data = np.int64(1)
     ds = ray.data.range(2, override_num_blocks=1)
     ds = ds.map(lambda x: {"output": data})
@@ -149,7 +195,10 @@ def test_scalar_numpy(ray_start_regular_shared):
     assert_structure_equals(output, np.array([1, 1], dtype=np.int64))
 
 
-def test_scalar_arrays(ray_start_regular_shared):
+def test_scalar_arrays(ray_start_regular_shared, restore_data_context):
+    # Disable (automatic) fallback to `ArrowPythonObjectType` extension type
+    DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False
+
     data = np.array([1, 2, 3])
     ds = ray.data.range(2, override_num_blocks=1)
     ds = ds.map(lambda x: {"output": data})
@@ -157,7 +206,10 @@ def test_scalar_arrays(ray_start_regular_shared):
     assert_structure_equals(output, np.array([[1, 2, 3], [1, 2, 3]], dtype=np.int64))
 
 
-def test_bytes(ray_start_regular_shared):
+def test_bytes(ray_start_regular_shared, restore_data_context):
+    # Disable (automatic) fallback to `ArrowPythonObjectType` extension type
+    DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False
+
     """Tests that bytes are converted to object dtype instead of zero-terminated."""
     data = b"\x1a\n\x00\n\x1a"
     ds = ray.data.range(1, override_num_blocks=1)
@@ -166,7 +218,10 @@ def test_bytes(ray_start_regular_shared):
     assert_structure_equals(output, np.array([b"\x1a\n\x00\n\x1a"], dtype=object))
 
 
-def test_scalar_array_like(ray_start_regular_shared):
+def test_uniform_tensors(ray_start_regular_shared, restore_data_context):
+    # Disable (automatic) fallback to `ArrowPythonObjectType` extension type
+    DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False
+
     data = torch.Tensor([1, 2, 3])
     ds = ray.data.range(2, override_num_blocks=1)
     ds = ds.map(lambda x: {"output": data})
@@ -174,17 +229,24 @@ def test_scalar_array_like(ray_start_regular_shared):
     assert_structure_equals(output, np.array([[1, 2, 3], [1, 2, 3]], dtype=np.float32))
 
 
-def test_scalar_ragged_arrays(ray_start_regular_shared):
+def test_scalar_ragged_arrays(ray_start_regular_shared, restore_data_context):
+    # Disable (automatic) fallback to `ArrowPythonObjectType` extension type
+    DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False
+
     data = [np.array([1, 2, 3]), np.array([1, 2])]
     ds = ray.data.range(2, override_num_blocks=1)
     ds = ds.map(lambda x: {"output": data[x["id"]]})
     output = ds.take_batch()["output"]
+
     assert_structure_equals(
         output, np.array([np.array([1, 2, 3]), np.array([1, 2])], dtype=object)
     )
 
 
-def test_scalar_ragged_array_like(ray_start_regular_shared):
+def test_ragged_tensors(ray_start_regular_shared, restore_data_context):
+    # Disable (automatic) fallback to `ArrowPythonObjectType` extension type
+    DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False
+
     data = [torch.Tensor([1, 2, 3]), torch.Tensor([1, 2])]
     ds = ray.data.range(2, override_num_blocks=1)
     ds = ds.map(lambda x: {"output": data[x["id"]]})
@@ -202,7 +264,10 @@ def test_scalar_ragged_array_like(ray_start_regular_shared):
     )
 
 
-def test_nested_ragged_arrays(ray_start_regular_shared):
+def test_nested_ragged_arrays(ray_start_regular_shared, restore_data_context):
+    # Disable (automatic) fallback to `ArrowPythonObjectType` extension type
+    DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False
+
     data = [
         {"a": [[1], [2, 3]]},
         {"a": [[4, 5], [6]]},
@@ -216,10 +281,26 @@ def f(row):
 
 
 # https://github.com/ray-project/ray/issues/35340
-def test_complex_ragged_arrays(ray_start_regular_shared):
+def test_complex_ragged_arrays(ray_start_regular_shared, restore_data_context):
+    # Disable (automatic) fallback to `ArrowPythonObjectType` extension type
+    DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False
+
     data = [[{"a": 1}, {"a": 2}, {"a": 3}], [{"b": 1}]]
     output = do_map_batches(data)
-    assert_structure_equals(output, create_ragged_ndarray(data))
+
+    # Assert resulting objects are coerced to appropriate shape, following
+    # table's schema
+    assert_structure_equals(
+        output,
+        create_ragged_ndarray(
+            [
+                np.array(
+                    [{"a": 1, "b": None}, {"a": 2, "b": None}, {"a": 3, "b": None}]
+                ),
+                np.array([{"a": None, "b": 1}]),
+            ]
+        ),
+    )
 
     data = ["hi", 1, None, [[[[]]]], {"a": [[{"b": 2, "c": UserObj()}]]}, UserObj()]
     output = do_map_batches(data)
diff --git a/python/ray/data/tests/test_object_gc.py b/python/ray/data/tests/test_object_gc.py
index b56c4542618d..2b1947e0498d 100644
--- a/python/ray/data/tests/test_object_gc.py
+++ b/python/ray/data/tests/test_object_gc.py
@@ -1,6 +1,7 @@
 import sys
 import threading
 
+import pandas as pd
 import pytest
 
 import ray
@@ -107,7 +108,7 @@ def test_tf_iteration(shutdown_only):
     # The size of dataset is 500*(80*80*4)*8B, about 100MB.
     ds = ray.data.range_tensor(
         500, shape=(80, 80, 4), override_num_blocks=100
-    ).add_column("label", lambda x: 1)
+    ).add_column("label", lambda df: pd.Series([1] * len(df)))
 
     # to_tf
     check_to_tf_no_spill(ctx, ds.map(lambda x: x))
diff --git a/python/ray/data/tests/test_pandas_block.py b/python/ray/data/tests/test_pandas_block.py
index 725d2c8ce1bb..4585d0e2a133 100644
--- a/python/ray/data/tests/test_pandas_block.py
+++ b/python/ray/data/tests/test_pandas_block.py
@@ -4,7 +4,7 @@
 import ray
 import ray.data
 from ray.data._internal.pandas_block import PandasBlockAccessor
-from ray.data.extensions.object_extension import object_extension_type_allowed
+from ray.data.extensions.object_extension import _object_extension_type_allowed
 
 
 def test_append_column(ray_start_regular_shared):
@@ -20,7 +20,7 @@ def test_append_column(ray_start_regular_shared):
 
 
 @pytest.mark.skipif(
-    object_extension_type_allowed(), reason="Objects can be put into Arrow"
+    _object_extension_type_allowed(), reason="Objects can be put into Arrow"
 )
 def test_dict_fallback_to_pandas_block(ray_start_regular_shared):
     # If the UDF returns a column with dict, this throws
diff --git a/python/ray/data/tests/test_stats.py b/python/ray/data/tests/test_stats.py
index d8d85515092c..a41e060fb3f5 100644
--- a/python/ray/data/tests/test_stats.py
+++ b/python/ray/data/tests/test_stats.py
@@ -1648,6 +1648,7 @@ def test_stats_actor_datasets(ray_start_cluster):
     assert "Input0" in operators
     assert "ReadRange->MapBatches(<lambda>)1" in operators
     for value in operators.values():
+        assert value["name"] in ["Input", "ReadRange->MapBatches(<lambda>)"]
         assert value["progress"] == 20
         assert value["total"] == 20
         assert value["state"] == "FINISHED"
@@ -1663,8 +1664,9 @@ def test_stats_manager(shutdown_only):
     datasets = [None] * num_threads
     # Mock clear methods so that _last_execution_stats and _last_iteration_stats
     # are not cleared. We will assert on them afterwards.
-    with patch.object(StatsManager, "clear_execution_metrics"), patch.object(
-        StatsManager, "clear_iteration_metrics"
+    with (
+        patch.object(StatsManager, "clear_last_execution_stats"),
+        patch.object(StatsManager, "clear_iteration_metrics"),
     ):
 
         def update_stats_manager(i):
@@ -1689,9 +1691,7 @@ def update_stats_manager(i):
         dataset_tag = create_dataset_tag(dataset._name, dataset._uuid)
         assert dataset_tag in StatsManager._last_execution_stats
         assert dataset_tag in StatsManager._last_iteration_stats
-        StatsManager.clear_execution_metrics(
-            dataset_tag, ["Input0", "ReadRange->MapBatches(<lambda>)1"]
-        )
+        StatsManager.clear_last_execution_stats(dataset_tag)
         StatsManager.clear_iteration_metrics(dataset_tag)
 
     wait_for_condition(lambda: not StatsManager._update_thread.is_alive())
diff --git a/python/ray/data/tests/test_strict_mode.py b/python/ray/data/tests/test_strict_mode.py
index 49b4b9cc4e37..e34c2f428329 100644
--- a/python/ray/data/tests/test_strict_mode.py
+++ b/python/ray/data/tests/test_strict_mode.py
@@ -181,7 +181,7 @@ def test_strict_schema(ray_start_regular_shared):
     from ray.data._internal.pandas_block import PandasBlockSchema
     from ray.data.extensions.object_extension import (
         ArrowPythonObjectType,
-        object_extension_type_allowed,
+        _object_extension_type_allowed,
     )
     from ray.data.extensions.tensor_extension import ArrowTensorType
 
@@ -199,7 +199,7 @@ def test_strict_schema(ray_start_regular_shared):
 
     ds = ray.data.from_items([{"x": 2, "y": object(), "z": [1, 2]}])
     schema = ds.schema()
-    if object_extension_type_allowed():
+    if _object_extension_type_allowed():
         assert isinstance(schema.base_schema, pa.lib.Schema)
         assert schema.names == ["x", "y", "z"]
         assert schema.types == [
diff --git a/python/ray/data/tests/test_transform_pyarrow.py b/python/ray/data/tests/test_transform_pyarrow.py
index 570bd8f6592b..a221bd6c7683 100644
--- a/python/ray/data/tests/test_transform_pyarrow.py
+++ b/python/ray/data/tests/test_transform_pyarrow.py
@@ -18,7 +18,7 @@
     ArrowTensorArray,
     ArrowTensorType,
     ArrowVariableShapedTensorType,
-    object_extension_type_allowed,
+    _object_extension_type_allowed,
 )
 
 
@@ -199,7 +199,7 @@ def test_arrow_concat_tensor_extension_uniform_but_different():
 
 
 @pytest.mark.skipif(
-    not object_extension_type_allowed(), reason="Object extension type not supported."
+    not _object_extension_type_allowed(), reason="Object extension type not supported."
 )
 def test_arrow_concat_with_objects():
     obj = types.SimpleNamespace(a=1, b="test")
@@ -458,9 +458,10 @@ def map(x):
         assert op == "map_batches"
 
         def map_batches(x):
+            row_id = x["id"][0]
             return {
                 "id": x["id"],
-                "my_data": data[x["id"][0]],
+                "my_data": [data[row_id]],
             }
 
         ds = ds.map_batches(map_batches, batch_size=None)
@@ -472,14 +473,14 @@ def map_batches(x):
 
 
 @pytest.mark.skipif(
-    object_extension_type_allowed(), reason="Arrow table supports pickled objects"
+    _object_extension_type_allowed(), reason="Arrow table supports pickled objects"
 )
 @pytest.mark.parametrize(
     "op, data",
     [
         ("map", [UnsupportedType(), 1]),
-        ("map_batches", [[None], [1]]),
-        ("map_batches", [[{"a": 1}], [{"a": 2}]]),
+        ("map_batches", [None, 1]),
+        ("map_batches", [{"a": 1}, {"a": 2}]),
     ],
 )
 def test_fallback_to_pandas_on_incompatible_data(
@@ -497,34 +498,59 @@ def test_fallback_to_pandas_on_incompatible_data(
 
 
 @pytest.mark.parametrize(
-    "op, data",
+    "op, data, should_fail, expected_type",
     [
-        ("map", [1, 2**100]),
-        ("map_batches", [[1.0], [2**4]]),
+        # Case A: Upon serializing to Arrow fallback to `ArrowPythonObjectType`
+        ("map_batches", [1, 2**100], False, ArrowPythonObjectType()),
+        ("map_batches", [1.0, 2**100], False, ArrowPythonObjectType()),
+        ("map_batches", ["1.0", 2**100], False, ArrowPythonObjectType()),
+        # Case B: No fallback to `ArrowPythonObjectType` and hence arrow is enforcing
+        #         deduced schema
+        ("map_batches", [1.0, 2**4], True, None),
+        ("map_batches", ["1.0", 2**4], True, None),
     ],
 )
-def test_pyarrow_conversion_error_detailed_info(
+def test_pyarrow_conversion_error_handling(
     ray_start_regular_shared,
     op,
     data,
+    should_fail: bool,
+    expected_type: pa.DataType,
 ):
     # Ray Data infers the block type (arrow or pandas) and the block schema
-    # based on the first UDF output.
-    # In one of the following cases, an error will be raised:
-    # * The first UDF output is compatible with Arrow, but the second is not.
-    # * Both UDF outputs are compatible with Arrow, but the second has a different
-    #   schema.
-    # Check that we'll raise an ArrowConversionError with detailed information
-    # about the incompatible data.
+    # based on the first *block* produced by UDF.
+    #
+    # These tests simulate following scenarios
+    #   1. (Case A) Type of the value of the first block is deduced as Arrow scalar
+    #      type, but second block carries value that overflows pa.int64 representation,
+    #      and column henceforth will be serialized as `ArrowPythonObjectExtensionType`
+    #      coercing first block to it as well
+    #
+    #   2. (Case B) Both blocks carry proper Arrow scalars which, however, have
+    #      diverging types and therefore Arrow fails during merging of these blocks
+    #      into 1
     ds = _create_dataset(op, data)
 
-    with pytest.raises(Exception) as e:
+    if should_fail:
+        with pytest.raises(Exception) as e:
+            ds.materialize()
+
+        error_msg = str(e.value)
+        expected_msg = "ArrowConversionError: Error converting data to Arrow:"
+
+        assert expected_msg in error_msg
+        assert "my_data" in error_msg
+
+    else:
         ds.materialize()
 
-    error_msg = str(e.value)
-    expected_msg = "ArrowConversionError: Error converting data to Arrow:"
-    assert expected_msg in error_msg, error_msg
-    assert "my_data" in error_msg, error_msg
+        assert ds.schema().base_schema == pa.schema(
+            [pa.field("id", pa.int64()), pa.field("my_data", expected_type)]
+        )
+
+        assert ds.take_all() == [
+            {"id": i, "my_data": data[i]} for i in range(len(data))
+        ]
 
 
 if __name__ == "__main__":
diff --git a/python/ray/includes/common.pxd b/python/ray/includes/common.pxd
index 7d4b6ece9e7a..f5c6d4655ac9 100644
--- a/python/ray/includes/common.pxd
+++ b/python/ray/includes/common.pxd
@@ -329,7 +329,8 @@ cdef extern from "ray/core_worker/common.h" nogil:
                      unordered_map[c_string, double] &resources,
                      c_string concurrency_group_name,
                      int64_t generator_backpressure_num_objects,
-                     c_string serialized_runtime_env, c_bool enable_task_events)
+                     c_string serialized_runtime_env, c_bool enable_task_events,
+                     const unordered_map[c_string, c_string] &labels)
 
     cdef cppclass CActorCreationOptions "ray::core::ActorCreationOptions":
         CActorCreationOptions()
@@ -347,7 +348,8 @@ cdef extern from "ray/core_worker/common.h" nogil:
             const c_vector[CConcurrencyGroup] &concurrency_groups,
             c_bool execute_out_of_order,
             int32_t max_pending_calls,
-            c_bool enable_task_events)
+            c_bool enable_task_events,
+            const unordered_map[c_string, c_string] &labels)
 
     cdef cppclass CPlacementGroupCreationOptions \
             "ray::core::PlacementGroupCreationOptions":
diff --git a/python/ray/includes/libcoreworker.pxd b/python/ray/includes/libcoreworker.pxd
index 242c5f10dd49..87f5d59a8583 100644
--- a/python/ray/includes/libcoreworker.pxd
+++ b/python/ray/includes/libcoreworker.pxd
@@ -191,6 +191,8 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
 
         CJobID GetCurrentJobId()
         CTaskID GetCurrentTaskId()
+        const c_string GetCurrentTaskName()
+        const c_string GetCurrentTaskFunctionName()
         void UpdateTaskIsDebuggerPaused(
             const CTaskID &task_id,
             const c_bool is_debugger_paused)
diff --git a/python/ray/remote_function.py b/python/ray/remote_function.py
index 72c07fd63b86..b44eae3d84ce 100644
--- a/python/ray/remote_function.py
+++ b/python/ray/remote_function.py
@@ -4,6 +4,7 @@
 import uuid
 from functools import wraps
 from threading import Lock
+from typing import Optional
 
 import ray._private.signature
 from ray import Language, cross_language
@@ -120,6 +121,22 @@ def __init__(
         if "runtime_env" in self._default_options:
             self._default_options["runtime_env"] = self._runtime_env
 
+        # Pre-calculate runtime env info, to avoid re-calculation at `remote`
+        # invocation. When `remote` call has specified extra `option` field,
+        # runtime env will be overwritten and re-serialized.
+        #
+        # Caveat: To support dynamic runtime envs in
+        # `func.option(runtime_env={...}).remote()`, we recalculate the serialized
+        # runtime env info in the `option` call. But it's acceptable since
+        # pre-calculation here only happens once at `RemoteFunction` initialization.
+        self._serialized_base_runtime_env_info = ""
+        if self._runtime_env:
+            self._serialized_base_runtime_env_info = get_runtime_env_info(
+                self._runtime_env,
+                is_job_runtime_env=False,
+                serialize=True,
+            )
+
         self._language = language
         self._is_generator = inspect.isgeneratorfunction(function)
         self._function = function
@@ -136,7 +153,12 @@ def __init__(
         # Override task.remote's signature and docstring
         @wraps(function)
         def _remote_proxy(*args, **kwargs):
-            return self._remote(args=args, kwargs=kwargs, **self._default_options)
+            return self._remote(
+                serialized_runtime_env_info=self._serialized_base_runtime_env_info,
+                args=args,
+                kwargs=kwargs,
+                **self._default_options,
+            )
 
         self.remote = _remote_proxy
 
@@ -217,6 +239,7 @@ def options(self, **task_options):
             _metadata: Extended options for Ray libraries. For example,
                 _metadata={"workflows.io/options": <workflow options>} for
                 Ray workflows.
+            _labels: The key-value labels of a task.
 
         Examples:
 
@@ -239,15 +262,29 @@ def f():
         updated_options = ray_option_utils.update_options(default_options, task_options)
         ray_option_utils.validate_task_options(updated_options, in_options=True)
 
-        # only update runtime_env when ".options()" specifies new runtime_env
+        # Only update runtime_env and re-calculate serialized runtime env info when
+        # ".options()" specifies new runtime_env.
+        serialized_runtime_env_info = self._serialized_base_runtime_env_info
         if "runtime_env" in task_options:
             updated_options["runtime_env"] = parse_runtime_env(
                 updated_options["runtime_env"]
             )
+            # Re-calculate runtime env info based on updated runtime env.
+            if updated_options["runtime_env"]:
+                serialized_runtime_env_info = get_runtime_env_info(
+                    updated_options["runtime_env"],
+                    is_job_runtime_env=False,
+                    serialize=True,
+                )
 
         class FuncWrapper:
             def remote(self, *args, **kwargs):
-                return func_cls._remote(args=args, kwargs=kwargs, **updated_options)
+                return func_cls._remote(
+                    args=args,
+                    kwargs=kwargs,
+                    serialized_runtime_env_info=serialized_runtime_env_info,
+                    **updated_options,
+                )
 
             @DeveloperAPI
             def bind(self, *args, **kwargs):
@@ -263,7 +300,13 @@ class or functions.
 
     @wrap_auto_init
     @_tracing_task_invocation
-    def _remote(self, args=None, kwargs=None, **task_options):
+    def _remote(
+        self,
+        args=None,
+        kwargs=None,
+        serialized_runtime_env_info: Optional[str] = None,
+        **task_options,
+    ):
         """Submit the remote function for execution."""
         # We pop the "max_calls" coming from "@ray.remote" here. We no longer need
         # it in "_remote()".
@@ -329,7 +372,6 @@ def _remote(self, args=None, kwargs=None, **task_options):
 
         # TODO(suquark): cleanup these fields
         name = task_options["name"]
-        runtime_env = parse_runtime_env(task_options["runtime_env"])
         placement_group = task_options["placement_group"]
         placement_group_bundle_index = task_options["placement_group_bundle_index"]
         placement_group_capture_child_tasks = task_options[
@@ -404,19 +446,12 @@ def _remote(self, args=None, kwargs=None, **task_options):
             else:
                 scheduling_strategy = "DEFAULT"
 
-        serialized_runtime_env_info = None
-        if runtime_env is not None:
-            serialized_runtime_env_info = get_runtime_env_info(
-                runtime_env,
-                is_job_runtime_env=False,
-                serialize=True,
-            )
-
         if _task_launch_hook:
             _task_launch_hook(self._function_descriptor, resources, scheduling_strategy)
 
         # Override enable_task_events to default for actor if not specified (i.e. None)
         enable_task_events = task_options.get("enable_task_events")
+        labels = task_options.get("_labels")
 
         def invocation(args, kwargs):
             if self._is_cross_language:
@@ -447,6 +482,7 @@ def invocation(args, kwargs):
                 serialized_runtime_env_info or "{}",
                 generator_backpressure_num_objects,
                 enable_task_events,
+                labels,
             )
             # Reset worker's debug context from the last "remote" command
             # (which applies only to this .remote call).
diff --git a/python/ray/runtime_context.py b/python/ray/runtime_context.py
index 78f7cee9502d..5cacae69371b 100644
--- a/python/ray/runtime_context.py
+++ b/python/ray/runtime_context.py
@@ -124,7 +124,7 @@ def get_worker_id(self) -> str:
     @property
     @Deprecated(message="Use get_task_id() instead", warning=True)
     def task_id(self):
-        """Get current task ID for this worker or driver.
+        """Get current task ID for this worker.
 
         Task ID is the id of a Ray task.
         This shouldn't be used in a driver process.
@@ -155,7 +155,7 @@ def f():
         Returns:
             The current worker's task id. None if there's no task id.
         """
-        # only worker mode has actor_id
+        # only worker mode has task_id
         assert (
             self.worker.mode == ray._private.worker.WORKER_MODE
         ), f"This method is only available when the process is a\
@@ -165,7 +165,7 @@ def f():
         return task_id if not task_id.is_nil() else None
 
     def get_task_id(self) -> Optional[str]:
-        """Get current task ID for this worker or driver.
+        """Get current task ID for this worker.
 
         Task ID is the id of a Ray task. The ID will be in hex format.
         This shouldn't be used in a driver process.
@@ -201,7 +201,7 @@ def get_task_id():
         Returns:
             The current worker's task id in hex. None if there's no task id.
         """
-        # only worker mode has actor_id
+        # only worker mode has task_id
         if self.worker.mode != ray._private.worker.WORKER_MODE:
             logger.warning(
                 "This method is only available when the process is a "
@@ -212,12 +212,116 @@ def get_task_id():
         return task_id.hex() if not task_id.is_nil() else None
 
     def _get_current_task_id(self) -> TaskID:
-        async_task_id = ray._raylet.async_task_id.get()
-        if async_task_id is None:
-            task_id = self.worker.current_task_id
-        else:
-            task_id = async_task_id
-        return task_id
+        return self.worker.current_task_id
+
+    def get_task_name(self) -> Optional[str]:
+        """Get current task name for this worker.
+
+        Task name by default is the task's funciton call string. It can also be
+        specified in options when triggering a task.
+
+        Example:
+
+            .. testcode::
+
+                import ray
+
+                @ray.remote
+                class Actor:
+                    def get_task_name(self):
+                        return ray.get_runtime_context().get_task_name()
+
+                @ray.remote
+                class AsyncActor:
+                    async def get_task_name(self):
+                        return ray.get_runtime_context().get_task_name()
+
+                @ray.remote
+                def get_task_name():
+                    return ray.get_runtime_context().get_task_name()
+
+                a = Actor.remote()
+                b = AsyncActor.remote()
+                # Task names are available for actor tasks.
+                print(ray.get(a.get_task_name.remote()))
+                # Task names are avaiable for async actor tasks.
+                print(ray.get(b.get_task_name.remote()))
+                # Task names are available for normal tasks.
+                # Get default task name
+                print(ray.get(get_task_name.remote()))
+                # Get specified task name
+                print(ray.get(get_task_name.options(name="task_name").remote()))
+
+            .. testoutput::
+                :options: +MOCK
+
+                Actor.get_task_name
+                AsyncActor.get_task_name
+                get_task_name
+                task_nams
+
+        Returns:
+            The current worker's task name
+        """
+        # only worker mode has task_name
+        if self.worker.mode != ray._private.worker.WORKER_MODE:
+            logger.warning(
+                "This method is only available when the process is a "
+                f"worker. Current mode: {self.worker.mode}"
+            )
+            return None
+        return self.worker.current_task_name
+
+    def get_task_function_name(self) -> Optional[str]:
+        """Get current task function name string for this worker.
+
+        Example:
+
+            .. testcode::
+
+                import ray
+
+                @ray.remote
+                class Actor:
+                    def get_task_function_name(self):
+                        return ray.get_runtime_context().get_task_function_name()
+
+                @ray.remote
+                class AsyncActor:
+                    async def get_task_function_name(self):
+                        return ray.get_runtime_context().get_task_function_name()
+
+                @ray.remote
+                def get_task_function_name():
+                    return ray.get_runtime_context().get_task_function_name()
+
+                a = Actor.remote()
+                b = AsyncActor.remote()
+                # Task functions are available for actor tasks.
+                print(ray.get(a.get_task_function_name.remote()))
+                # Task functions are available for async actor tasks.
+                print(ray.get(b.get_task_function_name.remote()))
+                # Task functions are available for normal tasks.
+                print(ray.get(get_task_function_name.remote()))
+
+            .. testoutput::
+                :options: +MOCK
+
+                [python modual name].Actor.get_task_function_name
+                [python modual name].AsyncActor.get_task_function_name
+                [python modual name].get_task_function_name
+
+        Returns:
+            The current worker's task function call string
+        """
+        # only worker mode has task_function_name
+        if self.worker.mode != ray._private.worker.WORKER_MODE:
+            logger.warning(
+                "This method is only available when the process is a "
+                f"worker. Current mode: {self.worker.mode}"
+            )
+            return None
+        return self.worker.current_task_function_name
 
     @property
     @Deprecated(message="Use get_actor_id() instead", warning=True)
diff --git a/python/ray/scripts/scripts.py b/python/ray/scripts/scripts.py
index a69c1369db72..eed702bb7438 100644
--- a/python/ray/scripts/scripts.py
+++ b/python/ray/scripts/scripts.py
@@ -15,6 +15,7 @@
 from typing import Optional, Set, List, Tuple
 from ray.dashboard.modules.metrics import install_and_start_prometheus
 from ray.util.check_open_ports import check_open_ports
+import requests
 
 import click
 import psutil
@@ -621,6 +622,15 @@ def debug(address: str, verbose: bool):
     type=str,
     help="a JSON serialized dictionary mapping label name to label value.",
 )
+@click.option(
+    "--include-log-monitor",
+    default=None,
+    type=bool,
+    help="If set to True or left unset, a log monitor will start monitoring "
+    "the log files of all processes on this node and push their contents to GCS. "
+    "Only one log monitor should be started per physical host to avoid log "
+    "duplication on the driver process.",
+)
 @add_click_logging_options
 @PublicAPI
 def start(
@@ -667,6 +677,7 @@ def start(
     ray_debugger_external,
     disable_usage_stats,
     labels,
+    include_log_monitor,
 ):
     """Start Ray processes manually on the local machine."""
 
@@ -756,6 +767,7 @@ def start(
         no_monitor=no_monitor,
         tracing_startup_hook=tracing_startup_hook,
         ray_debugger_external=ray_debugger_external,
+        include_log_monitor=include_log_monitor,
     )
 
     if ray_constants.RAY_START_HOOK in os.environ:
@@ -2583,6 +2595,15 @@ def launch_prometheus():
     install_and_start_prometheus.main()
 
 
+@metrics_group.command(name="shutdown-prometheus")
+def shutdown_prometheus():
+    try:
+        requests.post("http://localhost:9090/-/quit")
+    except requests.exceptions.RequestException as e:
+        print(f"An error occurred: {e}")
+        sys.exit(1)
+
+
 def add_command_alias(command, name, hidden):
     new_command = copy.deepcopy(command)
     new_command.hidden = hidden
diff --git a/python/ray/serve/_private/constants.py b/python/ray/serve/_private/constants.py
index 8fae61c95329..6e45bae19ceb 100644
--- a/python/ray/serve/_private/constants.py
+++ b/python/ray/serve/_private/constants.py
@@ -194,9 +194,8 @@
 # Logging format with record key to format string dict
 SERVE_LOG_RECORD_FORMAT = {
     SERVE_LOG_REQUEST_ID: "%(request_id)s",
-    SERVE_LOG_ROUTE: "%(route)s",
     SERVE_LOG_APPLICATION: "%(application)s",
-    SERVE_LOG_MESSAGE: "%(filename)s:%(lineno)d - %(message)s",
+    SERVE_LOG_MESSAGE: "-- %(message)s",
     SERVE_LOG_LEVEL_NAME: "%(levelname)s",
     SERVE_LOG_TIME: "%(asctime)s",
 }
@@ -356,3 +355,17 @@
 RAY_SERVE_FORCE_LOCAL_TESTING_MODE = (
     os.environ.get("RAY_SERVE_FORCE_LOCAL_TESTING_MODE", "0") == "1"
 )
+
+# Run sync methods defined in the replica in a thread pool by default.
+RAY_SERVE_RUN_SYNC_IN_THREADPOOL = (
+    os.environ.get("RAY_SERVE_RUN_SYNC_IN_THREADPOOL", "0") == "1"
+)
+
+RAY_SERVE_RUN_SYNC_IN_THREADPOOL_WARNING = (
+    "Calling sync method '{method_name}' directly on the "
+    "asyncio loop. In a future version, sync methods will be run in a "
+    "threadpool by default. Ensure your sync methods are thread safe "
+    "or keep the existing behavior by making them `async def`. Opt "
+    "into the new behavior by setting "
+    "RAY_SERVE_RUN_SYNC_IN_THREADPOOL=1."
+)
diff --git a/python/ray/serve/_private/controller.py b/python/ray/serve/_private/controller.py
index 8eff4c80315a..4aa6906b241f 100644
--- a/python/ray/serve/_private/controller.py
+++ b/python/ray/serve/_private/controller.py
@@ -226,8 +226,7 @@ def reconfigure_global_logging_config(self, global_logging_config: LoggingConfig
         self.global_logging_config = global_logging_config
 
         self.long_poll_host.notify_changed(
-            LongPollNamespace.GLOBAL_LOGGING_CONFIG,
-            global_logging_config,
+            {LongPollNamespace.GLOBAL_LOGGING_CONFIG: global_logging_config}
         )
         configure_component_logger(
             component_name="controller",
diff --git a/python/ray/serve/_private/default_impl.py b/python/ray/serve/_private/default_impl.py
index f47dfa85c178..489f0aaa25f9 100644
--- a/python/ray/serve/_private/default_impl.py
+++ b/python/ray/serve/_private/default_impl.py
@@ -56,6 +56,12 @@ def create_deployment_scheduler(
     )
 
 
+def create_replica_impl(**kwargs):
+    from ray.serve._private.replica import Replica
+
+    return Replica(**kwargs)
+
+
 def create_dynamic_handle_options(**kwargs):
     return DynamicHandleOptions(**kwargs)
 
diff --git a/python/ray/serve/_private/deployment_state.py b/python/ray/serve/_private/deployment_state.py
index ca0fb2d446c6..09fa27876397 100644
--- a/python/ray/serve/_private/deployment_state.py
+++ b/python/ray/serve/_private/deployment_state.py
@@ -1384,6 +1384,13 @@ def deployment_name(self) -> str:
     def app_name(self) -> str:
         return self._id.app_name
 
+    @property
+    def _failed_to_start_threshold(self) -> int:
+        return min(
+            MAX_DEPLOYMENT_CONSTRUCTOR_RETRY_COUNT,
+            self._target_state.target_num_replicas * 3,
+        )
+
     def get_alive_replica_actor_ids(self) -> Set[str]:
         return {replica.actor_id for replica in self._replicas.get()}
 
@@ -1448,16 +1455,17 @@ def broadcast_running_replicas_if_changed(self) -> None:
             return
 
         self._long_poll_host.notify_changed(
-            (LongPollNamespace.RUNNING_REPLICAS, self._id),
-            running_replica_infos,
-        )
-        # NOTE(zcin): notify changed for Java routers. Since Java only
-        # supports 1.x API, there is no concept of applications in Java,
-        # so the key should remain a string describing the deployment
-        # name. If there are no Java routers, this is a no-op.
-        self._long_poll_host.notify_changed(
-            (LongPollNamespace.RUNNING_REPLICAS, self._id.name),
-            running_replica_infos,
+            {
+                (LongPollNamespace.RUNNING_REPLICAS, self._id): running_replica_infos,
+                # NOTE(zcin): notify changed for Java routers. Since Java only
+                # supports 1.x API, there is no concept of applications in Java,
+                # so the key should remain a string describing the deployment
+                # name. If there are no Java routers, this is a no-op.
+                (
+                    LongPollNamespace.RUNNING_REPLICAS,
+                    self._id.name,
+                ): running_replica_infos,
+            }
         )
         self._last_broadcasted_running_replica_infos = running_replica_infos
         self._multiplexed_model_ids_updated = False
@@ -1473,8 +1481,7 @@ def broadcast_deployment_config_if_changed(self) -> None:
             return
 
         self._long_poll_host.notify_changed(
-            (LongPollNamespace.DEPLOYMENT_CONFIG, self._id),
-            current_deployment_config,
+            {(LongPollNamespace.DEPLOYMENT_CONFIG, self._id): current_deployment_config}
         )
 
         self._last_broadcasted_deployment_config = current_deployment_config
@@ -1845,11 +1852,10 @@ def scale_deployment_replicas(
 
             if to_add > 0:
                 # Exponential backoff
-                failed_to_start_threshold = min(
-                    MAX_DEPLOYMENT_CONSTRUCTOR_RETRY_COUNT,
-                    self._target_state.target_num_replicas * 3,
-                )
-                if self._replica_constructor_retry_counter >= failed_to_start_threshold:
+                if (
+                    self._replica_constructor_retry_counter
+                    >= self._failed_to_start_threshold
+                ):
                     # Wait 1, 2, 4, ... seconds before consecutive retries, with random
                     # offset added to avoid synchronization
                     if (
@@ -1909,17 +1915,13 @@ def check_curr_status(self) -> Tuple[bool, bool]:
         )
 
         failed_to_start_count = self._replica_constructor_retry_counter
-        failed_to_start_threshold = min(
-            MAX_DEPLOYMENT_CONSTRUCTOR_RETRY_COUNT,
-            self._target_state.target_num_replicas * 3,
-        )
 
         # Got to make a call to complete current deploy() goal after
         # start failure threshold reached, while we might still have
         # pending replicas in current goal.
         if (
-            failed_to_start_count >= failed_to_start_threshold
-            and failed_to_start_threshold != 0
+            failed_to_start_count >= self._failed_to_start_threshold
+            and self._failed_to_start_threshold != 0
         ):
             if running_at_target_version_replica_cnt > 0:
                 # At least one RUNNING replica at target state, partial
@@ -2043,17 +2045,27 @@ def record_replica_startup_failure(self, error_msg: str):
             self._replica_constructor_retry_counter += 1
             self._replica_constructor_error_msg = error_msg
 
+            retrying_msg = "Retrying"
+            if self._failed_to_start_threshold != 0:
+                remaining_retries = (
+                    self._failed_to_start_threshold
+                    - self._replica_constructor_retry_counter
+                )
+                retrying_msg += f" {remaining_retries} more time(s)"
+
+            message = (
+                f"A replica failed to start with exception. {retrying_msg}. Error:\n"
+                f"{error_msg}"
+            )
+            self._curr_status_info = self._curr_status_info.update_message(message)
+
     def update_replica_startup_backoff_time(self):
         """Updates the replica startup backoff time."""
 
         # If replicas have failed enough times, execute exponential backoff
         # Wait 1, 2, 4, ... seconds before consecutive retries (or use a custom
         # backoff factor by setting EXPONENTIAL_BACKOFF_FACTOR)
-        failed_to_start_threshold = min(
-            MAX_DEPLOYMENT_CONSTRUCTOR_RETRY_COUNT,
-            self._target_state.target_num_replicas * 3,
-        )
-        if self._replica_constructor_retry_counter > failed_to_start_threshold:
+        if self._replica_constructor_retry_counter > self._failed_to_start_threshold:
             self._backoff_time_s = min(
                 EXPONENTIAL_BACKOFF_FACTOR * self._backoff_time_s, MAX_BACKOFF_TIME_S
             )
diff --git a/python/ray/serve/_private/endpoint_state.py b/python/ray/serve/_private/endpoint_state.py
index abc4c0615ad6..fd2074fd6669 100644
--- a/python/ray/serve/_private/endpoint_state.py
+++ b/python/ray/serve/_private/endpoint_state.py
@@ -46,7 +46,7 @@ def _checkpoint(self):
 
     def _notify_route_table_changed(self):
         self._long_poll_host.notify_changed(
-            LongPollNamespace.ROUTE_TABLE, self._endpoints
+            {LongPollNamespace.ROUTE_TABLE: self._endpoints}
         )
 
     def _get_endpoint_for_route(self, route: str) -> Optional[DeploymentID]:
diff --git a/python/ray/serve/_private/local_testing_mode.py b/python/ray/serve/_private/local_testing_mode.py
index 6ccc16cd3628..af38c04c5e65 100644
--- a/python/ray/serve/_private/local_testing_mode.py
+++ b/python/ray/serve/_private/local_testing_mode.py
@@ -10,7 +10,10 @@
 import ray
 from ray import cloudpickle
 from ray.serve._private.common import DeploymentID, RequestMetadata
-from ray.serve._private.constants import SERVE_LOGGER_NAME
+from ray.serve._private.constants import (
+    RAY_SERVE_RUN_SYNC_IN_THREADPOOL,
+    SERVE_LOGGER_NAME,
+)
 from ray.serve._private.replica import UserCallableWrapper
 from ray.serve._private.replica_result import ReplicaResult
 from ray.serve._private.router import Router
@@ -66,6 +69,7 @@ def make_local_deployment_handle(
         deployment.init_args,
         deployment.init_kwargs,
         deployment_id=deployment_id,
+        run_sync_methods_in_threadpool=RAY_SERVE_RUN_SYNC_IN_THREADPOOL,
     )
     try:
         logger.info(f"Initializing local replica class for {deployment_id}.")
@@ -310,4 +314,6 @@ def generator_result_callback(item: Any):
         return noop_future
 
     def shutdown(self):
-        pass
+        noop_future = concurrent.futures.Future()
+        noop_future.set_result(None)
+        return noop_future
diff --git a/python/ray/serve/_private/logging_utils.py b/python/ray/serve/_private/logging_utils.py
index 207f0574c317..5081829670bc 100644
--- a/python/ray/serve/_private/logging_utils.py
+++ b/python/ray/serve/_private/logging_utils.py
@@ -141,8 +141,6 @@ def format(self, record: logging.LogRecord) -> str:
         record_formats_attrs = []
         if SERVE_LOG_REQUEST_ID in record.__dict__:
             record_formats_attrs.append(SERVE_LOG_RECORD_FORMAT[SERVE_LOG_REQUEST_ID])
-        if SERVE_LOG_ROUTE in record.__dict__:
-            record_formats_attrs.append(SERVE_LOG_RECORD_FORMAT[SERVE_LOG_ROUTE])
         record_formats_attrs.append(SERVE_LOG_RECORD_FORMAT[SERVE_LOG_MESSAGE])
         record_format += " ".join(record_formats_attrs)
 
@@ -153,9 +151,9 @@ def format(self, record: logging.LogRecord) -> str:
         return formatter.format(record)
 
 
-def access_log_msg(*, method: str, status: str, latency_ms: float):
+def access_log_msg(*, method: str, route: str, status: str, latency_ms: float):
     """Returns a formatted message for an HTTP or ServeHandle access log."""
-    return f"{method.upper()} {status.upper()} {latency_ms:.1f}ms"
+    return f"{method} {route} {status} {latency_ms:.1f}ms"
 
 
 def log_to_stderr_filter(record: logging.LogRecord) -> bool:
diff --git a/python/ray/serve/_private/long_poll.py b/python/ray/serve/_private/long_poll.py
index f3538913b76b..d6fb52e72310 100644
--- a/python/ray/serve/_private/long_poll.py
+++ b/python/ray/serve/_private/long_poll.py
@@ -4,6 +4,7 @@
 import random
 from asyncio.events import AbstractEventLoop
 from collections import defaultdict
+from collections.abc import Mapping
 from dataclasses import dataclass
 from enum import Enum, auto
 from typing import Any, Callable, DefaultDict, Dict, Optional, Set, Tuple, Union
@@ -179,12 +180,12 @@ class LongPollHost:
 
     The desired use case is to embed this in an Ray actor. Client will be
     expected to call actor.listen_for_change.remote(...). On the host side,
-    you can call host.notify_changed(key, object) to update the state and
+    you can call host.notify_changed({key: object}) to update the state and
     potentially notify whoever is polling for these values.
 
     Internally, we use snapshot_ids for each object to identify client with
     outdated object and immediately return the result. If the client has the
-    up-to-date verison, then the listen_for_change call will only return when
+    up-to-date version, then the listen_for_change call will only return when
     the object is updated.
     """
 
@@ -306,15 +307,15 @@ async def listen_for_change(
             self._count_send(LongPollState.TIME_OUT)
             return LongPollState.TIME_OUT
         else:
-            updated_object_key: str = async_task_to_watched_keys[done.pop()]
-            updated_object = {
-                updated_object_key: UpdatedObject(
+            updated_objects = {}
+            for task in done:
+                updated_object_key = async_task_to_watched_keys[task]
+                updated_objects[updated_object_key] = UpdatedObject(
                     self.object_snapshots[updated_object_key],
                     self.snapshot_ids[updated_object_key],
                 )
-            }
-            self._count_send(updated_object)
-            return updated_object
+            self._count_send(updated_objects)
+            return updated_objects
 
     async def listen_for_change_java(
         self,
@@ -403,21 +404,22 @@ def _listen_result_to_proto_bytes(
         proto = LongPollResult(**data)
         return proto.SerializeToString()
 
-    def notify_changed(
-        self,
-        object_key: KeyType,
-        updated_object: Any,
-    ):
-        try:
-            self.snapshot_ids[object_key] += 1
-        except KeyError:
-            # Initial snapshot id must be >= 0, so that the long poll client
-            # can send a negative initial snapshot id to get a fast update.
-            # They should also be randomized;
-            # see https://github.com/ray-project/ray/pull/45881#discussion_r1645243485
-            self.snapshot_ids[object_key] = random.randint(0, 1_000_000)
-        self.object_snapshots[object_key] = updated_object
-        logger.debug(f"LongPollHost: Notify change for key {object_key}.")
-
-        for event in self.notifier_events.pop(object_key, set()):
-            event.set()
+    def notify_changed(self, updates: Mapping[KeyType, Any]) -> None:
+        """
+        Update the current snapshot of some objects
+        and notify any long poll clients.
+        """
+        for object_key, updated_object in updates.items():
+            try:
+                self.snapshot_ids[object_key] += 1
+            except KeyError:
+                # Initial snapshot id must be >= 0, so that the long poll client
+                # can send a negative initial snapshot id to get a fast update.
+                # They should also be randomized; see
+                # https://github.com/ray-project/ray/pull/45881#discussion_r1645243485
+                self.snapshot_ids[object_key] = random.randint(0, 1_000_000)
+            self.object_snapshots[object_key] = updated_object
+            logger.debug(f"LongPollHost: Notify change for key {object_key}.")
+
+            for event in self.notifier_events.pop(object_key, set()):
+                event.set()
diff --git a/python/ray/serve/_private/proxy.py b/python/ray/serve/_private/proxy.py
index 614a8dc39508..2b5967a7a75f 100644
--- a/python/ray/serve/_private/proxy.py
+++ b/python/ray/serve/_private/proxy.py
@@ -454,9 +454,11 @@ async def proxy_request(self, proxy_request: ProxyRequest) -> ResponseGenerator:
 
         latency_ms = (time.time() - start_time) * 1000.0
         if response_handler_info.should_record_access_log:
+            request_context = ray.serve.context._serve_request_context.get()
             logger.info(
                 access_log_msg(
                     method=proxy_request.method,
+                    route=request_context.route,
                     status=str(status.code),
                     latency_ms=latency_ms,
                 ),
@@ -988,8 +990,7 @@ async def send_request_to_replica(
                         status_code = str(asgi_message["status"])
                         status = ResponseStatus(
                             code=status_code,
-                            # TODO(edoakes): we need a more nuanced check than this.
-                            is_error=status_code != "200",
+                            is_error=not status_code.startswith("2"),
                         )
                         expecting_trailers = asgi_message.get("trailers", False)
                     elif asgi_message["type"] == "websocket.accept":
@@ -1010,11 +1011,16 @@ async def send_request_to_replica(
                         # the trailers message has been sent.
                         if not asgi_message.get("more_trailers", False):
                             response_generator.stop_checking_for_disconnect()
-                    elif asgi_message["type"] == "websocket.disconnect":
+                    elif asgi_message["type"] in [
+                        "websocket.close",
+                        "websocket.disconnect",
+                    ]:
+                        status_code = str(asgi_message["code"])
                         status = ResponseStatus(
-                            code=str(asgi_message["code"]),
-                            # TODO(edoakes): we need a more nuanced check than this.
-                            is_error=False,
+                            code=status_code,
+                            # All status codes are considered errors aside from:
+                            # 1000 (CLOSE_NORMAL), 1001 (CLOSE_GOING_AWAY).
+                            is_error=status_code not in ["1000", "1001"],
                         )
                         response_generator.stop_checking_for_disconnect()
 
diff --git a/python/ray/serve/_private/proxy_request_response.py b/python/ray/serve/_private/proxy_request_response.py
index 8050c4be215d..0ca2235fd3dd 100644
--- a/python/ray/serve/_private/proxy_request_response.py
+++ b/python/ray/serve/_private/proxy_request_response.py
@@ -58,7 +58,8 @@ def request_type(self) -> str:
 
     @property
     def method(self) -> str:
-        return self.scope.get("method", "websocket").upper()
+        # WebSocket messages don't have a 'method' field.
+        return self.scope.get("method", "WS").upper()
 
     @property
     def route_path(self) -> str:
diff --git a/python/ray/serve/_private/replica.py b/python/ray/serve/_private/replica.py
index b90c837b6cc0..23fc7d237f94 100644
--- a/python/ray/serve/_private/replica.py
+++ b/python/ray/serve/_private/replica.py
@@ -1,5 +1,6 @@
 import asyncio
 import concurrent.futures
+import functools
 import inspect
 import logging
 import os
@@ -7,12 +8,23 @@
 import threading
 import time
 import traceback
+import warnings
+from abc import ABC, abstractmethod
 from contextlib import contextmanager
-from functools import wraps
 from importlib import import_module
-from typing import Any, AsyncGenerator, Callable, Dict, Optional, Tuple, Union
+from typing import (
+    Any,
+    AsyncGenerator,
+    Callable,
+    Dict,
+    Generator,
+    Optional,
+    Tuple,
+    Union,
+)
 
 import starlette.responses
+from anyio import to_thread
 from starlette.types import ASGIApp, Message
 
 import ray
@@ -37,11 +49,14 @@
     HEALTH_CHECK_METHOD,
     RAY_SERVE_COLLECT_AUTOSCALING_METRICS_ON_HANDLE,
     RAY_SERVE_REPLICA_AUTOSCALING_METRIC_RECORD_PERIOD_S,
+    RAY_SERVE_RUN_SYNC_IN_THREADPOOL,
+    RAY_SERVE_RUN_SYNC_IN_THREADPOOL_WARNING,
     RECONFIGURE_METHOD,
     SERVE_CONTROLLER_NAME,
     SERVE_LOGGER_NAME,
     SERVE_NAMESPACE,
 )
+from ray.serve._private.default_impl import create_replica_impl
 from ray.serve._private.http_util import (
     ASGIAppReplicaWrapper,
     ASGIArgs,
@@ -231,31 +246,23 @@ def _add_autoscaling_metrics_point(self) -> None:
         )
 
 
-class ReplicaActor:
-    """Actor definition for replicas of Ray Serve deployments.
+StatusCodeCallback = Callable[[str], None]
 
-    This class defines the interface that the controller and deployment handles
-    (i.e., from proxies and other replicas) use to interact with a replica.
 
-    All interaction with the user-provided callable is done via the
-    `UserCallableWrapper` class.
-    """
-
-    async def __init__(
+class ReplicaBase(ABC):
+    def __init__(
         self,
         replica_id: ReplicaID,
-        serialized_deployment_def: bytes,
-        serialized_init_args: bytes,
-        serialized_init_kwargs: bytes,
-        deployment_config_proto_bytes: bytes,
+        deployment_def: Callable,
+        init_args: Tuple,
+        init_kwargs: Dict,
+        deployment_config: DeploymentConfig,
         version: DeploymentVersion,
     ):
         self._version = version
         self._replica_id = replica_id
         self._deployment_id = replica_id.deployment_id
-        self._deployment_config = DeploymentConfig.from_proto_bytes(
-            deployment_config_proto_bytes
-        )
+        self._deployment_config = deployment_config
         self._component_name = f"{self._deployment_id.name}"
         if self._deployment_id.app_name:
             self._component_name = (
@@ -266,15 +273,12 @@ async def __init__(
         self._configure_logger_and_profilers(self._deployment_config.logging_config)
         self._event_loop = get_or_create_event_loop()
 
-        deployment_def = cloudpickle.loads(serialized_deployment_def)
-        if isinstance(deployment_def, str):
-            deployment_def = _load_deployment_def_from_import_path(deployment_def)
-
         self._user_callable_wrapper = UserCallableWrapper(
             deployment_def,
-            cloudpickle.loads(serialized_init_args),
-            cloudpickle.loads(serialized_init_kwargs),
+            init_args,
+            init_kwargs,
             deployment_id=self._deployment_id,
+            run_sync_methods_in_threadpool=RAY_SERVE_RUN_SYNC_IN_THREADPOOL,
         )
 
         # Guards against calling the user's callable constructor multiple times.
@@ -308,6 +312,7 @@ def _set_internal_replica_context(self, *, servable_object: Callable = None):
     def _configure_logger_and_profilers(
         self, logging_config: Union[None, Dict, LoggingConfig]
     ):
+
         if logging_config is None:
             logging_config = {}
         if isinstance(logging_config, dict):
@@ -330,18 +335,10 @@ def _configure_logger_and_profilers(
             component_id=self._component_id,
         )
 
-    def push_proxy_handle(self, handle: ActorHandle):
-        pass
-
-    def get_num_ongoing_requests(self) -> int:
-        """Fetch the number of ongoing requests at this replica (queue length).
-
-        This runs on a separate thread (using a Ray concurrency group) so it will
-        not be blocked by user code.
-        """
+    def get_num_ongoing_requests(self):
         return self._metrics_manager.get_num_ongoing_requests()
 
-    def _maybe_get_asgi_route(
+    def _maybe_get_http_route(
         self, request_metadata: RequestMetadata, request_args: Tuple[Any]
     ) -> Optional[str]:
         """Get the matched route string for ASGI apps to be used in logs & metrics.
@@ -374,49 +371,43 @@ def _maybe_get_asgi_route(
 
         return route
 
-    @contextmanager
-    def _wrap_user_method_call(
+    def _maybe_get_http_method(
         self, request_metadata: RequestMetadata, request_args: Tuple[Any]
-    ):
-        """Context manager that wraps user method calls.
+    ) -> Optional[str]:
+        """Get the HTTP method to be used in logs & metrics.
 
-        1) Sets the request context var with appropriate metadata.
-        2) Records the access log message (if not disabled).
-        3) Records per-request metrics via the metrics manager.
+        If this is not an HTTP request, returns None.
         """
-        route = self._maybe_get_asgi_route(request_metadata, request_args)
-        ray.serve.context._serve_request_context.set(
-            ray.serve.context._RequestContext(
-                route=route,
-                request_id=request_metadata.request_id,
-                _internal_request_id=request_metadata.internal_request_id,
-                app_name=self._deployment_id.app_name,
-                multiplexed_model_id=request_metadata.multiplexed_model_id,
-                grpc_context=request_metadata.grpc_context,
-            )
-        )
+        if request_metadata.is_http_request:
+            req: StreamingHTTPRequest = request_args[0]
+            # WebSocket messages don't have a 'method' field.
+            return req.asgi_scope.get("method", "WS")
+
+        return None
 
+    @contextmanager
+    def _handle_errors_and_metrics(
+        self, request_metadata: RequestMetadata, request_args: Tuple[Any]
+    ) -> Generator[StatusCodeCallback, None, None]:
         start_time = time.time()
         user_exception = None
+
+        status_code = None
+
+        def _status_code_callback(s: str):
+            nonlocal status_code
+            status_code = s
+
         try:
             self._metrics_manager.inc_num_ongoing_requests()
-            yield
+            yield _status_code_callback
         except asyncio.CancelledError as e:
             user_exception = e
-
-            # Recursively cancel child requests
-            requests_pending_assignment = (
-                ray.serve.context._get_requests_pending_assignment(
-                    request_metadata.internal_request_id
-                )
-            )
-            for task in requests_pending_assignment.values():
-                task.cancel()
+            self._on_request_cancelled(request_metadata, e)
         except Exception as e:
             user_exception = e
             logger.exception("Request failed.")
-            if ray.util.pdb._is_ray_debugger_post_mortem_enabled():
-                ray.util.pdb._post_mortem()
+            self._on_request_failed(request_metadata, e)
         finally:
             self._metrics_manager.dec_num_ongoing_requests()
 
@@ -428,16 +419,21 @@ def _wrap_user_method_call(
         else:
             status_str = "ERROR"
 
+        http_method = self._maybe_get_http_method(request_metadata, request_args)
+        http_route = request_metadata.route
+        # Set in _wrap_user_method_call.
         logger.info(
             access_log_msg(
-                method=request_metadata.call_method,
-                status=status_str,
+                method=http_method or "CALL",
+                route=http_route or request_metadata.call_method,
+                # Prefer the HTTP status code if it was populated.
+                status=status_code or status_str,
                 latency_ms=latency_ms,
             ),
             extra={"serve_access_log": True},
         )
         self._metrics_manager.record_request_metrics(
-            route=route,
+            route=http_route,
             status_str=status_str,
             latency_ms=latency_ms,
             was_error=user_exception is not None,
@@ -451,6 +447,7 @@ async def _call_user_generator(
         request_metadata: RequestMetadata,
         request_args: Tuple[Any],
         request_kwargs: Dict[str, Any],
+        status_code_callback: StatusCodeCallback,
     ) -> AsyncGenerator[Any, None]:
         """Calls a user method for a streaming call and yields its results.
 
@@ -476,6 +473,7 @@ def _enqueue_thread_safe(item: Any):
                 )
             )
 
+            first_message_peeked = False
             while True:
                 wait_for_message_task = self._event_loop.create_task(
                     result_queue.wait_for_message()
@@ -492,6 +490,16 @@ def _enqueue_thread_safe(item: Any):
                     # and use vanilla pickle (we know it's safe because these messages
                     # only contain primitive Python types).
                     if request_metadata.is_http_request:
+                        # Peek the first ASGI message to determine the status code.
+                        if not first_message_peeked:
+                            msg = messages[0]
+                            first_message_peeked = True
+                            if msg["type"] == "http.response.start":
+                                # HTTP responses begin with exactly one
+                                # "http.response.start" message containing the "status"
+                                # field. Other response types like WebSockets may not.
+                                status_code_callback(str(msg["status"]))
+
                         yield pickle.dumps(messages)
                     else:
                         for msg in messages:
@@ -516,13 +524,8 @@ def _enqueue_thread_safe(item: Any):
                 wait_for_message_task.cancel()
 
     async def handle_request(
-        self,
-        pickled_request_metadata: bytes,
-        *request_args,
-        **request_kwargs,
+        self, request_metadata: RequestMetadata, *request_args, **request_kwargs
     ) -> Tuple[bytes, Any]:
-        """Entrypoint for `stream=False` calls."""
-        request_metadata = pickle.loads(pickled_request_metadata)
         with self._wrap_user_method_call(request_metadata, request_args):
             return await asyncio.wrap_future(
                 self._user_callable_wrapper.call_user_method(
@@ -531,40 +534,23 @@ async def handle_request(
             )
 
     async def handle_request_streaming(
-        self,
-        pickled_request_metadata: bytes,
-        *request_args,
-        **request_kwargs,
+        self, request_metadata: RequestMetadata, *request_args, **request_kwargs
     ) -> AsyncGenerator[Any, None]:
         """Generator that is the entrypoint for all `stream=True` handle calls."""
-        request_metadata = pickle.loads(pickled_request_metadata)
-        with self._wrap_user_method_call(request_metadata, request_args):
+        with self._wrap_user_method_call(
+            request_metadata, request_args
+        ) as status_code_callback:
             async for result in self._call_user_generator(
                 request_metadata,
                 request_args,
                 request_kwargs,
+                status_code_callback=status_code_callback,
             ):
                 yield result
 
     async def handle_request_with_rejection(
-        self,
-        pickled_request_metadata: bytes,
-        *request_args,
-        **request_kwargs,
-    ) -> AsyncGenerator[Any, None]:
-        """Entrypoint for all requests with strict max_ongoing_requests enforcement.
-
-        The first response from this generator is always a system message indicating
-        if the request was accepted (the replica has capacity for the request) or
-        rejected (the replica is already at max_ongoing_requests).
-
-        For non-streaming requests, there will only be one more message, the unary
-        result of the user request handler.
-
-        For streaming requests, the subsequent messages will be the results of the
-        user request handler (which must be a generator).
-        """
-        request_metadata = pickle.loads(pickled_request_metadata)
+        self, request_metadata: RequestMetadata, *request_args, **request_kwargs
+    ):
         limit = self._deployment_config.max_ongoing_requests
         num_ongoing_requests = self.get_num_ongoing_requests()
         if num_ongoing_requests >= limit:
@@ -573,21 +559,19 @@ async def handle_request_with_rejection(
                 f"rejecting request {request_metadata.request_id}.",
                 extra={"log_to_stderr": False},
             )
-            yield pickle.dumps(
-                ReplicaQueueLengthInfo(
-                    accepted=False, num_ongoing_requests=num_ongoing_requests
-                )
+            yield ReplicaQueueLengthInfo(
+                accepted=False, num_ongoing_requests=num_ongoing_requests
             )
             return
 
-        with self._wrap_user_method_call(request_metadata, request_args):
-            yield pickle.dumps(
-                ReplicaQueueLengthInfo(
-                    accepted=True,
-                    # NOTE(edoakes): `_wrap_user_method_call` will increment the number
-                    # of ongoing requests to include this one, so re-fetch the value.
-                    num_ongoing_requests=self.get_num_ongoing_requests(),
-                )
+        with self._wrap_user_method_call(
+            request_metadata, request_args
+        ) as status_code_callback:
+            yield ReplicaQueueLengthInfo(
+                accepted=True,
+                # NOTE(edoakes): `_wrap_user_method_call` will increment the number
+                # of ongoing requests to include this one, so re-fetch the value.
+                num_ongoing_requests=self.get_num_ongoing_requests(),
             )
 
             if request_metadata.is_streaming:
@@ -595,6 +579,7 @@ async def handle_request_with_rejection(
                     request_metadata,
                     request_args,
                     request_kwargs,
+                    status_code_callback=status_code_callback,
                 ):
                     yield result
             else:
@@ -604,81 +589,29 @@ async def handle_request_with_rejection(
                     )
                 )
 
-    async def handle_request_from_java(
-        self,
-        proto_request_metadata: bytes,
-        *request_args,
-        **request_kwargs,
-    ) -> Any:
-        from ray.serve.generated.serve_pb2 import (
-            RequestMetadata as RequestMetadataProto,
-        )
-
-        proto = RequestMetadataProto.FromString(proto_request_metadata)
-        request_metadata: RequestMetadata = RequestMetadata(
-            request_id=proto.request_id,
-            internal_request_id=proto.internal_request_id,
-            call_method=proto.call_method,
-            multiplexed_model_id=proto.multiplexed_model_id,
-            route=proto.route,
-        )
-        with self._wrap_user_method_call(request_metadata, request_args):
-            return await asyncio.wrap_future(
-                self._user_callable_wrapper.call_user_method(
-                    request_metadata, request_args, request_kwargs
-                )
-            )
-
-    async def is_allocated(self) -> str:
-        """poke the replica to check whether it's alive.
-
-        When calling this method on an ActorHandle, it will complete as
-        soon as the actor has started running. We use this mechanism to
-        detect when a replica has been allocated a worker slot.
-        At this time, the replica can transition from PENDING_ALLOCATION
-        to PENDING_INITIALIZATION startup state.
-
-        Returns:
-            The PID, actor ID, node ID, node IP, and log filepath id of the replica.
-        """
-
-        return (
-            os.getpid(),
-            ray.get_runtime_context().get_actor_id(),
-            ray.get_runtime_context().get_worker_id(),
-            ray.get_runtime_context().get_node_id(),
-            ray.util.get_node_ip_address(),
-            get_component_logger_file_path(),
-        )
-
-    async def initialize_and_get_metadata(
-        self,
-        deployment_config: DeploymentConfig = None,
-        _after: Optional[Any] = None,
-    ) -> Tuple[DeploymentConfig, DeploymentVersion, Optional[float], Optional[int]]:
-        """Handles initializing the replica.
+    @abstractmethod
+    async def _on_initialized(self):
+        raise NotImplementedError
 
-        Returns: 3-tuple containing
-            1. DeploymentConfig of the replica
-            2. DeploymentVersion of the replica
-            3. Initialization duration in seconds
-        """
-        # Unused `_after` argument is for scheduling: passing an ObjectRef
-        # allows delaying this call until after the `_after` call has returned.
+    async def initialize(self, deployment_config: DeploymentConfig):
         try:
             # Ensure that initialization is only performed once.
             # When controller restarts, it will call this method again.
             async with self._user_callable_initialized_lock:
-                initialization_start_time = time.time()
+                self._initialization_start_time = time.time()
                 if not self._user_callable_initialized:
                     self._user_callable_asgi_app = await asyncio.wrap_future(
                         self._user_callable_wrapper.initialize_callable()
                     )
+                    await self._on_initialized()
                     self._user_callable_initialized = True
-                    self._set_internal_replica_context(
-                        servable_object=self._user_callable_wrapper.user_callable
-                    )
+
                 if deployment_config:
+                    await asyncio.wrap_future(
+                        self._user_callable_wrapper.set_sync_method_threadpool_limit(
+                            deployment_config.max_ongoing_requests
+                        )
+                    )
                     await asyncio.wrap_future(
                         self._user_callable_wrapper.call_reconfigure(
                             deployment_config.user_config
@@ -689,20 +622,10 @@ async def initialize_and_get_metadata(
             # an initial health check. If an initial health check fails,
             # consider it an initialization failure.
             await self.check_health()
-
-            # Save the initialization latency if the replica is initializing
-            # for the first time.
-            if self._initialization_latency is None:
-                self._initialization_latency = time.time() - initialization_start_time
-
-            return self._get_metadata()
         except Exception:
             raise RuntimeError(traceback.format_exc()) from None
 
-    async def reconfigure(
-        self,
-        deployment_config: DeploymentConfig,
-    ) -> Tuple[DeploymentConfig, DeploymentVersion, Optional[float], Optional[int]]:
+    async def reconfigure(self, deployment_config: DeploymentConfig):
         try:
             user_config_changed = (
                 deployment_config.user_config != self._deployment_config.user_config
@@ -722,6 +645,11 @@ async def reconfigure(
             if logging_config_changed:
                 self._configure_logger_and_profilers(deployment_config.logging_config)
 
+            await asyncio.wrap_future(
+                self._user_callable_wrapper.set_sync_method_threadpool_limit(
+                    deployment_config.max_ongoing_requests
+                )
+            )
             if user_config_changed:
                 await asyncio.wrap_future(
                     self._user_callable_wrapper.call_reconfigure(
@@ -734,12 +662,10 @@ async def reconfigure(
             self._set_internal_replica_context(
                 servable_object=self._user_callable_wrapper.user_callable
             )
-
-            return self._get_metadata()
         except Exception:
             raise RuntimeError(traceback.format_exc()) from None
 
-    def _get_metadata(
+    def get_metadata(
         self,
     ) -> Tuple[DeploymentConfig, DeploymentVersion, Optional[float], Optional[int]]:
         return (
@@ -749,26 +675,22 @@ def _get_metadata(
             self._port,
         )
 
-    def _save_cpu_profile_data(self) -> str:
-        """Saves CPU profiling data, if CPU profiling is enabled.
-
-        Logs a warning if CPU profiling is disabled.
-        """
+    @abstractmethod
+    def _on_request_cancelled(
+        self, request_metadata: RequestMetadata, e: asyncio.CancelledError
+    ):
+        pass
 
-        if self.cpu_profiler is not None:
-            import marshal
+    @abstractmethod
+    def _on_request_failed(self, request_metadata: RequestMetadata, e: Exception):
+        pass
 
-            self.cpu_profiler.snapshot_stats()
-            with open(self.cpu_profiler_log, "wb") as f:
-                marshal.dump(self.cpu_profiler.stats, f)
-            logger.info(f'Saved CPU profile data to file "{self.cpu_profiler_log}"')
-            return self.cpu_profiler_log
-        else:
-            logger.error(
-                "Attempted to save CPU profile data, but failed because no "
-                "CPU profiler was running! Enable CPU profiling by enabling "
-                "the RAY_SERVE_ENABLE_CPU_PROFILING env var."
-            )
+    @abstractmethod
+    @contextmanager
+    def _wrap_user_method_call(
+        self, request_metadata: RequestMetadata, request_args: Tuple[Any]
+    ) -> Generator[StatusCodeCallback, None, None]:
+        pass
 
     async def _drain_ongoing_requests(self):
         """Wait for any ongoing requests to finish.
@@ -825,6 +747,254 @@ async def check_health(self):
             await asyncio.wrap_future(f)
 
 
+class Replica(ReplicaBase):
+    async def _on_initialized(self):
+        self._set_internal_replica_context(
+            servable_object=self._user_callable_wrapper.user_callable
+        )
+
+        # Save the initialization latency if the replica is initializing
+        # for the first time.
+        if self._initialization_latency is None:
+            self._initialization_latency = time.time() - self._initialization_start_time
+
+    def _on_request_cancelled(
+        self, request_metadata: RequestMetadata, e: asyncio.CancelledError
+    ):
+        """Recursively cancels child requests."""
+        requests_pending_assignment = (
+            ray.serve.context._get_requests_pending_assignment(
+                request_metadata.internal_request_id
+            )
+        )
+        for task in requests_pending_assignment.values():
+            task.cancel()
+
+    def _on_request_failed(self, request_metadata: RequestMetadata, e: Exception):
+        if ray.util.pdb._is_ray_debugger_post_mortem_enabled():
+            ray.util.pdb._post_mortem()
+
+    @contextmanager
+    def _wrap_user_method_call(
+        self, request_metadata: RequestMetadata, request_args: Tuple[Any]
+    ) -> Generator[StatusCodeCallback, None, None]:
+        """Context manager that wraps user method calls.
+
+        1) Sets the request context var with appropriate metadata.
+        2) Records the access log message (if not disabled).
+        3) Records per-request metrics via the metrics manager.
+        """
+        request_metadata.route = self._maybe_get_http_route(
+            request_metadata, request_args
+        )
+        ray.serve.context._serve_request_context.set(
+            ray.serve.context._RequestContext(
+                route=request_metadata.route,
+                request_id=request_metadata.request_id,
+                _internal_request_id=request_metadata.internal_request_id,
+                app_name=self._deployment_id.app_name,
+                multiplexed_model_id=request_metadata.multiplexed_model_id,
+                grpc_context=request_metadata.grpc_context,
+            )
+        )
+
+        with self._handle_errors_and_metrics(
+            request_metadata, request_args
+        ) as status_code_callback:
+            yield status_code_callback
+
+
+class ReplicaActor:
+    """Actor definition for replicas of Ray Serve deployments.
+
+    This class defines the interface that the controller and deployment handles
+    (i.e., from proxies and other replicas) use to interact with a replica.
+
+    All interaction with the user-provided callable is done via the
+    `UserCallableWrapper` class.
+    """
+
+    async def __init__(
+        self,
+        replica_id: ReplicaID,
+        serialized_deployment_def: bytes,
+        serialized_init_args: bytes,
+        serialized_init_kwargs: bytes,
+        deployment_config_proto_bytes: bytes,
+        version: DeploymentVersion,
+    ):
+        deployment_config = DeploymentConfig.from_proto_bytes(
+            deployment_config_proto_bytes
+        )
+        deployment_def = cloudpickle.loads(serialized_deployment_def)
+        if isinstance(deployment_def, str):
+            deployment_def = _load_deployment_def_from_import_path(deployment_def)
+
+        self._replica_impl: ReplicaBase = create_replica_impl(
+            replica_id=replica_id,
+            deployment_def=deployment_def,
+            init_args=cloudpickle.loads(serialized_init_args),
+            init_kwargs=cloudpickle.loads(serialized_init_kwargs),
+            deployment_config=deployment_config,
+            version=version,
+        )
+
+    def push_proxy_handle(self, handle: ActorHandle):
+        pass
+
+    def get_num_ongoing_requests(self) -> int:
+        """Fetch the number of ongoing requests at this replica (queue length).
+
+        This runs on a separate thread (using a Ray concurrency group) so it will
+        not be blocked by user code.
+        """
+        return self._replica_impl.get_num_ongoing_requests()
+
+    async def is_allocated(self) -> str:
+        """poke the replica to check whether it's alive.
+
+        When calling this method on an ActorHandle, it will complete as
+        soon as the actor has started running. We use this mechanism to
+        detect when a replica has been allocated a worker slot.
+        At this time, the replica can transition from PENDING_ALLOCATION
+        to PENDING_INITIALIZATION startup state.
+
+        Returns:
+            The PID, actor ID, node ID, node IP, and log filepath id of the replica.
+        """
+
+        return (
+            os.getpid(),
+            ray.get_runtime_context().get_actor_id(),
+            ray.get_runtime_context().get_worker_id(),
+            ray.get_runtime_context().get_node_id(),
+            ray.util.get_node_ip_address(),
+            get_component_logger_file_path(),
+        )
+
+    async def initialize_and_get_metadata(
+        self, deployment_config: DeploymentConfig = None, _after: Optional[Any] = None
+    ):
+        """Handles initializing the replica.
+
+        Returns: 3-tuple containing
+            1. DeploymentConfig of the replica
+            2. DeploymentVersion of the replica
+            3. Initialization duration in seconds
+        """
+        # Unused `_after` argument is for scheduling: passing an ObjectRef
+        # allows delaying this call until after the `_after` call has returned.
+        await self._replica_impl.initialize(deployment_config)
+        return self._replica_impl.get_metadata()
+
+    async def check_health(self):
+        await self._replica_impl.check_health()
+
+    async def reconfigure(
+        self, deployment_config
+    ) -> Tuple[DeploymentConfig, DeploymentVersion, Optional[float], Optional[int]]:
+        await self._replica_impl.reconfigure(deployment_config)
+        return self._replica_impl.get_metadata()
+
+    async def handle_request(
+        self,
+        pickled_request_metadata: bytes,
+        *request_args,
+        **request_kwargs,
+    ) -> Tuple[bytes, Any]:
+        """Entrypoint for `stream=False` calls."""
+        request_metadata = pickle.loads(pickled_request_metadata)
+        return await self._replica_impl.handle_request(
+            request_metadata, *request_args, **request_kwargs
+        )
+
+    async def handle_request_streaming(
+        self,
+        pickled_request_metadata: bytes,
+        *request_args,
+        **request_kwargs,
+    ) -> AsyncGenerator[Any, None]:
+        """Generator that is the entrypoint for all `stream=True` handle calls."""
+        request_metadata = pickle.loads(pickled_request_metadata)
+        async for result in self._replica_impl.handle_request_streaming(
+            request_metadata, *request_args, **request_kwargs
+        ):
+            yield result
+
+    async def handle_request_with_rejection(
+        self,
+        pickled_request_metadata: bytes,
+        *request_args,
+        **request_kwargs,
+    ) -> AsyncGenerator[Any, None]:
+        """Entrypoint for all requests with strict max_ongoing_requests enforcement.
+
+        The first response from this generator is always a system message indicating
+        if the request was accepted (the replica has capacity for the request) or
+        rejected (the replica is already at max_ongoing_requests).
+
+        For non-streaming requests, there will only be one more message, the unary
+        result of the user request handler.
+
+        For streaming requests, the subsequent messages will be the results of the
+        user request handler (which must be a generator).
+        """
+        request_metadata = pickle.loads(pickled_request_metadata)
+        async for result in self._replica_impl.handle_request_with_rejection(
+            request_metadata, *request_args, **request_kwargs
+        ):
+            if isinstance(result, ReplicaQueueLengthInfo):
+                yield pickle.dumps(result)
+            else:
+                yield result
+
+    async def handle_request_from_java(
+        self,
+        proto_request_metadata: bytes,
+        *request_args,
+        **request_kwargs,
+    ) -> Any:
+        from ray.serve.generated.serve_pb2 import (
+            RequestMetadata as RequestMetadataProto,
+        )
+
+        proto = RequestMetadataProto.FromString(proto_request_metadata)
+        request_metadata: RequestMetadata = RequestMetadata(
+            request_id=proto.request_id,
+            internal_request_id=proto.internal_request_id,
+            call_method=proto.call_method,
+            multiplexed_model_id=proto.multiplexed_model_id,
+            route=proto.route,
+        )
+        return await self._replica_impl.handle_request(
+            request_metadata, *request_args, **request_kwargs
+        )
+
+    async def perform_graceful_shutdown(self):
+        await self._replica_impl.perform_graceful_shutdown()
+
+    def _save_cpu_profile_data(self) -> str:
+        """Saves CPU profiling data, if CPU profiling is enabled.
+
+        Logs a warning if CPU profiling is disabled.
+        """
+
+        if self.cpu_profiler is not None:
+            import marshal
+
+            self.cpu_profiler.snapshot_stats()
+            with open(self.cpu_profiler_log, "wb") as f:
+                marshal.dump(self.cpu_profiler.stats, f)
+            logger.info(f'Saved CPU profile data to file "{self.cpu_profiler_log}"')
+            return self.cpu_profiler_log
+        else:
+            logger.error(
+                "Attempted to save CPU profile data, but failed because no "
+                "CPU profiler was running! Enable CPU profiling by enabling "
+                "the RAY_SERVE_ENABLE_CPU_PROFILING env var."
+            )
+
+
 class UserCallableWrapper:
     """Wraps a user-provided callable that is used to handle requests to a replica."""
 
@@ -835,6 +1005,7 @@ def __init__(
         init_kwargs: Dict,
         *,
         deployment_id: DeploymentID,
+        run_sync_methods_in_threadpool: bool,
     ):
         if not (inspect.isfunction(deployment_def) or inspect.isclass(deployment_def)):
             raise TypeError(
@@ -848,6 +1019,8 @@ def __init__(
         self._is_function = inspect.isfunction(deployment_def)
         self._deployment_id = deployment_id
         self._destructor_called = False
+        self._run_sync_methods_in_threadpool = run_sync_methods_in_threadpool
+        self._warned_about_sync_method_change = False
 
         # Will be populated in `initialize_callable`.
         self._callable = None
@@ -878,7 +1051,7 @@ def _run_on_user_code_event_loop(f: Callable) -> Callable:
             f
         ), "_run_on_user_code_event_loop can only be used on coroutine functions."
 
-        @wraps(f)
+        @functools.wraps(f)
         def wrapper(self, *args, **kwargs) -> concurrent.futures.Future:
             return asyncio.run_coroutine_threadsafe(
                 f(self, *args, **kwargs),
@@ -887,6 +1060,12 @@ def wrapper(self, *args, **kwargs) -> concurrent.futures.Future:
 
         return wrapper
 
+    @_run_on_user_code_event_loop
+    async def set_sync_method_threadpool_limit(self, limit: int):
+        # NOTE(edoakes): the limit is thread local, so this must
+        # be run on the user code event loop.
+        to_thread.current_default_thread_limiter().total_tokens = limit
+
     def _get_user_callable_method(self, method_name: str) -> Callable:
         if self._is_function:
             return self._callable
@@ -927,17 +1106,89 @@ async def _send_user_result_over_asgi(
         else:
             await Response(result).send(scope, receive, send)
 
-    async def _call_func_or_gen(self, callable: Callable, *args, **kwargs) -> Any:
+    async def _call_func_or_gen(
+        self,
+        callable: Callable,
+        *,
+        args: Optional[Tuple[Any]] = None,
+        kwargs: Optional[Dict[str, Any]] = None,
+        request_metadata: Optional[RequestMetadata] = None,
+        generator_result_callback: Optional[Callable] = None,
+        run_sync_methods_in_threadpool_override: Optional[bool] = None,
+    ) -> Tuple[Any, bool]:
         """Call the callable with the provided arguments.
 
         This is a convenience wrapper that will work for `def`, `async def`,
         generator, and async generator functions.
+
+        Returns the result and a boolean indicating if the result was a sync generator
+        that has already been consumed.
         """
-        result = callable(*args, **kwargs)
-        if inspect.iscoroutine(result):
-            result = await result
+        sync_gen_consumed = False
+        args = args if args is not None else tuple()
+        kwargs = kwargs if kwargs is not None else dict()
+        run_sync_in_threadpool = (
+            self._run_sync_methods_in_threadpool
+            if run_sync_methods_in_threadpool_override is None
+            else run_sync_methods_in_threadpool_override
+        )
+        is_sync_method = (
+            inspect.isfunction(callable) or inspect.ismethod(callable)
+        ) and not (
+            inspect.iscoroutinefunction(callable)
+            or inspect.isasyncgenfunction(callable)
+        )
 
-        return result
+        if is_sync_method and run_sync_in_threadpool:
+            is_generator = inspect.isgeneratorfunction(callable)
+            if is_generator:
+                sync_gen_consumed = True
+                if request_metadata and not request_metadata.is_streaming:
+                    # TODO(edoakes): make this check less redundant with the one in
+                    # _handle_user_method_result.
+                    raise TypeError(
+                        f"Method '{callable.__name__}' returned a generator. "
+                        "You must use `handle.options(stream=True)` to call "
+                        "generators on a deployment."
+                    )
+
+            def run_callable():
+                result = callable(*args, **kwargs)
+                if is_generator:
+                    for r in result:
+                        # TODO(edoakes): make this less redundant with the handling in
+                        # _handle_user_method_result.
+                        if request_metadata and request_metadata.is_grpc_request:
+                            r = (request_metadata.grpc_context, r.SerializeToString())
+                        generator_result_callback(r)
+
+                    result = None
+
+                return result
+
+            # NOTE(edoakes): we use anyio.to_thread here because it's what Starlette
+            # uses (and therefore FastAPI too). The max size of the threadpool is
+            # set to max_ongoing_requests in the replica wrapper.
+            # anyio.to_thread propagates ContextVars to the worker thread automatically.
+            result = await to_thread.run_sync(run_callable)
+        else:
+            if (
+                is_sync_method
+                and not self._warned_about_sync_method_change
+                and run_sync_methods_in_threadpool_override is None
+            ):
+                self._warned_about_sync_method_change = True
+                warnings.warn(
+                    RAY_SERVE_RUN_SYNC_IN_THREADPOOL_WARNING.format(
+                        method_name=callable.__name__,
+                    )
+                )
+
+            result = callable(*args, **kwargs)
+            if inspect.iscoroutine(result):
+                result = await result
+
+        return result, sync_gen_consumed
 
     @property
     def user_callable(self) -> Optional[Callable]:
@@ -974,8 +1225,10 @@ async def initialize_callable(self) -> Optional[ASGIApp]:
             self._callable = self._deployment_def.__new__(self._deployment_def)
             await self._call_func_or_gen(
                 self._callable.__init__,
-                *self._init_args,
-                **self._init_kwargs,
+                args=self._init_args,
+                kwargs=self._init_kwargs,
+                # Always run the constructor on the main user code thread.
+                run_sync_methods_in_threadpool_override=False,
             )
 
             if isinstance(self._callable, ASGIAppReplicaWrapper):
@@ -997,7 +1250,7 @@ async def initialize_callable(self) -> Optional[ASGIApp]:
     def _raise_if_not_initialized(self, method_name: str):
         if self._callable is None:
             raise RuntimeError(
-                "`initialize_callable` must be called before `{method_name}`."
+                f"`initialize_callable` must be called before `{method_name}`."
             )
 
     def call_user_health_check(self) -> Optional[concurrent.futures.Future]:
@@ -1037,7 +1290,7 @@ async def call_reconfigure(self, user_config: Any):
                 )
             await self._call_func_or_gen(
                 getattr(self._callable, RECONFIGURE_METHOD),
-                user_config,
+                args=(user_config,),
             )
 
     def _prepare_args_for_http_request(
@@ -1109,6 +1362,7 @@ async def _handle_user_method_result(
         user_method_name: str,
         request_metadata: RequestMetadata,
         *,
+        sync_gen_consumed: bool,
         generator_result_callback: Optional[Callable],
         is_asgi_app: bool,
         asgi_args: Optional[ASGIArgs],
@@ -1142,7 +1396,7 @@ async def _handle_user_method_result(
                 # For the FastAPI codepath, the response has already been sent over
                 # ASGI, but for the vanilla deployment codepath we need to send it.
                 await self._send_user_result_over_asgi(result, asgi_args)
-            elif not request_metadata.is_http_request:
+            elif not request_metadata.is_http_request and not sync_gen_consumed:
                 # If a unary method is called with stream=True for anything EXCEPT
                 # an HTTP request, raise an error.
                 # HTTP requests are always streaming regardless of if the method
@@ -1227,19 +1481,32 @@ async def call_user_method(
                     request_args[0], request_metadata, user_method_params
                 )
 
-            result = await self._handle_user_method_result(
-                await self._call_func_or_gen(
-                    user_method, *request_args, **request_kwargs
-                ),
+            result, sync_gen_consumed = await self._call_func_or_gen(
+                user_method,
+                args=request_args,
+                kwargs=request_kwargs,
+                request_metadata=request_metadata,
+                generator_result_callback=generator_result_callback
+                if request_metadata.is_streaming
+                else None,
+            )
+            return await self._handle_user_method_result(
+                result,
                 user_method_name,
                 request_metadata,
+                sync_gen_consumed=sync_gen_consumed,
                 generator_result_callback=generator_result_callback,
                 is_asgi_app=is_asgi_app,
                 asgi_args=asgi_args,
             )
 
         except Exception:
-            if request_metadata.is_http_request and asgi_args is not None:
+            if (
+                request_metadata.is_http_request
+                and asgi_args is not None
+                # If the callable is an ASGI app, it already sent a 500 status response.
+                and not is_asgi_app
+            ):
                 await self._send_user_result_over_asgi(
                     starlette.responses.Response(
                         "Internal Server Error", status_code=500
@@ -1252,8 +1519,6 @@ async def call_user_method(
             if receive_task is not None and not receive_task.done():
                 receive_task.cancel()
 
-        return result
-
     @_run_on_user_code_event_loop
     async def call_destructor(self):
         """Explicitly call the `__del__` method of the user callable.
@@ -1277,7 +1542,11 @@ async def call_destructor(self):
         try:
             if hasattr(self._callable, "__del__"):
                 # Make sure to accept `async def __del__(self)` as well.
-                await self._call_func_or_gen(self._callable.__del__)
+                await self._call_func_or_gen(
+                    self._callable.__del__,
+                    # Always run the destructor on the main user callable thread.
+                    run_sync_methods_in_threadpool_override=False,
+                )
 
             if hasattr(self._callable, "__serve_multiplex_wrapper"):
                 await getattr(self._callable, "__serve_multiplex_wrapper").shutdown()
diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py
index 9cd8c10f5f82..85d391c95d52 100644
--- a/python/ray/serve/_private/router.py
+++ b/python/ray/serve/_private/router.py
@@ -327,7 +327,7 @@ def assign_request(
         pass
 
     @abstractmethod
-    def shutdown(self):
+    def shutdown(self) -> concurrent.futures.Future:
         pass
 
 
@@ -680,7 +680,7 @@ def assign_request(
             loop=self._asyncio_loop,
         )
 
-    def shutdown(self):
-        asyncio.run_coroutine_threadsafe(
+    def shutdown(self) -> concurrent.futures.Future:
+        return asyncio.run_coroutine_threadsafe(
             self._asyncio_router.shutdown(), loop=self._asyncio_loop
-        ).result()
+        )
diff --git a/python/ray/serve/api.py b/python/ray/serve/api.py
index 182795889d47..13b92c7fcaae 100644
--- a/python/ray/serve/api.py
+++ b/python/ray/serve/api.py
@@ -474,6 +474,7 @@ def _run(
     else:
         client = _private_api.serve_start(
             http_options={"location": "EveryNode"},
+            global_logging_config=logging_config,
         )
         # Record after Ray has been started.
         ServeUsageTag.API_VERSION.record("v2")
diff --git a/python/ray/serve/handle.py b/python/ray/serve/handle.py
index 0eba1c5dc5ee..ead2e174948b 100644
--- a/python/ray/serve/handle.py
+++ b/python/ray/serve/handle.py
@@ -76,16 +76,6 @@ def __init__(
             extra={"log_to_stderr": False},
         )
 
-    def _get_or_create_router(self) -> Router:
-        if self._router is None:
-            self._router = self._create_router(
-                handle_id=self.handle_id,
-                deployment_id=self.deployment_id,
-                handle_options=self.init_options,
-            )
-
-        return self._router
-
     @staticmethod
     def _gen_handle_tag(app_name: str, deployment_name: str, handle_id: str):
         if app_name:
@@ -150,8 +140,13 @@ def _init(self, **kwargs):
                 f"was initialized with {self.init_options}."
             )
 
-        self.init_options = create_init_handle_options(**kwargs)
-        self._get_or_create_router()
+        init_options = create_init_handle_options(**kwargs)
+        self._router = self._create_router(
+            handle_id=self.handle_id,
+            deployment_id=self.deployment_id,
+            handle_options=init_options,
+        )
+        self.init_options = init_options
 
         # Record handle api telemetry when not in the proxy
         if (
@@ -209,7 +204,13 @@ def __getattr__(self, name):
 
     def shutdown(self):
         if self._router:
-            self._router.shutdown()
+            shutdown_future = self._router.shutdown()
+            shutdown_future.result()
+
+    async def shutdown_async(self):
+        if self._router:
+            shutdown_future = self._router.shutdown()
+            await asyncio.wrap_future(shutdown_future)
 
     def __repr__(self):
         return f"{self.__class__.__name__}" f"(deployment='{self.deployment_name}')"
diff --git a/python/ray/serve/tests/BUILD b/python/ray/serve/tests/BUILD
index 369b9a339c6f..9f3208084538 100644
--- a/python/ray/serve/tests/BUILD
+++ b/python/ray/serve/tests/BUILD
@@ -467,3 +467,25 @@ py_test_module_list(
         "//python/ray/serve:serve_lib",
     ],
 )
+
+
+# Test currently off-by-default behavior to run replica sync methods in a threadpool.
+# TODO(edoakes): remove this once the FF is flipped on by default.
+py_test_module_list(
+    size = "small",
+    env = {"RAY_SERVE_RUN_SYNC_IN_THREADPOOL": "1"},
+    files = [
+        "test_replica_sync_methods.py",
+    ],
+    name_suffix = "_with_run_sync_in_threadpool",
+    tags = [
+        "exclusive",
+        "no_windows",
+        "team:serve",
+    ],
+    deps = [
+        ":common",
+        ":conftest",
+        "//python/ray/serve:serve_lib",
+    ],
+)
diff --git a/python/ray/serve/tests/test_cli.py b/python/ray/serve/tests/test_cli.py
index 943024cfbe38..2b2c0ff279dd 100644
--- a/python/ray/serve/tests/test_cli.py
+++ b/python/ray/serve/tests/test_cli.py
@@ -606,6 +606,34 @@ def check_for_failed_deployment():
     wait_for_condition(check_for_failed_deployment)
 
 
+@pytest.mark.skipif(sys.platform == "win32", reason="File path incorrect on Windows.")
+def test_status_constructor_retry_error(ray_start_stop):
+    """Deploys Serve deployment that errors out in constructor, checks that the
+    retry message is surfaced.
+    """
+
+    config_file_name = os.path.join(
+        os.path.dirname(__file__), "test_config_files", "deployment_fail_2.yaml"
+    )
+
+    subprocess.check_output(["serve", "deploy", config_file_name])
+
+    def check_for_failed_deployment():
+        cli_output = subprocess.check_output(
+            ["serve", "status", "-a", "http://localhost:52365/"]
+        )
+        status = yaml.safe_load(cli_output)["applications"][SERVE_DEFAULT_APP_NAME]
+        assert status["status"] == "DEPLOYING"
+
+        deployment_status = status["deployments"]["A"]
+        assert deployment_status["status"] == "UPDATING"
+        assert deployment_status["status_trigger"] == "CONFIG_UPDATE_STARTED"
+        assert "ZeroDivisionError" in deployment_status["message"]
+        return True
+
+    wait_for_condition(check_for_failed_deployment)
+
+
 @pytest.mark.skipif(sys.platform == "win32", reason="File path incorrect on Windows.")
 def test_status_package_unavailable_in_controller(ray_start_stop):
     """Test that exceptions raised from packages that are installed on deployment actors
diff --git a/python/ray/serve/tests/test_config_files/deployment_fail_2.yaml b/python/ray/serve/tests/test_config_files/deployment_fail_2.yaml
new file mode 100644
index 000000000000..38da015b0bb3
--- /dev/null
+++ b/python/ray/serve/tests/test_config_files/deployment_fail_2.yaml
@@ -0,0 +1,3 @@
+applications:
+  - name: default
+    import_path: ray.serve.tests.test_config_files.fail_2.node
diff --git a/python/ray/serve/tests/test_config_files/fail_2.py b/python/ray/serve/tests/test_config_files/fail_2.py
new file mode 100644
index 000000000000..2e95aa93d98f
--- /dev/null
+++ b/python/ray/serve/tests/test_config_files/fail_2.py
@@ -0,0 +1,13 @@
+import time
+
+from ray import serve
+
+
+@serve.deployment
+class A:
+    def __init__(self):
+        time.sleep(5)
+        1 / 0
+
+
+node = A.bind()
diff --git a/python/ray/serve/tests/test_controller_recovery.py b/python/ray/serve/tests/test_controller_recovery.py
index 0042323221b3..51d641dbedfc 100644
--- a/python/ray/serve/tests/test_controller_recovery.py
+++ b/python/ray/serve/tests/test_controller_recovery.py
@@ -64,7 +64,7 @@ def __call__(self, *args):
 
     replica_version_hash = None
     for replica in deployment_dict[id]:
-        ref = replica.actor_handle._get_metadata.remote()
+        ref = replica.actor_handle.initialize_and_get_metadata.remote()
         _, version, _, _ = ray.get(ref)
         if replica_version_hash is None:
             replica_version_hash = hash(version)
@@ -116,7 +116,7 @@ def __call__(self, *args):
     # Ensure recovered replica version has are the same
     for replica_name in recovered_replica_names:
         actor_handle = ray.get_actor(replica_name, namespace=SERVE_NAMESPACE)
-        ref = actor_handle._get_metadata.remote()
+        ref = actor_handle.initialize_and_get_metadata.remote()
         _, version, _, _ = ray.get(ref)
         assert replica_version_hash == hash(
             version
@@ -487,7 +487,7 @@ def check_proxy_handle_in_controller():
     resp = requests.get("http://127.0.0.1:8000")
     assert resp.status_code == 200
     wait_for_condition(
-        check_log_file, log_file=file_path, expected_regex=['.*"message":.*GET 200.*']
+        check_log_file, log_file=file_path, expected_regex=['.*"message":.*GET / 200.*']
     )
 
 
diff --git a/python/ray/serve/tests/test_gcs_failure.py b/python/ray/serve/tests/test_gcs_failure.py
index cb582a3df51c..3dabb58a99fc 100644
--- a/python/ray/serve/tests/test_gcs_failure.py
+++ b/python/ray/serve/tests/test_gcs_failure.py
@@ -30,9 +30,14 @@ def serve_ha(external_redis, monkeypatch):  # noqa: F811
     )
     serve.start()
     yield (address_info, _get_global_client())
-    ray.shutdown()
+
+    # When GCS is down, right now some core worker members are not cleared
+    # properly in ray.shutdown.
+    ray.worker._global_node.start_gcs_server()
+
     # Clear cache and global serve client
     serve.shutdown()
+    ray.shutdown()
 
 
 @pytest.mark.skipif(
@@ -127,6 +132,7 @@ def router_populated_with_replicas(
     else:
         replicas = get_replicas_func()
 
+    print(f"Replica set in router: {replicas}")
     assert len(replicas) >= threshold
 
     # Return early if we don't need to check cache
@@ -299,7 +305,4 @@ def test_proxy_router_updated_replicas_then_gcs_failure(serve_ha):
 
 
 if __name__ == "__main__":
-    # When GCS is down, right now some core worker members are not cleared
-    # properly in ray.shutdown. Given that this is not hi-pri issue,
-    # using --forked for isolation.
-    sys.exit(pytest.main(["-v", "-s", "--forked", __file__]))
+    sys.exit(pytest.main(["-v", "-s", __file__]))
diff --git a/python/ray/serve/tests/test_handle_2.py b/python/ray/serve/tests/test_handle_2.py
index cc58f970f5b7..6b238d8211d9 100644
--- a/python/ray/serve/tests/test_handle_2.py
+++ b/python/ray/serve/tests/test_handle_2.py
@@ -472,5 +472,30 @@ async def _assert_one_waiter():
         tasks = pending
 
 
+def test_shutdown(serve_instance):
+    @serve.deployment
+    class Hi:
+        def __call__(self):
+            return "hi"
+
+    h = serve.run(Hi.bind())
+    assert h.remote().result() == "hi"
+
+    h.shutdown()
+
+
+@pytest.mark.asyncio
+async def test_shutdown_async(serve_instance):
+    @serve.deployment
+    class Hi:
+        def __call__(self):
+            return "hi"
+
+    h = serve.run(Hi.bind())
+    assert await h.remote() == "hi"
+
+    await h.shutdown_async()
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main(["-v", "-s", __file__]))
diff --git a/python/ray/serve/tests/test_logging.py b/python/ray/serve/tests/test_logging.py
index b5b723a6d987..24f9a47a01a1 100644
--- a/python/ray/serve/tests/test_logging.py
+++ b/python/ray/serve/tests/test_logging.py
@@ -14,13 +14,15 @@
 import pytest
 import requests
 import starlette
+from fastapi import FastAPI
+from starlette.responses import PlainTextResponse
 
 import ray
 import ray.util.state as state_api
 from ray import serve
 from ray._private.ray_logging.formatters import JSONFormatter
 from ray._private.test_utils import wait_for_condition
-from ray.serve._private.common import ReplicaID, ServeComponentType
+from ray.serve._private.common import DeploymentID, ReplicaID, ServeComponentType
 from ray.serve._private.constants import SERVE_LOG_EXTRA_FIELDS, SERVE_LOGGER_NAME
 from ray.serve._private.logging_utils import (
     ServeComponentFilter,
@@ -97,6 +99,97 @@ def __call__(self):
     assert rotation_config["backup_count"] == backup_count
 
 
+def test_http_access_log(serve_instance):
+    name = "deployment_name"
+
+    fastapi_app = FastAPI()
+
+    @serve.deployment(name=name)
+    @serve.ingress(fastapi_app)
+    class Handler:
+        def __init__(self):
+            self._replica_unique_id = serve.get_replica_context().replica_id.unique_id
+
+        @fastapi_app.get("/")
+        def get_root(self):
+            return PlainTextResponse(self._replica_unique_id)
+
+        @fastapi_app.post("/")
+        def post_root(self):
+            return PlainTextResponse(self._replica_unique_id)
+
+        @fastapi_app.get("/{status}")
+        def template(self, status: str):
+            return PlainTextResponse(self._replica_unique_id, status_code=int(status))
+
+        @fastapi_app.put("/fail")
+        def fail(self):
+            raise RuntimeError("OOPS!")
+
+    serve.run(Handler.bind())
+
+    f = io.StringIO()
+    with redirect_stderr(f):
+
+        def check_log(
+            replica_id: ReplicaID,
+            method: str,
+            route: str,
+            status_code: str,
+            fail: bool = False,
+        ):
+            s = f.getvalue()
+            return all(
+                [
+                    name in s,
+                    _get_expected_replica_log_content(replica_id) in s,
+                    f"-- {method} {route} {status_code}" in s,
+                    "ms" in s,
+                    ("OOPS!" in s and "RuntimeError" in s)
+                    if fail
+                    else True,  # Check for stacktrace.
+                ]
+            )
+
+        r = requests.get("http://localhost:8000/")
+        assert r.status_code == 200
+        replica_id = ReplicaID(unique_id=r.text, deployment_id=DeploymentID(name=name))
+        wait_for_condition(
+            check_log, replica_id=replica_id, method="GET", route="/", status_code="200"
+        )
+
+        r = requests.post("http://localhost:8000/")
+        assert r.status_code == 200
+        wait_for_condition(
+            check_log,
+            replica_id=replica_id,
+            method="POST",
+            route="/",
+            status_code="200",
+        )
+
+        r = requests.get("http://localhost:8000/350")
+        assert r.status_code == 350
+        wait_for_condition(
+            check_log,
+            replica_id=replica_id,
+            method="GET",
+            route="/{status}",
+            status_code="350",
+        )
+
+        r = requests.put("http://localhost:8000/fail")
+        assert r.status_code == 500
+        wait_for_condition(
+            check_log,
+            replica_id=replica_id,
+            method="PUT",
+            route="/fail",
+            status_code="500",
+            fail=True,
+        )
+
+
 def test_handle_access_log(serve_instance):
     name = "handler"
 
@@ -122,7 +215,7 @@ def check_log(replica_id: ReplicaID, method_name: str, fail: bool = False):
                 [
                     name in s,
                     _get_expected_replica_log_content(replica_id) in s,
-                    method_name.upper() in s,
+                    method_name in s,
                     ("ERROR" if fail else "OK") in s,
                     "ms" in s,
                     ("blah blah blah" in s and "RuntimeError" in s)
@@ -258,6 +351,9 @@ def fn(*args):
             "actor_id": ray.get_runtime_context().get_actor_id(),
             "worker_id": ray.get_runtime_context().get_worker_id(),
             "node_id": ray.get_runtime_context().get_node_id(),
+            "task_name": ray.get_runtime_context().get_task_name(),
+            "task_func_name": ray.get_runtime_context().get_task_function_name(),
+            "actor_name": ray.get_runtime_context().get_actor_name(),
         }
 
     @serve.deployment(
@@ -276,6 +372,9 @@ def __call__(self, req: starlette.requests.Request):
                 "actor_id": ray.get_runtime_context().get_actor_id(),
                 "worker_id": ray.get_runtime_context().get_worker_id(),
                 "node_id": ray.get_runtime_context().get_node_id(),
+                "task_name": ray.get_runtime_context().get_task_name(),
+                "task_func_name": ray.get_runtime_context().get_task_function_name(),
+                "actor_name": ray.get_runtime_context().get_actor_name(),
             }
 
     serve.run(fn.bind(), name="app1", route_prefix="/fn")
@@ -288,15 +387,14 @@ def __call__(self, req: starlette.requests.Request):
 
         # Check the component log
         expected_log_infos = [
-            f"{resp['request_id']} {resp['route']} replica.py",
-            f"{resp2['request_id']} {resp2['route']} replica.py",
+            f"{resp['request_id']} -- ",
+            f"{resp2['request_id']} -- ",
         ]
 
         # Check User log
         user_log_regexes = [
-            f".*{resp['request_id']} {resp['route']}.* user func.*",
-            f".*{resp2['request_id']} {resp2['route']}.* user log "
-            "message from class method.*",
+            f".*{resp['request_id']} -- user func.*",
+            f".*{resp2['request_id']} -- user log.*" "message from class method.*",
         ]
 
         def check_log():
@@ -326,6 +424,9 @@ def check_log():
                 f'"worker_id": "{resp["worker_id"]}", '
                 f'"node_id": "{resp["node_id"]}", '
                 f'"actor_id": "{resp["actor_id"]}", '
+                f'"task_name": "{resp["task_name"]}", '
+                f'"task_func_name": "{resp["task_func_name"]}", '
+                f'"actor_name": "{resp["actor_name"]}", '
                 f'"deployment": "{resp["app_name"]}_fn", '
                 f'"replica": "{method_replica_id}", '
                 f'"component_name": "replica".*'
@@ -338,17 +439,17 @@ def check_log():
                 f'"worker_id": "{resp2["worker_id"]}", '
                 f'"node_id": "{resp2["node_id"]}", '
                 f'"actor_id": "{resp2["actor_id"]}", '
+                f'"task_name": "{resp2["task_name"]}", '
+                f'"task_func_name": "{resp2["task_func_name"]}", '
+                f'"actor_name": "{resp2["actor_name"]}", '
                 f'"deployment": "{resp2["app_name"]}_Model", '
                 f'"replica": "{class_method_replica_id}", '
                 f'"component_name": "replica".*'
             )
         else:
-            user_method_log_regex = (
-                f".*{resp['request_id']} {resp['route']}.* user func.*"
-            )
+            user_method_log_regex = f".*{resp['request_id']} -- user func.*"
             user_class_method_log_regex = (
-                f".*{resp2['request_id']} {resp2['route']}.* "
-                "user log message from class method.*"
+                f".*{resp2['request_id']} -- .*" "user log message from class method.*"
             )
 
         def check_log_file(log_file: str, expected_regex: list):
diff --git a/python/ray/serve/tests/test_long_poll.py b/python/ray/serve/tests/test_long_poll.py
index 86bf03880e33..2ba31d414e05 100644
--- a/python/ray/serve/tests/test_long_poll.py
+++ b/python/ray/serve/tests/test_long_poll.py
@@ -38,7 +38,7 @@ def test_notifier_events_cleared_without_update(serve_instance):
     host = ray.remote(LongPollHost).remote(
         listen_for_change_request_timeout_s=(0.1, 0.1)
     )
-    ray.get(host.notify_changed.remote("key_1", 999))
+    ray.get(host.notify_changed.remote({"key_1": 999}))
 
     # Get an initial object snapshot for the key.
     object_ref = host.listen_for_change.remote({"key_1": -1})
@@ -60,8 +60,8 @@ def test_host_standalone(serve_instance):
     host = ray.remote(LongPollHost).remote()
 
     # Write two values
-    ray.get(host.notify_changed.remote("key_1", 999))
-    ray.get(host.notify_changed.remote("key_2", 999))
+    ray.get(host.notify_changed.remote({"key_1": 999}))
+    ray.get(host.notify_changed.remote({"key_2": 999}))
     object_ref = host.listen_for_change.remote({"key_1": -1, "key_2": -1})
 
     # We should be able to get the result immediately
@@ -77,7 +77,7 @@ def test_host_standalone(serve_instance):
     assert len(not_done) == 1
 
     # Now update the value, we should immediately get updated value
-    ray.get(host.notify_changed.remote("key_2", 999))
+    ray.get(host.notify_changed.remote({"key_2": 999}))
     result = ray.get(object_ref)
     assert len(result) == 1
     assert "key_2" in result
@@ -88,13 +88,13 @@ def test_long_poll_wait_for_keys(serve_instance):
     # are set.
     host = ray.remote(LongPollHost).remote()
     object_ref = host.listen_for_change.remote({"key_1": -1, "key_2": -1})
-    ray.get(host.notify_changed.remote("key_1", 999))
-    ray.get(host.notify_changed.remote("key_2", 999))
 
-    # We should be able to get the one of the result immediately
+    ray.get(host.notify_changed.remote({"key_1": 123, "key_2": 456}))
+
+    # We should be able to get the both results immediately
     result: Dict[str, UpdatedObject] = ray.get(object_ref)
-    assert set(result.keys()).issubset({"key_1", "key_2"})
-    assert {v.object_snapshot for v in result.values()} == {999}
+    assert result.keys() == {"key_1", "key_2"}
+    assert {v.object_snapshot for v in result.values()} == {123, 456}
 
 
 def test_long_poll_restarts(serve_instance):
@@ -106,7 +106,7 @@ class RestartableLongPollHost:
         def __init__(self) -> None:
             print("actor started")
             self.host = LongPollHost()
-            self.host.notify_changed("timer", time.time())
+            self.host.notify_changed({"timer": time.time()})
             self.should_exit = False
 
         async def listen_for_change(self, key_to_ids):
@@ -142,8 +142,8 @@ async def test_client_callbacks(serve_instance):
     host = ray.remote(LongPollHost).remote()
 
     # Write two values
-    ray.get(host.notify_changed.remote("key_1", 100))
-    ray.get(host.notify_changed.remote("key_2", 999))
+    ray.get(host.notify_changed.remote({"key_1": 100}))
+    ray.get(host.notify_changed.remote({"key_2": 999}))
 
     callback_results = dict()
 
@@ -167,7 +167,7 @@ def key_2_callback(result):
         timeout=1,
     )
 
-    ray.get(host.notify_changed.remote("key_2", 1999))
+    ray.get(host.notify_changed.remote({"key_2": 1999}))
 
     await async_wait_for_condition(
         lambda: callback_results == {"key_1": 100, "key_2": 999},
@@ -178,7 +178,7 @@ def key_2_callback(result):
 @pytest.mark.asyncio
 async def test_client_threadsafe(serve_instance):
     host = ray.remote(LongPollHost).remote()
-    ray.get(host.notify_changed.remote("key_1", 100))
+    ray.get(host.notify_changed.remote({"key_1": 100}))
 
     e = asyncio.Event()
 
@@ -198,7 +198,7 @@ def key_1_callback(_):
 
 def test_listen_for_change_java(serve_instance):
     host = ray.remote(LongPollHost).remote()
-    ray.get(host.notify_changed.remote("key_1", 999))
+    ray.get(host.notify_changed.remote({"key_1": 999}))
     request_1 = {"keys_to_snapshot_ids": {"key_1": -1}}
     object_ref = host.listen_for_change_java.remote(
         LongPollRequest(**request_1).SerializeToString()
@@ -211,7 +211,7 @@ def test_listen_for_change_java(serve_instance):
     endpoints: Dict[DeploymentID, EndpointInfo] = dict()
     endpoints["deployment_name"] = EndpointInfo(route="/test/xlang/poll")
     endpoints["deployment_name1"] = EndpointInfo(route="/test/xlang/poll1")
-    ray.get(host.notify_changed.remote(LongPollNamespace.ROUTE_TABLE, endpoints))
+    ray.get(host.notify_changed.remote({LongPollNamespace.ROUTE_TABLE: endpoints}))
     object_ref_2 = host.listen_for_change_java.remote(
         LongPollRequest(**request_2).SerializeToString()
     )
@@ -240,7 +240,7 @@ def test_listen_for_change_java(serve_instance):
     ]
     ray.get(
         host.notify_changed.remote(
-            (LongPollNamespace.RUNNING_REPLICAS, "deployment_name"), replicas
+            {(LongPollNamespace.RUNNING_REPLICAS, "deployment_name"): replicas}
         )
     )
     object_ref_3 = host.listen_for_change_java.remote(
diff --git a/python/ray/serve/tests/test_metrics.py b/python/ray/serve/tests/test_metrics.py
index f93e37661394..6f64666a96ba 100644
--- a/python/ray/serve/tests/test_metrics.py
+++ b/python/ray/serve/tests/test_metrics.py
@@ -6,7 +6,11 @@
 import grpc
 import pytest
 import requests
-from fastapi import FastAPI
+from fastapi import FastAPI, WebSocket
+from starlette.requests import Request
+from starlette.responses import PlainTextResponse
+from websockets.exceptions import ConnectionClosed
+from websockets.sync.client import connect
 
 import ray
 import ray.util.state as state_api
@@ -583,6 +587,161 @@ def f(*args):
     print("serve_grpc_request_latency_ms_sum working as expected.")
 
 
+def test_proxy_metrics_http_status_code_is_error(serve_start_shutdown):
+    """Verify that 2xx status codes aren't errors, others are."""
+
+    def check_request_count_metrics(
+        expected_error_count: int,
+        expected_success_count: int,
+    ):
+        resp = requests.get("http://127.0.0.1:9999").text
+        error_count = 0
+        success_count = 0
+        for line in resp.split("\n"):
+            if line.startswith("ray_serve_num_http_error_requests_total"):
+                error_count += int(float(line.split(" ")[-1]))
+            if line.startswith("ray_serve_num_http_requests_total"):
+                success_count += int(float(line.split(" ")[-1]))
+
+        assert error_count == expected_error_count
+        assert success_count == expected_success_count
+        return True
+
+    @serve.deployment
+    async def return_status_code(request: Request):
+        code = int((await request.body()).decode("utf-8"))
+        return PlainTextResponse("", status_code=code)
+
+    serve.run(return_status_code.bind())
+
+    # 200 is not an error.
+    r = requests.get("http://127.0.0.1:8000/", data=b"200")
+    assert r.status_code == 200
+    wait_for_condition(
+        check_request_count_metrics,
+        expected_error_count=0,
+        expected_success_count=1,
+    )
+
+    # 2xx is not an error.
+    r = requests.get("http://127.0.0.1:8000/", data=b"250")
+    assert r.status_code == 250
+    wait_for_condition(
+        check_request_count_metrics,
+        expected_error_count=0,
+        expected_success_count=2,
+    )
+
+    # 3xx is an error.
+    r = requests.get("http://127.0.0.1:8000/", data=b"300")
+    assert r.status_code == 300
+    wait_for_condition(
+        check_request_count_metrics,
+        expected_error_count=1,
+        expected_success_count=3,
+    )
+
+    # 4xx is an error.
+    r = requests.get("http://127.0.0.1:8000/", data=b"400")
+    assert r.status_code == 400
+    wait_for_condition(
+        check_request_count_metrics,
+        expected_error_count=2,
+        expected_success_count=4,
+    )
+
+    # 5xx is an error.
+    r = requests.get("http://127.0.0.1:8000/", data=b"500")
+    assert r.status_code == 500
+    wait_for_condition(
+        check_request_count_metrics,
+        expected_error_count=3,
+        expected_success_count=5,
+    )
+
+
+def test_proxy_metrics_websocket_status_code_is_error(serve_start_shutdown):
+    """Verify that status codes aisde from 1000 or 1001 are errors."""
+
+    def check_request_count_metrics(
+        expected_error_count: int,
+        expected_success_count: int,
+    ):
+        resp = requests.get("http://127.0.0.1:9999").text
+        error_count = 0
+        success_count = 0
+        for line in resp.split("\n"):
+            if line.startswith("ray_serve_num_http_error_requests_total"):
+                error_count += int(float(line.split(" ")[-1]))
+            if line.startswith("ray_serve_num_http_requests_total"):
+                success_count += int(float(line.split(" ")[-1]))
+
+        assert error_count == expected_error_count
+        assert success_count == expected_success_count
+        return True
+
+    fastapi_app = FastAPI()
+
+    @serve.deployment
+    @serve.ingress(fastapi_app)
+    class WebSocketServer:
+        @fastapi_app.websocket("/")
+        async def accept_then_close(self, ws: WebSocket):
+            await ws.accept()
+            code = int(await ws.receive_text())
+            await ws.close(code=code)
+
+    serve.run(WebSocketServer.bind())
+
+    # Regular disconnect (1000) is not an error.
+    with connect("ws://localhost:8000/") as ws:
+        with pytest.raises(ConnectionClosed):
+            ws.send("1000")
+            ws.recv()
+
+    wait_for_condition(
+        check_request_count_metrics,
+        expected_error_count=0,
+        expected_success_count=1,
+    )
+
+    # Goaway disconnect (1001) is not an error.
+    with connect("ws://localhost:8000/") as ws:
+        with pytest.raises(ConnectionClosed):
+            ws.send("1001")
+            ws.recv()
+
+    wait_for_condition(
+        check_request_count_metrics,
+        expected_error_count=0,
+        expected_success_count=2,
+    )
+
+    # Other codes are errors.
+    with connect("ws://localhost:8000/") as ws:
+        with pytest.raises(ConnectionClosed):
+            ws.send("1011")
+            ws.recv()
+
+    wait_for_condition(
+        check_request_count_metrics,
+        expected_error_count=1,
+        expected_success_count=3,
+    )
+
+    # Other codes are errors.
+    with connect("ws://localhost:8000/") as ws:
+        with pytest.raises(ConnectionClosed):
+            ws.send("3000")
+            ws.recv()
+
+    wait_for_condition(
+        check_request_count_metrics,
+        expected_error_count=2,
+        expected_success_count=4,
+    )
+
+
 def test_replica_metrics_fields(serve_start_shutdown):
     """Test replica metrics fields"""
 
@@ -1581,7 +1740,7 @@ def test_long_poll_host_sends_counted(serve_instance):
     )
 
     # Write a value.
-    ray.get(host.notify_changed.remote("key_1", 999))
+    ray.get(host.notify_changed.remote({"key_1": 999}))
     object_ref = host.listen_for_change.remote({"key_1": -1})
 
     # Check that the result's size is reported.
@@ -1595,8 +1754,8 @@ def test_long_poll_host_sends_counted(serve_instance):
     )
 
     # Write two new values.
-    ray.get(host.notify_changed.remote("key_1", 1000))
-    ray.get(host.notify_changed.remote("key_2", 1000))
+    ray.get(host.notify_changed.remote({"key_1": 1000}))
+    ray.get(host.notify_changed.remote({"key_2": 1000}))
     object_ref = host.listen_for_change.remote(
         {"key_1": result_1["key_1"].snapshot_id, "key_2": -1}
     )
diff --git a/python/ray/serve/tests/test_multiplex.py b/python/ray/serve/tests/test_multiplex.py
index 994605565fd5..1da243af2212 100644
--- a/python/ray/serve/tests/test_multiplex.py
+++ b/python/ray/serve/tests/test_multiplex.py
@@ -21,7 +21,7 @@
 def _get_replica_scheduler(handle: DeploymentHandle) -> ReplicaScheduler:
     # TODO(edoakes): we shouldn't be reaching into private fields, but better
     # to isolate it to one place (this function).
-    return handle._get_or_create_router()._asyncio_router._replica_scheduler
+    return handle._router._asyncio_router._replica_scheduler
 
 
 @pytest.fixture()
diff --git a/python/ray/serve/tests/test_replica_sync_methods.py b/python/ray/serve/tests/test_replica_sync_methods.py
new file mode 100644
index 000000000000..d6485704138f
--- /dev/null
+++ b/python/ray/serve/tests/test_replica_sync_methods.py
@@ -0,0 +1,127 @@
+import asyncio
+import sys
+
+import pytest
+import requests
+from anyio import to_thread
+from fastapi import FastAPI
+from starlette.responses import PlainTextResponse
+
+import ray
+from ray import serve
+from ray._private.test_utils import SignalActor, wait_for_condition
+from ray.serve._private.constants import RAY_SERVE_RUN_SYNC_IN_THREADPOOL
+
+
+@pytest.mark.skipif(
+    not RAY_SERVE_RUN_SYNC_IN_THREADPOOL,
+    reason="Run sync method in threadpool FF disabled.",
+)
+@pytest.mark.parametrize("use_fastapi", [False, True])
+def test_not_running_in_asyncio_loop(serve_instance, use_fastapi: bool):
+    if use_fastapi:
+        fastapi_app = FastAPI()
+
+        @serve.deployment
+        @serve.ingress(fastapi_app)
+        class D:
+            @fastapi_app.get("/")
+            def root(self):
+                with pytest.raises(RuntimeError, match="no running event loop"):
+                    asyncio.get_running_loop()
+
+    else:
+
+        @serve.deployment
+        class D:
+            def __call__(self) -> str:
+                with pytest.raises(RuntimeError, match="no running event loop"):
+                    asyncio.get_running_loop()
+
+    serve.run(D.bind())
+    # Would error if the check fails.
+    requests.get("http://localhost:8000/").raise_for_status()
+
+
+@pytest.mark.skipif(
+    not RAY_SERVE_RUN_SYNC_IN_THREADPOOL,
+    reason="Run sync method in threadpool FF disabled.",
+)
+def test_concurrent_execution(serve_instance):
+    signal_actor = SignalActor.remote()
+
+    @serve.deployment
+    class D:
+        def do_sync(self):
+            ray.get(signal_actor.wait.remote())
+
+        async def do_async(self):
+            await signal_actor.wait.remote()
+
+    h = serve.run(D.bind())
+
+    sync_results = [h.do_sync.remote(), h.do_sync.remote()]
+    async_results = [h.do_async.remote(), h.do_async.remote()]
+
+    wait_for_condition(lambda: ray.get(signal_actor.cur_num_waiters.remote()) == 4)
+    ray.get(signal_actor.send.remote())
+    [r.result() for r in sync_results + async_results]
+
+
+@pytest.mark.skipif(
+    not RAY_SERVE_RUN_SYNC_IN_THREADPOOL,
+    reason="Run sync method in threadpool FF disabled.",
+)
+@pytest.mark.parametrize("use_fastapi", [False, True])
+def test_context_vars_propagated(serve_instance, use_fastapi: bool):
+    if use_fastapi:
+        fastapi_app = FastAPI()
+
+        @serve.deployment
+        @serve.ingress(fastapi_app)
+        class D:
+            @fastapi_app.get("/")
+            def root(self):
+                return PlainTextResponse(
+                    serve.context._serve_request_context.get().request_id
+                )
+
+    else:
+
+        @serve.deployment
+        class D:
+            def __call__(self) -> str:
+                return PlainTextResponse(
+                    serve.context._serve_request_context.get().request_id
+                )
+
+    serve.run(D.bind())
+
+    r = requests.get("http://localhost:8000/", headers={"X-Request-Id": "TEST-ID"})
+    r.raise_for_status()
+    # If context vars weren't propagated, the request ID would be empty.
+    assert r.text == "TEST-ID"
+
+
+@pytest.mark.skipif(
+    not RAY_SERVE_RUN_SYNC_IN_THREADPOOL,
+    reason="Run sync method in threadpool FF disabled.",
+)
+def test_thread_limit_set_to_max_ongoing_requests(serve_instance):
+    @serve.deployment
+    class D:
+        async def __call__(self):
+            return to_thread.current_default_thread_limiter().total_tokens
+
+    h = serve.run(D.bind())
+
+    # Check that it's set if max_ongoing_requests is defaulted.
+    assert h.remote().result() == 5
+
+    # Update to a custom value, check again.
+    h = serve.run(D.options(max_ongoing_requests=10).bind())
+    assert h.remote().result() == 10
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(["-v", "-s", __file__]))
diff --git a/python/ray/serve/tests/unit/test_deployment_state.py b/python/ray/serve/tests/unit/test_deployment_state.py
index dfeb9fc7524c..42facaf40282 100644
--- a/python/ray/serve/tests/unit/test_deployment_state.py
+++ b/python/ray/serve/tests/unit/test_deployment_state.py
@@ -2492,9 +2492,7 @@ def create_deployment_state(
 
     check_counts(ds1, total=3, by_state=[(ReplicaState.STOPPING, 3, None)])
     assert ds1._replica_constructor_retry_counter == 3
-    # An error message should show up after
-    # 3 * num_replicas startup failures.
-    assert "" == ds1.curr_status_info.message
+    assert "Retrying 6 more time(s)" in ds1.curr_status_info.message
 
     # Set all of ds1's replicas to stopped.
     for replica in ds1._replicas.get():
@@ -2512,7 +2510,7 @@ def create_deployment_state(
     assert ds1.curr_status_info.status == DeploymentStatus.UPDATING
     check_counts(ds1, total=3, by_state=[(ReplicaState.STOPPING, 3, None)])
     assert ds1._replica_constructor_retry_counter == 6
-    assert "" == ds1.curr_status_info.message
+    assert "Retrying 3 more time(s)" in ds1.curr_status_info.message
 
     # Set all of ds1's replicas to stopped.
     for replica in ds1._replicas.get():
@@ -2527,7 +2525,7 @@ def create_deployment_state(
     assert ds1.curr_status_info.status == DeploymentStatus.UPDATING
     check_counts(ds1, total=3, by_state=[(ReplicaState.STOPPING, 3, None)])
     assert ds1._replica_constructor_retry_counter == 9
-    assert "" == ds1.curr_status_info.message
+    assert "Retrying 0 more time(s)" in ds1.curr_status_info.message
 
     # Set all of ds1's replicas to stopped.
     for replica in ds1._replicas.get():
@@ -2540,7 +2538,7 @@ def create_deployment_state(
     assert ds1.curr_status_info.status == DeploymentStatus.UNHEALTHY
     check_counts(ds1, total=0)
     assert ds1._replica_constructor_retry_counter == 9
-    assert "Replica scheduling failed" in ds1.curr_status_info.message
+    assert "The deployment failed to start" in ds1.curr_status_info.message
 
 
 def test_deploy_with_transient_constructor_failure(mock_deployment_state_manager):
diff --git a/python/ray/serve/tests/unit/test_proxy_request_response.py b/python/ray/serve/tests/unit/test_proxy_request_response.py
index 70e2fdb2d581..7b2c4388b657 100644
--- a/python/ray/serve/tests/unit/test_proxy_request_response.py
+++ b/python/ray/serve/tests/unit/test_proxy_request_response.py
@@ -57,7 +57,7 @@ def test_method(self):
         """
         proxy_request = self.create_asgi_proxy_request(scope={})
         assert isinstance(proxy_request, ProxyRequest)
-        assert proxy_request.method == "WEBSOCKET"
+        assert proxy_request.method == "WS"
 
         method = "fake-method"
         proxy_request = self.create_asgi_proxy_request(scope={"method": method})
diff --git a/python/ray/serve/tests/unit/test_user_callable_wrapper.py b/python/ray/serve/tests/unit/test_user_callable_wrapper.py
index b03c9ca7e39e..39188f32c421 100644
--- a/python/ray/serve/tests/unit/test_user_callable_wrapper.py
+++ b/python/ray/serve/tests/unit/test_user_callable_wrapper.py
@@ -4,7 +4,7 @@
 import sys
 import threading
 from dataclasses import dataclass
-from typing import AsyncGenerator, Callable, Generator, Optional
+from typing import Any, AsyncGenerator, Callable, Dict, Generator, Optional, Tuple
 
 import pytest
 from fastapi import FastAPI
@@ -90,13 +90,18 @@ async def basic_async_generator(n: int, raise_exception: bool = False):
 
 
 def _make_user_callable_wrapper(
-    callable: Optional[Callable] = None, *init_args, **init_kwargs
+    callable: Optional[Callable] = None,
+    *,
+    init_args: Optional[Tuple[Any]] = None,
+    init_kwargs: Optional[Dict[str, Any]] = None,
+    run_sync_methods_in_threadpool: bool = False,
 ) -> UserCallableWrapper:
     return UserCallableWrapper(
         callable if callable is not None else BasicClass,
-        init_args,
-        init_kwargs,
+        init_args or tuple(),
+        init_kwargs or dict(),
         deployment_id=DeploymentID(name="test_name"),
+        run_sync_methods_in_threadpool=run_sync_methods_in_threadpool,
     )
 
 
@@ -144,8 +149,11 @@ def test_calling_methods_before_initialize():
         user_callable_wrapper.call_reconfigure(None).result()
 
 
-def test_basic_class_callable():
-    user_callable_wrapper = _make_user_callable_wrapper()
+@pytest.mark.parametrize("run_sync_methods_in_threadpool", [False, True])
+def test_basic_class_callable(run_sync_methods_in_threadpool: bool):
+    user_callable_wrapper = _make_user_callable_wrapper(
+        run_sync_methods_in_threadpool=run_sync_methods_in_threadpool
+    )
 
     user_callable_wrapper.initialize_callable().result()
 
@@ -215,8 +223,11 @@ def test_basic_class_callable():
         ).result()
 
 
-def test_basic_class_callable_generators():
-    user_callable_wrapper = _make_user_callable_wrapper()
+@pytest.mark.parametrize("run_sync_methods_in_threadpool", [False, True])
+def test_basic_class_callable_generators(run_sync_methods_in_threadpool: bool):
+    user_callable_wrapper = _make_user_callable_wrapper(
+        run_sync_methods_in_threadpool=run_sync_methods_in_threadpool
+    )
     user_callable_wrapper.initialize_callable().result()
 
     result_list = []
@@ -291,9 +302,12 @@ def test_basic_class_callable_generators():
     assert result_list == [0]
 
 
+@pytest.mark.parametrize("run_sync_methods_in_threadpool", [False, True])
 @pytest.mark.parametrize("fn", [basic_sync_function, basic_async_function])
-def test_basic_function_callable(fn: Callable):
-    user_callable_wrapper = _make_user_callable_wrapper(fn)
+def test_basic_function_callable(fn: Callable, run_sync_methods_in_threadpool: bool):
+    user_callable_wrapper = _make_user_callable_wrapper(
+        fn, run_sync_methods_in_threadpool=run_sync_methods_in_threadpool
+    )
     user_callable_wrapper.initialize_callable().result()
 
     # Call non-generator function with is_streaming.
@@ -325,9 +339,14 @@ def test_basic_function_callable(fn: Callable):
         ).result()
 
 
+@pytest.mark.parametrize("run_sync_methods_in_threadpool", [False, True])
 @pytest.mark.parametrize("fn", [basic_sync_generator, basic_async_generator])
-def test_basic_function_callable_generators(fn: Callable):
-    user_callable_wrapper = _make_user_callable_wrapper(fn)
+def test_basic_function_callable_generators(
+    fn: Callable, run_sync_methods_in_threadpool: bool
+):
+    user_callable_wrapper = _make_user_callable_wrapper(
+        fn, run_sync_methods_in_threadpool=run_sync_methods_in_threadpool
+    )
     user_callable_wrapper.initialize_callable().result()
 
     result_list = []
@@ -366,36 +385,68 @@ def test_basic_function_callable_generators(fn: Callable):
 
 
 @pytest.mark.asyncio
-async def test_user_code_runs_on_separate_loop():
+@pytest.mark.parametrize("run_sync_methods_in_threadpool", [False, True])
+async def test_user_code_runs_on_separate_loop(run_sync_methods_in_threadpool: bool):
     main_loop = asyncio.get_running_loop()
 
     class GetLoop:
         def __init__(self):
             self._constructor_loop = asyncio.get_running_loop()
 
-        def check_health(self):
+        async def check_health(self):
             check_health_loop = asyncio.get_running_loop()
             assert (
                 check_health_loop == self._constructor_loop
             ), "User constructor and health check should run on the same loop."
             return check_health_loop
 
-        def __call__(self) -> asyncio.AbstractEventLoop:
+        async def call_async(self) -> Optional[asyncio.AbstractEventLoop]:
             user_method_loop = asyncio.get_running_loop()
             assert (
                 user_method_loop == self._constructor_loop
             ), "User constructor and other methods should run on the same loop."
+
             return user_method_loop
 
-    user_callable_wrapper = _make_user_callable_wrapper(GetLoop)
+        def call_sync(self):
+            if run_sync_methods_in_threadpool:
+                with pytest.raises(RuntimeError, match="no running event loop"):
+                    asyncio.get_running_loop()
+
+                user_method_loop = None
+            else:
+                user_method_loop = asyncio.get_running_loop()
+                assert (
+                    user_method_loop == self._constructor_loop
+                ), "User constructor and other methods should run on the same loop."
+
+            return user_method_loop
+
+    user_callable_wrapper = _make_user_callable_wrapper(
+        GetLoop, run_sync_methods_in_threadpool=run_sync_methods_in_threadpool
+    )
     user_callable_wrapper.initialize_callable().result()
-    request_metadata = _make_request_metadata()
+
+    # Async methods should all run on the same loop.
+    request_metadata = _make_request_metadata(call_method="call_async")
     user_code_loop = user_callable_wrapper.call_user_method(
         request_metadata, tuple(), dict()
     ).result()
     assert isinstance(user_code_loop, asyncio.AbstractEventLoop)
     assert user_code_loop != main_loop
 
+    # Sync methods should run on the same loop if run_sync_methods_in_threadpool is off,
+    # else run in no asyncio loop.
+    request_metadata = _make_request_metadata(call_method="call_sync")
+    user_code_loop = user_callable_wrapper.call_user_method(
+        request_metadata, tuple(), dict()
+    ).result()
+    if run_sync_methods_in_threadpool:
+        assert user_code_loop is None
+    else:
+        assert isinstance(user_code_loop, asyncio.AbstractEventLoop)
+        assert user_code_loop != main_loop
+
     # `check_health` method asserts that it runs on the correct loop.
     user_callable_wrapper.call_user_health_check().result()
 
@@ -412,7 +463,7 @@ def __call__(self) -> str:
     msg = "hello world"
     user_callable_wrapper = _make_user_callable_wrapper(
         AsyncInitializer,
-        msg,
+        init_args=(msg,),
     )
     user_callable_wrapper.initialize_callable().result()
     request_metadata = _make_request_metadata()
@@ -498,8 +549,11 @@ def stream(self, msg: serve_pb2.UserDefinedMessage):
             yield serve_pb2.UserDefinedResponse(greeting=f"Hello {msg.greeting} {i}!")
 
 
-def test_grpc_unary_request():
-    user_callable_wrapper = _make_user_callable_wrapper(gRPCClass)
+@pytest.mark.parametrize("run_sync_methods_in_threadpool", [False, True])
+def test_grpc_unary_request(run_sync_methods_in_threadpool: bool):
+    user_callable_wrapper = _make_user_callable_wrapper(
+        gRPCClass, run_sync_methods_in_threadpool=run_sync_methods_in_threadpool
+    )
     user_callable_wrapper.initialize_callable().result()
 
     grpc_request = gRPCRequest(
@@ -518,8 +572,11 @@ def test_grpc_unary_request():
 
 
 @pytest.mark.asyncio
-def test_grpc_streaming_request():
-    user_callable_wrapper = _make_user_callable_wrapper(gRPCClass)
+@pytest.mark.parametrize("run_sync_methods_in_threadpool", [False, True])
+def test_grpc_streaming_request(run_sync_methods_in_threadpool: bool):
+    user_callable_wrapper = _make_user_callable_wrapper(
+        gRPCClass, run_sync_methods_in_threadpool=run_sync_methods_in_threadpool
+    )
     user_callable_wrapper.initialize_callable()
 
     grpc_request = gRPCRequest(
diff --git a/python/ray/tests/conftest.py b/python/ray/tests/conftest.py
index 47557bc36a32..d13c5a89b2d5 100644
--- a/python/ray/tests/conftest.py
+++ b/python/ray/tests/conftest.py
@@ -1,6 +1,7 @@
 """
 This file defines the common pytest fixtures used in current directory.
 """
+
 import json
 import logging
 import os
@@ -34,6 +35,8 @@
     redis_replicas,
     get_redis_cli,
     start_redis_instance,
+    start_redis_sentinel_instance,
+    redis_sentinel_replicas,
     find_available_port,
     wait_for_condition,
     find_free_port,
@@ -201,6 +204,34 @@ def redis_alive(port, enable_tls):
     return False
 
 
+def start_redis_with_sentinel(db_dir):
+    temp_dir = ray._private.utils.get_ray_temp_dir()
+
+    redis_ports = find_available_port(49159, 55535, redis_sentinel_replicas() + 1)
+    sentinel_port = redis_ports[0]
+    master_port = redis_ports[1]
+    redis_processes = [
+        start_redis_instance(temp_dir, p, listen_to_localhost_only=True, db_dir=db_dir)[
+            1
+        ]
+        for p in redis_ports[1:]
+    ]
+
+    # ensure all redis servers are up
+    for port in redis_ports[1:]:
+        wait_for_condition(redis_alive, 3, 100, port=port, enable_tls=False)
+
+    # setup replicas of the master
+    for port in redis_ports[2:]:
+        redis_cli = get_redis_cli(port, False)
+        redis_cli.replicaof("127.0.0.1", master_port)
+        sentinel_process = start_redis_sentinel_instance(
+            temp_dir, sentinel_port, master_port
+        )
+        address_str = f"127.0.0.1:{sentinel_port}"
+        return address_str, redis_processes + [sentinel_process]
+
+
 def start_redis(db_dir):
     retry_num = 0
     while True:
@@ -289,10 +320,14 @@ def kill_all_redis_server():
 
 
 @contextmanager
-def _setup_redis(request):
+def _setup_redis(request, with_sentinel=False):
     with tempfile.TemporaryDirectory() as tmpdirname:
         kill_all_redis_server()
-        address_str, processes = start_redis(tmpdirname)
+        address_str, processes = (
+            start_redis_with_sentinel(tmpdirname)
+            if with_sentinel
+            else start_redis(tmpdirname)
+        )
         old_addr = os.environ.get("RAY_REDIS_ADDRESS")
         os.environ["RAY_REDIS_ADDRESS"] = address_str
         import uuid
@@ -332,6 +367,12 @@ def external_redis(request):
         yield
 
 
+@pytest.fixture
+def external_redis_with_sentinel(request):
+    with _setup_redis(request, True):
+        yield
+
+
 @pytest.fixture
 def shutdown_only(maybe_external_redis):
     yield None
@@ -535,6 +576,15 @@ def ray_start_cluster_head_with_external_redis(request, external_redis):
         yield res
 
 
+@pytest.fixture
+def ray_start_cluster_head_with_external_redis_sentinel(
+    request, external_redis_with_sentinel
+):
+    param = getattr(request, "param", {})
+    with _ray_start_cluster(do_init=True, num_nodes=1, **param) as res:
+        yield res
+
+
 @pytest.fixture
 def ray_start_cluster_head_with_env_vars(request, maybe_external_redis, monkeypatch):
     param = getattr(request, "param", {})
diff --git a/python/ray/tests/kuberay/test_autoscaling_config.py b/python/ray/tests/kuberay/test_autoscaling_config.py
index 980f266bc9c5..699df522eb1a 100644
--- a/python/ray/tests/kuberay/test_autoscaling_config.py
+++ b/python/ray/tests/kuberay/test_autoscaling_config.py
@@ -71,7 +71,7 @@ def _get_basic_autoscaling_config() -> dict:
             "type": "kuberay",
         },
         "available_node_types": {
-            "head-group": {
+            "headgroup": {
                 "max_workers": 0,
                 "min_workers": 0,
                 "node_config": {},
@@ -84,7 +84,7 @@ def _get_basic_autoscaling_config() -> dict:
             },
             "small-group": {
                 "max_workers": 300,
-                "min_workers": 1,
+                "min_workers": 0,
                 "node_config": {},
                 "resources": {
                     "CPU": 1,
@@ -97,7 +97,7 @@ def _get_basic_autoscaling_config() -> dict:
             # and modified max_workers.
             "gpu-group": {
                 "max_workers": 200,
-                "min_workers": 1,
+                "min_workers": 0,
                 "node_config": {},
                 "resources": {
                     "CPU": 1,
@@ -111,7 +111,7 @@ def _get_basic_autoscaling_config() -> dict:
             # and modified max_workers and node_config.
             "tpu-group": {
                 "max_workers": 4,
-                "min_workers": 1,
+                "min_workers": 0,
                 "node_config": {},
                 "resources": {
                     "CPU": 1,
@@ -127,7 +127,7 @@ def _get_basic_autoscaling_config() -> dict:
         "cluster_synced_files": [],
         "file_mounts": {},
         "file_mounts_sync_continuously": False,
-        "head_node_type": "head-group",
+        "head_node_type": "headgroup",
         "head_setup_commands": [],
         "head_start_ray_commands": [],
         "idle_timeout_minutes": 1.0,
diff --git a/python/ray/tests/kuberay/test_kuberay_node_provider.py b/python/ray/tests/kuberay/test_kuberay_node_provider.py
index 4d2f94c1d086..3d5ea52009cb 100644
--- a/python/ray/tests/kuberay/test_kuberay_node_provider.py
+++ b/python/ray/tests/kuberay/test_kuberay_node_provider.py
@@ -126,7 +126,7 @@ def test_create_node_cap_at_max(
             {
                 "raycluster-autoscaler-head-8zsc8": NodeData(
                     kind="head",
-                    type="head-group",
+                    type="headgroup",
                     replica_index=None,
                     ip="10.4.2.6",
                     status="up-to-date",
@@ -149,7 +149,7 @@ def test_create_node_cap_at_max(
             {
                 "raycluster-autoscaler-head-8zsc8": NodeData(
                     kind="head",
-                    type="head-group",
+                    type="headgroup",
                     replica_index=None,
                     ip="10.4.2.6",
                     status="up-to-date",
@@ -217,7 +217,7 @@ def mock_get(node_provider, path):
             {
                 "raycluster-autoscaler-head-8zsc8": NodeData(
                     kind="head",
-                    type="head-group",
+                    type="headgroup",
                     replica_index=None,
                     ip="10.4.2.6",
                     status="up-to-date",
diff --git a/python/ray/tests/test_advanced.py b/python/ray/tests/test_advanced.py
index a3cc669c8c90..8642099b042b 100644
--- a/python/ray/tests/test_advanced.py
+++ b/python/ray/tests/test_advanced.py
@@ -125,11 +125,11 @@ def test_internal_get_local_ongoing_lineage_reconstruction_tasks(
     ray_start_cluster_enabled,
 ):
     cluster = ray_start_cluster_enabled
-    cluster.add_node(resources={"head": 1})
+    cluster.add_node(resources={"head": 2})
     ray.init(address=cluster.address)
-    worker1 = cluster.add_node(resources={"worker": 1})
+    worker1 = cluster.add_node(resources={"worker": 2})
 
-    @ray.remote(resources={"head": 1})
+    @ray.remote(num_cpus=0, resources={"head": 1})
     class Counter:
         def __init__(self):
             self.count = 0
@@ -138,7 +138,9 @@ def inc(self):
             self.count = self.count + 1
             return self.count
 
-    @ray.remote(max_retries=-1, num_cpus=0, resources={"worker": 1})
+    @ray.remote(
+        max_retries=-1, num_cpus=0, resources={"worker": 1}, _labels={"key1": "value1"}
+    )
     def task(counter):
         count = ray.get(counter.inc.remote())
         if count > 1:
@@ -146,10 +148,31 @@ def task(counter):
             time.sleep(100000)
         return [1] * 1024 * 1024
 
-    counter = Counter.remote()
-    obj = task.remote(counter)
+    @ray.remote(
+        max_restarts=-1,
+        max_task_retries=-1,
+        num_cpus=0,
+        resources={"worker": 1},
+        _labels={"key2": "value2"},
+    )
+    class Actor:
+        def run(self, counter):
+            count = ray.get(counter.inc.remote())
+            if count > 1:
+                # lineage reconstruction
+                time.sleep(100000)
+            return [1] * 1024 * 1024
+
+    counter1 = Counter.remote()
+    obj1 = task.remote(counter1)
     # Wait for task to finish
-    ray.wait([obj], fetch_local=False)
+    ray.wait([obj1], fetch_local=False)
+
+    counter2 = Counter.remote()
+    actor = Actor.remote()
+    obj2 = actor.run.remote(counter2)
+    # Wait for actor task to finish
+    ray.wait([obj2], fetch_local=False)
 
     assert len(get_local_ongoing_lineage_reconstruction_tasks()) == 0
 
@@ -158,16 +181,27 @@ def task(counter):
 
     def verify(expected_task_status):
         lineage_reconstruction_tasks = get_local_ongoing_lineage_reconstruction_tasks()
-        return (
-            len(lineage_reconstruction_tasks) == 1
-            and lineage_reconstruction_tasks[0][0].name == "task"
-            and lineage_reconstruction_tasks[0][0].resources == {"worker": 1.0}
+        lineage_reconstruction_tasks.sort(key=lambda task: task[0].name)
+        assert len(lineage_reconstruction_tasks) == 2
+        assert [
+            lineage_reconstruction_tasks[0][0].name,
+            lineage_reconstruction_tasks[1][0].name,
+        ] == ["Actor.run", "task"]
+        assert (
+            lineage_reconstruction_tasks[0][0].labels == {"key2": "value2"}
             and lineage_reconstruction_tasks[0][0].status == expected_task_status
             and lineage_reconstruction_tasks[0][1] == 1
         )
+        assert (
+            lineage_reconstruction_tasks[1][0].labels == {"key1": "value1"}
+            and lineage_reconstruction_tasks[1][0].status == expected_task_status
+            and lineage_reconstruction_tasks[1][1] == 1
+        )
+
+        return True
 
     wait_for_condition(lambda: verify(common_pb2.TaskStatus.PENDING_NODE_ASSIGNMENT))
-    cluster.add_node(resources={"worker": 1})
+    cluster.add_node(resources={"worker": 2})
     wait_for_condition(lambda: verify(common_pb2.TaskStatus.SUBMITTED_TO_WORKER))
 
 
diff --git a/python/ray/tests/test_basic.py b/python/ray/tests/test_basic.py
index 001f38393390..55b6ec73f21d 100644
--- a/python/ray/tests/test_basic.py
+++ b/python/ray/tests/test_basic.py
@@ -581,7 +581,7 @@ def foo():
     # TODO(suquark): The current implementation of `.options()` is so bad that we
     # cannot even access its options from outside. Here we hack the closures to
     # achieve our goal. Need futher efforts to clean up the tech debt.
-    assert f2.remote.__closure__[1].cell_contents == {
+    assert f2.remote.__closure__[2].cell_contents == {
         "_metadata": {"namespace": {"a": 11, "b": 2, "c": 3}},
         "num_cpus": 1,
         "num_gpus": 1,
@@ -593,7 +593,7 @@ def __init__(self, **options):
 
     f3 = foo.options(num_cpus=1, num_gpus=1, **mock_options2(a=11, c=3))
 
-    assert f3.remote.__closure__[1].cell_contents == {
+    assert f3.remote.__closure__[2].cell_contents == {
         "_metadata": {"namespace": {"a": 1, "b": 2}, "namespace2": {"a": 11, "c": 3}},
         "num_cpus": 1,
         "num_gpus": 1,
diff --git a/python/ray/tests/test_cli_logger.py b/python/ray/tests/test_cli_logger.py
index b164f5dc3725..bced27abee01 100644
--- a/python/ray/tests/test_cli_logger.py
+++ b/python/ray/tests/test_cli_logger.py
@@ -1,4 +1,6 @@
 from ray.autoscaler._private import cli_logger
+import io
+from unittest.mock import patch
 import pytest
 
 
@@ -14,6 +16,14 @@ def test_colorful_mock_random_function():
     assert cm.bold("abc") == "abc"
 
 
+def test_pathname():
+    # Ensure that the `pathname` of the `LogRecord` points to the
+    # caller of `cli_logger`, not `cli_logger` itself.
+    with patch("sys.stdout", new=io.StringIO()) as mock_stdout:
+        cli_logger.cli_logger.info("123")
+        assert "test_cli_logger.py" in mock_stdout.getvalue()
+
+
 if __name__ == "__main__":
     import os
     import sys
diff --git a/python/ray/tests/test_gcs_fault_tolerance.py b/python/ray/tests/test_gcs_fault_tolerance.py
index bca9b83021de..eaecd0c3a073 100644
--- a/python/ray/tests/test_gcs_fault_tolerance.py
+++ b/python/ray/tests/test_gcs_fault_tolerance.py
@@ -20,6 +20,7 @@
     wait_for_condition,
     wait_for_pid_to_exit,
     run_string_as_driver,
+    redis_sentinel_replicas,
 )
 from ray.job_submission import JobSubmissionClient, JobStatus
 from ray._raylet import GcsClient
@@ -871,6 +872,120 @@ def f():
     wait_for_pid_to_exit(gcs_server_pid, 10000)
 
 
+@pytest.mark.parametrize(
+    "ray_start_cluster_head_with_external_redis_sentinel",
+    [
+        generate_system_config_map(
+            gcs_rpc_server_reconnect_timeout_s=60,
+            gcs_server_request_timeout_seconds=10,
+            redis_db_connect_retries=50,
+        )
+    ],
+    indirect=True,
+)
+def test_redis_with_sentinel_failureover(
+    ray_start_cluster_head_with_external_redis_sentinel,
+):
+    """This test is to cover ray cluster's behavior with Redis sentinel.
+    The expectation is Redis sentinel should manage failover
+    automatically, and GCS can continue talking to the same address
+    without any human intervention on Redis.
+    For this test we ensure:
+    - When Redis master failed, Ray should crash (TODO: GCS should
+        autommatically try re-connect to sentinel).
+    - When restart Ray, it should continue talking to sentinel, which
+        should return information about new master.
+    """
+    cluster = ray_start_cluster_head_with_external_redis_sentinel
+    import redis
+
+    redis_addr = os.environ.get("RAY_REDIS_ADDRESS")
+    ip, port = redis_addr.split(":")
+    redis_cli = redis.Redis(ip, port)
+    print(redis_cli.info("sentinel"))
+    redis_name = redis_cli.info("sentinel")["master0"]["name"]
+
+    def get_sentinel_nodes():
+        leader_address = (
+            redis_cli.sentinel_master(redis_name)["ip"],
+            redis_cli.sentinel_master(redis_name)["port"],
+        )
+        follower_addresses = [
+            (x["ip"], x["port"]) for x in redis_cli.sentinel_slaves(redis_name)
+        ]
+        return [leader_address] + follower_addresses
+
+    wait_for_condition(lambda: len(get_sentinel_nodes()) == redis_sentinel_replicas())
+
+    @ray.remote(max_restarts=-1)
+    class Counter:
+        def r(self, v):
+            return v
+
+        def pid(self):
+            import os
+
+            return os.getpid()
+
+    c = Counter.options(name="c", namespace="test", lifetime="detached").remote()
+    c_pid = ray.get(c.pid.remote())
+    c_process = psutil.Process(pid=c_pid)
+    r = ray.get(c.r.remote(10))
+    assert r == 10
+
+    head_node = cluster.head_node
+    gcs_server_process = head_node.all_processes["gcs_server"][0].process
+    gcs_server_pid = gcs_server_process.pid
+
+    leader_cli = redis.Redis(*get_sentinel_nodes()[0])
+    leader_pid = leader_cli.info()["process_id"]
+    follower_cli = [redis.Redis(*x) for x in get_sentinel_nodes()[1:]]
+
+    # Wait until all data is updated in the replica
+    leader_cli.set("_hole", "0")
+    wait_for_condition(lambda: all([b"_hole" in f.keys("*") for f in follower_cli]))
+    current_leader = get_sentinel_nodes()[0]
+
+    # Now kill pid
+    leader_process = psutil.Process(pid=leader_pid)
+    leader_process.kill()
+
+    print(">>> Waiting gcs server to exit", gcs_server_pid)
+    wait_for_pid_to_exit(gcs_server_pid, 1000)
+    print("GCS killed")
+
+    wait_for_condition(lambda: current_leader != get_sentinel_nodes()[0])
+
+    # Kill Counter actor. It should restart after GCS is back
+    c_process.kill()
+    # Cleanup the in memory data and then start gcs
+    cluster.head_node.kill_gcs_server(False)
+
+    print("Start gcs")
+    sleep(2)
+    cluster.head_node.start_gcs_server()
+
+    assert len(ray.nodes()) == 1
+    assert ray.nodes()[0]["alive"]
+
+    driver_script = f"""
+import ray
+ray.init('{cluster.address}')
+@ray.remote
+def f():
+    return 10
+assert ray.get(f.remote()) == 10
+
+c = ray.get_actor("c", namespace="test")
+v = ray.get(c.r.remote(10))
+assert v == 10
+print("DONE")
+"""
+
+    # Make sure the cluster is usable
+    wait_for_condition(lambda: "DONE" in run_string_as_driver(driver_script))
+
+
 @pytest.mark.parametrize(
     "ray_start_regular",
     [
diff --git a/python/ray/tests/test_gcs_utils.py b/python/ray/tests/test_gcs_utils.py
index c25beac6e598..50862f386346 100644
--- a/python/ray/tests/test_gcs_utils.py
+++ b/python/ray/tests/test_gcs_utils.py
@@ -267,7 +267,7 @@ async def test_gcs_aio_client_is_async(ray_start_regular):
     gcs_client = gcs_utils.GcsAioClient(address=gcs_address, nums_reconnect_retry=0)
 
     await gcs_client.internal_kv_put(b"A", b"B", False, b"NS", timeout=2)
-    with async_timeout.timeout(3):
+    async with async_timeout.timeout(3):
         none, result = await asyncio.gather(
             asyncio.sleep(2), gcs_client.internal_kv_get(b"A", b"NS", timeout=2)
         )
diff --git a/python/ray/tests/test_logging_2.py b/python/ray/tests/test_logging_2.py
index b48b04e44a59..9d5be165f9ac 100644
--- a/python/ray/tests/test_logging_2.py
+++ b/python/ray/tests/test_logging_2.py
@@ -54,11 +54,14 @@ def f():
                 "worker_id": runtime_context.get_worker_id(),
                 "node_id": runtime_context.get_node_id(),
                 "task_id": runtime_context.get_task_id(),
+                "task_name": runtime_context.get_task_name(),
+                "task_func_name": runtime_context.get_task_function_name(),
             }
             for attr in should_exist:
                 assert hasattr(record, attr)
                 assert getattr(record, attr) == expected_values[attr]
             assert not hasattr(record, "actor_id")
+            assert not hasattr(record, "actor_name")
 
         obj_ref = f.remote()
         ray.get(obj_ref)
@@ -77,7 +80,10 @@ def f(self):
                     "worker_id": runtime_context.get_worker_id(),
                     "node_id": runtime_context.get_node_id(),
                     "actor_id": runtime_context.get_actor_id(),
+                    "actor_name": runtime_context.get_actor_name(),
                     "task_id": runtime_context.get_task_id(),
+                    "task_name": runtime_context.get_task_name(),
+                    "task_func_name": runtime_context.get_task_function_name(),
                 }
                 for attr in should_exist:
                     assert hasattr(record, attr)
diff --git a/python/ray/tests/test_output.py b/python/ray/tests/test_output.py
index 4b84b8285534..505a02e84226 100644
--- a/python/ray/tests/test_output.py
+++ b/python/ray/tests/test_output.py
@@ -575,26 +575,27 @@ def test_disable_driver_logs_breakpoint():
 @ray.remote
 def f():
     while True:
-        start_time = time.time()
-        while time.time() - start_time < 1:
+        start_time = time.monotonic()
+        while time.monotonic() - start_time < 1:
             time.sleep(0.1)
+            print(f"slept {time.monotonic() - start_time} seconds")
         print("hello there")
         sys.stdout.flush()
 
 def kill():
-    start_time = time.time()
-    while time.time() - start_time < 5:
+    start_time = time.monotonic()
+    while time.monotonic() - start_time < 5:
         time.sleep(0.1)
     sys.stdout.flush()
-    start_time = time.time()
-    while time.time() - start_time < 1:
+    start_time = time.monotonic()
+    while time.monotonic() - start_time < 1:
         time.sleep(0.1)
     os._exit(0)
 
 t = threading.Thread(target=kill)
 t.start()
 x = f.remote()
-time.sleep(2)  # Enough time to print one hello.
+time.sleep(3)  # Enough time to print one hello.
 breakpoint()  # This should disable worker logs.
     """
 
@@ -602,7 +603,7 @@ def kill():
     out_str = proc.stdout.read().decode("ascii")
     num_hello = out_str.count("hello")
     assert num_hello >= 1, out_str
-    assert num_hello < 3, out_str
+    assert num_hello <= 3, out_str
     assert "Temporarily disabling Ray worker logs" in out_str, out_str
     # TODO(ekl) nice to test resuming logs too, but it's quite complicated
 
diff --git a/python/ray/tests/test_runtime_context.py b/python/ray/tests/test_runtime_context.py
index 3835a7e7eb3a..23bb39c6ab4d 100644
--- a/python/ray/tests/test_runtime_context.py
+++ b/python/ray/tests/test_runtime_context.py
@@ -264,6 +264,136 @@ def test_auto_init(shutdown_only):
     assert ray.is_initialized()
 
 
+def test_get_task_name(shutdown_only):
+    ray.init()
+
+    # for a normal task
+    @ray.remote
+    def get_task_name_for_normal_task():
+        return ray.get_runtime_context().get_task_name()
+
+    expected_task_name = "normal_task_name"
+    task_name = ray.get(
+        get_task_name_for_normal_task.options(name=expected_task_name).remote()
+    )
+    assert (
+        task_name == expected_task_name
+    ), f"Check normal task name failed. expected={expected_task_name}, \
+actual={task_name}"
+
+    # for an actor task
+    @ray.remote
+    class Actor:
+        def get_task_name_for_actor_task(self):
+            return ray.get_runtime_context().get_task_name()
+
+    expected_task_name = "Actor.get_task_name_for_actor_task"
+    actor = Actor.remote()
+    task_name = ray.get(actor.get_task_name_for_actor_task.remote())
+    assert (
+        task_name == expected_task_name
+    ), f"Check actor task name failed. expected={expected_task_name}, \
+actual={task_name}"
+
+    # for a threaded actor task
+    @ray.remote
+    class ThreadedActor:
+        def get_task_name_for_threaded_actor_task(self):
+            return ray.get_runtime_context().get_task_name()
+
+    expected_task_name = "ThreadedActor.get_task_name_for_threaded_actor_task"
+    threaded_actor = ThreadedActor.options(max_concurrency=2).remote()
+    task_name = ray.get(threaded_actor.get_task_name_for_threaded_actor_task.remote())
+    assert (
+        task_name == expected_task_name
+    ), f"Check actor task name failed. expected={expected_task_name}, \
+actual={task_name}"
+
+    # for a async actor task
+    @ray.remote
+    class AsyncActor:
+        async def get_task_name_for_async_actor_task(self):
+            return ray.get_runtime_context().get_task_name()
+
+    expected_task_name = "AsyncActor.get_task_name_for_async_actor_task"
+    async_actor = AsyncActor.remote()
+    task_name = ray.get(async_actor.get_task_name_for_async_actor_task.remote())
+    assert (
+        task_name == expected_task_name
+    ), f"Check actor task name failed. expected={expected_task_name}, \
+actual={task_name}"
+
+
+def test_get_task_function_name(shutdown_only):
+    ray.init()
+
+    # for a normal task
+    @ray.remote
+    def get_task_function_name_for_normal_task():
+        return ray.get_runtime_context().get_task_function_name()
+
+    expected_task_function_name = __name__ + ".get_task_function_name_for_normal_task"
+    task_function_name = ray.get(get_task_function_name_for_normal_task.remote())
+    assert (
+        task_function_name == expected_task_function_name
+    ), f"Check normal task function failed. expected={expected_task_function_name}, \
+actual={task_function_name}"
+
+    # for an actor task
+    @ray.remote
+    class Actor:
+        def get_task_function_name_for_actor_task(self):
+            return ray.get_runtime_context().get_task_function_name()
+
+    expected_task_function_name = (
+        __name__ + ".Actor.get_task_function_name_for_actor_task"
+    )
+    actor = Actor.remote()
+    task_function_name = ray.get(actor.get_task_function_name_for_actor_task.remote())
+    assert (
+        task_function_name == expected_task_function_name
+    ), f"Check actor task function failed. expected={expected_task_function_name}, \
+actual={task_function_name}"
+
+    # for a threaded actor task
+    @ray.remote
+    class ThreadedActor:
+        def get_task_function_name_for_threaded_actor_task(self):
+            return ray.get_runtime_context().get_task_function_name()
+
+    expected_task_function_name = (
+        __name__ + ".ThreadedActor.get_task_function_name_for_threaded_actor_task"
+    )
+    threaded_actor = ThreadedActor.options(max_concurrency=2).remote()
+    task_function_name = ray.get(
+        threaded_actor.get_task_function_name_for_threaded_actor_task.remote()
+    )
+    assert (
+        task_function_name == expected_task_function_name
+    ), f"Check actor task function failed. expected={expected_task_function_name}, \
+actual={task_function_name}"
+
+    # for a async actor task
+    @ray.remote
+    class AsyncActor:
+        async def get_task_function_name_for_async_actor_task(self):
+            return ray.get_runtime_context().get_task_function_name()
+
+    expected_task_function_name = (
+        __name__
+        + ".test_get_task_function_name.<locals>.AsyncActor.\
+get_task_function_name_for_async_actor_task"
+    )
+    async_actor = AsyncActor.remote()
+    task_function_name = ray.get(
+        async_actor.get_task_function_name_for_async_actor_task.remote()
+    )
+    assert (
+        task_function_name == expected_task_function_name
+    ), f"Check actor task function failed. expected={expected_task_function_name}, \
+actual={task_function_name}"
+
+
 def test_async_actor_task_id(shutdown_only):
     ray.init()
 
diff --git a/python/ray/util/collective/collective_group/nccl_util.py b/python/ray/util/collective/collective_group/nccl_util.py
index 05b05ef33a27..221d5885c411 100644
--- a/python/ray/util/collective/collective_group/nccl_util.py
+++ b/python/ray/util/collective/collective_group/nccl_util.py
@@ -63,7 +63,7 @@
     }
 
     # Older versions of cupy don't support bfloat16.
-    if hasattr(nccl, "NCCL_BFlOAT16"):
+    if hasattr(nccl, "NCCL_BFLOAT16"):
         TORCH_NCCL_DTYPE_MAP[torch.bfloat16] = nccl.NCCL_BFLOAT16
 
     TORCH_NUMPY_DTYPE_MAP = {
diff --git a/python/requirements/ml/data-requirements.txt b/python/requirements/ml/data-requirements.txt
index 6e2baa5592fe..de91b5010c7c 100644
--- a/python/requirements/ml/data-requirements.txt
+++ b/python/requirements/ml/data-requirements.txt
@@ -1,8 +1,8 @@
 # Used by CI for datasets and docs.
 # https://github.com/ray-project/ray/pull/29448#discussion_r1006256498
 
-dask[complete]==2022.10.1; python_version < '3.12'
-distributed==2022.10.1; python_version < '3.12'
+dask[complete]==2022.10.2; python_version < '3.12'
+distributed==2022.10.2; python_version < '3.12'
 dask[complete]==2024.6.0; python_version >= '3.12'
 distributed==2024.6.0; python_version >= '3.12'
 aioboto3==11.2.0
diff --git a/python/requirements/ml/data-test-requirements.txt b/python/requirements/ml/data-test-requirements.txt
index d2d435b09d88..9ad22340d031 100644
--- a/python/requirements/ml/data-test-requirements.txt
+++ b/python/requirements/ml/data-test-requirements.txt
@@ -18,4 +18,5 @@ delta-sharing
 pytest-mock
 decord
 snowflake-connector-python
-pyiceberg[sql-sqlite]==0.7.0
\ No newline at end of file
+pyiceberg[sql-sqlite]==0.7.0
+hudi==0.2.0rc1
diff --git a/python/requirements/ml/rllib-test-requirements.txt b/python/requirements/ml/rllib-test-requirements.txt
index 027c57446e60..c67bf2cec445 100644
--- a/python/requirements/ml/rllib-test-requirements.txt
+++ b/python/requirements/ml/rllib-test-requirements.txt
@@ -5,7 +5,7 @@
 # Atari
 ale_py==0.10.1
 imageio==2.34.2
-opencv-python==4.8.1.78
+opencv-python-headless==4.8.1.78
 
 # For testing MuJoCo envs with gymnasium.
 mujoco==3.2.4
diff --git a/python/requirements/test-requirements.txt b/python/requirements/test-requirements.txt
index b73f554ec524..175affd3e7f7 100644
--- a/python/requirements/test-requirements.txt
+++ b/python/requirements/test-requirements.txt
@@ -14,8 +14,7 @@ beautifulsoup4==4.11.1
 boto3==1.26.76
 # Todo: investigate if we can get rid of this and exchange for ray.cloudpickle
 cloudpickle==2.2.0
-# Keep in sync with `ci/build/upload_build_info.sh`
-cryptography==38.0.1
+cryptography==42.0.5
 cython==0.29.37
 fastapi==0.109.2
 feather-format==0.4.1
@@ -45,8 +44,7 @@ Pillow==10.3.0; platform_system != "Windows"
 proxy.py==2.4.3
 pydantic==2.5.0
 pydot==1.4.2
-# Keep in sync with `ci/build/upload_build_info.sh`
-PyOpenSSL==23.0.0
+pyopenssl==24.2.1
 pygame==2.5.2
 Pygments==2.18.0
 pymongo==4.3.2
diff --git a/python/requirements_compiled.txt b/python/requirements_compiled.txt
index 1347afee24c5..f3b39647b0e8 100644
--- a/python/requirements_compiled.txt
+++ b/python/requirements_compiled.txt
@@ -4,24 +4,18 @@
 absl-py==1.4.0
     # via
     #   array-record
-    #   chex
     #   dm-control
     #   dm-env
-    #   dopamine-rl
     #   etils
     #   labmaze
     #   ml-collections
     #   mujoco
     #   open-spiel
-    #   optax
-    #   orbax-checkpoint
-    #   recsim
     #   tensorboard
     #   tensorflow
     #   tensorflow-datasets
     #   tensorflow-metadata
     #   tensorflow-probability
-    #   tf-slim
 accelerate==0.28.0
     # via -r /ray/ci/../python/requirements/ml/core-requirements.txt
 adagio==0.2.4
@@ -76,9 +70,7 @@ aiosqlite==0.19.0
 alabaster==0.7.13
     # via sphinx
 ale-py==0.10.1
-    # via
-    #   -r /ray/ci/../python/requirements/ml/rllib-test-requirements.txt
-    #   gymnasium
+    # via -r /ray/ci/../python/requirements/ml/rllib-test-requirements.txt
 alembic==1.12.1
     # via
     #   aim
@@ -147,10 +139,6 @@ attrs==21.4.0
     #   open-spiel
     #   sarif-om
     #   semgrep
-autorom==0.6.1 ; platform_machine != "arm64"
-    # via -r /ray/ci/../python/requirements/ml/rllib-test-requirements.txt
-autorom-accept-rom-license==0.6.1
-    # via autorom
 aws-sam-translator==1.81.0
     # via cfn-lint
 aws-xray-sdk==2.12.1
@@ -242,8 +230,6 @@ braceexpand==0.1.7
     # via webdataset
 bracex==2.4
     # via wcmatch
-cached-property==1.5.2
-    # via orbax-checkpoint
 cachetools==5.3.2
     # via
     #   aim
@@ -272,16 +258,12 @@ charset-normalizer==3.3.2
     # via
     #   requests
     #   snowflake-connector-python
-chex==0.1.7
-    # via optax
 clang-format==12.0.1
     # via -r /ray/ci/../python/requirements/lint-requirements.txt
 click==8.1.7
     # via
     #   -r /ray/ci/../python/requirements.txt
     #   aim
-    #   autorom
-    #   autorom-accept-rom-license
     #   black
     #   click-option-group
     #   dask
@@ -352,7 +334,7 @@ crc32c==2.3
     # via -r /ray/ci/../python/requirements/ml/data-requirements.txt
 crcmod==1.7
     # via gsutil
-cryptography==38.0.1
+cryptography==42.0.5
     # via
     #   -r /ray/ci/../python/requirements/test-requirements.txt
     #   adal
@@ -378,7 +360,7 @@ cython==0.29.37
     # via
     #   -r /ray/ci/../python/requirements/test-requirements.txt
     #   gpy
-dask==2022.10.1 ; python_version < "3.12"
+dask==2022.10.2 ; python_version < "3.12"
     # via
     #   -r /ray/ci/../python/requirements/ml/data-requirements.txt
     #   distributed
@@ -416,7 +398,7 @@ dill==0.3.7
     #   multiprocess
 distlib==0.3.7
     # via virtualenv
-distributed==2022.10.1 ; python_version < "3.12"
+distributed==2022.10.2 ; python_version < "3.12"
     # via
     #   -r /ray/ci/../python/requirements/ml/data-requirements.txt
     #   dask
@@ -429,7 +411,6 @@ dm-env==1.6
 dm-tree==0.1.8
     # via
     #   -r /ray/ci/../python/requirements.txt
-    #   chex
     #   dm-control
     #   dm-env
     #   tensorflow-datasets
@@ -450,10 +431,6 @@ docutils==0.19
     #   -r /ray/ci/../python/requirements/lint-requirements.txt
     #   myst-parser
     #   sphinx
-dopamine-rl==4.0.5 ; (sys_platform != "darwin" or platform_machine != "arm64") and python_version < "3.12"
-    # via
-    #   -r /ray/ci/../python/requirements/ml/rllib-test-requirements.txt
-    #   recsim
 dulwich==0.21.6
     # via comet-ml
 ecdsa==0.18.0
@@ -472,7 +449,7 @@ etils==1.5.2 ; python_version < "3.12"
     # via
     #   -r /ray/ci/../python/requirements/ml/dl-cpu-requirements.txt
     #   array-record
-    #   orbax-checkpoint
+    #   mujoco
     #   tensorflow-datasets
 evaluate==0.4.0
     # via -r /ray/ci/../python/requirements/ml/train-test-requirements.txt
@@ -545,8 +522,6 @@ flatbuffers==23.5.26
     #   onnxruntime
     #   tensorflow
     #   tf2onnx
-flax==0.7.2
-    # via dopamine-rl
 fonttools==4.45.1
     # via matplotlib
 fqdn==1.5.1
@@ -590,10 +565,6 @@ gast==0.4.0
     #   tensorflow-probability
 gcs-oauth2-boto-plugin==3.0
     # via gsutil
-gin-config==0.5.0
-    # via
-    #   dopamine-rl
-    #   recsim
 gitdb==4.0.11
     # via gitpython
 gitpython==3.1.40
@@ -677,6 +648,8 @@ gradio-client==0.6.1
     # via gradio
 graphql-core==3.2.3
     # via moto
+graphviz==0.20.3
+    # via -r /ray/ci/../python/requirements/test-requirements.txt
 greenlet==3.0.1
     # via sqlalchemy
 grpcio==1.66.2 ; sys_platform != "darwin"
@@ -704,7 +677,6 @@ gunicorn==20.1.0
 gymnasium==1.0.0
     # via
     #   -r /ray/ci/../python/requirements.txt
-    #   -r /ray/ci/../python/requirements/ml/rllib-test-requirements.txt
     #   minigrid
     #   pettingzoo
     #   shimmy
@@ -745,6 +717,8 @@ httpx==0.24.1
     #   -r /ray/ci/../python/requirements/test-requirements.txt
     #   gradio
     #   gradio-client
+hudi==0.2.0rc1
+    # via -r /ray/ci/../python/requirements/ml/data-test-requirements.txt
 huggingface-hub==0.19.4
     # via
     #   accelerate
@@ -771,7 +745,7 @@ idna==3.7
     #   snowflake-connector-python
     #   trustme
     #   yarl
-imageio==2.34.2 ; python_version < "3.12"
+imageio==2.34.2
     # via
     #   -r /ray/ci/../python/requirements/ml/rllib-test-requirements.txt
     #   moviepy
@@ -788,10 +762,8 @@ importlib-metadata==6.11.0
     #   myst-nb
 importlib-resources==5.13.0
     # via
-    #   ale-py
     #   etils
     #   gradio
-    #   orbax-checkpoint
     #   prophet
 iniconfig==2.0.0
     # via pytest
@@ -829,21 +801,6 @@ isort==5.10.1
     # via -r /ray/ci/../python/requirements/lint-requirements.txt
 itsdangerous==2.1.2
     # via flask
-jax==0.4.13
-    # via
-    #   chex
-    #   dopamine-rl
-    #   flax
-    #   optax
-    #   orbax-checkpoint
-jax-jumpy==1.0.0
-    # via gymnasium
-jaxlib==0.4.13
-    # via
-    #   chex
-    #   dopamine-rl
-    #   optax
-    #   orbax-checkpoint
 jedi==0.19.1
     # via ipython
 jinja2==3.1.2
@@ -908,7 +865,6 @@ jsonschema==4.17.3
     #   jsonschema-spec
     #   jupyter-events
     #   jupyterlab-server
-    #   kaggle-environments
     #   nbformat
     #   openapi-schema-validator
     #   openapi-spec-validator
@@ -966,8 +922,6 @@ jupyterlab-widgets==3.0.11
     # via ipywidgets
 jupytext==1.16.3
     # via -r /ray/ci/../python/requirements/test-requirements.txt
-kaggle-environments==1.7.11
-    # via -r /ray/ci/../python/requirements/ml/rllib-test-requirements.txt
 keras==2.15.0
     # via tensorflow
 kiwisolver==1.4.5
@@ -1060,11 +1014,7 @@ mistune==0.8.4
 ml-collections==0.1.1
     # via open-spiel
 ml-dtypes==0.3.2
-    # via
-    #   jax
-    #   jaxlib
-    #   tensorflow
-    #   tensorstore
+    # via tensorflow
 mlagents-envs==0.28.0
     # via -r /ray/ci/../python/requirements/ml/rllib-test-requirements.txt
 mlflow==2.9.2
@@ -1105,9 +1055,7 @@ msgpack==1.0.7
     #   -r /ray/ci/../python/requirements.txt
     #   -r /ray/ci/../python/requirements/ml/rllib-requirements.txt
     #   distributed
-    #   flax
     #   msgpack-numpy
-    #   orbax-checkpoint
     #   ray
 msgpack-numpy==0.4.8
     # via -r /ray/ci/../python/requirements/ml/rllib-requirements.txt
@@ -1177,7 +1125,6 @@ nest-asyncio==1.5.8
     #   nbclassic
     #   nbclient
     #   notebook
-    #   orbax-checkpoint
 netifaces==0.11.0
     # via
     #   hpbandster
@@ -1218,7 +1165,6 @@ numpy==1.26.4
     #   altair
     #   bayesian-optimization
     #   bokeh
-    #   chex
     #   cma
     #   cmaes
     #   cmdstanpy
@@ -1231,10 +1177,8 @@ numpy==1.26.4
     #   deepspeed
     #   dm-control
     #   dm-env
-    #   dopamine-rl
     #   etils
     #   evaluate
-    #   flax
     #   gpy
     #   gradio
     #   gymnasium
@@ -1242,9 +1186,6 @@ numpy==1.26.4
     #   hpbandster
     #   hyperopt
     #   imageio
-    #   jax
-    #   jax-jumpy
-    #   jaxlib
     #   labmaze
     #   lightgbm
     #   matplotlib
@@ -1266,9 +1207,7 @@ numpy==1.26.4
     #   open-spiel
     #   opencv-python
     #   opt-einsum
-    #   optax
     #   optuna
-    #   orbax-checkpoint
     #   pandas
     #   paramz
     #   patsy
@@ -1293,7 +1232,6 @@ numpy==1.26.4
     #   tensorflow
     #   tensorflow-datasets
     #   tensorflow-probability
-    #   tensorstore
     #   tf2onnx
     #   tifffile
     #   tinyscaler
@@ -1336,7 +1274,7 @@ opencensus-context==0.1.3
 opencensus-proto==0.1.0
     # via opentelemetry-exporter-opencensus
 opencv-python==4.8.1.78
-    # via dopamine-rl
+    # via -r /ray/ci/../python/requirements/ml/rllib-test-requirements.txt
 openpyxl==3.0.10
     # via -r /ray/ci/../python/requirements/test-requirements.txt
 opentelemetry-api==1.1.0
@@ -1369,15 +1307,10 @@ opentelemetry-semantic-conventions==0.20b0
     # via opentelemetry-sdk
 opt-einsum==3.3.0
     # via
-    #   jax
     #   pyro-ppl
     #   tensorflow
-optax==0.1.7
-    # via flax
 optuna==3.2.0
     # via -r /ray/ci/../python/requirements/ml/tune-requirements.txt
-orbax-checkpoint==0.2.3
-    # via flax
 orjson==3.9.10
     # via gradio
 packaging==23.0
@@ -1439,7 +1372,6 @@ pandas==1.5.3 ; python_version < "3.12"
     #   dask
     #   datasets
     #   delta-sharing
-    #   dopamine-rl
     #   evaluate
     #   gradio
     #   mlflow
@@ -1491,7 +1423,6 @@ pillow==10.3.0 ; platform_system != "Windows"
     #   -r /ray/ci/../python/requirements/test-requirements.txt
     #   aim
     #   bokeh
-    #   dopamine-rl
     #   gradio
     #   imageio
     #   matplotlib
@@ -1638,7 +1569,6 @@ pyflakes==2.3.1
 pygame==2.5.2
     # via
     #   -r /ray/ci/../python/requirements/test-requirements.txt
-    #   dopamine-rl
     #   minigrid
 pyglet==1.5.15
     # via -r /ray/ci/../python/requirements/ml/rllib-requirements.txt
@@ -1675,7 +1605,7 @@ pyopengl==3.1.7
     # via
     #   dm-control
     #   mujoco
-pyopenssl==23.0.0
+pyopenssl==24.2.1
     # via
     #   -r /ray/ci/../python/requirements.txt
     #   -r /ray/ci/../python/requirements/anyscale-requirements.txt
@@ -1809,7 +1739,6 @@ pyyaml==6.0.1
     #   dask
     #   datasets
     #   distributed
-    #   flax
     #   gradio
     #   huggingface-hub
     #   jsonschema-spec
@@ -1826,7 +1755,6 @@ pyyaml==6.0.1
     #   myst-nb
     #   myst-parser
     #   optuna
-    #   orbax-checkpoint
     #   pymars
     #   pytorch-lightning
     #   ray
@@ -1862,8 +1790,6 @@ requests==2.31.0
     #   -r /ray/ci/../python/requirements.txt
     #   adal
     #   aim
-    #   autorom
-    #   autorom-accept-rom-license
     #   azure-cli-core
     #   azure-core
     #   comet-ml
@@ -1937,7 +1863,6 @@ rich==13.3.2
     #   -r /ray/ci/../python/requirements.txt
     #   -r /ray/ci/../python/requirements/ml/rllib-requirements.txt
     #   comet-ml
-    #   flax
     #   memray
     #   pyiceberg
     #   semgrep
@@ -1989,8 +1914,6 @@ scipy==1.11.4
     #   gpy
     #   hpbandster
     #   hyperopt
-    #   jax
-    #   jaxlib
     #   lightgbm
     #   linear-operator
     #   medpy
@@ -1998,7 +1921,6 @@ scipy==1.11.4
     #   open-spiel
     #   paramz
     #   pymars
-    #   recsim
     #   scikit-image
     #   scikit-learn
     #   statsforecast
@@ -2154,6 +2076,8 @@ statsmodels==0.14.0
     # via
     #   hpbandster
     #   statsforecast
+strictyaml==1.7.3
+    # via pyiceberg
 supersuit==3.9.3
     # via -r /ray/ci/../python/requirements/ml/rllib-test-requirements.txt
 sympy==1.13.1
@@ -2188,10 +2112,7 @@ tensorboardx==2.6.2.2
     #   -r /ray/ci/../python/requirements/test-requirements.txt
     #   pytorch-lightning
 tensorflow==2.15.1 ; python_version < "3.12" and (sys_platform != "darwin" or platform_machine != "arm64")
-    # via
-    #   -r /ray/ci/../python/requirements/ml/dl-cpu-requirements.txt
-    #   dopamine-rl
-    #   recsim
+    # via -r /ray/ci/../python/requirements/ml/dl-cpu-requirements.txt
 tensorflow-datasets==4.9.3 ; python_version < "3.12"
     # via
     #   -r /ray/ci/../python/requirements/ml/data-test-requirements.txt
@@ -2207,13 +2128,7 @@ tensorflow-io-gcs-filesystem==0.31.0 ; python_version < "3.12"
 tensorflow-metadata==1.14.0
     # via tensorflow-datasets
 tensorflow-probability==0.23.0 ; python_version < "3.12"
-    # via
-    #   -r /ray/ci/../python/requirements/ml/dl-cpu-requirements.txt
-    #   dopamine-rl
-tensorstore==0.1.63
-    # via
-    #   flax
-    #   orbax-checkpoint
+    # via -r /ray/ci/../python/requirements/ml/dl-cpu-requirements.txt
 termcolor==2.4.0
     # via
     #   pytest-shutil
@@ -2227,8 +2142,6 @@ terminado==0.18.1
     #   notebook
 testfixtures==7.0.0
     # via -r /ray/ci/../python/requirements/test-requirements.txt
-tf-slim==1.1.0
-    # via dopamine-rl
 tf2onnx==1.15.1 ; sys_platform != "darwin" or platform_machine != "arm64"
     # via -r /ray/ci/../python/requirements/ml/rllib-requirements.txt
 threadpoolctl==3.1.0
@@ -2258,7 +2171,6 @@ tomlkit==0.13.0
 toolz==0.12.1
     # via
     #   altair
-    #   chex
     #   dask
     #   distributed
     #   partd
@@ -2394,7 +2306,6 @@ typing-extensions==4.8.0
     #   configspace
     #   etils
     #   fastapi
-    #   flax
     #   gradio
     #   gradio-client
     #   gymnasium
@@ -2403,7 +2314,6 @@ typing-extensions==4.8.0
     #   mypy
     #   myst-nb
     #   nevergrad
-    #   orbax-checkpoint
     #   pydantic
     #   pydantic-core
     #   pytorch-lightning
diff --git a/python/setup.py b/python/setup.py
index 2e6958d021da..16017fa5447a 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -228,7 +228,7 @@ def get_packages(self):
     pandas_dep = "pandas >= 1.3"
     numpy_dep = "numpy >= 1.20"
     pyarrow_deps = [
-        "pyarrow >= 6.0.1",
+        "pyarrow >= 9.0.0",
         "pyarrow <18; sys_platform == 'darwin' and platform_machine == 'x86_64'",
     ]
     setup_spec.extras = {
diff --git a/release/BUILD.bazel b/release/BUILD.bazel
index a09070d9b313..f269add55138 100644
--- a/release/BUILD.bazel
+++ b/release/BUILD.bazel
@@ -309,7 +309,6 @@ py_library(
         bk_require("pybuildkite"),
         bk_require("pygithub"),
         bk_require("requests"),
-        bk_require("retry"),
     ],
 )
 
@@ -624,3 +623,18 @@ py_test(
         bk_require("pytest"),
     ],
 )
+
+py_test(
+    name = "test_retry",
+    size = "small",
+    srcs = ["ray_release/tests/test_retry.py"],
+    exec_compatible_with = ["//:hermetic_python"],
+    tags = [
+        "release_unit",
+        "team:ci",
+    ],
+    deps = [
+        ":ray_release",
+        bk_require("pytest"),
+    ],
+)
diff --git a/release/air_examples/dolly_v2_lightning_fsdp_finetuning/dolly_v2_fsdp_compute_aws.yaml b/release/air_examples/dolly_v2_lightning_fsdp_finetuning/dolly_v2_fsdp_compute_aws.yaml
index 7966578a31b1..9e6cabef573d 100644
--- a/release/air_examples/dolly_v2_lightning_fsdp_finetuning/dolly_v2_fsdp_compute_aws.yaml
+++ b/release/air_examples/dolly_v2_lightning_fsdp_finetuning/dolly_v2_fsdp_compute_aws.yaml
@@ -12,7 +12,7 @@ worker_node_types:
       max_workers: 15
       use_spot: false
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_compute_aws.yaml b/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_compute_aws.yaml
index e315fc0b9f88..6ed2aa738ed9 100644
--- a/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_compute_aws.yaml
+++ b/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_compute_aws.yaml
@@ -12,7 +12,7 @@ worker_node_types:
       max_workers: 7
       use_spot: false
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_compute_gce.yaml b/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_compute_gce.yaml
index 7be4f970f0b1..be93c6d0aac6 100644
--- a/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_compute_gce.yaml
+++ b/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_compute_gce.yaml
@@ -14,7 +14,7 @@ worker_node_types:
       max_workers: 7
       use_spot: false
 
-#aws:
+#advanced_configurations_json:
 #  TagSpecifications:
 #    - ResourceType: "instance"
 #      Tags:
diff --git a/release/air_examples/opt_deepspeed_batch_inference/30b_deepspeed_compute.yaml b/release/air_examples/opt_deepspeed_batch_inference/30b_deepspeed_compute.yaml
index 1ac93d59eb91..a2b79c9cc489 100644
--- a/release/air_examples/opt_deepspeed_batch_inference/30b_deepspeed_compute.yaml
+++ b/release/air_examples/opt_deepspeed_batch_inference/30b_deepspeed_compute.yaml
@@ -7,7 +7,7 @@ head_node_type:
 
 worker_node_types: []
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_deepspeed_compute_aws.yaml b/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_deepspeed_compute_aws.yaml
index 17f69c81a906..3fe5ec7c083e 100644
--- a/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_deepspeed_compute_aws.yaml
+++ b/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_deepspeed_compute_aws.yaml
@@ -12,7 +12,7 @@ worker_node_types:
       max_workers: 15
       use_spot: false
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/air_tests/air_benchmarks/compute_gpu_1_aws.yaml b/release/air_tests/air_benchmarks/compute_gpu_1_aws.yaml
index df7c2a8958a0..150990710680 100644
--- a/release/air_tests/air_benchmarks/compute_gpu_1_aws.yaml
+++ b/release/air_tests/air_benchmarks/compute_gpu_1_aws.yaml
@@ -9,7 +9,7 @@ head_node_type:
 
 worker_node_types: []
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml b/release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml
index ee7d1436e7cf..c543315e24f3 100644
--- a/release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml
+++ b/release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml
@@ -14,7 +14,7 @@ worker_node_types:
       min_workers: 3
       use_spot: false
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/air_tests/air_benchmarks/mlperf-train/compute_cpu_16.yaml b/release/air_tests/air_benchmarks/mlperf-train/compute_cpu_16.yaml
index be1577e57401..b45a2c038d78 100644
--- a/release/air_tests/air_benchmarks/mlperf-train/compute_cpu_16.yaml
+++ b/release/air_tests/air_benchmarks/mlperf-train/compute_cpu_16.yaml
@@ -9,7 +9,7 @@ head_node_type:
 
 worker_node_types: []
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/air_tests/horovod/compute_tpl_aws.yaml b/release/air_tests/horovod/compute_tpl_aws.yaml
index d14997e25f0b..2ef09f059167 100644
--- a/release/air_tests/horovod/compute_tpl_aws.yaml
+++ b/release/air_tests/horovod/compute_tpl_aws.yaml
@@ -15,7 +15,7 @@ worker_node_types:
       min_workers: 1
       use_spot: false
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/benchmark-worker-startup/only_head_node_1gpu_64cpu.yaml b/release/benchmark-worker-startup/only_head_node_1gpu_64cpu.yaml
index cddae87016f6..f377139d6f22 100644
--- a/release/benchmark-worker-startup/only_head_node_1gpu_64cpu.yaml
+++ b/release/benchmark-worker-startup/only_head_node_1gpu_64cpu.yaml
@@ -3,7 +3,7 @@ region: us-west-2
 
 max_workers: 0
 
-aws:
+advanced_configurations_json:
     # Fix the volume size so that IOPS is constant even if the default changes.
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
diff --git a/release/benchmarks/distributed/many_nodes_tests/compute_config.yaml b/release/benchmarks/distributed/many_nodes_tests/compute_config.yaml
index 47f435a873ac..2a8de6119ea3 100644
--- a/release/benchmarks/distributed/many_nodes_tests/compute_config.yaml
+++ b/release/benchmarks/distributed/many_nodes_tests/compute_config.yaml
@@ -4,7 +4,7 @@ region: us-west-2
 
 # NFS needs to be disabled for this test, since the test spawns too many nodes
 # and may hit the limit on the # of clients.
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
@@ -23,7 +23,7 @@ head_node_type:
 
 worker_node_types:
   - name: worker_node
-    instance_type: m5.large
+    instance_type: m6i.large
     min_workers: 500
     max_workers: 2000
     use_spot: false
diff --git a/release/benchmarks/object_store.yaml b/release/benchmarks/object_store.yaml
index 6908c9e9bf7f..5353a1009c6b 100644
--- a/release/benchmarks/object_store.yaml
+++ b/release/benchmarks/object_store.yaml
@@ -5,14 +5,14 @@ max_workers: 49
 
 head_node_type:
     name: head_node
-    instance_type: m4.16xlarge
+    instance_type: m6i.16xlarge
     resources:
       custom_resources:
         node: 1
 
 worker_node_types:
     - name: worker_node
-      instance_type: m4.2xlarge
+      instance_type: m6i.2xlarge
       min_workers: 49
       max_workers: 49
       use_spot: false
diff --git a/release/benchmarks/single_node.yaml b/release/benchmarks/single_node.yaml
index 94297cbfbb8e..d94ea397f073 100644
--- a/release/benchmarks/single_node.yaml
+++ b/release/benchmarks/single_node.yaml
@@ -3,7 +3,7 @@ region: us-west-2
 
 max_workers: 0
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/dashboard/agent_stress_compute.yaml b/release/dashboard/agent_stress_compute.yaml
index 340b63778529..5e3859905a07 100644
--- a/release/dashboard/agent_stress_compute.yaml
+++ b/release/dashboard/agent_stress_compute.yaml
@@ -1,7 +1,7 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/jobs_tests/compute_tpl_4_xlarge.yaml b/release/jobs_tests/compute_tpl_4_xlarge.yaml
index 7da54ffb0d6c..5f21711662d7 100644
--- a/release/jobs_tests/compute_tpl_4_xlarge.yaml
+++ b/release/jobs_tests/compute_tpl_4_xlarge.yaml
@@ -16,7 +16,7 @@ worker_node_types:
       max_workers: 4
       use_spot: false
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/jobs_tests/compute_tpl_gce_4_xlarge.yaml b/release/jobs_tests/compute_tpl_gce_4_xlarge.yaml
index b2c81c32c83c..176c282a50f5 100644
--- a/release/jobs_tests/compute_tpl_gce_4_xlarge.yaml
+++ b/release/jobs_tests/compute_tpl_gce_4_xlarge.yaml
@@ -1,6 +1,6 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west1
-allowed_azs: 
+allowed_azs:
     - us-west1-c
 
 max_workers: 4
@@ -16,7 +16,7 @@ worker_node_types:
       max_workers: 4
       use_spot: false
 
-#aws:
+#advanced_configurations_json:
 #  TagSpecifications:
 #    - ResourceType: "instance"
 #      Tags:
diff --git a/release/jobs_tests/compute_tpl_gpu_node.yaml b/release/jobs_tests/compute_tpl_gpu_node.yaml
index 36a2a3e0ce8c..27700e794664 100644
--- a/release/jobs_tests/compute_tpl_gpu_node.yaml
+++ b/release/jobs_tests/compute_tpl_gpu_node.yaml
@@ -13,7 +13,7 @@ worker_node_types:
       max_workers: 1
       use_spot: false
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/jobs_tests/compute_tpl_gpu_worker.yaml b/release/jobs_tests/compute_tpl_gpu_worker.yaml
index 15955339513f..a98edd4945dc 100644
--- a/release/jobs_tests/compute_tpl_gpu_worker.yaml
+++ b/release/jobs_tests/compute_tpl_gpu_worker.yaml
@@ -13,7 +13,7 @@ worker_node_types:
       max_workers: 1
       use_spot: false
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/k8s_tests/compute_tpl.yaml b/release/k8s_tests/compute_tpl.yaml
index c29a684f9efd..a5a788a2eec7 100644
--- a/release/k8s_tests/compute_tpl.yaml
+++ b/release/k8s_tests/compute_tpl.yaml
@@ -9,7 +9,7 @@ head_node_type:
 
 worker_node_types: []
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/k8s_tests/ray_v1alpha1_rayservice_template.yaml b/release/k8s_tests/ray_v1alpha1_rayservice_template.yaml
index c47e9572ceb1..7bed4fc1066c 100644
--- a/release/k8s_tests/ray_v1alpha1_rayservice_template.yaml
+++ b/release/k8s_tests/ray_v1alpha1_rayservice_template.yaml
@@ -126,7 +126,7 @@ spec:
       serviceType: ClusterIP
       # the pod replicas in this group typed head (assuming there could be more than 1 in the future)
       replicas: 1
-      # logical group name, for this called head-group, also can be functional
+      # logical group name, for this called headgroup, also can be functional
       # pod type head or worker
       # rayNodeType: head # Not needed since it is under the headgroup
       # the following params are used to complete the ray start: ray start --head --block --redis-port=6379 ...
diff --git a/release/long_running_distributed_tests/compute_tpl.yaml b/release/long_running_distributed_tests/compute_tpl.yaml
index 1fe5db1b9f87..68c144d651d3 100644
--- a/release/long_running_distributed_tests/compute_tpl.yaml
+++ b/release/long_running_distributed_tests/compute_tpl.yaml
@@ -14,7 +14,7 @@ worker_node_types:
       max_workers: 2
       use_spot: false
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/long_running_tests/many_ppo.yaml b/release/long_running_tests/many_ppo.yaml
index 63c8145d992e..941dd0fa091d 100644
--- a/release/long_running_tests/many_ppo.yaml
+++ b/release/long_running_tests/many_ppo.yaml
@@ -9,7 +9,7 @@ head_node_type:
 
 worker_node_types: []
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/long_running_tests/many_ppo_gce.yaml b/release/long_running_tests/many_ppo_gce.yaml
index 7be96bd65462..9981b725f770 100644
--- a/release/long_running_tests/many_ppo_gce.yaml
+++ b/release/long_running_tests/many_ppo_gce.yaml
@@ -11,7 +11,7 @@ head_node_type:
 
 worker_node_types: []
 
-#aws:
+#advanced_configurations_json:
 #  TagSpecifications:
 #    - ResourceType: "instance"
 #      Tags:
diff --git a/release/long_running_tests/tpl_cpu_1.yaml b/release/long_running_tests/tpl_cpu_1.yaml
index 0798e51be0d3..f09553f38347 100644
--- a/release/long_running_tests/tpl_cpu_1.yaml
+++ b/release/long_running_tests/tpl_cpu_1.yaml
@@ -9,7 +9,7 @@ head_node_type:
 
 worker_node_types: []
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/long_running_tests/tpl_cpu_1_c5.yaml b/release/long_running_tests/tpl_cpu_1_c5.yaml
index 6711e8ea7b66..d6cd546b8951 100644
--- a/release/long_running_tests/tpl_cpu_1_c5.yaml
+++ b/release/long_running_tests/tpl_cpu_1_c5.yaml
@@ -9,7 +9,7 @@ head_node_type:
 
 worker_node_types: []
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/long_running_tests/tpl_cpu_1_c5_gce.yaml b/release/long_running_tests/tpl_cpu_1_c5_gce.yaml
index 1f6a428ca7ec..52c35b2508e6 100644
--- a/release/long_running_tests/tpl_cpu_1_c5_gce.yaml
+++ b/release/long_running_tests/tpl_cpu_1_c5_gce.yaml
@@ -11,7 +11,7 @@ head_node_type:
 
 worker_node_types: []
 
-#aws:
+#advanced_configurations_json:
 #  TagSpecifications:
 #    - ResourceType: "instance"
 #      Tags:
diff --git a/release/long_running_tests/tpl_cpu_1_gce.yaml b/release/long_running_tests/tpl_cpu_1_gce.yaml
index 807ac26cac60..48eef743adfb 100644
--- a/release/long_running_tests/tpl_cpu_1_gce.yaml
+++ b/release/long_running_tests/tpl_cpu_1_gce.yaml
@@ -11,7 +11,7 @@ head_node_type:
 
 worker_node_types: []
 
-#aws:
+#advanced_configurations_json:
 #  TagSpecifications:
 #    - ResourceType: "instance"
 #      Tags:
diff --git a/release/long_running_tests/tpl_cpu_1_large.yaml b/release/long_running_tests/tpl_cpu_1_large.yaml
index ebe2058ab8b9..87e8548fc087 100644
--- a/release/long_running_tests/tpl_cpu_1_large.yaml
+++ b/release/long_running_tests/tpl_cpu_1_large.yaml
@@ -9,7 +9,7 @@ head_node_type:
 
 worker_node_types: []
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/long_running_tests/tpl_cpu_1_large_gce.yaml b/release/long_running_tests/tpl_cpu_1_large_gce.yaml
index f9392a87032a..93c00da4d01e 100644
--- a/release/long_running_tests/tpl_cpu_1_large_gce.yaml
+++ b/release/long_running_tests/tpl_cpu_1_large_gce.yaml
@@ -11,7 +11,7 @@ head_node_type:
 
 worker_node_types: []
 
-#aws:
+#advanced_configurations_json:
 #  TagSpecifications:
 #    - ResourceType: "instance"
 #      Tags:
diff --git a/release/long_running_tests/tpl_cpu_2.yaml b/release/long_running_tests/tpl_cpu_2.yaml
index 7e249f7ec82e..94b54c63b6d1 100644
--- a/release/long_running_tests/tpl_cpu_2.yaml
+++ b/release/long_running_tests/tpl_cpu_2.yaml
@@ -14,7 +14,7 @@ worker_node_types:
       max_workers: 1
       use_spot: false
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/long_running_tests/tpl_cpu_3.yaml b/release/long_running_tests/tpl_cpu_3.yaml
index 54ac50b78e38..4821923fe71c 100644
--- a/release/long_running_tests/tpl_cpu_3.yaml
+++ b/release/long_running_tests/tpl_cpu_3.yaml
@@ -14,7 +14,7 @@ worker_node_types:
       max_workers: 2
       use_spot: false
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/long_running_tests/tpl_cpu_3_gce.yaml b/release/long_running_tests/tpl_cpu_3_gce.yaml
index e08b73838512..c9d24ec1dd71 100644
--- a/release/long_running_tests/tpl_cpu_3_gce.yaml
+++ b/release/long_running_tests/tpl_cpu_3_gce.yaml
@@ -16,7 +16,7 @@ worker_node_types:
       max_workers: 2
       use_spot: false
 
-#aws:
+#advanced_configurations_json:
 #  TagSpecifications:
 #    - ResourceType: "instance"
 #      Tags:
diff --git a/release/long_running_tests/tpl_cpu_4.yaml b/release/long_running_tests/tpl_cpu_4.yaml
index c08501e94c67..43e2adbdad8a 100644
--- a/release/long_running_tests/tpl_cpu_4.yaml
+++ b/release/long_running_tests/tpl_cpu_4.yaml
@@ -14,7 +14,7 @@ worker_node_types:
       max_workers: 3
       use_spot: false
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/long_running_tests/tpl_cpu_4_gce.yaml b/release/long_running_tests/tpl_cpu_4_gce.yaml
index 4525893848e1..3a56f551ccc3 100644
--- a/release/long_running_tests/tpl_cpu_4_gce.yaml
+++ b/release/long_running_tests/tpl_cpu_4_gce.yaml
@@ -16,7 +16,7 @@ worker_node_types:
       max_workers: 3
       use_spot: false
 
-#aws:
+#advanced_configurations_json:
 #  TagSpecifications:
 #    - ResourceType: "instance"
 #      Tags:
diff --git a/release/microbenchmark/experimental/accelerated_dag_gpu_microbenchmark.py b/release/microbenchmark/experimental/accelerated_dag_gpu_microbenchmark.py
index f440e72752fb..895d43bdcdab 100644
--- a/release/microbenchmark/experimental/accelerated_dag_gpu_microbenchmark.py
+++ b/release/microbenchmark/experimental/accelerated_dag_gpu_microbenchmark.py
@@ -58,13 +58,16 @@ class TorchTensorWorker:
     def __init__(self):
         self.device = torch_utils.get_devices()[0]
 
-    def send(self, shape, dtype, value: int):
-        t = torch.ones(shape, dtype=dtype, device=self.device) * value
+    def send(self, shape, dtype, _):
+        t = torch.ones(shape, dtype=dtype, device=self.device) * 1
         return t
 
     def recv(self, tensor):
+        # This benchmark tests the overhead of sending a tensor between
+        # actors. To minimize the overhead of shared memory transfer,
+        # we return only a byte string.
         assert tensor.device == self.device
-        return (tensor[0].item(), tensor.shape, tensor.dtype)
+        return b"x"
 
 
 @ray.remote(num_gpus=1)
@@ -139,17 +142,15 @@ def exec_ray_dag(
         dag = dag.experimental_compile()
 
         def _run():
-            i = np.random.randint(100)
-            ref = dag.execute(i)
+            ref = dag.execute(b"x")
             result = ray.get(ref)
-            assert result == (i, SHAPE, DTYPE)
+            assert result == b"x"
 
     else:
 
         def _run():
-            i = np.random.randint(100)
-            result = ray.get(dag.execute(i))
-            assert result == (i, SHAPE, DTYPE)
+            result = ray.get(dag.execute(b"x"))
+            assert result == b"x"
 
     results = timeit(label, _run)
 
diff --git a/release/ml_user_tests/horovod/compute_tpl_aws.yaml b/release/ml_user_tests/horovod/compute_tpl_aws.yaml
index 6c518ba272c1..61999ce38e8c 100644
--- a/release/ml_user_tests/horovod/compute_tpl_aws.yaml
+++ b/release/ml_user_tests/horovod/compute_tpl_aws.yaml
@@ -14,7 +14,7 @@ worker_node_types:
       min_workers: 3
       use_spot: false
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/ml_user_tests/horovod/compute_tpl_gce.yaml b/release/ml_user_tests/horovod/compute_tpl_gce.yaml
index 2cad8d220fba..d2d46997fd4f 100644
--- a/release/ml_user_tests/horovod/compute_tpl_gce.yaml
+++ b/release/ml_user_tests/horovod/compute_tpl_gce.yaml
@@ -16,7 +16,7 @@ worker_node_types:
       min_workers: 3
       use_spot: false
 
-#aws:
+#advanced_configurations_json:
 #  TagSpecifications:
 #    - ResourceType: "instance"
 #      Tags:
diff --git a/release/ml_user_tests/tune_rllib/compute_tpl_aws.yaml b/release/ml_user_tests/tune_rllib/compute_tpl_aws.yaml
index c4166af67959..376fd90539c7 100644
--- a/release/ml_user_tests/tune_rllib/compute_tpl_aws.yaml
+++ b/release/ml_user_tests/tune_rllib/compute_tpl_aws.yaml
@@ -20,7 +20,7 @@ worker_node_types:
       max_workers: 2
       use_spot: false
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/nightly_tests/chaos_test/compute_template.yaml b/release/nightly_tests/chaos_test/compute_template.yaml
index 4421ed956e29..f91504fb6937 100644
--- a/release/nightly_tests/chaos_test/compute_template.yaml
+++ b/release/nightly_tests/chaos_test/compute_template.yaml
@@ -1,7 +1,7 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-aws:
+advanced_configurations_json:
     IamInstanceProfile: {"Name": "ray-autoscaler-v1"}
 
 head_node_type:
diff --git a/release/nightly_tests/dask_on_ray/1tb_sort_compute.yaml b/release/nightly_tests/dask_on_ray/1tb_sort_compute.yaml
index 80c5ea1325da..7bc19c0bf2f9 100644
--- a/release/nightly_tests/dask_on_ray/1tb_sort_compute.yaml
+++ b/release/nightly_tests/dask_on_ray/1tb_sort_compute.yaml
@@ -1,7 +1,7 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/nightly_tests/dask_on_ray/chaos_dask_on_ray_stress_compute.yaml b/release/nightly_tests/dask_on_ray/chaos_dask_on_ray_stress_compute.yaml
index e249486f0377..838abd890c33 100644
--- a/release/nightly_tests/dask_on_ray/chaos_dask_on_ray_stress_compute.yaml
+++ b/release/nightly_tests/dask_on_ray/chaos_dask_on_ray_stress_compute.yaml
@@ -1,7 +1,7 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/nightly_tests/dask_on_ray/dask_on_ray_sort_compute_template.yaml b/release/nightly_tests/dask_on_ray/dask_on_ray_sort_compute_template.yaml
index da67eec060c4..e87043b3d435 100644
--- a/release/nightly_tests/dask_on_ray/dask_on_ray_sort_compute_template.yaml
+++ b/release/nightly_tests/dask_on_ray/dask_on_ray_sort_compute_template.yaml
@@ -1,7 +1,7 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/nightly_tests/dask_on_ray/dask_on_ray_stress_compute.yaml b/release/nightly_tests/dask_on_ray/dask_on_ray_stress_compute.yaml
index e249486f0377..838abd890c33 100644
--- a/release/nightly_tests/dask_on_ray/dask_on_ray_stress_compute.yaml
+++ b/release/nightly_tests/dask_on_ray/dask_on_ray_stress_compute.yaml
@@ -1,7 +1,7 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/nightly_tests/dask_on_ray/dask_on_ray_stress_compute_k8s.yaml b/release/nightly_tests/dask_on_ray/dask_on_ray_stress_compute_k8s.yaml
index 387ce28c725a..6e891770737b 100644
--- a/release/nightly_tests/dask_on_ray/dask_on_ray_stress_compute_k8s.yaml
+++ b/release/nightly_tests/dask_on_ray/dask_on_ray_stress_compute_k8s.yaml
@@ -1,7 +1,7 @@
 cloud_id: cld_HSrCZdMCYDe1NmMCJhYRgQ4p
 region: us-west-2
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/nightly_tests/dask_on_ray/large_scale_dask_on_ray_compute_template.yaml b/release/nightly_tests/dask_on_ray/large_scale_dask_on_ray_compute_template.yaml
index aea2d4f78b07..67838b838f5d 100644
--- a/release/nightly_tests/dask_on_ray/large_scale_dask_on_ray_compute_template.yaml
+++ b/release/nightly_tests/dask_on_ray/large_scale_dask_on_ray_compute_template.yaml
@@ -1,7 +1,7 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/nightly_tests/dataset/aggregate_benchmark.py b/release/nightly_tests/dataset/aggregate_benchmark.py
deleted file mode 100644
index 8085ed0ca6d9..000000000000
--- a/release/nightly_tests/dataset/aggregate_benchmark.py
+++ /dev/null
@@ -1,141 +0,0 @@
-from typing import Tuple
-
-import ray
-from ray.data._internal.aggregate import (
-    _AggregateOnKeyBase,
-    Max,
-    Mean,
-    Min,
-    Sum,
-)
-from ray.data.block import Block
-from ray.data.dataset import Dataset
-import pyarrow.compute as pac
-
-from benchmark import Benchmark
-
-
-def run_h2oai(benchmark: Benchmark):
-    """This benchmark is originally from https://github.com/h2oai/db-benchmark
-
-    Here we run all group-by queries from the benchmark on Ray Datasets.
-    The input files are pre-generated and stored in AWS S3 beforehand.
-    """
-
-    # Test input file schema={
-    #   id1: string, id2: string, id3: string, id4: int64, id5: int64, id6: int64,
-    #   v1: int64, v2: int64, v3: double
-    # })
-    test_input = [
-        ("s3://air-example-data/h2oai_benchmark/G1_1e7_1e2_0_0.csv", "h2oai-500M")
-    ]
-    for path, test_name in test_input:
-        input_ds = ray.data.read_csv(path)
-        # Number of blocks (parallelism) should be set as number of available CPUs
-        # to get best performance.
-        num_blocks = int(ray.cluster_resources().get("CPU", 1))
-        input_ds = input_ds.repartition(num_blocks).materialize()
-
-        q_list = [
-            (h2oai_q1, "q1"),
-            (h2oai_q3, "q3"),
-            (h2oai_q4, "q4"),
-            (h2oai_q5, "q5"),
-            (h2oai_q7, "q7"),
-            (h2oai_q8, "q8"),
-        ]
-
-        for q, name in q_list:
-            benchmark.run_materialize_ds(f"{test_name}-{name}", q, ds=input_ds)
-
-
-def h2oai_q1(ds: Dataset) -> Dataset:
-    return ds.groupby("id1").sum("v1")
-
-
-def h2oai_q2(ds: Dataset) -> Dataset:
-    # TODO(chengsu): Run this after dataset supports multiple group-by keys.
-    # return ds.groupby(["id1", "id2"]).sum("v1")
-    raise NotImplementedError
-
-
-def h2oai_q3(ds: Dataset) -> Dataset:
-    return ds.groupby("id3").aggregate(Sum("v1"), Mean("v3"))
-
-
-def h2oai_q4(ds: Dataset) -> Dataset:
-    return ds.groupby("id4").aggregate(Mean("v1"), Mean("v2"), Mean("v3"))
-
-
-def h2oai_q5(ds: Dataset) -> Dataset:
-    return ds.groupby("id6").aggregate(Sum("v1"), Sum("v2"), Sum("v3"))
-
-
-def h2oai_q6(ds: Dataset) -> Dataset:
-    # TODO(chengsu): Run this after dataset supports multiple group-by keys.
-    # return ds.groupby(["id4", "id5"]).aggregate(Median("v3"), Std("v3"))
-    raise NotImplementedError
-
-
-def h2oai_q7(ds: Dataset) -> Dataset:
-    ds = ds.groupby("id3").aggregate(Max("v1"), Min("v2"))
-    ds = ds.map_batches(
-        lambda df: df.assign(result=df["max(v1)"] - df["min(v2)"]),
-        batch_format="pandas",
-    )
-    return ds
-
-
-def h2oai_q8(ds: Dataset) -> Dataset:
-    def accumulate_block(agg: Tuple[float, float], block: Block) -> Tuple[float, float]:
-        column = block["v3"]
-        top_k_indices = pac.top_k_unstable(column, k=2)
-        top_k_result = pac.take(column, top_k_indices).to_pylist()
-        top_k_result.extend([float("-inf")] * (2 - len(top_k_result)))
-        top_k_result = (top_k_result[0], top_k_result[1])
-        return merge(agg, top_k_result)
-
-    def merge(
-        agg1: Tuple[float, float],
-        agg2: Tuple[float, float],
-    ) -> Tuple[float, float]:
-        if agg1[0] >= agg2[0]:
-            value1 = agg1[0]
-            value2 = max(agg1[1], agg2[0])
-        else:
-            value1 = agg2[0]
-            value2 = max(agg1[0], agg2[1])
-        return (value1, value2)
-
-    class Top2(_AggregateOnKeyBase):
-        def __init__(self, on):
-            self._set_key_fn(on)
-            super().__init__(
-                init=lambda _: (float("-inf"), float("-inf")),
-                merge=merge,
-                accumulate_block=accumulate_block,
-                name=(f"top2({str(on)})"),
-            )
-
-    return ds.groupby("id6").aggregate(Top2("v3"))
-
-
-def h2oai_q9(ds: Dataset) -> Dataset:
-    # TODO(chengsu): Run this after dataset supports multiple group-by keys.
-    # return ds.groupby(["id2", "id4"]).aggregate(pow(corr("v1", "v2"), 2))
-    raise NotImplementedError
-
-
-def h2oai_q10(ds: Dataset) -> Dataset:
-    # TODO(chengsu): Run this after dataset supports multiple group-by keys.
-    # return ds.groupby(["id1", "id2", "id3", "id4", "id5", "id6"])
-    #          .aggregate(Count(), Sum("v3"))
-    raise NotImplementedError
-
-
-if __name__ == "__main__":
-    benchmark = Benchmark("aggregate")
-
-    run_h2oai(benchmark)
-
-    benchmark.write_result()
diff --git a/release/nightly_tests/dataset/compute_gpu_4x4_aws.yaml b/release/nightly_tests/dataset/compute_gpu_4x4_aws.yaml
index df0eb98234ea..e56edf8bbf28 100644
--- a/release/nightly_tests/dataset/compute_gpu_4x4_aws.yaml
+++ b/release/nightly_tests/dataset/compute_gpu_4x4_aws.yaml
@@ -14,7 +14,7 @@ worker_node_types:
       min_workers: 3
       use_spot: false
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/nightly_tests/dataset/multi_node_autoscaling_compute.yaml b/release/nightly_tests/dataset/multi_node_autoscaling_compute.yaml
new file mode 100644
index 000000000000..7b3612d3b4b8
--- /dev/null
+++ b/release/nightly_tests/dataset/multi_node_autoscaling_compute.yaml
@@ -0,0 +1,18 @@
+# This config matches the default config for Anyscale workspaces with autoscaling.
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west-2
+
+max_workers: 0
+
+head_node_type:
+    name: head_node
+    instance_type: m5.2xlarge
+    resources:
+      cpu: 0
+
+worker_node_types:
+    - name: worker_node
+      instance_type: m5.2xlarge
+      min_workers: 0
+      max_workers: 10
+      use_spot: false
diff --git a/release/nightly_tests/dataset/multi_node_read_images_benchmark_compute.yaml b/release/nightly_tests/dataset/multi_node_read_images_benchmark_compute.yaml
deleted file mode 100644
index 9655daad50cd..000000000000
--- a/release/nightly_tests/dataset/multi_node_read_images_benchmark_compute.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
-region: us-west-2
-
-max_workers: 19
-
-head_node_type:
-    name: head_node
-    instance_type: m5.16xlarge
-
-worker_node_types:
-    - name: worker_node
-      instance_type: m5.4xlarge
-      max_workers: 19
-      min_workers: 19
-      use_spot: false
diff --git a/release/nightly_tests/dataset/multi_node_read_images_benchmark_compute_gce.yaml b/release/nightly_tests/dataset/multi_node_read_images_benchmark_compute_gce.yaml
deleted file mode 100644
index bca10d5c5447..000000000000
--- a/release/nightly_tests/dataset/multi_node_read_images_benchmark_compute_gce.yaml
+++ /dev/null
@@ -1,17 +0,0 @@
-cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
-region: us-west1
-allowed_azs:
-    - us-west1-c
-
-max_workers: 19
-
-head_node_type:
-    name: head_node
-    instance_type: n2-standard-64 # aws m5.16xlarge
-
-worker_node_types:
-    - name: worker_node
-      instance_type: n2-standard-16 # aws m5.4xlarge
-      max_workers: 19
-      min_workers: 19
-      use_spot: false
diff --git a/release/nightly_tests/dataset/parquet_metadata_resolution.py b/release/nightly_tests/dataset/parquet_metadata_resolution.py
deleted file mode 100644
index b9f634f3f17f..000000000000
--- a/release/nightly_tests/dataset/parquet_metadata_resolution.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import argparse
-import os
-
-from benchmark import Benchmark
-
-parser = argparse.ArgumentParser(description="Parquet Metadata Read")
-parser.add_argument("--num-files", type=int, default=30)
-parser.add_argument("--cloud", type=str, choices=["aws", "gcp"])
-
-
-if __name__ == "__main__":
-    args = parser.parse_args()
-    import ray
-
-    print("Connecting to Ray cluster...")
-    ray.init(address="auto")
-
-    num = args.num_files
-
-    assert args.cloud in {"aws", "gcp"}, args.cloud
-    if args.cloud == "aws":
-        prefix = "s3://shuffling-data-loader-benchmarks/data/r10_000_000_000-f1000"
-    if args.cloud == "gcp":
-        # NOTE(@bveeramani): I made a mistake while transferring the files from S3 to
-        # GCS, so there's an extra "r10_000_000_000-f1000" in the URI. Don't worry about
-        # it. The files are the same.
-        prefix = "gs://shuffling-data-loader-benchmarks/data/r10_000_000_000-f1000/r10_000_000_000-f1000"  # noqa: E501
-    files = [f"{prefix}/input_data_{i}.parquet.snappy" for i in range(args.num_files)]
-
-    def _trigger_parquet_metadata_load():
-        # This should only read Parquet metadata.
-        ray.data.read_parquet(files).count()
-
-    benchmark = Benchmark("parquet_metadata_resolution")
-    benchmark.run_fn("read_metadata", _trigger_parquet_metadata_load)
-    benchmark.write_result(os.environ["TEST_OUTPUT_JSON"])
diff --git a/release/nightly_tests/dataset/pipelined_ingestion_compute.yaml b/release/nightly_tests/dataset/pipelined_ingestion_compute.yaml
index b8b25b2def6c..0ed874893d1d 100644
--- a/release/nightly_tests/dataset/pipelined_ingestion_compute.yaml
+++ b/release/nightly_tests/dataset/pipelined_ingestion_compute.yaml
@@ -3,7 +3,7 @@ region: us-west-2
 
 max_workers: 999
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/nightly_tests/dataset/read_and_consume_benchmark.py b/release/nightly_tests/dataset/read_and_consume_benchmark.py
new file mode 100644
index 000000000000..f833d27035ce
--- /dev/null
+++ b/release/nightly_tests/dataset/read_and_consume_benchmark.py
@@ -0,0 +1,69 @@
+import ray
+
+from benchmark import Benchmark
+
+import argparse
+from typing import Callable
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("path", type=str)
+    parser.add_argument(
+        "--format",
+        choices=["image", "parquet"],
+        required=True,
+    )
+
+    consume_group = parser.add_mutually_exclusive_group()
+    consume_group.add_argument("--count", action="store_true")
+    consume_group.add_argument("--iterate", action="store_true")
+
+    return parser.parse_args()
+
+
+def main(args):
+    benchmark = Benchmark("read-and-consume")
+    read_fn = get_read_fn(args)
+    consume_fn = get_consume_fn(args)
+
+    def benchmark_fn():
+        ds = read_fn(args.path)
+        consume_fn(ds)
+
+    benchmark.run_fn(str(vars(args)), benchmark_fn)
+    benchmark.write_result()
+
+
+def get_read_fn(args: argparse.Namespace) -> Callable[[str], ray.data.Dataset]:
+    if args.format == "image":
+        read_fn = ray.data.read_images
+    elif args.format == "parquet":
+        read_fn = ray.data.read_parquet
+    else:
+        assert False, f"Invalid data format argument: {args}"
+
+    return read_fn
+
+
+def get_consume_fn(args: argparse.Namespace) -> Callable[[ray.data.Dataset], None]:
+    if args.count:
+
+        def consume_fn(ds):
+            ds.count()
+
+    elif args.iterate:
+
+        def consume_fn(ds):
+            for _ in ds.iter_internal_ref_bundles():
+                pass
+
+    else:
+        assert False, f"Invalid consume arguments: {args}"
+
+    return consume_fn
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/release/nightly_tests/dataset/read_images_benchmark.py b/release/nightly_tests/dataset/read_images_benchmark.py
deleted file mode 100644
index 2a5f68db6e2f..000000000000
--- a/release/nightly_tests/dataset/read_images_benchmark.py
+++ /dev/null
@@ -1,148 +0,0 @@
-import argparse
-import os
-import random
-import shutil
-import tempfile
-from typing import List, Tuple
-
-from PIL import Image
-
-import ray
-
-from benchmark import Benchmark
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    group = parser.add_mutually_exclusive_group()
-    group.add_argument(
-        "--single-node",
-        action="store_true",
-        help="Run single-node read_images benchmark.",
-    )
-    group.add_argument(
-        "--multi-node",
-        action="store_true",
-        help="Run multi-node read_images benchmark.",
-    )
-    return parser.parse_args()
-
-
-def main(args):
-    ray.init()
-
-    benchmark = Benchmark("read-images")
-    if args.single_node:
-        run_images_benchmark_single_node(benchmark)
-    elif args.multi_node:
-        run_images_benchmark_multi_node(benchmark)
-
-    benchmark.write_result()
-
-
-def generate_images(
-    num_images: int, sizes: List[Tuple[int, int]], modes: List[str], formats: List[str]
-) -> str:
-
-    dimensions = []
-    for mode in modes:
-        if mode in ["1", "L", "P"]:
-            dimension = 1
-        elif mode in ["RGB", "YCbCr", "LAB", "HSV"]:
-            dimension = 3
-        elif mode in ["RGBA", "CMYK", "I", "F"]:
-            dimension = 4
-        else:
-            raise ValueError(f"Found unknown image mode: {mode}.")
-        dimensions.append(dimension)
-
-    images_dir = tempfile.mkdtemp()
-
-    for image_idx in range(num_images):
-        size = random.choice(sizes)
-        file_format = random.choice(formats)
-        mode_idx = random.randrange(len(modes))
-        mode = modes[mode_idx]
-        dimension = dimensions[mode_idx]
-
-        width, height = size
-        file_name = f"{images_dir}/{image_idx}.{file_format}"
-        pixels_per_dimension = []
-        for _ in range(dimension):
-            pixels = os.urandom(width * height)
-            pixels_per_dimension.append(pixels)
-
-        image = Image.new(mode, size)
-        if len(pixels_per_dimension) == 1:
-            image.putdata(pixels_per_dimension[0])
-        else:
-            image.putdata(list(zip(*pixels_per_dimension)))
-        image.save(file_name)
-
-    return images_dir
-
-
-def run_images_benchmark_single_node(benchmark: Benchmark):
-    # Set global random seed.
-    random.seed(42)
-
-    test_input = [
-        generate_images(100, [(256, 256)], ["RGB"], ["jpg"]),
-        generate_images(100, [(2048, 2048)], ["RGB"], ["jpg"]),
-        generate_images(
-            1000, [(64, 64), (256, 256)], ["RGB", "L"], ["jpg", "jpeg", "png"]
-        ),
-    ]
-
-    benchmark.run_materialize_ds(
-        "images-100-256-rbg-jpg", ray.data.read_images, test_input[0]
-    )
-    benchmark.run_materialize_ds(
-        "images-100-2048-rbg-jpg", ray.data.read_images, test_input[1]
-    )
-    benchmark.run_materialize_ds(
-        "images-100-2048-to-256-rbg-jpg",
-        ray.data.read_images,
-        test_input[1],
-        size=(256, 256),
-    )
-    benchmark.run_materialize_ds(
-        "images-1000-mix",
-        ray.data.read_images,
-        test_input[2],
-        size=(256, 256),
-        mode="RGB",
-    )
-
-    for root in test_input:
-        shutil.rmtree(root)
-
-    # TODO(chengsu): run benchmark on 20G and 100G imagenet data in multi-nodes
-    # cluster.
-    benchmark.run_materialize_ds(
-        "images-imagenet-1g",
-        ray.data.read_images,
-        "s3://air-example-data-2/1G-image-data-synthetic-raw",
-    )
-
-
-def run_images_benchmark_multi_node(benchmark: Benchmark):
-    hundred_thousand_image_paths = [
-        f"s3://air-example-data-2/100k-images-data-synthetic-raw/dog_{i}/dog_0.jpg"
-        for i in range(100_000)
-    ]
-    hundred_million_image_paths = []
-    for _ in range(100_000_000 // 100_000):
-        hundred_million_image_paths.extend(hundred_thousand_image_paths)
-
-    def fn():
-        ds = ray.data.read_images(hundred_million_image_paths)
-        for _ in ds.iter_batches(batch_size=None, batch_format="pyarrow"):
-            pass
-
-    benchmark.run_fn("images-100M", fn)
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
diff --git a/release/nightly_tests/dataset/read_parquet_benchmark.py b/release/nightly_tests/dataset/read_parquet_benchmark.py
deleted file mode 100644
index 96ceff9ff55c..000000000000
--- a/release/nightly_tests/dataset/read_parquet_benchmark.py
+++ /dev/null
@@ -1,120 +0,0 @@
-import ray
-from ray.data.dataset import Dataset
-
-from benchmark import Benchmark
-from parquet_data_generator import generate_data
-
-import shutil
-import tempfile
-from typing import Optional
-
-
-def read_parquet(
-    root: str,
-    override_num_blocks: Optional[int] = None,
-    use_threads: bool = False,
-    filter=None,
-    columns=None,
-) -> Dataset:
-    return ray.data.read_parquet(
-        paths=root,
-        override_num_blocks=override_num_blocks,
-        use_threads=use_threads,
-        filter=filter,
-        columns=columns,
-    )
-
-
-def run_read_parquet_benchmark(benchmark: Benchmark):
-    # Test with different override_num_blocks (multi-processing for single node)
-    # and threading.
-    for override_num_blocks in [1, 2, 4]:
-        for use_threads in [True, False]:
-            test_name = f"read-parquet-downsampled-nyc-taxi-2009-{override_num_blocks}-{use_threads}"  # noqa: E501
-            benchmark.run_materialize_ds(
-                test_name,
-                read_parquet,
-                root="s3://anonymous@air-example-data/ursa-labs-taxi-data/downsampled_2009_full_year_data.parquet",  # noqa: E501
-                override_num_blocks=override_num_blocks,
-                use_threads=use_threads,
-            )
-
-    # TODO: Test below is currently excluded, due to failure around
-    # pickling the Dataset involving the filter expression.
-    # The error is present on Python < 3.8, and involves the pickle/pickle5
-    # libraries. `pickle` is included as a default library from Python 3.8+,
-    # whereas Python versions before this must import the backported `pickle5` library
-    # to maintain the same functionality.
-
-    # Test with projection and filter pushdowns.
-    # Since we have projection and filter pushdown, we can run the read on the full
-    # size of one year data fast enough on a single node.
-    # test_name = "read-parquet-nyc-taxi-2018-pushdown"
-    # filter_expr = (pa.dataset.field("passenger_count") <= 10) & (
-    #     pa.dataset.field("passenger_count") > 0
-    # )
-    # benchmark.run(
-    #     test_name,
-    #     read_parquet,
-    #     root="s3://anonymous@air-example-data/ursa-labs-taxi-data/by_year/2018",
-    #     columns=["passenger_count", "trip_distance"],
-    #     filter=filter_expr,
-    # )
-
-    # Test with different number files to handle: from a few to many.
-    data_dirs = []
-    # Each test set has same total number of rows, which are distributed
-    # to different number of files.
-    total_rows = 1024 * 1024 * 8
-    for num_files in [8, 128, 1024]:
-        for compression in ["snappy", "gzip"]:
-            data_dirs.append(tempfile.mkdtemp())
-            generate_data(
-                num_rows=total_rows,
-                num_files=num_files,
-                num_row_groups_per_file=16,
-                compression=compression,
-                data_dir=data_dirs[-1],
-            )
-            test_name = f"read-parquet-random-data-{num_files}-{compression}"
-            benchmark.run_materialize_ds(
-                test_name,
-                read_parquet,
-                root=data_dirs[-1],
-                override_num_blocks=1,  # We are testing one task to handle N files
-            )
-    for dir in data_dirs:
-        shutil.rmtree(dir)
-
-    # Test reading many small files.
-    num_files = 1000
-    num_row_groups_per_file = 2
-    total_rows = num_files * num_row_groups_per_file
-    compression = "gzip"
-
-    many_files_dir = "s3://air-example-data-2/read-many-parquet-files/"
-    # If needed, use the following utility to generate files on S3.
-    # Otherwise, the benchmark will read pre-generated files in the above bucket.
-    # generate_data(
-    #     num_rows=total_rows,
-    #     num_files=num_files,
-    #     num_row_groups_per_file=num_row_groups_per_file,
-    #     compression=compression,
-    #     data_dir=many_files_dir,
-    # )
-    test_name = f"read-many-parquet-files-s3-{num_files}-{compression}"
-    benchmark.run_materialize_ds(
-        test_name,
-        read_parquet,
-        root=many_files_dir,
-    )
-
-
-if __name__ == "__main__":
-    ray.init()
-
-    benchmark = Benchmark("read-parquet")
-
-    run_read_parquet_benchmark(benchmark)
-
-    benchmark.write_result()
diff --git a/release/nightly_tests/dataset/read_tfrecords_benchmark.py b/release/nightly_tests/dataset/read_tfrecords_benchmark.py
index 48d1bb229195..34fecb02fd41 100644
--- a/release/nightly_tests/dataset/read_tfrecords_benchmark.py
+++ b/release/nightly_tests/dataset/read_tfrecords_benchmark.py
@@ -1,3 +1,4 @@
+import os
 import random
 import shutil
 import tempfile
@@ -7,11 +8,47 @@
 from ray.data.dataset import Dataset
 
 from benchmark import Benchmark
-from read_images_benchmark import generate_images
+from PIL import Image
 import pyarrow as pa
 import numpy as np
 
 
+def generate_images(
+    num_images: int, sizes: List[Tuple[int, int]], modes: List[str], formats: List[str]
+) -> str:
+    dimensions = []
+    for mode in modes:
+        if mode in ["1", "L", "P"]:
+            dimension = 1
+        elif mode in ["RGB", "YCbCr", "LAB", "HSV"]:
+            dimension = 3
+        elif mode in ["RGBA", "CMYK", "I", "F"]:
+            dimension = 4
+        else:
+            raise ValueError(f"Found unknown image mode: {mode}.")
+        dimensions.append(dimension)
+    images_dir = tempfile.mkdtemp()
+    for image_idx in range(num_images):
+        size = random.choice(sizes)
+        file_format = random.choice(formats)
+        mode_idx = random.randrange(len(modes))
+        mode = modes[mode_idx]
+        dimension = dimensions[mode_idx]
+        width, height = size
+        file_name = f"{images_dir}/{image_idx}.{file_format}"
+        pixels_per_dimension = []
+        for _ in range(dimension):
+            pixels = os.urandom(width * height)
+            pixels_per_dimension.append(pixels)
+        image = Image.new(mode, size)
+        if len(pixels_per_dimension) == 1:
+            image.putdata(pixels_per_dimension[0])
+        else:
+            image.putdata(list(zip(*pixels_per_dimension)))
+        image.save(file_name)
+    return images_dir
+
+
 def read_tfrecords(path: str) -> Dataset:
     return ray.data.read_tfrecords(paths=path).materialize()
 
diff --git a/release/nightly_tests/dataset/shuffle_compute.yaml b/release/nightly_tests/dataset/shuffle_compute.yaml
index eb7aacc0b8e7..b776f7edfa4c 100644
--- a/release/nightly_tests/dataset/shuffle_compute.yaml
+++ b/release/nightly_tests/dataset/shuffle_compute.yaml
@@ -3,7 +3,7 @@ region: us-west-2
 
 max_workers: 999
 
-aws:
+advanced_configurations_json:
     IamInstanceProfile: {"Name": "ray-autoscaler-v1"}
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
diff --git a/release/nightly_tests/decision_tree/autoscaling_compute.yaml b/release/nightly_tests/decision_tree/autoscaling_compute.yaml
index 4eb361e1bcae..3031267a1bd8 100644
--- a/release/nightly_tests/decision_tree/autoscaling_compute.yaml
+++ b/release/nightly_tests/decision_tree/autoscaling_compute.yaml
@@ -3,7 +3,7 @@ region: us-west-2
 
 max_workers: 10
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/nightly_tests/decision_tree/autoscaling_compute_gce.yaml b/release/nightly_tests/decision_tree/autoscaling_compute_gce.yaml
index b54d1d50f812..5b7072d5b30f 100644
--- a/release/nightly_tests/decision_tree/autoscaling_compute_gce.yaml
+++ b/release/nightly_tests/decision_tree/autoscaling_compute_gce.yaml
@@ -1,11 +1,11 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west1
-allowed_azs: 
+allowed_azs:
     - us-west1-c
 
 max_workers: 10
 
-#aws:
+#advanced_configurations_json:
 #    BlockDeviceMappings:
 #        - DeviceName: /dev/sda1
 #          Ebs:
diff --git a/release/nightly_tests/placement_group_tests/compute.yaml b/release/nightly_tests/placement_group_tests/compute.yaml
index d0fe68b4c17c..3baa53f9f36c 100644
--- a/release/nightly_tests/placement_group_tests/compute.yaml
+++ b/release/nightly_tests/placement_group_tests/compute.yaml
@@ -1,7 +1,7 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
@@ -20,7 +20,7 @@ worker_node_types:
       use_spot: false
     - name: fake_gpu_node
       instance_type: m5.4xlarge
-      min_workers: 0 
+      min_workers: 0
       max_workers: 2
       use_spot: false
       resources:
diff --git a/release/nightly_tests/placement_group_tests/long_running_test_compute.yaml b/release/nightly_tests/placement_group_tests/long_running_test_compute.yaml
index cc3005c09c5b..d990178123f0 100644
--- a/release/nightly_tests/placement_group_tests/long_running_test_compute.yaml
+++ b/release/nightly_tests/placement_group_tests/long_running_test_compute.yaml
@@ -1,7 +1,7 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/nightly_tests/placement_group_tests/pg_perf_test_compute.yaml b/release/nightly_tests/placement_group_tests/pg_perf_test_compute.yaml
index 8764e0f6c4df..a3e1852cc568 100644
--- a/release/nightly_tests/placement_group_tests/pg_perf_test_compute.yaml
+++ b/release/nightly_tests/placement_group_tests/pg_perf_test_compute.yaml
@@ -1,7 +1,7 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/nightly_tests/shuffle/100tb_shuffle_compute.yaml b/release/nightly_tests/shuffle/100tb_shuffle_compute.yaml
index 14a02cfa030e..17ed94f5d623 100644
--- a/release/nightly_tests/shuffle/100tb_shuffle_compute.yaml
+++ b/release/nightly_tests/shuffle/100tb_shuffle_compute.yaml
@@ -1,7 +1,7 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/nightly_tests/shuffle/datasets_large_scale_compute_small_instances.yaml b/release/nightly_tests/shuffle/datasets_large_scale_compute_small_instances.yaml
index b6f95b050839..f6a658058a5e 100644
--- a/release/nightly_tests/shuffle/datasets_large_scale_compute_small_instances.yaml
+++ b/release/nightly_tests/shuffle/datasets_large_scale_compute_small_instances.yaml
@@ -1,7 +1,7 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/nightly_tests/shuffle/shuffle_compute_autoscaling.yaml b/release/nightly_tests/shuffle/shuffle_compute_autoscaling.yaml
index 38091a3f12b6..57653d15408a 100644
--- a/release/nightly_tests/shuffle/shuffle_compute_autoscaling.yaml
+++ b/release/nightly_tests/shuffle/shuffle_compute_autoscaling.yaml
@@ -1,7 +1,7 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/nightly_tests/shuffle/shuffle_compute_autoscaling_gce.yaml b/release/nightly_tests/shuffle/shuffle_compute_autoscaling_gce.yaml
index 4a7af41b667c..30d141e8b544 100644
--- a/release/nightly_tests/shuffle/shuffle_compute_autoscaling_gce.yaml
+++ b/release/nightly_tests/shuffle/shuffle_compute_autoscaling_gce.yaml
@@ -1,9 +1,9 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west1
-allowed_azs: 
+allowed_azs:
     - us-west1-c
 
-#aws:
+#advanced_configurations_json:
 #    BlockDeviceMappings:
 #        - DeviceName: /dev/sda1
 #          Ebs:
diff --git a/release/nightly_tests/shuffle/shuffle_compute_large_scale.yaml b/release/nightly_tests/shuffle/shuffle_compute_large_scale.yaml
index 27268db12a0d..7b2a779f6cd0 100644
--- a/release/nightly_tests/shuffle/shuffle_compute_large_scale.yaml
+++ b/release/nightly_tests/shuffle/shuffle_compute_large_scale.yaml
@@ -1,7 +1,7 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/nightly_tests/shuffle/shuffle_compute_multi.yaml b/release/nightly_tests/shuffle/shuffle_compute_multi.yaml
index a726988aeda0..841d019c9545 100644
--- a/release/nightly_tests/shuffle/shuffle_compute_multi.yaml
+++ b/release/nightly_tests/shuffle/shuffle_compute_multi.yaml
@@ -3,7 +3,7 @@ region: us-west-2
 
 max_workers: 3
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/nightly_tests/shuffle/shuffle_compute_multi_gce.yaml b/release/nightly_tests/shuffle/shuffle_compute_multi_gce.yaml
index 1332e3390e97..ff95d850892d 100644
--- a/release/nightly_tests/shuffle/shuffle_compute_multi_gce.yaml
+++ b/release/nightly_tests/shuffle/shuffle_compute_multi_gce.yaml
@@ -1,11 +1,11 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west1
-allowed_azs: 
+allowed_azs:
     - us-west1-c
 
 max_workers: 3
 
-#aws:
+#advanced_configurations_json:
 #    BlockDeviceMappings:
 #        - DeviceName: /dev/sda1
 #          Ebs:
diff --git a/release/nightly_tests/shuffle/shuffle_compute_single.yaml b/release/nightly_tests/shuffle/shuffle_compute_single.yaml
index df8d84edc81f..16b4bec73a91 100644
--- a/release/nightly_tests/shuffle/shuffle_compute_single.yaml
+++ b/release/nightly_tests/shuffle/shuffle_compute_single.yaml
@@ -3,7 +3,7 @@ region: us-west-2
 
 max_workers: 0
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/nightly_tests/stress_tests/placement_group_tests_compute.yaml b/release/nightly_tests/stress_tests/placement_group_tests_compute.yaml
index 275b0a0a7d36..9b5476d95624 100644
--- a/release/nightly_tests/stress_tests/placement_group_tests_compute.yaml
+++ b/release/nightly_tests/stress_tests/placement_group_tests_compute.yaml
@@ -3,7 +3,7 @@ region: us-west-2
 
 max_workers: 5
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
@@ -12,18 +12,17 @@ aws:
 
 head_node_type:
     name: head_node
-    instance_type: m4.16xlarge
+    instance_type: m6i.16xlarge
     resources:
       cpu: 64
 
 worker_node_types:
    - name: worker_node
-     instance_type: m4.large
+     instance_type: m6i.large
      min_workers: 5
      max_workers: 5
      use_spot: false
      resources:
-      cpu: 2 
+      cpu: 2
       custom_resources:
         pg_custom: 666
-
diff --git a/release/nightly_tests/stress_tests/placement_group_tests_compute_gce.yaml b/release/nightly_tests/stress_tests/placement_group_tests_compute_gce.yaml
index 0e0285848708..b22a790f1b18 100644
--- a/release/nightly_tests/stress_tests/placement_group_tests_compute_gce.yaml
+++ b/release/nightly_tests/stress_tests/placement_group_tests_compute_gce.yaml
@@ -1,11 +1,11 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west1
-allowed_azs: 
+allowed_azs:
     - us-west1-c
 
 max_workers: 5
 
-#aws:
+#advanced_configurations_json:
 #    BlockDeviceMappings:
 #        - DeviceName: /dev/sda1
 #          Ebs:
@@ -25,7 +25,7 @@ worker_node_types:
      max_workers: 5
      use_spot: false
      resources:
-      cpu: 2 
+      cpu: 2
       custom_resources:
         pg_custom: 666
 
diff --git a/release/nightly_tests/stress_tests/smoke_test_compute.yaml b/release/nightly_tests/stress_tests/smoke_test_compute.yaml
index 0af96e62373b..9ae9ea54cfe4 100644
--- a/release/nightly_tests/stress_tests/smoke_test_compute.yaml
+++ b/release/nightly_tests/stress_tests/smoke_test_compute.yaml
@@ -3,7 +3,7 @@ region: us-west-2
 
 max_workers: 4
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
@@ -12,11 +12,11 @@ aws:
 
 head_node_type:
     name: head_node
-    instance_type: m4.4xlarge
+    instance_type: m6i.4xlarge
 
 worker_node_types:
    - name: worker_node
-     instance_type: m4.large
+     instance_type: m6i.large
      min_workers: 4
      max_workers: 4
      use_spot: false
diff --git a/release/nightly_tests/stress_tests/stress_test_threaded_actor_compute.yaml b/release/nightly_tests/stress_tests/stress_test_threaded_actor_compute.yaml
index 93a576e66333..88ba8049b087 100644
--- a/release/nightly_tests/stress_tests/stress_test_threaded_actor_compute.yaml
+++ b/release/nightly_tests/stress_tests/stress_test_threaded_actor_compute.yaml
@@ -1,7 +1,7 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/nightly_tests/stress_tests/stress_tests_compute.yaml b/release/nightly_tests/stress_tests/stress_tests_compute.yaml
index 5e9acaf7f7e8..9b85c0723e98 100644
--- a/release/nightly_tests/stress_tests/stress_tests_compute.yaml
+++ b/release/nightly_tests/stress_tests/stress_tests_compute.yaml
@@ -3,7 +3,7 @@ region: us-west-2
 
 max_workers: 100
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
@@ -12,13 +12,13 @@ aws:
 
 head_node_type:
     name: head_node
-    instance_type: m4.16xlarge
+    instance_type: m6i.16xlarge
     resources:
       cpu: 64
 
 worker_node_types:
    - name: worker_node
-     instance_type: m4.large
+     instance_type: m6i.large
      min_workers: 100
      max_workers: 100
      use_spot: false
diff --git a/release/nightly_tests/stress_tests/stress_tests_compute_large.yaml b/release/nightly_tests/stress_tests/stress_tests_compute_large.yaml
index 3784bac975be..53aa4e77c3d6 100644
--- a/release/nightly_tests/stress_tests/stress_tests_compute_large.yaml
+++ b/release/nightly_tests/stress_tests/stress_tests_compute_large.yaml
@@ -3,7 +3,7 @@ region: us-west-2
 
 max_workers: 6
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
@@ -12,13 +12,13 @@ aws:
 
 head_node_type:
     name: head_node
-    instance_type: m4.16xlarge
+    instance_type: m6i.16xlarge
     resources:
       cpu: 64
 
 worker_node_types:
    - name: worker_node
-     instance_type: m4.16xlarge
+     instance_type: m6i.16xlarge
      min_workers: 6
      max_workers: 6
      use_spot: false
diff --git a/release/perf_metrics/metadata.json b/release/perf_metrics/metadata.json
index 2ef9c5cd543e..d4a423494576 100644
--- a/release/perf_metrics/metadata.json
+++ b/release/perf_metrics/metadata.json
@@ -1 +1 @@
-{"release_version": "2.39.0"}
\ No newline at end of file
+{"release_version": "2.39.0"}
diff --git a/release/ray_release/byod/requirements_byod_3.9.txt b/release/ray_release/byod/requirements_byod_3.9.txt
index adff2b611647..f1dcf9ee13a8 100644
--- a/release/ray_release/byod/requirements_byod_3.9.txt
+++ b/release/ray_release/byod/requirements_byod_3.9.txt
@@ -144,6 +144,7 @@ ale-py==0.10.1 \
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   -r release/ray_release/byod/requirements_byod_3.9.in
+    #   gymnasium
 annotated-types==0.6.0 \
     --hash=sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43 \
     --hash=sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d
@@ -574,33 +575,39 @@ crcmod==1.7 \
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   gsutil
-cryptography==38.0.1 \
-    --hash=sha256:0297ffc478bdd237f5ca3a7dc96fc0d315670bfa099c04dc3a4a2172008a405a \
-    --hash=sha256:10d1f29d6292fc95acb597bacefd5b9e812099d75a6469004fd38ba5471a977f \
-    --hash=sha256:16fa61e7481f4b77ef53991075de29fc5bacb582a1244046d2e8b4bb72ef66d0 \
-    --hash=sha256:194044c6b89a2f9f169df475cc167f6157eb9151cc69af8a2a163481d45cc407 \
-    --hash=sha256:1db3d807a14931fa317f96435695d9ec386be7b84b618cc61cfa5d08b0ae33d7 \
-    --hash=sha256:3261725c0ef84e7592597606f6583385fed2a5ec3909f43bc475ade9729a41d6 \
-    --hash=sha256:3b72c360427889b40f36dc214630e688c2fe03e16c162ef0aa41da7ab1455153 \
-    --hash=sha256:3e3a2599e640927089f932295a9a247fc40a5bdf69b0484532f530471a382750 \
-    --hash=sha256:3fc26e22840b77326a764ceb5f02ca2d342305fba08f002a8c1f139540cdfaad \
-    --hash=sha256:5067ee7f2bce36b11d0e334abcd1ccf8c541fc0bbdaf57cdd511fdee53e879b6 \
-    --hash=sha256:52e7bee800ec869b4031093875279f1ff2ed12c1e2f74923e8f49c916afd1d3b \
-    --hash=sha256:64760ba5331e3f1794d0bcaabc0d0c39e8c60bf67d09c93dc0e54189dfd7cfe5 \
-    --hash=sha256:765fa194a0f3372d83005ab83ab35d7c5526c4e22951e46059b8ac678b44fa5a \
-    --hash=sha256:79473cf8a5cbc471979bd9378c9f425384980fcf2ab6534b18ed7d0d9843987d \
-    --hash=sha256:896dd3a66959d3a5ddcfc140a53391f69ff1e8f25d93f0e2e7830c6de90ceb9d \
-    --hash=sha256:89ed49784ba88c221756ff4d4755dbc03b3c8d2c5103f6d6b4f83a0fb1e85294 \
-    --hash=sha256:ac7e48f7e7261207d750fa7e55eac2d45f720027d5703cd9007e9b37bbb59ac0 \
-    --hash=sha256:ad7353f6ddf285aeadfaf79e5a6829110106ff8189391704c1d8801aa0bae45a \
-    --hash=sha256:b0163a849b6f315bf52815e238bc2b2346604413fa7c1601eea84bcddb5fb9ac \
-    --hash=sha256:b6c9b706316d7b5a137c35e14f4103e2115b088c412140fdbd5f87c73284df61 \
-    --hash=sha256:c2e5856248a416767322c8668ef1845ad46ee62629266f84a8f007a317141013 \
-    --hash=sha256:ca9f6784ea96b55ff41708b92c3f6aeaebde4c560308e5fbbd3173fbc466e94e \
-    --hash=sha256:d1a5bd52d684e49a36582193e0b89ff267704cd4025abefb9e26803adeb3e5fb \
-    --hash=sha256:d3971e2749a723e9084dd507584e2a2761f78ad2c638aa31e80bc7a15c9db4f9 \
-    --hash=sha256:d4ef6cc305394ed669d4d9eebf10d3a101059bdcf2669c366ec1d14e4fb227bd \
-    --hash=sha256:d9e69ae01f99abe6ad646947bba8941e896cb3aa805be2597a0400e0764b5818
+cryptography==42.0.5 \
+    --hash=sha256:0270572b8bd2c833c3981724b8ee9747b3ec96f699a9665470018594301439ee \
+    --hash=sha256:111a0d8553afcf8eb02a4fea6ca4f59d48ddb34497aa8706a6cf536f1a5ec576 \
+    --hash=sha256:16a48c23a62a2f4a285699dba2e4ff2d1cff3115b9df052cdd976a18856d8e3d \
+    --hash=sha256:1b95b98b0d2af784078fa69f637135e3c317091b615cd0905f8b8a087e86fa30 \
+    --hash=sha256:1f71c10d1e88467126f0efd484bd44bca5e14c664ec2ede64c32f20875c0d413 \
+    --hash=sha256:2424ff4c4ac7f6b8177b53c17ed5d8fa74ae5955656867f5a8affaca36a27abb \
+    --hash=sha256:2bce03af1ce5a5567ab89bd90d11e7bbdff56b8af3acbbec1faded8f44cb06da \
+    --hash=sha256:329906dcc7b20ff3cad13c069a78124ed8247adcac44b10bea1130e36caae0b4 \
+    --hash=sha256:37dd623507659e08be98eec89323469e8c7b4c1407c85112634ae3dbdb926fdd \
+    --hash=sha256:3eaafe47ec0d0ffcc9349e1708be2aaea4c6dd4978d76bf6eb0cb2c13636c6fc \
+    --hash=sha256:5e6275c09d2badf57aea3afa80d975444f4be8d3bc58f7f80d2a484c6f9485c8 \
+    --hash=sha256:6fe07eec95dfd477eb9530aef5bead34fec819b3aaf6c5bd6d20565da607bfe1 \
+    --hash=sha256:7367d7b2eca6513681127ebad53b2582911d1736dc2ffc19f2c3ae49997496bc \
+    --hash=sha256:7cde5f38e614f55e28d831754e8a3bacf9ace5d1566235e39d91b35502d6936e \
+    --hash=sha256:9481ffe3cf013b71b2428b905c4f7a9a4f76ec03065b05ff499bb5682a8d9ad8 \
+    --hash=sha256:98d8dc6d012b82287f2c3d26ce1d2dd130ec200c8679b6213b3c73c08b2b7940 \
+    --hash=sha256:a011a644f6d7d03736214d38832e030d8268bcff4a41f728e6030325fea3e400 \
+    --hash=sha256:a2913c5375154b6ef2e91c10b5720ea6e21007412f6437504ffea2109b5a33d7 \
+    --hash=sha256:a30596bae9403a342c978fb47d9b0ee277699fa53bbafad14706af51fe543d16 \
+    --hash=sha256:b03c2ae5d2f0fc05f9a2c0c997e1bc18c8229f392234e8a0194f202169ccd278 \
+    --hash=sha256:b6cd2203306b63e41acdf39aa93b86fb566049aeb6dc489b70e34bcd07adca74 \
+    --hash=sha256:b7ffe927ee6531c78f81aa17e684e2ff617daeba7f189f911065b2ea2d526dec \
+    --hash=sha256:b8cac287fafc4ad485b8a9b67d0ee80c66bf3574f655d3b97ef2e1082360faf1 \
+    --hash=sha256:ba334e6e4b1d92442b75ddacc615c5476d4ad55cc29b15d590cc6b86efa487e2 \
+    --hash=sha256:ba3e4a42397c25b7ff88cdec6e2a16c2be18720f317506ee25210f6d31925f9c \
+    --hash=sha256:c41fb5e6a5fe9ebcd58ca3abfeb51dffb5d83d6775405305bfa8715b76521922 \
+    --hash=sha256:cd2030f6650c089aeb304cf093f3244d34745ce0cfcc39f20c6fbfe030102e2a \
+    --hash=sha256:cd65d75953847815962c84a4654a84850b2bb4aed3f26fadcc1c13892e1e29f6 \
+    --hash=sha256:e4985a790f921508f36f81831817cbc03b102d643b5fcb81cd33df3fa291a1a1 \
+    --hash=sha256:e807b3188f9eb0eaa7bbb579b462c5ace579f1cedb28107ce8b48a9f7ad3679e \
+    --hash=sha256:f12764b8fffc7a123f641d7d049d382b73f96a34117e0b637b80643169cec8ac \
+    --hash=sha256:f8837fe1d6ac4a8052a9a8ddab256bc006242696f03368a4009be7ee3075cdb7
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   pyopenssl
@@ -650,9 +657,9 @@ cython==0.29.37 \
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   -r release/ray_release/byod/requirements_byod_3.9.in
-dask[complete]==2022.10.1 ; python_version < "3.12" \
-    --hash=sha256:2e6765bb6011c97c59fd4792540df679c703100443fcd99c82b98d8697295822 \
-    --hash=sha256:79d283326045700af0de7e2be57fd663499958c63638bf5076839cbcde64aa3f
+dask[complete]==2022.10.2 ; python_version < "3.12" \
+    --hash=sha256:42cb43f601709575fa46ce09e74bea83fdd464187024f56954e09d9b428ceaab \
+    --hash=sha256:928003a97b890a14c8a09a01f15320d261053bda530a8bf191d84f33db4a63b8
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   -r release/ray_release/byod/requirements_byod_3.9.in
@@ -673,9 +680,9 @@ diskcache==5.6.3 \
     --hash=sha256:2c3a3fa2743d8535d832ec61c2054a1641f41775aa7c556758a109941e33e4fc \
     --hash=sha256:5e31b2d5fbad117cc363ebaf6b689474db18a1f6438bc82358b024abd4c2ca19
     # via petastorm
-distributed==2022.10.1 ; python_version < "3.12" \
-    --hash=sha256:31abab8ecc76951875828a3689d47dc4f20226b3ec99a0dc1af6183d02dbe5fe \
-    --hash=sha256:42c6fe7d3bea491e23ce020879c411f2ecfecdb4914a6cb6b4a63530a7b0fa70
+distributed==2022.10.2 ; python_version < "3.12" \
+    --hash=sha256:53f0a5bf6efab9a5ab3345cd913f6d3f3d4ea444ee2edbea331c7fef96fd67d0 \
+    --hash=sha256:ae4fffdb55c6cb510ba1cbdf2856563af80ebf93e5ceacb91c1ce79e7da108d8
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   dask
@@ -1264,7 +1271,7 @@ gsutil==5.27 \
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   -r release/ray_release/byod/requirements_byod_3.9.in
-gymnasium==1.0.0 \
+gymnasium[atari]==1.0.0 \
     --hash=sha256:9d2b66f30c1b34fe3c2ce7fae65ecf365d0e9982d2b3d860235e773328a3b403 \
     --hash=sha256:b6f40e1e24c5bd419361e1a5b86a9117d2499baecc3a660d44dfff4c465393ad
     # via
@@ -2203,9 +2210,9 @@ pygments==2.18.0 \
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   rich
-pyopenssl==23.0.0 \
-    --hash=sha256:c1cc5f86bcacefc84dada7d31175cae1b1518d5f60d3d0bb595a67822a868a6f \
-    --hash=sha256:df5fc28af899e74e19fccb5510df423581047e10ab6f1f4ba1763ff5fde844c0
+pyopenssl==24.2.1 \
+    --hash=sha256:4247f0dbe3748d560dcbb2ff3ea01af0f9a1a001ef5f7c4c647956ed8cbf0e95 \
+    --hash=sha256:967d5719b12b243588573f39b0c677637145c7a1ffedcd495a487e58177fbb8d
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   gcs-oauth2-boto-plugin
diff --git a/release/ray_release/byod/requirements_ml_byod_3.9.in b/release/ray_release/byod/requirements_ml_byod_3.9.in
index 7ef915a8a698..69ca2cc2c734 100644
--- a/release/ray_release/byod/requirements_ml_byod_3.9.in
+++ b/release/ray_release/byod/requirements_ml_byod_3.9.in
@@ -6,7 +6,6 @@ bitsandbytes
 boto3
 cmake
 crc32c
-dataset
 datasets
 decord
 deepspeed>=0.12.3
diff --git a/release/ray_release/byod/requirements_ml_byod_3.9.txt b/release/ray_release/byod/requirements_ml_byod_3.9.txt
index 7eeec76c2886..8d9e5a044f47 100644
--- a/release/ray_release/byod/requirements_ml_byod_3.9.txt
+++ b/release/ray_release/byod/requirements_ml_byod_3.9.txt
@@ -1,4 +1,3 @@
-
 #
 # This file is autogenerated by pip-compile with python 3.9
 # To update, run:
@@ -118,12 +117,6 @@ aiosignal==1.3.1 \
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   aiohttp
-alembic==1.12.1 \
-    --hash=sha256:47d52e3dfb03666ed945becb723d6482e52190917fdb47071440cfdba05d92cb \
-    --hash=sha256:bca5877e9678b454706347bc10b97cb7d67f300320fa5c3a94423e8266e2823f
-    # via
-    #   -c release/ray_release/byod/requirements_compiled.txt
-    #   dataset
 annotated-types==0.6.0 \
     --hash=sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43 \
     --hash=sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d
@@ -175,17 +168,12 @@ attrs==21.4.0 \
     #   aiohttp
     #   jsonlines
     #   jsonschema
-    #   markdown-it-py
 backcall==0.2.0 \
     --hash=sha256:5cbdbf27be5e7cfadb448baf0aa95508f91f2bbc6c6437cd9cd06e2a4c215e1e \
     --hash=sha256:fbbce6a29f263178a1f7915c1940bde0ec2b2a967566fe1c65c1dfb7422bd255
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   ipython
-banal==1.0.6 \
-    --hash=sha256:2fe02c9305f53168441948f4a03dfbfa2eacc73db30db4a93309083cb0e250a5 \
-    --hash=sha256:877aacb16b17f8fa4fd29a7c44515c5a23dc1a7b26078bc41dd34829117d85e1
-    # via dataset
 bitsandbytes==0.43.1 \
     --hash=sha256:52c1c7189a6ca006555a9663e544e75f40520a97a26e075411f9f9aca0771fcd \
     --hash=sha256:a81c826d576d6d691c7b4a7491c8fdc0f37f769795d6ca2e54afa605d2c260a3
@@ -510,12 +498,6 @@ comm==0.2.0 \
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   ipywidgets
-commonmark==0.9.1 \
-    --hash=sha256:452f9dc859be7f06631ddcb328b6919c67984aca654e5fefb3914d54691aed60 \
-    --hash=sha256:da2f38c92590f83de410ba1a3cbceafbc74fee9def35f9251ba9a971d6d66fd9
-    # via
-    #   -c release/ray_release/byod/requirements_compiled.txt
-    #   rich
 configargparse==1.7 \
     --hash=sha256:d249da6591465c6c26df64a9f73d2536e743be2f244eb3ebe61114af2f94f86b \
     --hash=sha256:e7067471884de5478c58a511e529f0f9bd1c66bfef1dea90935438d6c23306d1
@@ -652,33 +634,39 @@ crcmod==1.7 \
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   gsutil
-cryptography==38.0.1 \
-    --hash=sha256:0297ffc478bdd237f5ca3a7dc96fc0d315670bfa099c04dc3a4a2172008a405a \
-    --hash=sha256:10d1f29d6292fc95acb597bacefd5b9e812099d75a6469004fd38ba5471a977f \
-    --hash=sha256:16fa61e7481f4b77ef53991075de29fc5bacb582a1244046d2e8b4bb72ef66d0 \
-    --hash=sha256:194044c6b89a2f9f169df475cc167f6157eb9151cc69af8a2a163481d45cc407 \
-    --hash=sha256:1db3d807a14931fa317f96435695d9ec386be7b84b618cc61cfa5d08b0ae33d7 \
-    --hash=sha256:3261725c0ef84e7592597606f6583385fed2a5ec3909f43bc475ade9729a41d6 \
-    --hash=sha256:3b72c360427889b40f36dc214630e688c2fe03e16c162ef0aa41da7ab1455153 \
-    --hash=sha256:3e3a2599e640927089f932295a9a247fc40a5bdf69b0484532f530471a382750 \
-    --hash=sha256:3fc26e22840b77326a764ceb5f02ca2d342305fba08f002a8c1f139540cdfaad \
-    --hash=sha256:5067ee7f2bce36b11d0e334abcd1ccf8c541fc0bbdaf57cdd511fdee53e879b6 \
-    --hash=sha256:52e7bee800ec869b4031093875279f1ff2ed12c1e2f74923e8f49c916afd1d3b \
-    --hash=sha256:64760ba5331e3f1794d0bcaabc0d0c39e8c60bf67d09c93dc0e54189dfd7cfe5 \
-    --hash=sha256:765fa194a0f3372d83005ab83ab35d7c5526c4e22951e46059b8ac678b44fa5a \
-    --hash=sha256:79473cf8a5cbc471979bd9378c9f425384980fcf2ab6534b18ed7d0d9843987d \
-    --hash=sha256:896dd3a66959d3a5ddcfc140a53391f69ff1e8f25d93f0e2e7830c6de90ceb9d \
-    --hash=sha256:89ed49784ba88c221756ff4d4755dbc03b3c8d2c5103f6d6b4f83a0fb1e85294 \
-    --hash=sha256:ac7e48f7e7261207d750fa7e55eac2d45f720027d5703cd9007e9b37bbb59ac0 \
-    --hash=sha256:ad7353f6ddf285aeadfaf79e5a6829110106ff8189391704c1d8801aa0bae45a \
-    --hash=sha256:b0163a849b6f315bf52815e238bc2b2346604413fa7c1601eea84bcddb5fb9ac \
-    --hash=sha256:b6c9b706316d7b5a137c35e14f4103e2115b088c412140fdbd5f87c73284df61 \
-    --hash=sha256:c2e5856248a416767322c8668ef1845ad46ee62629266f84a8f007a317141013 \
-    --hash=sha256:ca9f6784ea96b55ff41708b92c3f6aeaebde4c560308e5fbbd3173fbc466e94e \
-    --hash=sha256:d1a5bd52d684e49a36582193e0b89ff267704cd4025abefb9e26803adeb3e5fb \
-    --hash=sha256:d3971e2749a723e9084dd507584e2a2761f78ad2c638aa31e80bc7a15c9db4f9 \
-    --hash=sha256:d4ef6cc305394ed669d4d9eebf10d3a101059bdcf2669c366ec1d14e4fb227bd \
-    --hash=sha256:d9e69ae01f99abe6ad646947bba8941e896cb3aa805be2597a0400e0764b5818
+cryptography==42.0.5 \
+    --hash=sha256:0270572b8bd2c833c3981724b8ee9747b3ec96f699a9665470018594301439ee \
+    --hash=sha256:111a0d8553afcf8eb02a4fea6ca4f59d48ddb34497aa8706a6cf536f1a5ec576 \
+    --hash=sha256:16a48c23a62a2f4a285699dba2e4ff2d1cff3115b9df052cdd976a18856d8e3d \
+    --hash=sha256:1b95b98b0d2af784078fa69f637135e3c317091b615cd0905f8b8a087e86fa30 \
+    --hash=sha256:1f71c10d1e88467126f0efd484bd44bca5e14c664ec2ede64c32f20875c0d413 \
+    --hash=sha256:2424ff4c4ac7f6b8177b53c17ed5d8fa74ae5955656867f5a8affaca36a27abb \
+    --hash=sha256:2bce03af1ce5a5567ab89bd90d11e7bbdff56b8af3acbbec1faded8f44cb06da \
+    --hash=sha256:329906dcc7b20ff3cad13c069a78124ed8247adcac44b10bea1130e36caae0b4 \
+    --hash=sha256:37dd623507659e08be98eec89323469e8c7b4c1407c85112634ae3dbdb926fdd \
+    --hash=sha256:3eaafe47ec0d0ffcc9349e1708be2aaea4c6dd4978d76bf6eb0cb2c13636c6fc \
+    --hash=sha256:5e6275c09d2badf57aea3afa80d975444f4be8d3bc58f7f80d2a484c6f9485c8 \
+    --hash=sha256:6fe07eec95dfd477eb9530aef5bead34fec819b3aaf6c5bd6d20565da607bfe1 \
+    --hash=sha256:7367d7b2eca6513681127ebad53b2582911d1736dc2ffc19f2c3ae49997496bc \
+    --hash=sha256:7cde5f38e614f55e28d831754e8a3bacf9ace5d1566235e39d91b35502d6936e \
+    --hash=sha256:9481ffe3cf013b71b2428b905c4f7a9a4f76ec03065b05ff499bb5682a8d9ad8 \
+    --hash=sha256:98d8dc6d012b82287f2c3d26ce1d2dd130ec200c8679b6213b3c73c08b2b7940 \
+    --hash=sha256:a011a644f6d7d03736214d38832e030d8268bcff4a41f728e6030325fea3e400 \
+    --hash=sha256:a2913c5375154b6ef2e91c10b5720ea6e21007412f6437504ffea2109b5a33d7 \
+    --hash=sha256:a30596bae9403a342c978fb47d9b0ee277699fa53bbafad14706af51fe543d16 \
+    --hash=sha256:b03c2ae5d2f0fc05f9a2c0c997e1bc18c8229f392234e8a0194f202169ccd278 \
+    --hash=sha256:b6cd2203306b63e41acdf39aa93b86fb566049aeb6dc489b70e34bcd07adca74 \
+    --hash=sha256:b7ffe927ee6531c78f81aa17e684e2ff617daeba7f189f911065b2ea2d526dec \
+    --hash=sha256:b8cac287fafc4ad485b8a9b67d0ee80c66bf3574f655d3b97ef2e1082360faf1 \
+    --hash=sha256:ba334e6e4b1d92442b75ddacc615c5476d4ad55cc29b15d590cc6b86efa487e2 \
+    --hash=sha256:ba3e4a42397c25b7ff88cdec6e2a16c2be18720f317506ee25210f6d31925f9c \
+    --hash=sha256:c41fb5e6a5fe9ebcd58ca3abfeb51dffb5d83d6775405305bfa8715b76521922 \
+    --hash=sha256:cd2030f6650c089aeb304cf093f3244d34745ce0cfcc39f20c6fbfe030102e2a \
+    --hash=sha256:cd65d75953847815962c84a4654a84850b2bb4aed3f26fadcc1c13892e1e29f6 \
+    --hash=sha256:e4985a790f921508f36f81831817cbc03b102d643b5fcb81cd33df3fa291a1a1 \
+    --hash=sha256:e807b3188f9eb0eaa7bbb579b462c5ace579f1cedb28107ce8b48a9f7ad3679e \
+    --hash=sha256:f12764b8fffc7a123f641d7d049d382b73f96a34117e0b637b80643169cec8ac \
+    --hash=sha256:f8837fe1d6ac4a8052a9a8ddab256bc006242696f03368a4009be7ee3075cdb7
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   pyopenssl
@@ -694,10 +682,6 @@ dataproperty==1.0.1 \
     # via
     #   pytablewriter
     #   tabledata
-dataset==1.6.2 \
-    --hash=sha256:77d362118f67a8cbb4848dbd30ab362b9fa7cfebdbfaf426c9c500cb38969a99 \
-    --hash=sha256:dcca9ba7658473d3082b1adf87a650252a1cd665705b73fa7d4ee32116a107b9
-    # via -r release/ray_release/byod/requirements_ml_byod_3.9.in
 datasets==2.14.0 \
     --hash=sha256:1bb3d1c992a593949a8d3e445b358ac1db4ead00e6619ea2e5e7b6dfc222dde1 \
     --hash=sha256:93081cc3d9d0ce860c81f950a3ba23d24704da2eacbe2722092ef4f6ae0ada96
@@ -867,68 +851,84 @@ fonttools==4.45.1 \
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   matplotlib
-frozenlist==1.4.0 \
-    --hash=sha256:007df07a6e3eb3e33e9a1fe6a9db7af152bbd8a185f9aaa6ece10a3529e3e1c6 \
-    --hash=sha256:008eb8b31b3ea6896da16c38c1b136cb9fec9e249e77f6211d479db79a4eaf01 \
-    --hash=sha256:09163bdf0b2907454042edb19f887c6d33806adc71fbd54afc14908bfdc22251 \
-    --hash=sha256:0c7c1b47859ee2cac3846fde1c1dc0f15da6cec5a0e5c72d101e0f83dcb67ff9 \
-    --hash=sha256:0e5c8764c7829343d919cc2dfc587a8db01c4f70a4ebbc49abde5d4b158b007b \
-    --hash=sha256:10ff5faaa22786315ef57097a279b833ecab1a0bfb07d604c9cbb1c4cdc2ed87 \
-    --hash=sha256:17ae5cd0f333f94f2e03aaf140bb762c64783935cc764ff9c82dff626089bebf \
-    --hash=sha256:19488c57c12d4e8095a922f328df3f179c820c212940a498623ed39160bc3c2f \
-    --hash=sha256:1a0848b52815006ea6596c395f87449f693dc419061cc21e970f139d466dc0a0 \
-    --hash=sha256:1e78fb68cf9c1a6aa4a9a12e960a5c9dfbdb89b3695197aa7064705662515de2 \
-    --hash=sha256:261b9f5d17cac914531331ff1b1d452125bf5daa05faf73b71d935485b0c510b \
-    --hash=sha256:2b8bcf994563466db019fab287ff390fffbfdb4f905fc77bc1c1d604b1c689cc \
-    --hash=sha256:38461d02d66de17455072c9ba981d35f1d2a73024bee7790ac2f9e361ef1cd0c \
-    --hash=sha256:490132667476f6781b4c9458298b0c1cddf237488abd228b0b3650e5ecba7467 \
-    --hash=sha256:491e014f5c43656da08958808588cc6c016847b4360e327a62cb308c791bd2d9 \
-    --hash=sha256:515e1abc578dd3b275d6a5114030b1330ba044ffba03f94091842852f806f1c1 \
-    --hash=sha256:556de4430ce324c836789fa4560ca62d1591d2538b8ceb0b4f68fb7b2384a27a \
-    --hash=sha256:5833593c25ac59ede40ed4de6d67eb42928cca97f26feea219f21d0ed0959b79 \
-    --hash=sha256:6221d84d463fb110bdd7619b69cb43878a11d51cbb9394ae3105d082d5199167 \
-    --hash=sha256:6918d49b1f90821e93069682c06ffde41829c346c66b721e65a5c62b4bab0300 \
-    --hash=sha256:6c38721585f285203e4b4132a352eb3daa19121a035f3182e08e437cface44bf \
-    --hash=sha256:71932b597f9895f011f47f17d6428252fc728ba2ae6024e13c3398a087c2cdea \
-    --hash=sha256:7211ef110a9194b6042449431e08c4d80c0481e5891e58d429df5899690511c2 \
-    --hash=sha256:764226ceef3125e53ea2cb275000e309c0aa5464d43bd72abd661e27fffc26ab \
-    --hash=sha256:7645a8e814a3ee34a89c4a372011dcd817964ce8cb273c8ed6119d706e9613e3 \
-    --hash=sha256:76d4711f6f6d08551a7e9ef28c722f4a50dd0fc204c56b4bcd95c6cc05ce6fbb \
-    --hash=sha256:7f4f399d28478d1f604c2ff9119907af9726aed73680e5ed1ca634d377abb087 \
-    --hash=sha256:88f7bc0fcca81f985f78dd0fa68d2c75abf8272b1f5c323ea4a01a4d7a614efc \
-    --hash=sha256:8d0edd6b1c7fb94922bf569c9b092ee187a83f03fb1a63076e7774b60f9481a8 \
-    --hash=sha256:901289d524fdd571be1c7be054f48b1f88ce8dddcbdf1ec698b27d4b8b9e5d62 \
-    --hash=sha256:93ea75c050c5bb3d98016b4ba2497851eadf0ac154d88a67d7a6816206f6fa7f \
-    --hash=sha256:981b9ab5a0a3178ff413bca62526bb784249421c24ad7381e39d67981be2c326 \
-    --hash=sha256:9ac08e601308e41eb533f232dbf6b7e4cea762f9f84f6357136eed926c15d12c \
-    --hash=sha256:a02eb8ab2b8f200179b5f62b59757685ae9987996ae549ccf30f983f40602431 \
-    --hash=sha256:a0c6da9aee33ff0b1a451e867da0c1f47408112b3391dd43133838339e410963 \
-    --hash=sha256:a6c8097e01886188e5be3e6b14e94ab365f384736aa1fca6a0b9e35bd4a30bc7 \
-    --hash=sha256:aa384489fefeb62321b238e64c07ef48398fe80f9e1e6afeff22e140e0850eef \
-    --hash=sha256:ad2a9eb6d9839ae241701d0918f54c51365a51407fd80f6b8289e2dfca977cc3 \
-    --hash=sha256:b206646d176a007466358aa21d85cd8600a415c67c9bd15403336c331a10d956 \
-    --hash=sha256:b826d97e4276750beca7c8f0f1a4938892697a6bcd8ec8217b3312dad6982781 \
-    --hash=sha256:b89ac9768b82205936771f8d2eb3ce88503b1556324c9f903e7156669f521472 \
-    --hash=sha256:bd7bd3b3830247580de99c99ea2a01416dfc3c34471ca1298bccabf86d0ff4dc \
-    --hash=sha256:bdf1847068c362f16b353163391210269e4f0569a3c166bc6a9f74ccbfc7e839 \
-    --hash=sha256:c11b0746f5d946fecf750428a95f3e9ebe792c1ee3b1e96eeba145dc631a9672 \
-    --hash=sha256:c5374b80521d3d3f2ec5572e05adc94601985cc526fb276d0c8574a6d749f1b3 \
-    --hash=sha256:ca265542ca427bf97aed183c1676e2a9c66942e822b14dc6e5f42e038f92a503 \
-    --hash=sha256:ce31ae3e19f3c902de379cf1323d90c649425b86de7bbdf82871b8a2a0615f3d \
-    --hash=sha256:ceb6ec0a10c65540421e20ebd29083c50e6d1143278746a4ef6bcf6153171eb8 \
-    --hash=sha256:d081f13b095d74b67d550de04df1c756831f3b83dc9881c38985834387487f1b \
-    --hash=sha256:d5655a942f5f5d2c9ed93d72148226d75369b4f6952680211972a33e59b1dfdc \
-    --hash=sha256:d5a32087d720c608f42caed0ef36d2b3ea61a9d09ee59a5142d6070da9041b8f \
-    --hash=sha256:d6484756b12f40003c6128bfcc3fa9f0d49a687e171186c2d85ec82e3758c559 \
-    --hash=sha256:dd65632acaf0d47608190a71bfe46b209719bf2beb59507db08ccdbe712f969b \
-    --hash=sha256:de343e75f40e972bae1ef6090267f8260c1446a1695e77096db6cfa25e759a95 \
-    --hash=sha256:e29cda763f752553fa14c68fb2195150bfab22b352572cb36c43c47bedba70eb \
-    --hash=sha256:e41f3de4df3e80de75845d3e743b3f1c4c8613c3997a912dbf0229fc61a8b963 \
-    --hash=sha256:e66d2a64d44d50d2543405fb183a21f76b3b5fd16f130f5c99187c3fb4e64919 \
-    --hash=sha256:e74b0506fa5aa5598ac6a975a12aa8928cbb58e1f5ac8360792ef15de1aa848f \
-    --hash=sha256:f0ed05f5079c708fe74bf9027e95125334b6978bf07fd5ab923e9e55e5fbb9d3 \
-    --hash=sha256:f61e2dc5ad442c52b4887f1fdc112f97caeff4d9e6ebe78879364ac59f1663e1 \
-    --hash=sha256:fec520865f42e5c7f050c2a79038897b1c7d1595e907a9e08e3353293ffc948e
+frozenlist==1.4.1 \
+    --hash=sha256:04ced3e6a46b4cfffe20f9ae482818e34eba9b5fb0ce4056e4cc9b6e212d09b7 \
+    --hash=sha256:0633c8d5337cb5c77acbccc6357ac49a1770b8c487e5b3505c57b949b4b82e98 \
+    --hash=sha256:068b63f23b17df8569b7fdca5517edef76171cf3897eb68beb01341131fbd2ad \
+    --hash=sha256:0c250a29735d4f15321007fb02865f0e6b6a41a6b88f1f523ca1596ab5f50bd5 \
+    --hash=sha256:1979bc0aeb89b33b588c51c54ab0161791149f2461ea7c7c946d95d5f93b56ae \
+    --hash=sha256:1a4471094e146b6790f61b98616ab8e44f72661879cc63fa1049d13ef711e71e \
+    --hash=sha256:1b280e6507ea8a4fa0c0a7150b4e526a8d113989e28eaaef946cc77ffd7efc0a \
+    --hash=sha256:1d0ce09d36d53bbbe566fe296965b23b961764c0bcf3ce2fa45f463745c04701 \
+    --hash=sha256:20b51fa3f588ff2fe658663db52a41a4f7aa6c04f6201449c6c7c476bd255c0d \
+    --hash=sha256:23b2d7679b73fe0e5a4560b672a39f98dfc6f60df63823b0a9970525325b95f6 \
+    --hash=sha256:23b701e65c7b36e4bf15546a89279bd4d8675faabc287d06bbcfac7d3c33e1e6 \
+    --hash=sha256:2471c201b70d58a0f0c1f91261542a03d9a5e088ed3dc6c160d614c01649c106 \
+    --hash=sha256:27657df69e8801be6c3638054e202a135c7f299267f1a55ed3a598934f6c0d75 \
+    --hash=sha256:29acab3f66f0f24674b7dc4736477bcd4bc3ad4b896f5f45379a67bce8b96868 \
+    --hash=sha256:32453c1de775c889eb4e22f1197fe3bdfe457d16476ea407472b9442e6295f7a \
+    --hash=sha256:3a670dc61eb0d0eb7080890c13de3066790f9049b47b0de04007090807c776b0 \
+    --hash=sha256:3e0153a805a98f5ada7e09826255ba99fb4f7524bb81bf6b47fb702666484ae1 \
+    --hash=sha256:410478a0c562d1a5bcc2f7ea448359fcb050ed48b3c6f6f4f18c313a9bdb1826 \
+    --hash=sha256:442acde1e068288a4ba7acfe05f5f343e19fac87bfc96d89eb886b0363e977ec \
+    --hash=sha256:48f6a4533887e189dae092f1cf981f2e3885175f7a0f33c91fb5b7b682b6bab6 \
+    --hash=sha256:4f57dab5fe3407b6c0c1cc907ac98e8a189f9e418f3b6e54d65a718aaafe3950 \
+    --hash=sha256:4f9c515e7914626b2a2e1e311794b4c35720a0be87af52b79ff8e1429fc25f19 \
+    --hash=sha256:55fdc093b5a3cb41d420884cdaf37a1e74c3c37a31f46e66286d9145d2063bd0 \
+    --hash=sha256:5667ed53d68d91920defdf4035d1cdaa3c3121dc0b113255124bcfada1cfa1b8 \
+    --hash=sha256:590344787a90ae57d62511dd7c736ed56b428f04cd8c161fcc5e7232c130c69a \
+    --hash=sha256:5a7d70357e7cee13f470c7883a063aae5fe209a493c57d86eb7f5a6f910fae09 \
+    --hash=sha256:5c3894db91f5a489fc8fa6a9991820f368f0b3cbdb9cd8849547ccfab3392d86 \
+    --hash=sha256:5c849d495bf5154cd8da18a9eb15db127d4dba2968d88831aff6f0331ea9bd4c \
+    --hash=sha256:64536573d0a2cb6e625cf309984e2d873979709f2cf22839bf2d61790b448ad5 \
+    --hash=sha256:693945278a31f2086d9bf3df0fe8254bbeaef1fe71e1351c3bd730aa7d31c41b \
+    --hash=sha256:6db4667b187a6742b33afbbaf05a7bc551ffcf1ced0000a571aedbb4aa42fc7b \
+    --hash=sha256:6eb73fa5426ea69ee0e012fb59cdc76a15b1283d6e32e4f8dc4482ec67d1194d \
+    --hash=sha256:722e1124aec435320ae01ee3ac7bec11a5d47f25d0ed6328f2273d287bc3abb0 \
+    --hash=sha256:7268252af60904bf52c26173cbadc3a071cece75f873705419c8681f24d3edea \
+    --hash=sha256:74fb4bee6880b529a0c6560885fce4dc95936920f9f20f53d99a213f7bf66776 \
+    --hash=sha256:780d3a35680ced9ce682fbcf4cb9c2bad3136eeff760ab33707b71db84664e3a \
+    --hash=sha256:82e8211d69a4f4bc360ea22cd6555f8e61a1bd211d1d5d39d3d228b48c83a897 \
+    --hash=sha256:89aa2c2eeb20957be2d950b85974b30a01a762f3308cd02bb15e1ad632e22dc7 \
+    --hash=sha256:8aefbba5f69d42246543407ed2461db31006b0f76c4e32dfd6f42215a2c41d09 \
+    --hash=sha256:96ec70beabbd3b10e8bfe52616a13561e58fe84c0101dd031dc78f250d5128b9 \
+    --hash=sha256:9750cc7fe1ae3b1611bb8cfc3f9ec11d532244235d75901fb6b8e42ce9229dfe \
+    --hash=sha256:9acbb16f06fe7f52f441bb6f413ebae6c37baa6ef9edd49cdd567216da8600cd \
+    --hash=sha256:9d3e0c25a2350080e9319724dede4f31f43a6c9779be48021a7f4ebde8b2d742 \
+    --hash=sha256:a06339f38e9ed3a64e4c4e43aec7f59084033647f908e4259d279a52d3757d09 \
+    --hash=sha256:a0cb6f11204443f27a1628b0e460f37fb30f624be6051d490fa7d7e26d4af3d0 \
+    --hash=sha256:a7496bfe1da7fb1a4e1cc23bb67c58fab69311cc7d32b5a99c2007b4b2a0e932 \
+    --hash=sha256:a828c57f00f729620a442881cc60e57cfcec6842ba38e1b19fd3e47ac0ff8dc1 \
+    --hash=sha256:a9b2de4cf0cdd5bd2dee4c4f63a653c61d2408055ab77b151c1957f221cabf2a \
+    --hash=sha256:b46c8ae3a8f1f41a0d2ef350c0b6e65822d80772fe46b653ab6b6274f61d4a49 \
+    --hash=sha256:b7e3ed87d4138356775346e6845cccbe66cd9e207f3cd11d2f0b9fd13681359d \
+    --hash=sha256:b7f2f9f912dca3934c1baec2e4585a674ef16fe00218d833856408c48d5beee7 \
+    --hash=sha256:ba60bb19387e13597fb059f32cd4d59445d7b18b69a745b8f8e5db0346f33480 \
+    --hash=sha256:beee944ae828747fd7cb216a70f120767fc9f4f00bacae8543c14a6831673f89 \
+    --hash=sha256:bfa4a17e17ce9abf47a74ae02f32d014c5e9404b6d9ac7f729e01562bbee601e \
+    --hash=sha256:c037a86e8513059a2613aaba4d817bb90b9d9b6b69aace3ce9c877e8c8ed402b \
+    --hash=sha256:c302220494f5c1ebeb0912ea782bcd5e2f8308037b3c7553fad0e48ebad6ad82 \
+    --hash=sha256:c6321c9efe29975232da3bd0af0ad216800a47e93d763ce64f291917a381b8eb \
+    --hash=sha256:c757a9dd70d72b076d6f68efdbb9bc943665ae954dad2801b874c8c69e185068 \
+    --hash=sha256:c99169d4ff810155ca50b4da3b075cbde79752443117d89429595c2e8e37fed8 \
+    --hash=sha256:c9c92be9fd329ac801cc420e08452b70e7aeab94ea4233a4804f0915c14eba9b \
+    --hash=sha256:cc7b01b3754ea68a62bd77ce6020afaffb44a590c2289089289363472d13aedb \
+    --hash=sha256:db9e724bebd621d9beca794f2a4ff1d26eed5965b004a97f1f1685a173b869c2 \
+    --hash=sha256:dca69045298ce5c11fd539682cff879cc1e664c245d1c64da929813e54241d11 \
+    --hash=sha256:dd9b1baec094d91bf36ec729445f7769d0d0cf6b64d04d86e45baf89e2b9059b \
+    --hash=sha256:e02a0e11cf6597299b9f3bbd3f93d79217cb90cfd1411aec33848b13f5c656cc \
+    --hash=sha256:e6a20a581f9ce92d389a8c7d7c3dd47c81fd5d6e655c8dddf341e14aa48659d0 \
+    --hash=sha256:e7004be74cbb7d9f34553a5ce5fb08be14fb33bc86f332fb71cbe5216362a497 \
+    --hash=sha256:e774d53b1a477a67838a904131c4b0eef6b3d8a651f8b138b04f748fccfefe17 \
+    --hash=sha256:edb678da49d9f72c9f6c609fbe41a5dfb9a9282f9e6a2253d5a91e0fc382d7c0 \
+    --hash=sha256:f146e0911cb2f1da549fc58fc7bcd2b836a44b79ef871980d605ec392ff6b0d2 \
+    --hash=sha256:f56e2333dda1fe0f909e7cc59f021eba0d2307bc6f012a1ccf2beca6ba362439 \
+    --hash=sha256:f9a3ea26252bd92f570600098783d1371354d89d5f6b7dfd87359d669f2109b5 \
+    --hash=sha256:f9aa1878d1083b276b0196f2dfbe00c9b7e752475ed3b682025ff20c1c1f51ac \
+    --hash=sha256:fb3c2db03683b5767dedb5769b8a40ebb47d6f7f45b1b3e3b4b51ec8ad9d9825 \
+    --hash=sha256:fbeb989b5cc29e8daf7f976b421c220f1b8c731cbf22b9130d8815418ea45887 \
+    --hash=sha256:fde5bd59ab5357e3853313127f4d3565fc7dad314a74d7b5d43c22c6a5ed2ced \
+    --hash=sha256:fe1a06da377e3a1062ae5fe0926e12b84eceb8a50b350ddca72dc85015873f74
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   aiohttp
@@ -964,8 +964,9 @@ fugue-sql-antlr==0.2.0 \
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   fugue
-future==0.18.3 \
-    --hash=sha256:34a17436ed1e96697a86f9de3d15a3b0be01d8bc8de9c1dffd59fb8234ed5307
+future==1.0.0 \
+    --hash=sha256:929292d34f5872e70396626ef385ec22355a1fae8ad29e1a734c3e43f9fbc216 \
+    --hash=sha256:bd2968309307861edae1458a4f8a4f3598c03be43b97521076aebf5d94c07b05
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   petastorm
@@ -1295,7 +1296,6 @@ greenlet==3.0.1 \
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   gevent
-    #   sqlalchemy
 gsutil==5.27 \
     --hash=sha256:681a2d844acdf05fac989da6dd406944ae11cb27a4cf3c9edef74d2585ab5f05
     # via
@@ -1431,9 +1431,9 @@ jupyterlab-widgets==3.0.11 \
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   ipywidgets
-jupytext==1.13.6 \
-    --hash=sha256:2160774e30587fb427213231f0267ed070ba4ede41cf6121dbb2b14225eb83ba \
-    --hash=sha256:c6c25918ddb6403d0d8504e08d35f6efc447baf0dbeb6a28b73adf39e866a0c4
+jupytext==1.16.3 \
+    --hash=sha256:1ebac990461dd9f477ff7feec9e3003fa1acc89f3c16ba01b73f79fd76f01a98 \
+    --hash=sha256:870e0d7a716dcb1303df6ad1cec65e3315a20daedd808a55cb3dae2d56e4ed20
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   -r release/ray_release/byod/requirements_ml_byod_3.9.in
@@ -1680,19 +1680,14 @@ lxml==4.9.4 \
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   sacrebleu
-mako==1.3.0 \
-    --hash=sha256:57d4e997349f1a92035aa25c17ace371a4213f2ca42f99bee9a602500cfd54d9 \
-    --hash=sha256:e3a9d388fd00e87043edbe8792f45880ac0114e9c4adc69f6e9bfb2c55e3b11b
-    # via
-    #   -c release/ray_release/byod/requirements_compiled.txt
-    #   alembic
-markdown-it-py==1.1.0 \
-    --hash=sha256:36be6bb3ad987bfdb839f5ba78ddf094552ca38ccbd784ae4f74a4e1419fc6e3 \
-    --hash=sha256:98080fc0bc34c4f2bcf0846a096a9429acbd9d5d8e67ed34026c03c61c464389
+markdown-it-py==2.2.0 \
+    --hash=sha256:5a35f8d1870171d9acc47b99612dc146129b631baf04970128b568f190d0cc30 \
+    --hash=sha256:7c9a5e412688bc771c67432cbfebcdd686c93ce6484913dccf06cb5a0bea35a1
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   jupytext
     #   mdit-py-plugins
+    #   rich
 markupsafe==2.1.3 \
     --hash=sha256:05fb21170423db021895e1ea1e1f3ab3adb85d1c2333cbc2310f2a26bc77272e \
     --hash=sha256:0a4e4a1aff6c7ac4cd55792abf96c915634c2b97e3cc1c7129578aa68ebd754e \
@@ -1757,7 +1752,6 @@ markupsafe==2.1.3 \
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   jinja2
-    #   mako
     #   werkzeug
 matplotlib==3.7.4 \
     --hash=sha256:0037d066cca1f4bda626c507cddeb6f7da8283bc6a214da2db13ff2162933c52 \
@@ -1829,6 +1823,12 @@ mdit-py-plugins==0.3.5 \
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   jupytext
+mdurl==0.1.2 \
+    --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \
+    --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba
+    # via
+    #   -c release/ray_release/byod/requirements_compiled.txt
+    #   markdown-it-py
 memray==1.10.0 ; platform_system != "Windows" and sys_platform != "darwin" and platform_machine != "aarch64" \
     --hash=sha256:0a21745fb516b7a6efcd40aa7487c59e9313fcfc782d0193fcfcf00b48426874 \
     --hash=sha256:22f2a47871c172a0539bd72737bb6b294fc10c510464066b825d90fcd3bb4916 \
@@ -1868,7 +1868,7 @@ memray==1.10.0 ; platform_system != "Windows" and sys_platform != "darwin" and p
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   -r release/ray_release/byod/requirements_ml_byod_3.9.in
-modin==0.22.2 \
+modin==0.22.2 ; python_version < "3.12" \
     --hash=sha256:532fe0bfb2dcf06c0ad2d467721ef489fd58bb3ef7150bcf4a7ddd1069be1e4d \
     --hash=sha256:fa897dc59d5b9a8496be044185689fdd337b9f26cc81c4144b217a2a94d029bc
     # via
@@ -1952,81 +1952,97 @@ msgpack==1.0.7 \
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   locust
-multidict==6.0.4 \
-    --hash=sha256:01a3a55bd90018c9c080fbb0b9f4891db37d148a0a18722b42f94694f8b6d4c9 \
-    --hash=sha256:0b1a97283e0c85772d613878028fec909f003993e1007eafa715b24b377cb9b8 \
-    --hash=sha256:0dfad7a5a1e39c53ed00d2dd0c2e36aed4650936dc18fd9a1826a5ae1cad6f03 \
-    --hash=sha256:11bdf3f5e1518b24530b8241529d2050014c884cf18b6fc69c0c2b30ca248710 \
-    --hash=sha256:1502e24330eb681bdaa3eb70d6358e818e8e8f908a22a1851dfd4e15bc2f8161 \
-    --hash=sha256:16ab77bbeb596e14212e7bab8429f24c1579234a3a462105cda4a66904998664 \
-    --hash=sha256:16d232d4e5396c2efbbf4f6d4df89bfa905eb0d4dc5b3549d872ab898451f569 \
-    --hash=sha256:21a12c4eb6ddc9952c415f24eef97e3e55ba3af61f67c7bc388dcdec1404a067 \
-    --hash=sha256:27c523fbfbdfd19c6867af7346332b62b586eed663887392cff78d614f9ec313 \
-    --hash=sha256:281af09f488903fde97923c7744bb001a9b23b039a909460d0f14edc7bf59706 \
-    --hash=sha256:33029f5734336aa0d4c0384525da0387ef89148dc7191aae00ca5fb23d7aafc2 \
-    --hash=sha256:3601a3cece3819534b11d4efc1eb76047488fddd0c85a3948099d5da4d504636 \
-    --hash=sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49 \
-    --hash=sha256:36c63aaa167f6c6b04ef2c85704e93af16c11d20de1d133e39de6a0e84582a93 \
-    --hash=sha256:39ff62e7d0f26c248b15e364517a72932a611a9b75f35b45be078d81bdb86603 \
-    --hash=sha256:43644e38f42e3af682690876cff722d301ac585c5b9e1eacc013b7a3f7b696a0 \
-    --hash=sha256:4372381634485bec7e46718edc71528024fcdc6f835baefe517b34a33c731d60 \
-    --hash=sha256:458f37be2d9e4c95e2d8866a851663cbc76e865b78395090786f6cd9b3bbf4f4 \
-    --hash=sha256:45e1ecb0379bfaab5eef059f50115b54571acfbe422a14f668fc8c27ba410e7e \
-    --hash=sha256:4b9d9e4e2b37daddb5c23ea33a3417901fa7c7b3dee2d855f63ee67a0b21e5b1 \
-    --hash=sha256:4ceef517eca3e03c1cceb22030a3e39cb399ac86bff4e426d4fc6ae49052cc60 \
-    --hash=sha256:4d1a3d7ef5e96b1c9e92f973e43aa5e5b96c659c9bc3124acbbd81b0b9c8a951 \
-    --hash=sha256:4dcbb0906e38440fa3e325df2359ac6cb043df8e58c965bb45f4e406ecb162cc \
-    --hash=sha256:509eac6cf09c794aa27bcacfd4d62c885cce62bef7b2c3e8b2e49d365b5003fe \
-    --hash=sha256:52509b5be062d9eafc8170e53026fbc54cf3b32759a23d07fd935fb04fc22d95 \
-    --hash=sha256:52f2dffc8acaba9a2f27174c41c9e57f60b907bb9f096b36b1a1f3be71c6284d \
-    --hash=sha256:574b7eae1ab267e5f8285f0fe881f17efe4b98c39a40858247720935b893bba8 \
-    --hash=sha256:5979b5632c3e3534e42ca6ff856bb24b2e3071b37861c2c727ce220d80eee9ed \
-    --hash=sha256:59d43b61c59d82f2effb39a93c48b845efe23a3852d201ed2d24ba830d0b4cf2 \
-    --hash=sha256:5a4dcf02b908c3b8b17a45fb0f15b695bf117a67b76b7ad18b73cf8e92608775 \
-    --hash=sha256:5cad9430ab3e2e4fa4a2ef4450f548768400a2ac635841bc2a56a2052cdbeb87 \
-    --hash=sha256:5fc1b16f586f049820c5c5b17bb4ee7583092fa0d1c4e28b5239181ff9532e0c \
-    --hash=sha256:62501642008a8b9871ddfccbf83e4222cf8ac0d5aeedf73da36153ef2ec222d2 \
-    --hash=sha256:64bdf1086b6043bf519869678f5f2757f473dee970d7abf6da91ec00acb9cb98 \
-    --hash=sha256:64da238a09d6039e3bd39bb3aee9c21a5e34f28bfa5aa22518581f910ff94af3 \
-    --hash=sha256:666daae833559deb2d609afa4490b85830ab0dfca811a98b70a205621a6109fe \
-    --hash=sha256:67040058f37a2a51ed8ea8f6b0e6ee5bd78ca67f169ce6122f3e2ec80dfe9b78 \
-    --hash=sha256:6748717bb10339c4760c1e63da040f5f29f5ed6e59d76daee30305894069a660 \
-    --hash=sha256:6b181d8c23da913d4ff585afd1155a0e1194c0b50c54fcfe286f70cdaf2b7176 \
-    --hash=sha256:6ed5f161328b7df384d71b07317f4d8656434e34591f20552c7bcef27b0ab88e \
-    --hash=sha256:7582a1d1030e15422262de9f58711774e02fa80df0d1578995c76214f6954988 \
-    --hash=sha256:7d18748f2d30f94f498e852c67d61261c643b349b9d2a581131725595c45ec6c \
-    --hash=sha256:7d6ae9d593ef8641544d6263c7fa6408cc90370c8cb2bbb65f8d43e5b0351d9c \
-    --hash=sha256:81a4f0b34bd92df3da93315c6a59034df95866014ac08535fc819f043bfd51f0 \
-    --hash=sha256:8316a77808c501004802f9beebde51c9f857054a0c871bd6da8280e718444449 \
-    --hash=sha256:853888594621e6604c978ce2a0444a1e6e70c8d253ab65ba11657659dcc9100f \
-    --hash=sha256:99b76c052e9f1bc0721f7541e5e8c05db3941eb9ebe7b8553c625ef88d6eefde \
-    --hash=sha256:a2e4369eb3d47d2034032a26c7a80fcb21a2cb22e1173d761a162f11e562caa5 \
-    --hash=sha256:ab55edc2e84460694295f401215f4a58597f8f7c9466faec545093045476327d \
-    --hash=sha256:af048912e045a2dc732847d33821a9d84ba553f5c5f028adbd364dd4765092ac \
-    --hash=sha256:b1a2eeedcead3a41694130495593a559a668f382eee0727352b9a41e1c45759a \
-    --hash=sha256:b1e8b901e607795ec06c9e42530788c45ac21ef3aaa11dbd0c69de543bfb79a9 \
-    --hash=sha256:b41156839806aecb3641f3208c0dafd3ac7775b9c4c422d82ee2a45c34ba81ca \
-    --hash=sha256:b692f419760c0e65d060959df05f2a531945af31fda0c8a3b3195d4efd06de11 \
-    --hash=sha256:bc779e9e6f7fda81b3f9aa58e3a6091d49ad528b11ed19f6621408806204ad35 \
-    --hash=sha256:bf6774e60d67a9efe02b3616fee22441d86fab4c6d335f9d2051d19d90a40063 \
-    --hash=sha256:c048099e4c9e9d615545e2001d3d8a4380bd403e1a0578734e0d31703d1b0c0b \
-    --hash=sha256:c5cb09abb18c1ea940fb99360ea0396f34d46566f157122c92dfa069d3e0e982 \
-    --hash=sha256:cc8e1d0c705233c5dd0c5e6460fbad7827d5d36f310a0fadfd45cc3029762258 \
-    --hash=sha256:d5e3fc56f88cc98ef8139255cf8cd63eb2c586531e43310ff859d6bb3a6b51f1 \
-    --hash=sha256:d6aa0418fcc838522256761b3415822626f866758ee0bc6632c9486b179d0b52 \
-    --hash=sha256:d6c254ba6e45d8e72739281ebc46ea5eb5f101234f3ce171f0e9f5cc86991480 \
-    --hash=sha256:d6d635d5209b82a3492508cf5b365f3446afb65ae7ebd755e70e18f287b0adf7 \
-    --hash=sha256:dcfe792765fab89c365123c81046ad4103fcabbc4f56d1c1997e6715e8015461 \
-    --hash=sha256:ddd3915998d93fbcd2566ddf9cf62cdb35c9e093075f862935573d265cf8f65d \
-    --hash=sha256:ddff9c4e225a63a5afab9dd15590432c22e8057e1a9a13d28ed128ecf047bbdc \
-    --hash=sha256:e41b7e2b59679edfa309e8db64fdf22399eec4b0b24694e1b2104fb789207779 \
-    --hash=sha256:e69924bfcdda39b722ef4d9aa762b2dd38e4632b3641b1d9a57ca9cd18f2f83a \
-    --hash=sha256:ea20853c6dbbb53ed34cb4d080382169b6f4554d394015f1bef35e881bf83547 \
-    --hash=sha256:ee2a1ece51b9b9e7752e742cfb661d2a29e7bcdba2d27e66e28a99f1890e4fa0 \
-    --hash=sha256:eeb6dcc05e911516ae3d1f207d4b0520d07f54484c49dfc294d6e7d63b734171 \
-    --hash=sha256:f70b98cd94886b49d91170ef23ec5c0e8ebb6f242d734ed7ed677b24d50c82cf \
-    --hash=sha256:fc35cb4676846ef752816d5be2193a1e8367b4c1397b74a565a9d0389c433a1d \
-    --hash=sha256:ff959bee35038c4624250473988b24f846cbeb2c6639de3602c073f10410ceba
+multidict==6.0.5 \
+    --hash=sha256:01265f5e40f5a17f8241d52656ed27192be03bfa8764d88e8220141d1e4b3556 \
+    --hash=sha256:0275e35209c27a3f7951e1ce7aaf93ce0d163b28948444bec61dd7badc6d3f8c \
+    --hash=sha256:04bde7a7b3de05732a4eb39c94574db1ec99abb56162d6c520ad26f83267de29 \
+    --hash=sha256:04da1bb8c8dbadf2a18a452639771951c662c5ad03aefe4884775454be322c9b \
+    --hash=sha256:09a892e4a9fb47331da06948690ae38eaa2426de97b4ccbfafbdcbe5c8f37ff8 \
+    --hash=sha256:0d63c74e3d7ab26de115c49bffc92cc77ed23395303d496eae515d4204a625e7 \
+    --hash=sha256:107c0cdefe028703fb5dafe640a409cb146d44a6ae201e55b35a4af8e95457dd \
+    --hash=sha256:141b43360bfd3bdd75f15ed811850763555a251e38b2405967f8e25fb43f7d40 \
+    --hash=sha256:14c2976aa9038c2629efa2c148022ed5eb4cb939e15ec7aace7ca932f48f9ba6 \
+    --hash=sha256:19fe01cea168585ba0f678cad6f58133db2aa14eccaf22f88e4a6dccadfad8b3 \
+    --hash=sha256:1d147090048129ce3c453f0292e7697d333db95e52616b3793922945804a433c \
+    --hash=sha256:1d9ea7a7e779d7a3561aade7d596649fbecfa5c08a7674b11b423783217933f9 \
+    --hash=sha256:215ed703caf15f578dca76ee6f6b21b7603791ae090fbf1ef9d865571039ade5 \
+    --hash=sha256:21fd81c4ebdb4f214161be351eb5bcf385426bf023041da2fd9e60681f3cebae \
+    --hash=sha256:220dd781e3f7af2c2c1053da9fa96d9cf3072ca58f057f4c5adaaa1cab8fc442 \
+    --hash=sha256:228b644ae063c10e7f324ab1ab6b548bdf6f8b47f3ec234fef1093bc2735e5f9 \
+    --hash=sha256:29bfeb0dff5cb5fdab2023a7a9947b3b4af63e9c47cae2a10ad58394b517fddc \
+    --hash=sha256:2f4848aa3baa109e6ab81fe2006c77ed4d3cd1e0ac2c1fbddb7b1277c168788c \
+    --hash=sha256:2faa5ae9376faba05f630d7e5e6be05be22913782b927b19d12b8145968a85ea \
+    --hash=sha256:2ffc42c922dbfddb4a4c3b438eb056828719f07608af27d163191cb3e3aa6cc5 \
+    --hash=sha256:37b15024f864916b4951adb95d3a80c9431299080341ab9544ed148091b53f50 \
+    --hash=sha256:3cc2ad10255f903656017363cd59436f2111443a76f996584d1077e43ee51182 \
+    --hash=sha256:3d25f19500588cbc47dc19081d78131c32637c25804df8414463ec908631e453 \
+    --hash=sha256:403c0911cd5d5791605808b942c88a8155c2592e05332d2bf78f18697a5fa15e \
+    --hash=sha256:411bf8515f3be9813d06004cac41ccf7d1cd46dfe233705933dd163b60e37600 \
+    --hash=sha256:425bf820055005bfc8aa9a0b99ccb52cc2f4070153e34b701acc98d201693733 \
+    --hash=sha256:435a0984199d81ca178b9ae2c26ec3d49692d20ee29bc4c11a2a8d4514c67eda \
+    --hash=sha256:4a6a4f196f08c58c59e0b8ef8ec441d12aee4125a7d4f4fef000ccb22f8d7241 \
+    --hash=sha256:4cc0ef8b962ac7a5e62b9e826bd0cd5040e7d401bc45a6835910ed699037a461 \
+    --hash=sha256:51d035609b86722963404f711db441cf7134f1889107fb171a970c9701f92e1e \
+    --hash=sha256:53689bb4e102200a4fafa9de9c7c3c212ab40a7ab2c8e474491914d2305f187e \
+    --hash=sha256:55205d03e8a598cfc688c71ca8ea5f66447164efff8869517f175ea632c7cb7b \
+    --hash=sha256:5c0631926c4f58e9a5ccce555ad7747d9a9f8b10619621f22f9635f069f6233e \
+    --hash=sha256:5cb241881eefd96b46f89b1a056187ea8e9ba14ab88ba632e68d7a2ecb7aadf7 \
+    --hash=sha256:60d698e8179a42ec85172d12f50b1668254628425a6bd611aba022257cac1386 \
+    --hash=sha256:612d1156111ae11d14afaf3a0669ebf6c170dbb735e510a7438ffe2369a847fd \
+    --hash=sha256:6214c5a5571802c33f80e6c84713b2c79e024995b9c5897f794b43e714daeec9 \
+    --hash=sha256:6939c95381e003f54cd4c5516740faba40cf5ad3eeff460c3ad1d3e0ea2549bf \
+    --hash=sha256:69db76c09796b313331bb7048229e3bee7928eb62bab5e071e9f7fcc4879caee \
+    --hash=sha256:6bf7a982604375a8d49b6cc1b781c1747f243d91b81035a9b43a2126c04766f5 \
+    --hash=sha256:766c8f7511df26d9f11cd3a8be623e59cca73d44643abab3f8c8c07620524e4a \
+    --hash=sha256:76c0de87358b192de7ea9649beb392f107dcad9ad27276324c24c91774ca5271 \
+    --hash=sha256:76f067f5121dcecf0d63a67f29080b26c43c71a98b10c701b0677e4a065fbd54 \
+    --hash=sha256:7901c05ead4b3fb75113fb1dd33eb1253c6d3ee37ce93305acd9d38e0b5f21a4 \
+    --hash=sha256:79660376075cfd4b2c80f295528aa6beb2058fd289f4c9252f986751a4cd0496 \
+    --hash=sha256:79a6d2ba910adb2cbafc95dad936f8b9386e77c84c35bc0add315b856d7c3abb \
+    --hash=sha256:7afcdd1fc07befad18ec4523a782cde4e93e0a2bf71239894b8d61ee578c1319 \
+    --hash=sha256:7be7047bd08accdb7487737631d25735c9a04327911de89ff1b26b81745bd4e3 \
+    --hash=sha256:7c6390cf87ff6234643428991b7359b5f59cc15155695deb4eda5c777d2b880f \
+    --hash=sha256:7df704ca8cf4a073334e0427ae2345323613e4df18cc224f647f251e5e75a527 \
+    --hash=sha256:85f67aed7bb647f93e7520633d8f51d3cbc6ab96957c71272b286b2f30dc70ed \
+    --hash=sha256:896ebdcf62683551312c30e20614305f53125750803b614e9e6ce74a96232604 \
+    --hash=sha256:92d16a3e275e38293623ebf639c471d3e03bb20b8ebb845237e0d3664914caef \
+    --hash=sha256:99f60d34c048c5c2fabc766108c103612344c46e35d4ed9ae0673d33c8fb26e8 \
+    --hash=sha256:9fe7b0653ba3d9d65cbe7698cca585bf0f8c83dbbcc710db9c90f478e175f2d5 \
+    --hash=sha256:a3145cb08d8625b2d3fee1b2d596a8766352979c9bffe5d7833e0503d0f0b5e5 \
+    --hash=sha256:aeaf541ddbad8311a87dd695ed9642401131ea39ad7bc8cf3ef3967fd093b626 \
+    --hash=sha256:b55358304d7a73d7bdf5de62494aaf70bd33015831ffd98bc498b433dfe5b10c \
+    --hash=sha256:b82cc8ace10ab5bd93235dfaab2021c70637005e1ac787031f4d1da63d493c1d \
+    --hash=sha256:c0868d64af83169e4d4152ec612637a543f7a336e4a307b119e98042e852ad9c \
+    --hash=sha256:c1c1496e73051918fcd4f58ff2e0f2f3066d1c76a0c6aeffd9b45d53243702cc \
+    --hash=sha256:c9bf56195c6bbd293340ea82eafd0071cb3d450c703d2c93afb89f93b8386ccc \
+    --hash=sha256:cbebcd5bcaf1eaf302617c114aa67569dd3f090dd0ce8ba9e35e9985b41ac35b \
+    --hash=sha256:cd6c8fca38178e12c00418de737aef1261576bd1b6e8c6134d3e729a4e858b38 \
+    --hash=sha256:ceb3b7e6a0135e092de86110c5a74e46bda4bd4fbfeeb3a3bcec79c0f861e450 \
+    --hash=sha256:cf590b134eb70629e350691ecca88eac3e3b8b3c86992042fb82e3cb1830d5e1 \
+    --hash=sha256:d3eb1ceec286eba8220c26f3b0096cf189aea7057b6e7b7a2e60ed36b373b77f \
+    --hash=sha256:d65f25da8e248202bd47445cec78e0025c0fe7582b23ec69c3b27a640dd7a8e3 \
+    --hash=sha256:d6f6d4f185481c9669b9447bf9d9cf3b95a0e9df9d169bbc17e363b7d5487755 \
+    --hash=sha256:d84a5c3a5f7ce6db1f999fb9438f686bc2e09d38143f2d93d8406ed2dd6b9226 \
+    --hash=sha256:d946b0a9eb8aaa590df1fe082cee553ceab173e6cb5b03239716338629c50c7a \
+    --hash=sha256:dce1c6912ab9ff5f179eaf6efe7365c1f425ed690b03341911bf4939ef2f3046 \
+    --hash=sha256:de170c7b4fe6859beb8926e84f7d7d6c693dfe8e27372ce3b76f01c46e489fcf \
+    --hash=sha256:e02021f87a5b6932fa6ce916ca004c4d441509d33bbdbeca70d05dff5e9d2479 \
+    --hash=sha256:e030047e85cbcedbfc073f71836d62dd5dadfbe7531cae27789ff66bc551bd5e \
+    --hash=sha256:e0e79d91e71b9867c73323a3444724d496c037e578a0e1755ae159ba14f4f3d1 \
+    --hash=sha256:e4428b29611e989719874670fd152b6625500ad6c686d464e99f5aaeeaca175a \
+    --hash=sha256:e4972624066095e52b569e02b5ca97dbd7a7ddd4294bf4e7247d52635630dd83 \
+    --hash=sha256:e7be68734bd8c9a513f2b0cfd508802d6609da068f40dc57d4e3494cefc92929 \
+    --hash=sha256:e8e94e6912639a02ce173341ff62cc1201232ab86b8a8fcc05572741a5dc7d93 \
+    --hash=sha256:ea1456df2a27c73ce51120fa2f519f1bea2f4a03a917f4a43c8707cf4cbbae1a \
+    --hash=sha256:ebd8d160f91a764652d3e51ce0d2956b38efe37c9231cd82cfc0bed2e40b581c \
+    --hash=sha256:eca2e9d0cc5a889850e9bbd68e98314ada174ff6ccd1129500103df7a94a7a44 \
+    --hash=sha256:edd08e6f2f1a390bf137080507e44ccc086353c8e98c657e666c017718561b89 \
+    --hash=sha256:f285e862d2f153a70586579c15c44656f888806ed0e5b56b64489afe4a2dbfba \
+    --hash=sha256:f2a1dee728b52b33eebff5072817176c172050d44d67befd681609b4746e1c2e \
+    --hash=sha256:f7e301075edaf50500f0b341543c41194d8df3ae5caf4702f2095f3ca73dd8da \
+    --hash=sha256:fb616be3538599e797a2017cccca78e354c767165e8858ab5116813146041a24 \
+    --hash=sha256:fce28b3c8a81b6b36dfac9feb1de115bab619b3c13905b419ec71d03a3fc1423 \
+    --hash=sha256:fe5d7785250541f7f5019ab9cba2c71169dc7d74d0f45253f8313f436458a4ef
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   aiohttp
@@ -2219,6 +2235,12 @@ numpy==1.26.4 \
     #   triad
     #   utilsforecast
     #   xgboost
+nvidia-nccl-cu12==2.20.5 \
+    --hash=sha256:057f6bf9685f75215d0c53bf3ac4a10b3e6578351de307abad9e18a99182af56 \
+    --hash=sha256:1fc150d5c3250b170b29410ba682384b14581db722b2531b0d8d33c595f33d01
+    # via
+    #   -c release/ray_release/byod/requirements_compiled.txt
+    #   xgboost
 oauth2client==4.1.3 \
     --hash=sha256:b8a81cc5d60e2d364f0b1b98f958dbd472887acaf1a5b05e21c28c31a2d6d3ac \
     --hash=sha256:d486741e451287f69568a4d26d70d9acd73a2bbfa275746c535b4209891cccc6
@@ -2250,6 +2272,7 @@ packaging==23.0 \
     #   evaluate
     #   fugue-sql-antlr
     #   huggingface-hub
+    #   jupytext
     #   lightning-utilities
     #   matplotlib
     #   modin
@@ -2263,7 +2286,7 @@ packaging==23.0 \
     #   transformers
     #   typepy
     #   utilsforecast
-pandas==1.5.3 \
+pandas==1.5.3 ; python_version < "3.12" \
     --hash=sha256:14e45300521902689a81f3f41386dc86f19b8ba8dd5ac5a3c7010ef8d2932813 \
     --hash=sha256:26d9c71772c7afb9d5046e6e9cf42d83dd147b5cf5bcb9d97252077118543792 \
     --hash=sha256:3749077d86e3a2f0ed51367f30bf5b82e131cc0f14260c4d3e499186fccc4406 \
@@ -2710,9 +2733,9 @@ pynvml==11.5.0 \
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   deepspeed
-pyopenssl==23.0.0 \
-    --hash=sha256:c1cc5f86bcacefc84dada7d31175cae1b1518d5f60d3d0bb595a67822a868a6f \
-    --hash=sha256:df5fc28af899e74e19fccb5510df423581047e10ab6f1f4ba1763ff5fde844c0
+pyopenssl==24.2.1 \
+    --hash=sha256:4247f0dbe3748d560dcbb2ff3ea01af0f9a1a001ef5f7c4c647956ed8cbf0e95 \
+    --hash=sha256:967d5719b12b243588573f39b0c677637145c7a1ffedcd495a487e58177fbb8d
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   gcs-oauth2-boto-plugin
@@ -2963,95 +2986,86 @@ qpd==0.4.4 \
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   fugue
-regex==2023.10.3 \
-    --hash=sha256:00ba3c9818e33f1fa974693fb55d24cdc8ebafcb2e4207680669d8f8d7cca79a \
-    --hash=sha256:00e871d83a45eee2f8688d7e6849609c2ca2a04a6d48fba3dff4deef35d14f07 \
-    --hash=sha256:06e9abc0e4c9ab4779c74ad99c3fc10d3967d03114449acc2c2762ad4472b8ca \
-    --hash=sha256:0b9ac09853b2a3e0d0082104036579809679e7715671cfbf89d83c1cb2a30f58 \
-    --hash=sha256:0d47840dc05e0ba04fe2e26f15126de7c755496d5a8aae4a08bda4dd8d646c54 \
-    --hash=sha256:0f649fa32fe734c4abdfd4edbb8381c74abf5f34bc0b3271ce687b23729299ed \
-    --hash=sha256:107ac60d1bfdc3edb53be75e2a52aff7481b92817cfdddd9b4519ccf0e54a6ff \
-    --hash=sha256:11175910f62b2b8c055f2b089e0fedd694fe2be3941b3e2633653bc51064c528 \
-    --hash=sha256:12bd4bc2c632742c7ce20db48e0d99afdc05e03f0b4c1af90542e05b809a03d9 \
-    --hash=sha256:16f8740eb6dbacc7113e3097b0a36065a02e37b47c936b551805d40340fb9971 \
-    --hash=sha256:1c0e8fae5b27caa34177bdfa5a960c46ff2f78ee2d45c6db15ae3f64ecadde14 \
-    --hash=sha256:2c54e23836650bdf2c18222c87f6f840d4943944146ca479858404fedeb9f9af \
-    --hash=sha256:3367007ad1951fde612bf65b0dffc8fd681a4ab98ac86957d16491400d661302 \
-    --hash=sha256:36362386b813fa6c9146da6149a001b7bd063dabc4d49522a1f7aa65b725c7ec \
-    --hash=sha256:39807cbcbe406efca2a233884e169d056c35aa7e9f343d4e78665246a332f597 \
-    --hash=sha256:39cdf8d141d6d44e8d5a12a8569d5a227f645c87df4f92179bd06e2e2705e76b \
-    --hash=sha256:3b2c3502603fab52d7619b882c25a6850b766ebd1b18de3df23b2f939360e1bd \
-    --hash=sha256:3ccf2716add72f80714b9a63899b67fa711b654be3fcdd34fa391d2d274ce767 \
-    --hash=sha256:3fef4f844d2290ee0ba57addcec17eec9e3df73f10a2748485dfd6a3a188cc0f \
-    --hash=sha256:4023e2efc35a30e66e938de5aef42b520c20e7eda7bb5fb12c35e5d09a4c43f6 \
-    --hash=sha256:4a3ee019a9befe84fa3e917a2dd378807e423d013377a884c1970a3c2792d293 \
-    --hash=sha256:4a8bf76e3182797c6b1afa5b822d1d5802ff30284abe4599e1247be4fd6b03be \
-    --hash=sha256:4a992f702c9be9c72fa46f01ca6e18d131906a7180950958f766c2aa294d4b41 \
-    --hash=sha256:4c34d4f73ea738223a094d8e0ffd6d2c1a1b4c175da34d6b0de3d8d69bee6bcc \
-    --hash=sha256:4cd1bccf99d3ef1ab6ba835308ad85be040e6a11b0977ef7ea8c8005f01a3c29 \
-    --hash=sha256:4ef80829117a8061f974b2fda8ec799717242353bff55f8a29411794d635d964 \
-    --hash=sha256:58837f9d221744d4c92d2cf7201c6acd19623b50c643b56992cbd2b745485d3d \
-    --hash=sha256:5a8f91c64f390ecee09ff793319f30a0f32492e99f5dc1c72bc361f23ccd0a9a \
-    --hash=sha256:5addc9d0209a9afca5fc070f93b726bf7003bd63a427f65ef797a931782e7edc \
-    --hash=sha256:6239d4e2e0b52c8bd38c51b760cd870069f0bdf99700a62cd509d7a031749a55 \
-    --hash=sha256:66e2fe786ef28da2b28e222c89502b2af984858091675044d93cb50e6f46d7af \
-    --hash=sha256:69c0771ca5653c7d4b65203cbfc5e66db9375f1078689459fe196fe08b7b4930 \
-    --hash=sha256:6ac965a998e1388e6ff2e9781f499ad1eaa41e962a40d11c7823c9952c77123e \
-    --hash=sha256:6c56c3d47da04f921b73ff9415fbaa939f684d47293f071aa9cbb13c94afc17d \
-    --hash=sha256:6f85739e80d13644b981a88f529d79c5bdf646b460ba190bffcaf6d57b2a9863 \
-    --hash=sha256:706e7b739fdd17cb89e1fbf712d9dc21311fc2333f6d435eac2d4ee81985098c \
-    --hash=sha256:741ba2f511cc9626b7561a440f87d658aabb3d6b744a86a3c025f866b4d19e7f \
-    --hash=sha256:7434a61b158be563c1362d9071358f8ab91b8d928728cd2882af060481244c9e \
-    --hash=sha256:76066d7ff61ba6bf3cb5efe2428fc82aac91802844c022d849a1f0f53820502d \
-    --hash=sha256:7979b834ec7a33aafae34a90aad9f914c41fd6eaa8474e66953f3f6f7cbd4368 \
-    --hash=sha256:7eece6fbd3eae4a92d7c748ae825cbc1ee41a89bb1c3db05b5578ed3cfcfd7cb \
-    --hash=sha256:7ef1e014eed78ab650bef9a6a9cbe50b052c0aebe553fb2881e0453717573f52 \
-    --hash=sha256:81dce2ddc9f6e8f543d94b05d56e70d03a0774d32f6cca53e978dc01e4fc75b8 \
-    --hash=sha256:82fcc1f1cc3ff1ab8a57ba619b149b907072e750815c5ba63e7aa2e1163384a4 \
-    --hash=sha256:8d1f21af4c1539051049796a0f50aa342f9a27cde57318f2fc41ed50b0dbc4ac \
-    --hash=sha256:90a79bce019c442604662d17bf69df99090e24cdc6ad95b18b6725c2988a490e \
-    --hash=sha256:9145f092b5d1977ec8c0ab46e7b3381b2fd069957b9862a43bd383e5c01d18c2 \
-    --hash=sha256:91dc1d531f80c862441d7b66c4505cd6ea9d312f01fb2f4654f40c6fdf5cc37a \
-    --hash=sha256:979c24cbefaf2420c4e377ecd1f165ea08cc3d1fbb44bdc51bccbbf7c66a2cb4 \
-    --hash=sha256:994645a46c6a740ee8ce8df7911d4aee458d9b1bc5639bc968226763d07f00fa \
-    --hash=sha256:9b98b7681a9437262947f41c7fac567c7e1f6eddd94b0483596d320092004533 \
-    --hash=sha256:9c6b4d23c04831e3ab61717a707a5d763b300213db49ca680edf8bf13ab5d91b \
-    --hash=sha256:9c6d0ced3c06d0f183b73d3c5920727268d2201aa0fe6d55c60d68c792ff3588 \
-    --hash=sha256:9fd88f373cb71e6b59b7fa597e47e518282455c2734fd4306a05ca219a1991b0 \
-    --hash=sha256:a8f4e49fc3ce020f65411432183e6775f24e02dff617281094ba6ab079ef0915 \
-    --hash=sha256:a9e908ef5889cda4de038892b9accc36d33d72fb3e12c747e2799a0e806ec841 \
-    --hash=sha256:ad08a69728ff3c79866d729b095872afe1e0557251da4abb2c5faff15a91d19a \
-    --hash=sha256:adbccd17dcaff65704c856bd29951c58a1bd4b2b0f8ad6b826dbd543fe740988 \
-    --hash=sha256:b0c7d2f698e83f15228ba41c135501cfe7d5740181d5903e250e47f617eb4292 \
-    --hash=sha256:b3ab05a182c7937fb374f7e946f04fb23a0c0699c0450e9fb02ef567412d2fa3 \
-    --hash=sha256:b6104f9a46bd8743e4f738afef69b153c4b8b592d35ae46db07fc28ae3d5fb7c \
-    --hash=sha256:ba7cd6dc4d585ea544c1412019921570ebd8a597fabf475acc4528210d7c4a6f \
-    --hash=sha256:bc72c231f5449d86d6c7d9cc7cd819b6eb30134bb770b8cfdc0765e48ef9c420 \
-    --hash=sha256:bce8814b076f0ce5766dc87d5a056b0e9437b8e0cd351b9a6c4e1134a7dfbda9 \
-    --hash=sha256:be5e22bbb67924dea15039c3282fa4cc6cdfbe0cbbd1c0515f9223186fc2ec5f \
-    --hash=sha256:be6b7b8d42d3090b6c80793524fa66c57ad7ee3fe9722b258aec6d0672543fd0 \
-    --hash=sha256:bfe50b61bab1b1ec260fa7cd91106fa9fece57e6beba05630afe27c71259c59b \
-    --hash=sha256:bff507ae210371d4b1fe316d03433ac099f184d570a1a611e541923f78f05037 \
-    --hash=sha256:c148bec483cc4b421562b4bcedb8e28a3b84fcc8f0aa4418e10898f3c2c0eb9b \
-    --hash=sha256:c15ad0aee158a15e17e0495e1e18741573d04eb6da06d8b84af726cfc1ed02ee \
-    --hash=sha256:c2169b2dcabf4e608416f7f9468737583ce5f0a6e8677c4efbf795ce81109d7c \
-    --hash=sha256:c55853684fe08d4897c37dfc5faeff70607a5f1806c8be148f1695be4a63414b \
-    --hash=sha256:c65a3b5330b54103e7d21cac3f6bf3900d46f6d50138d73343d9e5b2900b2353 \
-    --hash=sha256:c7964c2183c3e6cce3f497e3a9f49d182e969f2dc3aeeadfa18945ff7bdd7051 \
-    --hash=sha256:cc3f1c053b73f20c7ad88b0d1d23be7e7b3901229ce89f5000a8399746a6e039 \
-    --hash=sha256:ce615c92d90df8373d9e13acddd154152645c0dc060871abf6bd43809673d20a \
-    --hash=sha256:d29338556a59423d9ff7b6eb0cb89ead2b0875e08fe522f3e068b955c3e7b59b \
-    --hash=sha256:d8a993c0a0ffd5f2d3bda23d0cd75e7086736f8f8268de8a82fbc4bd0ac6791e \
-    --hash=sha256:d9c727bbcf0065cbb20f39d2b4f932f8fa1631c3e01fcedc979bd4f51fe051c5 \
-    --hash=sha256:dac37cf08fcf2094159922edc7a2784cfcc5c70f8354469f79ed085f0328ebdf \
-    --hash=sha256:dd829712de97753367153ed84f2de752b86cd1f7a88b55a3a775eb52eafe8a94 \
-    --hash=sha256:e54ddd0bb8fb626aa1f9ba7b36629564544954fff9669b15da3610c22b9a0991 \
-    --hash=sha256:e77c90ab5997e85901da85131fd36acd0ed2221368199b65f0d11bca44549711 \
-    --hash=sha256:ebedc192abbc7fd13c5ee800e83a6df252bec691eb2c4bedc9f8b2e2903f5e2a \
-    --hash=sha256:ef71561f82a89af6cfcbee47f0fabfdb6e63788a9258e913955d89fdd96902ab \
-    --hash=sha256:f0a47efb1dbef13af9c9a54a94a0b814902e547b7f21acb29434504d18f36e3a \
-    --hash=sha256:f4f2ca6df64cbdd27f27b34f35adb640b5d2d77264228554e68deda54456eb11 \
-    --hash=sha256:fb02e4257376ae25c6dd95a5aec377f9b18c09be6ebdefa7ad209b9137b73d48
+regex==2024.5.15 \
+    --hash=sha256:0721931ad5fe0dda45d07f9820b90b2148ccdd8e45bb9e9b42a146cb4f695649 \
+    --hash=sha256:10002e86e6068d9e1c91eae8295ef690f02f913c57db120b58fdd35a6bb1af35 \
+    --hash=sha256:10e4ce0dca9ae7a66e6089bb29355d4432caed736acae36fef0fdd7879f0b0cb \
+    --hash=sha256:119af6e56dce35e8dfb5222573b50c89e5508d94d55713c75126b753f834de68 \
+    --hash=sha256:1337b7dbef9b2f71121cdbf1e97e40de33ff114801263b275aafd75303bd62b5 \
+    --hash=sha256:13cdaf31bed30a1e1c2453ef6015aa0983e1366fad2667657dbcac7b02f67133 \
+    --hash=sha256:1595f2d10dff3d805e054ebdc41c124753631b6a471b976963c7b28543cf13b0 \
+    --hash=sha256:16093f563098448ff6b1fa68170e4acbef94e6b6a4e25e10eae8598bb1694b5d \
+    --hash=sha256:1878b8301ed011704aea4c806a3cadbd76f84dece1ec09cc9e4dc934cfa5d4da \
+    --hash=sha256:19068a6a79cf99a19ccefa44610491e9ca02c2be3305c7760d3831d38a467a6f \
+    --hash=sha256:19dfb1c504781a136a80ecd1fff9f16dddf5bb43cec6871778c8a907a085bb3d \
+    --hash=sha256:1b5269484f6126eee5e687785e83c6b60aad7663dafe842b34691157e5083e53 \
+    --hash=sha256:1c1c174d6ec38d6c8a7504087358ce9213d4332f6293a94fbf5249992ba54efa \
+    --hash=sha256:2431b9e263af1953c55abbd3e2efca67ca80a3de8a0437cb58e2421f8184717a \
+    --hash=sha256:287eb7f54fc81546346207c533ad3c2c51a8d61075127d7f6d79aaf96cdee890 \
+    --hash=sha256:2b4c884767504c0e2401babe8b5b7aea9148680d2e157fa28f01529d1f7fcf67 \
+    --hash=sha256:35cb514e137cb3488bce23352af3e12fb0dbedd1ee6e60da053c69fb1b29cc6c \
+    --hash=sha256:391d7f7f1e409d192dba8bcd42d3e4cf9e598f3979cdaed6ab11288da88cb9f2 \
+    --hash=sha256:3ad070b823ca5890cab606c940522d05d3d22395d432f4aaaf9d5b1653e47ced \
+    --hash=sha256:3cd7874d57f13bf70078f1ff02b8b0aa48d5b9ed25fc48547516c6aba36f5741 \
+    --hash=sha256:3e507ff1e74373c4d3038195fdd2af30d297b4f0950eeda6f515ae3d84a1770f \
+    --hash=sha256:455705d34b4154a80ead722f4f185b04c4237e8e8e33f265cd0798d0e44825fa \
+    --hash=sha256:4a605586358893b483976cffc1723fb0f83e526e8f14c6e6614e75919d9862cf \
+    --hash=sha256:4babf07ad476aaf7830d77000874d7611704a7fcf68c9c2ad151f5d94ae4bfc4 \
+    --hash=sha256:4eee78a04e6c67e8391edd4dad3279828dd66ac4b79570ec998e2155d2e59fd5 \
+    --hash=sha256:5397de3219a8b08ae9540c48f602996aa6b0b65d5a61683e233af8605c42b0f2 \
+    --hash=sha256:5b5467acbfc153847d5adb21e21e29847bcb5870e65c94c9206d20eb4e99a384 \
+    --hash=sha256:5eaa7ddaf517aa095fa8da0b5015c44d03da83f5bd49c87961e3c997daed0de7 \
+    --hash=sha256:632b01153e5248c134007209b5c6348a544ce96c46005d8456de1d552455b014 \
+    --hash=sha256:64c65783e96e563103d641760664125e91bd85d8e49566ee560ded4da0d3e704 \
+    --hash=sha256:64f18a9a3513a99c4bef0e3efd4c4a5b11228b48aa80743be822b71e132ae4f5 \
+    --hash=sha256:673b5a6da4557b975c6c90198588181029c60793835ce02f497ea817ff647cb2 \
+    --hash=sha256:68811ab14087b2f6e0fc0c2bae9ad689ea3584cad6917fc57be6a48bbd012c49 \
+    --hash=sha256:6e8d717bca3a6e2064fc3a08df5cbe366369f4b052dcd21b7416e6d71620dca1 \
+    --hash=sha256:71a455a3c584a88f654b64feccc1e25876066c4f5ef26cd6dd711308aa538694 \
+    --hash=sha256:72d7a99cd6b8f958e85fc6ca5b37c4303294954eac1376535b03c2a43eb72629 \
+    --hash=sha256:7b59138b219ffa8979013be7bc85bb60c6f7b7575df3d56dc1e403a438c7a3f6 \
+    --hash=sha256:7dbe2467273b875ea2de38ded4eba86cbcbc9a1a6d0aa11dcf7bd2e67859c435 \
+    --hash=sha256:833616ddc75ad595dee848ad984d067f2f31be645d603e4d158bba656bbf516c \
+    --hash=sha256:87e2a9c29e672fc65523fb47a90d429b70ef72b901b4e4b1bd42387caf0d6835 \
+    --hash=sha256:8fe45aa3f4aa57faabbc9cb46a93363edd6197cbc43523daea044e9ff2fea83e \
+    --hash=sha256:9e717956dcfd656f5055cc70996ee2cc82ac5149517fc8e1b60261b907740201 \
+    --hash=sha256:9efa1a32ad3a3ea112224897cdaeb6aa00381627f567179c0314f7b65d354c62 \
+    --hash=sha256:9ff11639a8d98969c863d4617595eb5425fd12f7c5ef6621a4b74b71ed8726d5 \
+    --hash=sha256:a094801d379ab20c2135529948cb84d417a2169b9bdceda2a36f5f10977ebc16 \
+    --hash=sha256:a0981022dccabca811e8171f913de05720590c915b033b7e601f35ce4ea7019f \
+    --hash=sha256:a0bd000c6e266927cb7a1bc39d55be95c4b4f65c5be53e659537537e019232b1 \
+    --hash=sha256:a32b96f15c8ab2e7d27655969a23895eb799de3665fa94349f3b2fbfd547236f \
+    --hash=sha256:a81e3cfbae20378d75185171587cbf756015ccb14840702944f014e0d93ea09f \
+    --hash=sha256:ac394ff680fc46b97487941f5e6ae49a9f30ea41c6c6804832063f14b2a5a145 \
+    --hash=sha256:ada150c5adfa8fbcbf321c30c751dc67d2f12f15bd183ffe4ec7cde351d945b3 \
+    --hash=sha256:b2b6f1b3bb6f640c1a92be3bbfbcb18657b125b99ecf141fb3310b5282c7d4ed \
+    --hash=sha256:b802512f3e1f480f41ab5f2cfc0e2f761f08a1f41092d6718868082fc0d27143 \
+    --hash=sha256:ba68168daedb2c0bab7fd7e00ced5ba90aebf91024dea3c88ad5063c2a562cca \
+    --hash=sha256:bfc4f82cabe54f1e7f206fd3d30fda143f84a63fe7d64a81558d6e5f2e5aaba9 \
+    --hash=sha256:c0c18345010870e58238790a6779a1219b4d97bd2e77e1140e8ee5d14df071aa \
+    --hash=sha256:c3bea0ba8b73b71b37ac833a7f3fd53825924165da6a924aec78c13032f20850 \
+    --hash=sha256:c486b4106066d502495b3025a0a7251bf37ea9540433940a23419461ab9f2a80 \
+    --hash=sha256:c49e15eac7c149f3670b3e27f1f28a2c1ddeccd3a2812cba953e01be2ab9b5fe \
+    --hash=sha256:c6a2b494a76983df8e3d3feea9b9ffdd558b247e60b92f877f93a1ff43d26656 \
+    --hash=sha256:cab12877a9bdafde5500206d1020a584355a97884dfd388af3699e9137bf7388 \
+    --hash=sha256:cac27dcaa821ca271855a32188aa61d12decb6fe45ffe3e722401fe61e323cd1 \
+    --hash=sha256:cdd09d47c0b2efee9378679f8510ee6955d329424c659ab3c5e3a6edea696294 \
+    --hash=sha256:cf2430df4148b08fb4324b848672514b1385ae3807651f3567871f130a728cc3 \
+    --hash=sha256:d0a3d8d6acf0c78a1fff0e210d224b821081330b8524e3e2bc5a68ef6ab5803d \
+    --hash=sha256:d0c0c0003c10f54a591d220997dd27d953cd9ccc1a7294b40a4be5312be8797b \
+    --hash=sha256:d1f059a4d795e646e1c37665b9d06062c62d0e8cc3c511fe01315973a6542e40 \
+    --hash=sha256:d347a741ea871c2e278fde6c48f85136c96b8659b632fb57a7d1ce1872547600 \
+    --hash=sha256:d3ee02d9e5f482cc8309134a91eeaacbdd2261ba111b0fef3748eeb4913e6a2c \
+    --hash=sha256:d99ceffa25ac45d150e30bd9ed14ec6039f2aad0ffa6bb87a5936f5782fc1569 \
+    --hash=sha256:e38a7d4e8f633a33b4c7350fbd8bad3b70bf81439ac67ac38916c4a86b465456 \
+    --hash=sha256:e4682f5ba31f475d58884045c1a97a860a007d44938c4c0895f41d64481edbc9 \
+    --hash=sha256:e5bb9425fe881d578aeca0b2b4b3d314ec88738706f66f219c194d67179337cb \
+    --hash=sha256:e64198f6b856d48192bf921421fdd8ad8eb35e179086e99e99f711957ffedd6e \
+    --hash=sha256:e6662686aeb633ad65be2a42b4cb00178b3fbf7b91878f9446075c404ada552f \
+    --hash=sha256:ec54d5afa89c19c6dd8541a133be51ee1017a38b412b1321ccb8d6ddbeb4cf7d \
+    --hash=sha256:f5b1dff3ad008dccf18e652283f5e5339d70bf8ba7c98bf848ac33db10f7bc7a \
+    --hash=sha256:f8ec0c2fea1e886a19c3bee0cd19d862b3aa75dcdfb42ebe8ed30708df64687a \
+    --hash=sha256:f9ebd0a36102fcad2f03696e8af4ae682793a5d30b46c647eaf280d6cfb32796
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   diffusers
@@ -3080,9 +3094,9 @@ requests==2.31.0 \
     #   torchtext
     #   transformers
     #   wandb
-requests-oauthlib==1.3.1 \
-    --hash=sha256:2577c501a2fb8d05a304c09d090d6e47c306fef15809d102b327cf8364bddab5 \
-    --hash=sha256:75beac4a47881eeb94d5ea5d6ad31ef88856affe2332b9aafb52c6452ccf0d7a
+requests-oauthlib==2.0.0 \
+    --hash=sha256:7dd8a5c40426b779b0868c404bdef9768deccf22749cde15852df527e6269b36 \
+    --hash=sha256:b3dffaebd884d8cd778494369603a9e7b58d29111bf6b41bdc2dcd87203af4e9
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   google-auth-oauthlib
@@ -3098,12 +3112,13 @@ retry-decorator==1.1.1 \
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   gcs-oauth2-boto-plugin
     #   gsutil
-rich==12.6.0 \
-    --hash=sha256:a4eb26484f2c82589bd9a17c73d32a010b1e29d89f1604cd9bf3a2097b81bb5e \
-    --hash=sha256:ba3a3775974105c221d31141f2c116f4fd65c5ceb0698657a11e9f295ec93fd0
+rich==13.3.2 \
+    --hash=sha256:91954fe80cfb7985727a467ca98a7618e5dd15178cc2da10f553b36a93859001 \
+    --hash=sha256:a104f37270bf677148d8acb07d33be1569eeee87e2d1beb286a4e9113caf6f2f
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   memray
+    #   typer
 rouge-score==0.1.2 \
     --hash=sha256:c7d4da2683e68c9abf0135ef915d63a46643666f848e558a1b9f7ead17ff0f04
     # via lm-eval
@@ -3127,105 +3142,107 @@ s3transfer==0.6.2 \
 sacrebleu==2.4.2 \
     --hash=sha256:611a581d205828912f0b05f806b110180087184d3be2dc650fda7a729d6ecb89
     # via lm-eval
-safetensors==0.4.1 \
-    --hash=sha256:04157d008385bea66d12fe90844a80d4a76dc25ec5230b5bd9a630496d1b7c03 \
-    --hash=sha256:04dd14f53f5500eb4c4149674216ba1000670efbcf4b1b5c2643eb244e7882ea \
-    --hash=sha256:097e9af2efa8778cd2f0cba451784253e62fa7cc9fc73c0744d27212f7294e25 \
-    --hash=sha256:0bd0afd95c1e497f520e680ea01e0397c0868a3a3030e128438cf6e9e3fcd671 \
-    --hash=sha256:0ddd050e01f3e843aa8c1c27bf68675b8a08e385d0045487af4d70418c3cb356 \
-    --hash=sha256:16d8bbb7344e39cb9d4762e85c21df94ebeb03edac923dd94bb9ed8c10eac070 \
-    --hash=sha256:1a45dbf03e8334d3a5dc93687d98b6dc422f5d04c7d519dac09b84a3c87dd7c6 \
-    --hash=sha256:1d568628e9c43ca15eb96c217da73737c9ccb07520fafd8a1eba3f2750614105 \
-    --hash=sha256:1faf5111c66a6ba91f85dff2e36edaaf36e6966172703159daeef330de4ddc7b \
-    --hash=sha256:2297b359d91126c0f9d4fd17bae3cfa2fe3a048a6971b8db07db746ad92f850c \
-    --hash=sha256:2304658e6ada81a5223225b4efe84748e760c46079bffedf7e321763cafb36c9 \
-    --hash=sha256:2536b11ce665834201072e9397404170f93f3be10cca9995b909f023a04501ee \
-    --hash=sha256:257d59e40a1b367cb544122e7451243d65b33c3f34d822a347f4eea6fdf97fdf \
-    --hash=sha256:25a043cbb59d4f75e9dd87fdf5c009dd8830105a2c57ace49b72167dd9808111 \
-    --hash=sha256:270b99885ec14abfd56c1d7f28ada81740a9220b4bae960c3de1c6fe84af9e4d \
-    --hash=sha256:285b52a481e7ba93e29ad4ec5841ef2c4479ef0a6c633c4e2629e0508453577b \
-    --hash=sha256:2b6a2814278b6660261aa9a9aae524616de9f1ec364e3716d219b6ed8f91801f \
-    --hash=sha256:2d54c2f1826e790d1eb2d2512bfd0ee443f0206b423d6f27095057c7f18a0687 \
-    --hash=sha256:2d87d993eaefe6611a9c241a8bd364a5f1ffed5771c74840363a6c4ed8d868f6 \
-    --hash=sha256:2fe6926110e3d425c4b684a4379b7796fdc26ad7d16922ea1696c8e6ea7e920f \
-    --hash=sha256:303d2c0415cf15a28f8d7f17379ea3c34c2b466119118a34edd9965983a1a8a6 \
-    --hash=sha256:313e8472197bde54e3ec54a62df184c414582979da8f3916981b6a7954910a1b \
-    --hash=sha256:35803201d980efcf964b75a0a2aee97fe5e9ecc5f3ad676b38fafdfe98e0620d \
-    --hash=sha256:39d36f1d88468a87c437a1bc27c502e71b6ca44c385a9117a9f9ba03a75cc9c6 \
-    --hash=sha256:3b0b7b2d5976fbed8a05e2bbdce5816a59e6902e9e7c7e07dc723637ed539787 \
-    --hash=sha256:3b30abd0cddfe959d1daedf92edcd1b445521ebf7ddefc20860ed01486b33c90 \
-    --hash=sha256:3c1b1d510c7aba71504ece87bf393ea82638df56303e371e5e2cf09d18977dd7 \
-    --hash=sha256:3cfd1ca35eacc635f0eaa894e5c5ed83ffebd0f95cac298fd430014fa7323631 \
-    --hash=sha256:3f6a520af7f2717c5ecba112041f2c8af1ca6480b97bf957aba81ed9642e654c \
-    --hash=sha256:413e1f6ac248f7d1b755199a06635e70c3515493d3b41ba46063dec33aa2ebb7 \
-    --hash=sha256:4177b456c6b0c722d82429127b5beebdaf07149d265748e97e0a34ff0b3694c8 \
-    --hash=sha256:42c3710cec7e5c764c7999697516370bee39067de0aa089b7e2cfb97ac8c6b20 \
-    --hash=sha256:44e230fbbe120de564b64f63ef3a8e6ff02840fa02849d9c443d56252a1646d4 \
-    --hash=sha256:48901bd540f8a3c1791314bc5c8a170927bf7f6acddb75bf0a263d081a3637d4 \
-    --hash=sha256:53134226053e56bd56e73f7db42596e7908ed79f3c9a1016e4c1dade593ac8e5 \
-    --hash=sha256:573b6023a55a2f28085fc0a84e196c779b6cbef4d9e73acea14c8094fee7686f \
-    --hash=sha256:5d95ea4d8b32233910734a904123bdd3979c137c461b905a5ed32511defc075f \
-    --hash=sha256:5f25297148ec665f0deb8bd67e9564634d8d6841041ab5393ccfe203379ea88b \
-    --hash=sha256:645b3f1138fce6e818e79d4128afa28f0657430764cc045419c1d069ff93f732 \
-    --hash=sha256:660ca1d8bff6c7bc7c6b30b9b32df74ef3ab668f5df42cefd7588f0d40feadcb \
-    --hash=sha256:6ace9e66a40f98a216ad661245782483cf79cf56eb2b112650bb904b0baa9db5 \
-    --hash=sha256:6fd80f7794554091836d4d613d33a7d006e2b8d6ba014d06f97cebdfda744f64 \
-    --hash=sha256:780dc21eb3fd32ddd0e8c904bdb0290f2454f4ac21ae71e94f9ce72db1900a5a \
-    --hash=sha256:791edc10a3c359a2f5f52d5cddab0df8a45107d91027d86c3d44e57162e5d934 \
-    --hash=sha256:7a8f6f679d97ea0135c7935c202feefbd042c149aa70ee759855e890c01c7814 \
-    --hash=sha256:7ef010e9afcb4057fb6be3d0a0cfa07aac04fe97ef73fe4a23138d8522ba7c17 \
-    --hash=sha256:7ff8a36e0396776d3ed9a106fc9a9d7c55d4439ca9a056a24bf66d343041d3e6 \
-    --hash=sha256:82571d20288c975c1b30b08deb9b1c3550f36b31191e1e81fae87669a92217d0 \
-    --hash=sha256:82cbb8f4d022f2e94498cbefca900698b8ded3d4f85212f47da614001ff06652 \
-    --hash=sha256:83c2cfbe8c6304f0891e7bb378d56f66d2148972eeb5f747cd8a2246886f0d8c \
-    --hash=sha256:845be0aafabf2a60c2d482d4e93023fecffe5e5443d801d7a7741bae9de41233 \
-    --hash=sha256:88b4653059c903015284a9722f9a46838c654257173b279c8f6f46dbe80b612d \
-    --hash=sha256:8b58ba13a9e82b4bc3fc221914f6ef237fe6c2adb13cede3ace64d1aacf49610 \
-    --hash=sha256:8f69903ff49cb30b9227fb5d029bea276ea20d04b06803877a420c5b1b74c689 \
-    --hash=sha256:8ff8e41c8037db17de0ea2a23bc684f43eaf623be7d34906fe1ac10985b8365e \
-    --hash=sha256:911b48dc09e321a194def3a7431662ff4f03646832f3a8915bbf0f449b8a5fcb \
-    --hash=sha256:998fbac99ca956c3a09fe07cc0b35fac26a521fa8865a690686d889f0ff4e4a6 \
-    --hash=sha256:9a82bc2bd7a9a0e08239bdd6d7774d64121f136add93dfa344a2f1a6d7ef35fa \
-    --hash=sha256:9d16b3b2fcc6fca012c74bd01b5619c655194d3e3c13e4d4d0e446eefa39a463 \
-    --hash=sha256:a257de175c254d39ccd6a21341cd62eb7373b05c1e618a78096a56a857e0c316 \
-    --hash=sha256:a79e16222106b2f5edbca1b8185661477d8971b659a3c814cc6f15181a9b34c8 \
-    --hash=sha256:ae2d5a31cfb8a973a318f7c4d2cffe0bd1fe753cdf7bb41a1939d45a0a06f964 \
-    --hash=sha256:ae2f67f04ed0bb2e56fd380a8bd3eef03f609df53f88b6f5c7e89c08e52aae00 \
-    --hash=sha256:ae5497adc68669db2fed7cb2dad81e6a6106e79c9a132da3efdb6af1db1014fa \
-    --hash=sha256:b287304f2b2220d51ccb51fd857761e78bcffbeabe7b0238f8dc36f2edfd9542 \
-    --hash=sha256:b2f8877990a72ff595507b80f4b69036a9a1986a641f8681adf3425d97d3d2a5 \
-    --hash=sha256:bb4cb3e37a9b961ddd68e873b29fe9ab4a081e3703412e34aedd2b7a8e9cafd9 \
-    --hash=sha256:bbc2ce1f5ae5143a7fb72b71fa71db6a42b4f6cf912aa3acdc6b914084778e68 \
-    --hash=sha256:bda3d98e2bcece388232cfc551ebf063b55bdb98f65ab54df397da30efc7dcc5 \
-    --hash=sha256:bdc0d039e44a727824639824090bd8869535f729878fa248addd3dc01db30eae \
-    --hash=sha256:bfa2e20342b81921b98edba52f8deb68843fa9c95250739a56b52ceda5ea5c61 \
-    --hash=sha256:c3807ac3b16288dffebb3474b555b56fe466baa677dfc16290dcd02dca1ab228 \
-    --hash=sha256:c3c9f0ca510e0de95abd6424789dcbc879942a3a4e29b0dfa99d9427bf1da75c \
-    --hash=sha256:c8ed5d2c04cdc1afc6b3c28d59580448ac07732c50d94c15e14670f9c473a2ce \
-    --hash=sha256:cba01c6b76e01ec453933b3b3c0157c59b52881c83eaa0f7666244e71aa75fd1 \
-    --hash=sha256:ce7a28bc8af685a69d7e869d09d3e180a275e3281e29cf5f1c7319e231932cc7 \
-    --hash=sha256:d10a9f7bae608ccfdc009351f01dc3d8535ff57f9488a58a4c38e45bf954fe93 \
-    --hash=sha256:d3ac139377cfe71ba04573f1cda66e663b7c3e95be850e9e6c2dd4b5984bd513 \
-    --hash=sha256:d5b3defa74f3723a388bfde2f5d488742bc4879682bd93267c09a3bcdf8f869b \
-    --hash=sha256:d784938534e255473155e4d9f276ee69eb85455b6af1292172c731409bf9adee \
-    --hash=sha256:d784a98c492c751f228a4a894c3b8a092ff08b24e73b5568938c28b8c0e8f8df \
-    --hash=sha256:d8a85e3e47e0d4eebfaf9a58b40aa94f977a56050cb5598ad5396a9ee7c087c6 \
-    --hash=sha256:d93321eea0dd7e81b283e47a1d20dee6069165cc158286316d0d06d340de8fe8 \
-    --hash=sha256:da52ee0dc8ba03348ffceab767bd8230842fdf78f8a996e2a16445747143a778 \
-    --hash=sha256:dab431699b5d45e0ca043bc580651ce9583dda594e62e245b7497adb32e99809 \
-    --hash=sha256:dac4bb42f8679aadc59bd91a4c5a1784a758ad49d0912995945cd674089f628e \
-    --hash=sha256:e056fb9e22d118cc546107f97dc28b449d88274207dd28872bd668c86216e4f6 \
-    --hash=sha256:e09000b2599e1836314430f81a3884c66a5cbabdff5d9f175b5d560d4de38d78 \
-    --hash=sha256:e0ccb5aa0f3be2727117e5631200fbb3a5b3a2b3757545a92647d6dd8be6658f \
-    --hash=sha256:e57a5ab08b0ec7a7caf30d2ac79bb30c89168431aca4f8854464bb9461686925 \
-    --hash=sha256:e9a7ffb1e551c6df51d267f5a751f042b183df22690f6feceac8d27364fd51d7 \
-    --hash=sha256:e9c80ce0001efa16066358d2dd77993adc25f5a6c61850e4ad096a2232930bce \
-    --hash=sha256:eb2c1da1cc39509d1a55620a5f4d14f8911c47a89c926a96e6f4876e864375a3 \
-    --hash=sha256:edcf3121890b5f0616aa5a54683b1a5d2332037b970e507d6bb7841a3a596556 \
-    --hash=sha256:f603bdd8deac6726d39f41688ed353c532dd53935234405d79e9eb53f152fbfb \
-    --hash=sha256:f8934bdfd202ebd0697040a3dff40dd77bc4c5bbf3527ede0532f5e7fb4d970f \
-    --hash=sha256:fdb4adb76e21bad318210310590de61c9f4adcef77ee49b4a234f9dc48867869 \
-    --hash=sha256:fdb58dee173ef33634c3016c459d671ca12d11e6acf9db008261cbe58107e579
+safetensors==0.4.3 \
+    --hash=sha256:018b691383026a2436a22b648873ed11444a364324e7088b99cd2503dd828400 \
+    --hash=sha256:01e4b22e3284cd866edeabe4f4d896229495da457229408d2e1e4810c5187121 \
+    --hash=sha256:01feb3089e5932d7e662eda77c3ecc389f97c0883c4a12b5cfdc32b589a811c3 \
+    --hash=sha256:02318f01e332cc23ffb4f6716e05a492c5f18b1d13e343c49265149396284a44 \
+    --hash=sha256:02ef3a24face643456020536591fbd3c717c5abaa2737ec428ccbbc86dffa7a4 \
+    --hash=sha256:03a4447c784917c9bf01d8f2ac5080bc15c41692202cd5f406afba16629e84d6 \
+    --hash=sha256:084fc436e317f83f7071fc6a62ca1c513b2103db325cd09952914b50f51cf78f \
+    --hash=sha256:0bf4f9d6323d9f86eef5567eabd88f070691cf031d4c0df27a40d3b4aaee755b \
+    --hash=sha256:0d52c958dc210265157573f81d34adf54e255bc2b59ded6218500c9b15a750eb \
+    --hash=sha256:0d5ffc6a80f715c30af253e0e288ad1cd97a3d0086c9c87995e5093ebc075e50 \
+    --hash=sha256:0d9cd8e1560dfc514b6d7859247dc6a86ad2f83151a62c577428d5102d872721 \
+    --hash=sha256:0dd37306546b58d3043eb044c8103a02792cc024b51d1dd16bd3dd1f334cb3ed \
+    --hash=sha256:1139eb436fd201c133d03c81209d39ac57e129f5e74e34bb9ab60f8d9b726270 \
+    --hash=sha256:19bbdf95de2cf64f25cd614c5236c8b06eb2cfa47cbf64311f4b5d80224623a3 \
+    --hash=sha256:1ab6527a20586d94291c96e00a668fa03f86189b8a9defa2cdd34a1a01acc7d5 \
+    --hash=sha256:1b89381517891a7bb7d1405d828b2bf5d75528299f8231e9346b8eba092227f9 \
+    --hash=sha256:1f598b713cc1a4eb31d3b3203557ac308acf21c8f41104cdd74bf640c6e538e3 \
+    --hash=sha256:22d21760dc6ebae42e9c058d75aa9907d9f35e38f896e3c69ba0e7b213033856 \
+    --hash=sha256:22f3b5d65e440cec0de8edaa672efa888030802e11c09b3d6203bff60ebff05a \
+    --hash=sha256:2a0deb16a1d3ea90c244ceb42d2c6c276059616be21a19ac7101aa97da448faf \
+    --hash=sha256:2a1f4430cc0c9d6afa01214a4b3919d0a029637df8e09675ceef1ca3f0dfa0df \
+    --hash=sha256:2d603846a8585b9432a0fd415db1d4c57c0f860eb4aea21f92559ff9902bae4d \
+    --hash=sha256:2f85fc50c4e07a21e95c24e07460fe6f7e2859d0ce88092838352b798ce711c2 \
+    --hash=sha256:309b10dbcab63269ecbf0e2ca10ce59223bb756ca5d431ce9c9eeabd446569da \
+    --hash=sha256:3615a96dd2dcc30eb66d82bc76cda2565f4f7bfa89fcb0e31ba3cea8a1a9ecbb \
+    --hash=sha256:38e2a8666178224a51cca61d3cb4c88704f696eac8f72a49a598a93bbd8a4af9 \
+    --hash=sha256:393e6e391467d1b2b829c77e47d726f3b9b93630e6a045b1d1fca67dc78bf632 \
+    --hash=sha256:3f9cdca09052f585e62328c1c2923c70f46814715c795be65f0b93f57ec98a02 \
+    --hash=sha256:41a727a7f5e6ad9f1db6951adee21bbdadc632363d79dc434876369a17de6ad6 \
+    --hash=sha256:420a98f593ff9930f5822560d14c395ccbc57342ddff3b463bc0b3d6b1951550 \
+    --hash=sha256:446e9fe52c051aeab12aac63d1017e0f68a02a92a027b901c4f8e931b24e5397 \
+    --hash=sha256:455d538aa1aae4a8b279344a08136d3f16334247907b18a5c3c7fa88ef0d3c46 \
+    --hash=sha256:4f9bac020faba7f5dc481e881b14b6425265feabb5bfc552551d21189c0eddc3 \
+    --hash=sha256:53c4879b9c6bd7cd25d114ee0ef95420e2812e676314300624594940a8d6a91f \
+    --hash=sha256:5757e4688f20df083e233b47de43845d1adb7e17b6cf7da5f8444416fc53828d \
+    --hash=sha256:585c9ae13a205807b63bef8a37994f30c917ff800ab8a1ca9c9b5d73024f97ee \
+    --hash=sha256:5d07cbca5b99babb692d76d8151bec46f461f8ad8daafbfd96b2fca40cadae65 \
+    --hash=sha256:5fc6775529fb9f0ce2266edd3e5d3f10aab068e49f765e11f6f2a63b5367021d \
+    --hash=sha256:622afd28968ef3e9786562d352659a37de4481a4070f4ebac883f98c5836563e \
+    --hash=sha256:6f9568f380f513a60139971169c4a358b8731509cc19112369902eddb33faa4d \
+    --hash=sha256:70a5319ef409e7f88686a46607cbc3c428271069d8b770076feaf913664a07ac \
+    --hash=sha256:74707624b81f1b7f2b93f5619d4a9f00934d5948005a03f2c1845ffbfff42212 \
+    --hash=sha256:7c4fa560ebd4522adddb71dcd25d09bf211b5634003f015a4b815b7647d62ebe \
+    --hash=sha256:7de32d0d34b6623bb56ca278f90db081f85fb9c5d327e3c18fd23ac64f465768 \
+    --hash=sha256:840b7ac0eff5633e1d053cc9db12fdf56b566e9403b4950b2dc85393d9b88d67 \
+    --hash=sha256:840caf38d86aa7014fe37ade5d0d84e23dcfbc798b8078015831996ecbc206a3 \
+    --hash=sha256:8651c7299cbd8b4161a36cd6a322fa07d39cd23535b144d02f1c1972d0c62f3c \
+    --hash=sha256:868ad1b6fc41209ab6bd12f63923e8baeb1a086814cb2e81a65ed3d497e0cf8f \
+    --hash=sha256:88887f69f7a00cf02b954cdc3034ffb383b2303bc0ab481d4716e2da51ddc10e \
+    --hash=sha256:89f9f17b0dacb913ed87d57afbc8aad85ea42c1085bd5de2f20d83d13e9fc4b2 \
+    --hash=sha256:8c496c5401c1b9c46d41a7688e8ff5b0310a3b9bae31ce0f0ae870e1ea2b8caf \
+    --hash=sha256:8cf18888606dad030455d18f6c381720e57fc6a4170ee1966adb7ebc98d4d6a3 \
+    --hash=sha256:8d22c1a10dff3f64d0d68abb8298a3fd88ccff79f408a3e15b3e7f637ef5c980 \
+    --hash=sha256:90964917f5b0fa0fa07e9a051fbef100250c04d150b7026ccbf87a34a54012e0 \
+    --hash=sha256:9bfb92f82574d9e58401d79c70c716985dc049b635fef6eecbb024c79b2c46ad \
+    --hash=sha256:9c6ad011c1b4e3acff058d6b090f1da8e55a332fbf84695cf3100c649cc452d1 \
+    --hash=sha256:a11c374eb63a9c16c5ed146457241182f310902bd2a9c18255781bb832b6748b \
+    --hash=sha256:a7cef55929dcbef24af3eb40bedec35d82c3c2fa46338bb13ecf3c5720af8a61 \
+    --hash=sha256:a844cdb5d7cbc22f5f16c7e2a0271170750763c4db08381b7f696dbd2c78a361 \
+    --hash=sha256:ae7613a119a71a497d012ccc83775c308b9c1dab454806291427f84397d852fd \
+    --hash=sha256:b1648568667f820b8c48317c7006221dc40aced1869908c187f493838a1362bc \
+    --hash=sha256:b1e31be7945f66be23f4ec1682bb47faa3df34cb89fc68527de6554d3c4258a4 \
+    --hash=sha256:b277482120df46e27a58082df06a15aebda4481e30a1c21eefd0921ae7e03f65 \
+    --hash=sha256:b7ffba80aa49bd09195145a7fd233a7781173b422eeb995096f2b30591639517 \
+    --hash=sha256:b852e47eb08475c2c1bd8131207b405793bfc20d6f45aff893d3baaad449ed14 \
+    --hash=sha256:bb4f8c5d0358a31e9a08daeebb68f5e161cdd4018855426d3f0c23bb51087055 \
+    --hash=sha256:bbae3b4b9d997971431c346edbfe6e41e98424a097860ee872721e176040a893 \
+    --hash=sha256:befdf0167ad626f22f6aac6163477fcefa342224a22f11fdd05abb3995c1783c \
+    --hash=sha256:c0acbe31340ab150423347e5b9cc595867d814244ac14218932a5cf1dd38eb39 \
+    --hash=sha256:c41e1893d1206aa7054029681778d9a58b3529d4c807002c156d58426c225173 \
+    --hash=sha256:c59d51f182c729f47e841510b70b967b0752039f79f1de23bcdd86462a9b09ee \
+    --hash=sha256:cd6fff9e56df398abc5866b19a32124815b656613c1c5ec0f9350906fd798aac \
+    --hash=sha256:cdd0a3b5da66e7f377474599814dbf5cbf135ff059cc73694de129b58a5e8a2c \
+    --hash=sha256:cf476bca34e1340ee3294ef13e2c625833f83d096cfdf69a5342475602004f95 \
+    --hash=sha256:d0dd4a1db09db2dba0f94d15addc7e7cd3a7b0d393aa4c7518c39ae7374623c3 \
+    --hash=sha256:d1456f814655b224d4bf6e7915c51ce74e389b413be791203092b7ff78c936dd \
+    --hash=sha256:d14d30c25897b2bf19b6fb5ff7e26cc40006ad53fd4a88244fdf26517d852dd7 \
+    --hash=sha256:d244bcafeb1bc06d47cfee71727e775bca88a8efda77a13e7306aae3813fa7e4 \
+    --hash=sha256:d8815b5e1dac85fc534a97fd339e12404db557878c090f90442247e87c8aeaea \
+    --hash=sha256:d88b33980222085dd6001ae2cad87c6068e0991d4f5ccf44975d216db3b57376 \
+    --hash=sha256:d8c5093206ef4b198600ae484230402af6713dab1bd5b8e231905d754022bec7 \
+    --hash=sha256:d9c289f140a9ae4853fc2236a2ffc9a9f2d5eae0cb673167e0f1b8c18c0961ac \
+    --hash=sha256:dcf5705cab159ce0130cd56057f5f3425023c407e170bca60b4868048bae64fd \
+    --hash=sha256:e011cc162503c19f4b1fd63dfcddf73739c7a243a17dac09b78e57a00983ab35 \
+    --hash=sha256:e066e8861eef6387b7c772344d1fe1f9a72800e04ee9a54239d460c400c72aab \
+    --hash=sha256:e0b2104df1579d6ba9052c0ae0e3137c9698b2d85b0645507e6fd1813b70931a \
+    --hash=sha256:e375d975159ac534c7161269de24ddcd490df2157b55c1a6eeace6cbb56903f0 \
+    --hash=sha256:e4119532cd10dba04b423e0f86aecb96cfa5a602238c0aa012f70c3a40c44b50 \
+    --hash=sha256:e7dbbde64b6c534548696808a0e01276d28ea5773bc9a2dfb97a88cd3dffe3df \
+    --hash=sha256:e9afd5358719f1b2cf425fad638fc3c887997d6782da317096877e5b15b2ce93 \
+    --hash=sha256:ec4b52ce9a396260eb9731eb6aea41a7320de22ed73a1042c2230af0212758ce \
+    --hash=sha256:edb5698a7bc282089f64c96c477846950358a46ede85a1c040e0230344fdde10 \
+    --hash=sha256:ee463219d9ec6c2be1d331ab13a8e0cd50d2f32240a81d498266d77d07b7e71e \
+    --hash=sha256:efcc860be094b8d19ac61b452ec635c7acb9afa77beb218b1d7784c6d41fe8ad \
+    --hash=sha256:f5e6883af9a68c0028f70a4c19d5a6ab6238a379be36ad300a22318316c00cb0 \
+    --hash=sha256:f9650713b2cfa9537a2baf7dd9fee458b24a0aaaa6cafcea8bdd5fb2b8efdc34 \
+    --hash=sha256:faefeb3b81bdfb4e5a55b9bbdf3d8d8753f65506e1d67d03f5c851a6c87150e9 \
+    --hash=sha256:fb9c65bd82f9ef3ce4970dc19ee86be5f6f93d032159acf35e663c6bea02b237 \
+    --hash=sha256:fe746d03ed8d193674a26105e4f0fe6c726f5bb602ffc695b409eaf02f04763d \
+    --hash=sha256:fef5d70683643618244a4f5221053567ca3e77c2531e42ad48ae05fae909f542
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   accelerate
@@ -3346,9 +3363,9 @@ sentencepiece==0.1.96 \
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   -r release/ray_release/byod/requirements_ml_byod_3.9.in
-sentry-sdk==1.37.1 \
-    --hash=sha256:7cd324dd2877fdc861f75cba4242bce23a58272a6fea581fcb218bb718bd9cc5 \
-    --hash=sha256:a249c7364827ee89daaa078bb8b56ece0b3d52d9130961bef2302b79bdf7fe70
+sentry-sdk==2.10.0 \
+    --hash=sha256:545fcc6e36c335faa6d6cda84669b6e17025f31efbf3b2211ec14efe008b75d1 \
+    --hash=sha256:87b3d413c87d8e7f816cc9334bff255a83d8b577db2b22042651c30c19c09190
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   wandb
@@ -3444,6 +3461,12 @@ setproctitle==1.3.3 \
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   wandb
+shellingham==1.5.4 \
+    --hash=sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686 \
+    --hash=sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de
+    # via
+    #   -c release/ray_release/byod/requirements_compiled.txt
+    #   typer
 six==1.16.0 \
     --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \
     --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254
@@ -3470,50 +3493,15 @@ smmap==5.0.1 \
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   gitdb
-sniffio==1.3.0 \
-    --hash=sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101 \
-    --hash=sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384
+sniffio==1.3.1 \
+    --hash=sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2 \
+    --hash=sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   anyio
-sqlalchemy==1.4.17 \
-    --hash=sha256:196fb6bb2733834e506c925d7532f8eabad9d2304deef738a40846e54c31e236 \
-    --hash=sha256:1dd77acbc19bee9c0ba858ff5e4e5d5c60895495c83b4df9bcdf4ad5e9b74f21 \
-    --hash=sha256:216ff28fe803885ceb5b131dcee6507d28d255808dd5bcffcb3b5fa75be2e102 \
-    --hash=sha256:461a4ea803ce0834822f372617a68ac97f9fa1281f2a984624554c651d7c3ae1 \
-    --hash=sha256:4b09191ed22af149c07a880f309b7740f3f782ff13325bae5c6168a6aa57e715 \
-    --hash=sha256:4c5e20666b33b03bf7f14953f0deb93007bf8c1342e985bd7c7cf25f46fac579 \
-    --hash=sha256:4d93b62e98248e3e1ac1e91c2e6ee1e7316f704be1f734338b350b6951e6c175 \
-    --hash=sha256:5732858e56d32fa7e02468f4fd2d8f01ddf709e5b93d035c637762890f8ed8b6 \
-    --hash=sha256:58c02d1771bb0e61bc9ced8f3b36b5714d9ece8fd4bdbe2a44a892574c3bbc3c \
-    --hash=sha256:651cdb3adcee13624ba22d5ff3e96f91e16a115d2ca489ddc16a8e4c217e8509 \
-    --hash=sha256:6fe1c8dc26bc0005439cb78ebc78772a22cccc773f5a0e67cb3002d791f53f0f \
-    --hash=sha256:7222f3236c280fab3a2d76f903b493171f0ffc29667538cc388a5d5dd0216a88 \
-    --hash=sha256:7dc3d3285fb682316d580d84e6e0840fdd8ffdc05cb696db74b9dd746c729908 \
-    --hash=sha256:7e45043fe11d503e1c3f9dcf5b42f92d122a814237cd9af68a11dae46ecfcae1 \
-    --hash=sha256:7eb55d5583076c03aaf1510473fad2a61288490809049cb31028af56af7068ee \
-    --hash=sha256:82922a320d38d7d6aa3a8130523ec7e8c70fa95f7ca7d0fd6ec114b626e4b10b \
-    --hash=sha256:8e133e2551fa99c75849848a4ac08efb79930561eb629dd7d2dc9b7ee05256e6 \
-    --hash=sha256:949ac299903d2ed8419086f81847381184e2264f3431a33af4679546dcc87f01 \
-    --hash=sha256:a2d225c8863a76d15468896dc5af36f1e196b403eb9c7e0151e77ffab9e7df57 \
-    --hash=sha256:a5f00a2be7d777119e15ccfb5ba0b2a92e8a193959281089d79821a001095f80 \
-    --hash=sha256:b0ad951a6e590bbcfbfeadc5748ef5ec8ede505a8119a71b235f7481cc08371c \
-    --hash=sha256:b59b2c0a3b1d93027f6b6b8379a50c354483fe1ebe796c6740e157bb2e06d39a \
-    --hash=sha256:bc89e37c359dcd4d75b744e5e81af128ba678aa2ecea4be957e80e6e958a1612 \
-    --hash=sha256:bde055c019e6e449ebc4ec61abd3e08690abeb028c7ada2a3b95d8e352b7b514 \
-    --hash=sha256:c367ed95d41df584f412a9419b5ece85b0d6c2a08a51ae13ae47ef74ff9a9349 \
-    --hash=sha256:dde05ae0987e43ec84e64d6722ce66305eda2a5e2b7d6fda004b37aabdfbb909 \
-    --hash=sha256:ee6e7ca09ff274c55d19a1e15ee6f884fa0230c0d9b8d22a456e249d08dee5bf \
-    --hash=sha256:f1c68f7bd4a57ffdb85eab489362828dddf6cd565a4c18eda4c446c1d5d3059d \
-    --hash=sha256:f63e1f531a8bf52184e2afb53648511f3f8534decb7575b483a583d3cd8d13ed \
-    --hash=sha256:fdad4a33140b77df61d456922b7974c1f1bb2c35238f6809f078003a620c4734
-    # via
-    #   -c release/ray_release/byod/requirements_compiled.txt
-    #   alembic
-    #   dataset
-sqlglot==20.4.0 \
-    --hash=sha256:401a2933298cf66901704cf2029272d8243ee72ac47b9fd8784254401b43ee43 \
-    --hash=sha256:9a42135d0530de8150a2c5106e0c52abd3396d92501ebe97df7b371d20de5dc9
+sqlglot==25.6.1 \
+    --hash=sha256:c1fcbaa00429979f16fb8cea20279a8b3f5312e76d97abb8f8c6a9b21be450d7 \
+    --hash=sha256:ea40f3bf8452e2c1a696fe120163190bd67e49b346336e7db6d34400b57b7601
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   fugue
@@ -3571,9 +3559,9 @@ statsmodels==0.14.0 \
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   statsforecast
-sympy==1.12 \
-    --hash=sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5 \
-    --hash=sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8
+sympy==1.13.1 \
+    --hash=sha256:9cebf7e04ff162015ce31c9c6c9144daa34a93bd082f54fd8f12deca4f47515f \
+    --hash=sha256:db36cdc64bf61b9b24578b6f7bab1ecdd2452cf008f34faa33776680c26d66f8
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   torch
@@ -3764,17 +3752,12 @@ tokenizers==0.15.2 \
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   transformers
-toml==0.10.2 \
-    --hash=sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b \
-    --hash=sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f
-    # via
-    #   -c release/ray_release/byod/requirements_compiled.txt
-    #   jupytext
 tomli==2.0.1 \
     --hash=sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc \
     --hash=sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
+    #   jupytext
     #   pytest
 torch==2.3.0 \
     --hash=sha256:09c81c5859a5b819956c6925a405ef1cdda393c9d8a01ce3851453f699d3358c \
@@ -3881,9 +3864,9 @@ tqdm-multiprocess==0.0.11 \
     --hash=sha256:3ebdf03e7a675150fa0bbceaa9c3c64b8cb556e9ffafa4fe6c078e51820524aa \
     --hash=sha256:a74002a1222ea9cbe8cdc9bd460108c6009be359621fbee9b92d0515d4d180f7
     # via lm-eval
-traitlets==5.14.0 \
-    --hash=sha256:f14949d23829023013c47df20b4a76ccd1a85effb786dc060f34de7948361b33 \
-    --hash=sha256:fcdaa8ac49c04dfa0ed3ee3384ef6dfdb5d6f3741502be247279407679296772
+traitlets==5.14.3 \
+    --hash=sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7 \
+    --hash=sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   comm
@@ -3900,9 +3883,9 @@ transformers==4.36.2 \
     #   -r release/ray_release/byod/requirements_ml_byod_3.9.in
     #   lm-eval
     #   peft
-triad==0.9.3 \
-    --hash=sha256:1862b5a78deb9d475c7747b605f2b32457e96c6719f8cbc4e7e95147f34f6f64 \
-    --hash=sha256:e4dff41ffbb98bad4d9741c9dd632890cdfe0b873f23d76d2b5f9ca41d4440a7
+triad==0.9.8 \
+    --hash=sha256:2c0ba7d83977c6d4e7b59e3cc70727f858014ef7676c62d184aa8e63f7bef5de \
+    --hash=sha256:5b67673124891981daf8afbab44b2e6358932ca35ef3ff38a25bc3e0f6f03f17
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   adagio
@@ -3930,9 +3913,9 @@ typepy[datetime]==1.3.2 \
     #   dataproperty
     #   pytablewriter
     #   tabledata
-typer==0.9.0 \
-    --hash=sha256:50922fd79aea2f4751a8e0408ff10d2662bd0c8bbfa84755a699f3bada2978b2 \
-    --hash=sha256:5d96d986a21493606a358cae4461bd8cdf83cbf33a5aa950ae629ca3b51467ee
+typer==0.12.3 \
+    --hash=sha256:070d7ca53f785acbccba8e7d28b08dcd88f79f1fbda035ade0aecec71ca5c914 \
+    --hash=sha256:49e73131481d804288ef62598d97a1ceef3058905aa536a1134f90891ba35482
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   -r release/ray_release/byod/requirements_ml_byod_3.9.in
@@ -3942,7 +3925,6 @@ typing-extensions==4.8.0 \
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   -r release/ray_release/byod/requirements_ml_byod_3.9.in
-    #   alembic
     #   fastapi
     #   huggingface-hub
     #   ipython
@@ -3954,9 +3936,9 @@ typing-extensions==4.8.0 \
     #   torch
     #   typer
     #   wandb
-urllib3==1.26.18 \
-    --hash=sha256:34b97092d7e0a3a8cf7cd10e386f401b3737364026c45e622aa02903dffe0f07 \
-    --hash=sha256:f8ecc1bba5667413457c529ab955bf8c67b45db799d159066261719e328580a0
+urllib3==1.26.19 \
+    --hash=sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3 \
+    --hash=sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   -r release/ray_release/byod/requirements_ml_byod_3.9.in
@@ -3965,9 +3947,9 @@ urllib3==1.26.18 \
     #   requests
     #   responses
     #   sentry-sdk
-utilsforecast==0.0.23 \
-    --hash=sha256:188daa121c528965e26a3a38f409b66a15f9eef2b44684cc9426f3ddb1146841 \
-    --hash=sha256:290882da47ebc7887663c05c46c67e19bc63898220be444ca6173d0a5fdeee4a
+utilsforecast==0.2.0 \
+    --hash=sha256:3db4245da4e361f26c8eaeef216c2d1206b20defbb033bf11d3e66ce2b1d6ef8 \
+    --hash=sha256:a4825bf8da547e3dc552f9b9a7a8159341a118c3a5d122191f09bc3683cba433
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   statsforecast
@@ -3992,9 +3974,9 @@ wandb==0.17.0 \
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   -r release/ray_release/byod/requirements_ml_byod_3.9.in
-wcwidth==0.2.12 \
-    --hash=sha256:f01c104efdf57971bcb756f054dd58ddec5204dd15fa31d6503ea57947d97c02 \
-    --hash=sha256:f26ec43d96c8cbfed76a5075dac87680124fa84e0855195a6184da9c187f133c
+wcwidth==0.2.13 \
+    --hash=sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859 \
+    --hash=sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   prompt-toolkit
@@ -4011,13 +3993,15 @@ widgetsnbextension==4.0.11 \
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   ipywidgets
-xgboost==1.7.6 \
-    --hash=sha256:127cf1f5e2ec25cd41429394c6719b87af1456ce583e89f0bffd35d02ad18bcb \
-    --hash=sha256:1c527554a400445e0c38186039ba1a00425dcdb4e40b37eed0e74cb39a159c47 \
-    --hash=sha256:281c3c6f4fbed2d36bf95cd02a641afa95e72e9abde70064056da5e76233e8df \
-    --hash=sha256:4c34675b4d2678c624ddde5d45361e7e16046923e362e4e609b88353e6b87124 \
-    --hash=sha256:59b4b366d2cafc7f645e87d897983a5b59be02876194b1d213bd8d8b811d8ce8 \
-    --hash=sha256:b1d5db49b199152d62bd9217c98760207d3de86d2b9d243260c573ffe638f80a
+xgboost==2.1.0 \
+    --hash=sha256:19d145eb847b070c32342b1bf2d7331c102783e07a484f8b13b7d759d707c6b0 \
+    --hash=sha256:43b16205689249d7509daf7a6ab00ad0e6c570b3a9c263cb32b26e39d9477bb3 \
+    --hash=sha256:7144980923e76ce741c7b03a14d3bd7514db6de5c7cabe96ba95b229d274f5ca \
+    --hash=sha256:73673c9bb85927db7fe2e3aed6df6d35dba708cfd6767cc63d4ea11dda2dede5 \
+    --hash=sha256:74904b91c42524a6c32147fe5718569e78fb65911ff4499b053f81d0964514d4 \
+    --hash=sha256:840a0c6e2119d8c8f260a5dace996ea064a267f62b301a25d7d452488a7ac860 \
+    --hash=sha256:b2a456eb0f3d3e8fd8ab37e44ac288292bf8ea8744c294be9fd88713d27af810 \
+    --hash=sha256:cedc2e386e686795735448fd4597533acacc5ba6fb47dd910c204c468b80bb96
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   -r release/ray_release/byod/requirements_ml_byod_3.9.in
@@ -4134,103 +4118,103 @@ xxhash==3.4.1 \
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   datasets
     #   evaluate
-yarl==1.9.3 \
-    --hash=sha256:09c19e5f4404574fcfb736efecf75844ffe8610606f3fccc35a1515b8b6712c4 \
-    --hash=sha256:0ab5baaea8450f4a3e241ef17e3d129b2143e38a685036b075976b9c415ea3eb \
-    --hash=sha256:0d155a092bf0ebf4a9f6f3b7a650dc5d9a5bbb585ef83a52ed36ba46f55cc39d \
-    --hash=sha256:126638ab961633f0940a06e1c9d59919003ef212a15869708dcb7305f91a6732 \
-    --hash=sha256:1a0a4f3aaa18580038cfa52a7183c8ffbbe7d727fe581300817efc1e96d1b0e9 \
-    --hash=sha256:1d93461e2cf76c4796355494f15ffcb50a3c198cc2d601ad8d6a96219a10c363 \
-    --hash=sha256:26a1a8443091c7fbc17b84a0d9f38de34b8423b459fb853e6c8cdfab0eacf613 \
-    --hash=sha256:271d63396460b6607b588555ea27a1a02b717ca2e3f2cf53bdde4013d7790929 \
-    --hash=sha256:28a108cb92ce6cf867690a962372996ca332d8cda0210c5ad487fe996e76b8bb \
-    --hash=sha256:29beac86f33d6c7ab1d79bd0213aa7aed2d2f555386856bb3056d5fdd9dab279 \
-    --hash=sha256:2c757f64afe53a422e45e3e399e1e3cf82b7a2f244796ce80d8ca53e16a49b9f \
-    --hash=sha256:2dad8166d41ebd1f76ce107cf6a31e39801aee3844a54a90af23278b072f1ccf \
-    --hash=sha256:2dc72e891672343b99db6d497024bf8b985537ad6c393359dc5227ef653b2f17 \
-    --hash=sha256:2f3c8822bc8fb4a347a192dd6a28a25d7f0ea3262e826d7d4ef9cc99cd06d07e \
-    --hash=sha256:32435d134414e01d937cd9d6cc56e8413a8d4741dea36af5840c7750f04d16ab \
-    --hash=sha256:3cfa4dbe17b2e6fca1414e9c3bcc216f6930cb18ea7646e7d0d52792ac196808 \
-    --hash=sha256:3d5434b34100b504aabae75f0622ebb85defffe7b64ad8f52b8b30ec6ef6e4b9 \
-    --hash=sha256:4003f380dac50328c85e85416aca6985536812c082387255c35292cb4b41707e \
-    --hash=sha256:44e91a669c43f03964f672c5a234ae0d7a4d49c9b85d1baa93dec28afa28ffbd \
-    --hash=sha256:4a14907b597ec55740f63e52d7fee0e9ee09d5b9d57a4f399a7423268e457b57 \
-    --hash=sha256:4ce77d289f8d40905c054b63f29851ecbfd026ef4ba5c371a158cfe6f623663e \
-    --hash=sha256:4d6d74a97e898c1c2df80339aa423234ad9ea2052f66366cef1e80448798c13d \
-    --hash=sha256:51382c72dd5377861b573bd55dcf680df54cea84147c8648b15ac507fbef984d \
-    --hash=sha256:525cd69eff44833b01f8ef39aa33a9cc53a99ff7f9d76a6ef6a9fb758f54d0ff \
-    --hash=sha256:53ec65f7eee8655bebb1f6f1607760d123c3c115a324b443df4f916383482a67 \
-    --hash=sha256:5f74b015c99a5eac5ae589de27a1201418a5d9d460e89ccb3366015c6153e60a \
-    --hash=sha256:6280353940f7e5e2efaaabd686193e61351e966cc02f401761c4d87f48c89ea4 \
-    --hash=sha256:632c7aeb99df718765adf58eacb9acb9cbc555e075da849c1378ef4d18bf536a \
-    --hash=sha256:6465d36381af057d0fab4e0f24ef0e80ba61f03fe43e6eeccbe0056e74aadc70 \
-    --hash=sha256:66a6dbf6ca7d2db03cc61cafe1ee6be838ce0fbc97781881a22a58a7c5efef42 \
-    --hash=sha256:6d350388ba1129bc867c6af1cd17da2b197dff0d2801036d2d7d83c2d771a682 \
-    --hash=sha256:7217234b10c64b52cc39a8d82550342ae2e45be34f5bff02b890b8c452eb48d7 \
-    --hash=sha256:721ee3fc292f0d069a04016ef2c3a25595d48c5b8ddc6029be46f6158d129c92 \
-    --hash=sha256:72a57b41a0920b9a220125081c1e191b88a4cdec13bf9d0649e382a822705c65 \
-    --hash=sha256:73cc83f918b69110813a7d95024266072d987b903a623ecae673d1e71579d566 \
-    --hash=sha256:778df71c8d0c8c9f1b378624b26431ca80041660d7be7c3f724b2c7a6e65d0d6 \
-    --hash=sha256:79e1df60f7c2b148722fb6cafebffe1acd95fd8b5fd77795f56247edaf326752 \
-    --hash=sha256:7c86d0d0919952d05df880a1889a4f0aeb6868e98961c090e335671dea5c0361 \
-    --hash=sha256:7eaf13af79950142ab2bbb8362f8d8d935be9aaf8df1df89c86c3231e4ff238a \
-    --hash=sha256:828235a2a169160ee73a2fcfb8a000709edf09d7511fccf203465c3d5acc59e4 \
-    --hash=sha256:8535e111a064f3bdd94c0ed443105934d6f005adad68dd13ce50a488a0ad1bf3 \
-    --hash=sha256:88d2c3cc4b2f46d1ba73d81c51ec0e486f59cc51165ea4f789677f91a303a9a7 \
-    --hash=sha256:8a2538806be846ea25e90c28786136932ec385c7ff3bc1148e45125984783dc6 \
-    --hash=sha256:8dab30b21bd6fb17c3f4684868c7e6a9e8468078db00f599fb1c14e324b10fca \
-    --hash=sha256:8f18a7832ff85dfcd77871fe677b169b1bc60c021978c90c3bb14f727596e0ae \
-    --hash=sha256:946db4511b2d815979d733ac6a961f47e20a29c297be0d55b6d4b77ee4b298f6 \
-    --hash=sha256:96758e56dceb8a70f8a5cff1e452daaeff07d1cc9f11e9b0c951330f0a2396a7 \
-    --hash=sha256:9a172c3d5447b7da1680a1a2d6ecdf6f87a319d21d52729f45ec938a7006d5d8 \
-    --hash=sha256:9a5211de242754b5e612557bca701f39f8b1a9408dff73c6db623f22d20f470e \
-    --hash=sha256:9df9a0d4c5624790a0dea2e02e3b1b3c69aed14bcb8650e19606d9df3719e87d \
-    --hash=sha256:aa4643635f26052401750bd54db911b6342eb1a9ac3e74f0f8b58a25d61dfe41 \
-    --hash=sha256:aed37db837ecb5962469fad448aaae0f0ee94ffce2062cf2eb9aed13328b5196 \
-    --hash=sha256:af52725c7c39b0ee655befbbab5b9a1b209e01bb39128dce0db226a10014aacc \
-    --hash=sha256:b0b8c06afcf2bac5a50b37f64efbde978b7f9dc88842ce9729c020dc71fae4ce \
-    --hash=sha256:b61e64b06c3640feab73fa4ff9cb64bd8182de52e5dc13038e01cfe674ebc321 \
-    --hash=sha256:b7831566595fe88ba17ea80e4b61c0eb599f84c85acaa14bf04dd90319a45b90 \
-    --hash=sha256:b8bc5b87a65a4e64bc83385c05145ea901b613d0d3a434d434b55511b6ab0067 \
-    --hash=sha256:b8d51817cf4b8d545963ec65ff06c1b92e5765aa98831678d0e2240b6e9fd281 \
-    --hash=sha256:b9f9cafaf031c34d95c1528c16b2fa07b710e6056b3c4e2e34e9317072da5d1a \
-    --hash=sha256:bb72d2a94481e7dc7a0c522673db288f31849800d6ce2435317376a345728225 \
-    --hash=sha256:c25ec06e4241e162f5d1f57c370f4078797ade95c9208bd0c60f484834f09c96 \
-    --hash=sha256:c405d482c320a88ab53dcbd98d6d6f32ada074f2d965d6e9bf2d823158fa97de \
-    --hash=sha256:c4472fe53ebf541113e533971bd8c32728debc4c6d8cc177f2bff31d011ec17e \
-    --hash=sha256:c4b1efb11a8acd13246ffb0bee888dd0e8eb057f8bf30112e3e21e421eb82d4a \
-    --hash=sha256:c5f3faeb8100a43adf3e7925d556801d14b5816a0ac9e75e22948e787feec642 \
-    --hash=sha256:c6f034386e5550b5dc8ded90b5e2ff7db21f0f5c7de37b6efc5dac046eb19c10 \
-    --hash=sha256:c99ddaddb2fbe04953b84d1651149a0d85214780e4d0ee824e610ab549d98d92 \
-    --hash=sha256:ca6b66f69e30f6e180d52f14d91ac854b8119553b524e0e28d5291a724f0f423 \
-    --hash=sha256:cccdc02e46d2bd7cb5f38f8cc3d9db0d24951abd082b2f242c9e9f59c0ab2af3 \
-    --hash=sha256:cd49a908cb6d387fc26acee8b7d9fcc9bbf8e1aca890c0b2fdfd706057546080 \
-    --hash=sha256:cf7a4e8de7f1092829caef66fd90eaf3710bc5efd322a816d5677b7664893c93 \
-    --hash=sha256:cfd77e8e5cafba3fb584e0f4b935a59216f352b73d4987be3af51f43a862c403 \
-    --hash=sha256:d34c4f80956227f2686ddea5b3585e109c2733e2d4ef12eb1b8b4e84f09a2ab6 \
-    --hash=sha256:d61a0ca95503867d4d627517bcfdc28a8468c3f1b0b06c626f30dd759d3999fd \
-    --hash=sha256:d81657b23e0edb84b37167e98aefb04ae16cbc5352770057893bd222cdc6e45f \
-    --hash=sha256:d92d897cb4b4bf915fbeb5e604c7911021a8456f0964f3b8ebbe7f9188b9eabb \
-    --hash=sha256:dd318e6b75ca80bff0b22b302f83a8ee41c62b8ac662ddb49f67ec97e799885d \
-    --hash=sha256:dd952b9c64f3b21aedd09b8fe958e4931864dba69926d8a90c90d36ac4e28c9a \
-    --hash=sha256:e0e7e83f31e23c5d00ff618045ddc5e916f9e613d33c5a5823bc0b0a0feb522f \
-    --hash=sha256:e0f17d1df951336a02afc8270c03c0c6e60d1f9996fcbd43a4ce6be81de0bd9d \
-    --hash=sha256:e2a16ef5fa2382af83bef4a18c1b3bcb4284c4732906aa69422cf09df9c59f1f \
-    --hash=sha256:e36021db54b8a0475805acc1d6c4bca5d9f52c3825ad29ae2d398a9d530ddb88 \
-    --hash=sha256:e73db54c967eb75037c178a54445c5a4e7461b5203b27c45ef656a81787c0c1b \
-    --hash=sha256:e741bd48e6a417bdfbae02e088f60018286d6c141639359fb8df017a3b69415a \
-    --hash=sha256:f7271d6bd8838c49ba8ae647fc06469137e1c161a7ef97d778b72904d9b68696 \
-    --hash=sha256:fc391e3941045fd0987c77484b2799adffd08e4b6735c4ee5f054366a2e1551d \
-    --hash=sha256:fc94441bcf9cb8c59f51f23193316afefbf3ff858460cb47b5758bf66a14d130 \
-    --hash=sha256:fe34befb8c765b8ce562f0200afda3578f8abb159c76de3ab354c80b72244c41 \
-    --hash=sha256:fe8080b4f25dfc44a86bedd14bc4f9d469dfc6456e6f3c5d9077e81a5fedfba7 \
-    --hash=sha256:ff34cb09a332832d1cf38acd0f604c068665192c6107a439a92abfd8acf90fe2
+yarl==1.9.4 \
+    --hash=sha256:008d3e808d03ef28542372d01057fd09168419cdc8f848efe2804f894ae03e51 \
+    --hash=sha256:03caa9507d3d3c83bca08650678e25364e1843b484f19986a527630ca376ecce \
+    --hash=sha256:07574b007ee20e5c375a8fe4a0789fad26db905f9813be0f9fef5a68080de559 \
+    --hash=sha256:09efe4615ada057ba2d30df871d2f668af661e971dfeedf0c159927d48bbeff0 \
+    --hash=sha256:0d2454f0aef65ea81037759be5ca9947539667eecebca092733b2eb43c965a81 \
+    --hash=sha256:0e9d124c191d5b881060a9e5060627694c3bdd1fe24c5eecc8d5d7d0eb6faabc \
+    --hash=sha256:18580f672e44ce1238b82f7fb87d727c4a131f3a9d33a5e0e82b793362bf18b4 \
+    --hash=sha256:1f23e4fe1e8794f74b6027d7cf19dc25f8b63af1483d91d595d4a07eca1fb26c \
+    --hash=sha256:206a55215e6d05dbc6c98ce598a59e6fbd0c493e2de4ea6cc2f4934d5a18d130 \
+    --hash=sha256:23d32a2594cb5d565d358a92e151315d1b2268bc10f4610d098f96b147370136 \
+    --hash=sha256:26a1dc6285e03f3cc9e839a2da83bcbf31dcb0d004c72d0730e755b33466c30e \
+    --hash=sha256:29e0f83f37610f173eb7e7b5562dd71467993495e568e708d99e9d1944f561ec \
+    --hash=sha256:2b134fd795e2322b7684155b7855cc99409d10b2e408056db2b93b51a52accc7 \
+    --hash=sha256:2d47552b6e52c3319fede1b60b3de120fe83bde9b7bddad11a69fb0af7db32f1 \
+    --hash=sha256:357495293086c5b6d34ca9616a43d329317feab7917518bc97a08f9e55648455 \
+    --hash=sha256:35a2b9396879ce32754bd457d31a51ff0a9d426fd9e0e3c33394bf4b9036b099 \
+    --hash=sha256:3777ce5536d17989c91696db1d459574e9a9bd37660ea7ee4d3344579bb6f129 \
+    --hash=sha256:3986b6f41ad22988e53d5778f91855dc0399b043fc8946d4f2e68af22ee9ff10 \
+    --hash=sha256:44d8ffbb9c06e5a7f529f38f53eda23e50d1ed33c6c869e01481d3fafa6b8142 \
+    --hash=sha256:49a180c2e0743d5d6e0b4d1a9e5f633c62eca3f8a86ba5dd3c471060e352ca98 \
+    --hash=sha256:4aa9741085f635934f3a2583e16fcf62ba835719a8b2b28fb2917bb0537c1dfa \
+    --hash=sha256:4b21516d181cd77ebd06ce160ef8cc2a5e9ad35fb1c5930882baff5ac865eee7 \
+    --hash=sha256:4b3c1ffe10069f655ea2d731808e76e0f452fc6c749bea04781daf18e6039525 \
+    --hash=sha256:4c7d56b293cc071e82532f70adcbd8b61909eec973ae9d2d1f9b233f3d943f2c \
+    --hash=sha256:4e9035df8d0880b2f1c7f5031f33f69e071dfe72ee9310cfc76f7b605958ceb9 \
+    --hash=sha256:54525ae423d7b7a8ee81ba189f131054defdb122cde31ff17477951464c1691c \
+    --hash=sha256:549d19c84c55d11687ddbd47eeb348a89df9cb30e1993f1b128f4685cd0ebbf8 \
+    --hash=sha256:54beabb809ffcacbd9d28ac57b0db46e42a6e341a030293fb3185c409e626b8b \
+    --hash=sha256:566db86717cf8080b99b58b083b773a908ae40f06681e87e589a976faf8246bf \
+    --hash=sha256:5a2e2433eb9344a163aced6a5f6c9222c0786e5a9e9cac2c89f0b28433f56e23 \
+    --hash=sha256:5aef935237d60a51a62b86249839b51345f47564208c6ee615ed2a40878dccdd \
+    --hash=sha256:604f31d97fa493083ea21bd9b92c419012531c4e17ea6da0f65cacdcf5d0bd27 \
+    --hash=sha256:63b20738b5aac74e239622d2fe30df4fca4942a86e31bf47a81a0e94c14df94f \
+    --hash=sha256:686a0c2f85f83463272ddffd4deb5e591c98aac1897d65e92319f729c320eece \
+    --hash=sha256:6a962e04b8f91f8c4e5917e518d17958e3bdee71fd1d8b88cdce74dd0ebbf434 \
+    --hash=sha256:6ad6d10ed9b67a382b45f29ea028f92d25bc0bc1daf6c5b801b90b5aa70fb9ec \
+    --hash=sha256:6f5cb257bc2ec58f437da2b37a8cd48f666db96d47b8a3115c29f316313654ff \
+    --hash=sha256:6fe79f998a4052d79e1c30eeb7d6c1c1056ad33300f682465e1b4e9b5a188b78 \
+    --hash=sha256:7855426dfbddac81896b6e533ebefc0af2f132d4a47340cee6d22cac7190022d \
+    --hash=sha256:7d5aaac37d19b2904bb9dfe12cdb08c8443e7ba7d2852894ad448d4b8f442863 \
+    --hash=sha256:801e9264d19643548651b9db361ce3287176671fb0117f96b5ac0ee1c3530d53 \
+    --hash=sha256:81eb57278deb6098a5b62e88ad8281b2ba09f2f1147c4767522353eaa6260b31 \
+    --hash=sha256:824d6c50492add5da9374875ce72db7a0733b29c2394890aef23d533106e2b15 \
+    --hash=sha256:8397a3817d7dcdd14bb266283cd1d6fc7264a48c186b986f32e86d86d35fbac5 \
+    --hash=sha256:848cd2a1df56ddbffeb375535fb62c9d1645dde33ca4d51341378b3f5954429b \
+    --hash=sha256:84fc30f71689d7fc9168b92788abc977dc8cefa806909565fc2951d02f6b7d57 \
+    --hash=sha256:8619d6915b3b0b34420cf9b2bb6d81ef59d984cb0fde7544e9ece32b4b3043c3 \
+    --hash=sha256:8a854227cf581330ffa2c4824d96e52ee621dd571078a252c25e3a3b3d94a1b1 \
+    --hash=sha256:8be9e837ea9113676e5754b43b940b50cce76d9ed7d2461df1af39a8ee674d9f \
+    --hash=sha256:928cecb0ef9d5a7946eb6ff58417ad2fe9375762382f1bf5c55e61645f2c43ad \
+    --hash=sha256:957b4774373cf6f709359e5c8c4a0af9f6d7875db657adb0feaf8d6cb3c3964c \
+    --hash=sha256:992f18e0ea248ee03b5a6e8b3b4738850ae7dbb172cc41c966462801cbf62cf7 \
+    --hash=sha256:9fc5fc1eeb029757349ad26bbc5880557389a03fa6ada41703db5e068881e5f2 \
+    --hash=sha256:a00862fb23195b6b8322f7d781b0dc1d82cb3bcac346d1e38689370cc1cc398b \
+    --hash=sha256:a3a6ed1d525bfb91b3fc9b690c5a21bb52de28c018530ad85093cc488bee2dd2 \
+    --hash=sha256:a6327976c7c2f4ee6816eff196e25385ccc02cb81427952414a64811037bbc8b \
+    --hash=sha256:a7409f968456111140c1c95301cadf071bd30a81cbd7ab829169fb9e3d72eae9 \
+    --hash=sha256:a825ec844298c791fd28ed14ed1bffc56a98d15b8c58a20e0e08c1f5f2bea1be \
+    --hash=sha256:a8c1df72eb746f4136fe9a2e72b0c9dc1da1cbd23b5372f94b5820ff8ae30e0e \
+    --hash=sha256:a9bd00dc3bc395a662900f33f74feb3e757429e545d831eef5bb280252631984 \
+    --hash=sha256:aa102d6d280a5455ad6a0f9e6d769989638718e938a6a0a2ff3f4a7ff8c62cc4 \
+    --hash=sha256:aaaea1e536f98754a6e5c56091baa1b6ce2f2700cc4a00b0d49eca8dea471074 \
+    --hash=sha256:ad4d7a90a92e528aadf4965d685c17dacff3df282db1121136c382dc0b6014d2 \
+    --hash=sha256:b8477c1ee4bd47c57d49621a062121c3023609f7a13b8a46953eb6c9716ca392 \
+    --hash=sha256:ba6f52cbc7809cd8d74604cce9c14868306ae4aa0282016b641c661f981a6e91 \
+    --hash=sha256:bac8d525a8dbc2a1507ec731d2867025d11ceadcb4dd421423a5d42c56818541 \
+    --hash=sha256:bef596fdaa8f26e3d66af846bbe77057237cb6e8efff8cd7cc8dff9a62278bbf \
+    --hash=sha256:c0ec0ed476f77db9fb29bca17f0a8fcc7bc97ad4c6c1d8959c507decb22e8572 \
+    --hash=sha256:c38c9ddb6103ceae4e4498f9c08fac9b590c5c71b0370f98714768e22ac6fa66 \
+    --hash=sha256:c7224cab95645c7ab53791022ae77a4509472613e839dab722a72abe5a684575 \
+    --hash=sha256:c74018551e31269d56fab81a728f683667e7c28c04e807ba08f8c9e3bba32f14 \
+    --hash=sha256:ca06675212f94e7a610e85ca36948bb8fc023e458dd6c63ef71abfd482481aa5 \
+    --hash=sha256:d1d2532b340b692880261c15aee4dc94dd22ca5d61b9db9a8a361953d36410b1 \
+    --hash=sha256:d25039a474c4c72a5ad4b52495056f843a7ff07b632c1b92ea9043a3d9950f6e \
+    --hash=sha256:d5ff2c858f5f6a42c2a8e751100f237c5e869cbde669a724f2062d4c4ef93551 \
+    --hash=sha256:d7d7f7de27b8944f1fee2c26a88b4dabc2409d2fea7a9ed3df79b67277644e17 \
+    --hash=sha256:d7eeb6d22331e2fd42fce928a81c697c9ee2d51400bd1a28803965883e13cead \
+    --hash=sha256:d8a1c6c0be645c745a081c192e747c5de06e944a0d21245f4cf7c05e457c36e0 \
+    --hash=sha256:d8b889777de69897406c9fb0b76cdf2fd0f31267861ae7501d93003d55f54fbe \
+    --hash=sha256:d9e09c9d74f4566e905a0b8fa668c58109f7624db96a2171f21747abc7524234 \
+    --hash=sha256:db8e58b9d79200c76956cefd14d5c90af54416ff5353c5bfd7cbe58818e26ef0 \
+    --hash=sha256:ddb2a5c08a4eaaba605340fdee8fc08e406c56617566d9643ad8bf6852778fc7 \
+    --hash=sha256:e0381b4ce23ff92f8170080c97678040fc5b08da85e9e292292aba67fdac6c34 \
+    --hash=sha256:e23a6d84d9d1738dbc6e38167776107e63307dfc8ad108e580548d1f2c587f42 \
+    --hash=sha256:e516dc8baf7b380e6c1c26792610230f37147bb754d6426462ab115a02944385 \
+    --hash=sha256:ea65804b5dc88dacd4a40279af0cdadcfe74b3e5b4c897aa0d81cf86927fee78 \
+    --hash=sha256:ec61d826d80fc293ed46c9dd26995921e3a82146feacd952ef0757236fc137be \
+    --hash=sha256:ee04010f26d5102399bd17f8df8bc38dc7ccd7701dc77f4a68c5b8d733406958 \
+    --hash=sha256:f3bc6af6e2b8f92eced34ef6a96ffb248e863af20ef4fde9448cc8c9b858b749 \
+    --hash=sha256:f7d6b36dd2e029b6bcb8a13cf19664c7b8e19ab3a58e0fefbb5b8461447ed5ec
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   aiohttp
-zipp==3.17.0 \
-    --hash=sha256:0e923e726174922dce09c53c59ad483ff7bbb8e572e00c7f7c46b88556409f31 \
-    --hash=sha256:84e64a1c28cf7e91ed2078bb8cc8c259cb19b76942096c8d7b84947690cabaf0
+zipp==3.19.2 \
+    --hash=sha256:bf1dcf6450f873a13e952a29504887c89e6de7506209e5b1bcc3460135d4de19 \
+    --hash=sha256:f091755f667055f2d02b32c53771a7a6c8b47e1fdbc4b72a8b9072b3eef8015c
     # via
     #   -c release/ray_release/byod/requirements_compiled.txt
     #   importlib-metadata
diff --git a/release/ray_release/cluster_manager/cluster_manager.py b/release/ray_release/cluster_manager/cluster_manager.py
index fac34cc00eb6..3f42da467f91 100644
--- a/release/ray_release/cluster_manager/cluster_manager.py
+++ b/release/ray_release/cluster_manager/cluster_manager.py
@@ -108,8 +108,13 @@ def _annotate_cluster_compute(
             return cluster_compute
 
         cluster_compute = cluster_compute.copy()
-        aws = cluster_compute.get("aws", {})
-        cluster_compute["aws"] = add_tags_to_aws_config(
+        if "aws" in cluster_compute:
+            raise ValueError(
+                "aws field is invalid in compute config, "
+                "use advanced_configurations_json instead"
+            )
+        aws = cluster_compute.get("advanced_configurations_json", {})
+        cluster_compute["advanced_configurations_json"] = add_tags_to_aws_config(
             aws, extra_tags, RELEASE_AWS_RESOURCE_TYPES_TO_TRACK_FOR_BILLING
         )
         return cluster_compute
diff --git a/release/ray_release/cluster_manager/minimal.py b/release/ray_release/cluster_manager/minimal.py
index 8b202c247753..1cfe14c1e2f2 100644
--- a/release/ray_release/cluster_manager/minimal.py
+++ b/release/ray_release/cluster_manager/minimal.py
@@ -9,7 +9,7 @@
 from ray_release.logger import logger
 from ray_release.cluster_manager.cluster_manager import ClusterManager
 from ray_release.util import format_link, anyscale_cluster_env_build_url
-from retry import retry
+from ray_release.retry import retry
 
 REPORT_S = 30.0
 
@@ -20,7 +20,12 @@ class MinimalClusterManager(ClusterManager):
     Builds app config and compute template but does not start or stop session.
     """
 
-    @retry((ClusterEnvCreateError), delay=10, jitter=5, tries=2)
+    @retry(
+        init_delay_sec=10,
+        jitter_sec=5,
+        max_retry_count=2,
+        exceptions=(ClusterEnvCreateError,),
+    )
     def create_cluster_env(self):
         assert self.cluster_env_id is None
 
diff --git a/release/ray_release/config.py b/release/ray_release/config.py
index 93b69e56601c..c38fa4b2f7e4 100644
--- a/release/ray_release/config.py
+++ b/release/ray_release/config.py
@@ -38,6 +38,11 @@
 
 RELEASE_TEST_SCHEMA_FILE = bazel_runfile("release/ray_release/schema.json")
 
+RELEASE_TEST_CONFIG_FILES = [
+    "release/release_tests.yaml",
+    "release/release_data_tests.yaml",
+]
+
 
 def read_and_validate_release_test_collection(
     config_files: List[str],
@@ -76,11 +81,22 @@ def _test_definition_invariant(
 
 
 def parse_test_definition(test_definitions: List[TestDefinition]) -> List[Test]:
+    default_definition = {}
     tests = []
     for test_definition in test_definitions:
+        if test_definition["name"] == "DEFAULTS":
+            default_definition = copy.deepcopy(test_definition)
+            continue
+
+        # Add default values to the test definition.
+        test_definition = deep_update(
+            copy.deepcopy(default_definition), test_definition
+        )
+
         if "variations" not in test_definition:
             tests.append(Test(test_definition))
             continue
+
         variations = test_definition.pop("variations")
         _test_definition_invariant(
             test_definition,
diff --git a/release/ray_release/retry.py b/release/ray_release/retry.py
new file mode 100644
index 000000000000..dec0bd9be925
--- /dev/null
+++ b/release/ray_release/retry.py
@@ -0,0 +1,42 @@
+"""Utils on retry."""
+
+import time
+from functools import wraps
+from typing import Tuple
+
+# Default configuration for retry.
+_DEFAULT_MAX_RETRY_COUNT: int = 10
+_DEFAULT_INIT_DELAY_SEC: int = 1
+_DEFAULT_MAX_DELAY_SEC: int = 30
+_DEFAULT_BACKOFF: int = 2
+_DEFAULT_JITTER_SEC: int = 1
+_DEFAULT_EXCEPTIONS: Tuple[Exception] = (Exception,)
+
+
+def retry(
+    max_retry_count: int = _DEFAULT_MAX_RETRY_COUNT,
+    init_delay_sec: int = _DEFAULT_INIT_DELAY_SEC,
+    max_delay_sec: int = _DEFAULT_MAX_DELAY_SEC,
+    backoff: int = _DEFAULT_BACKOFF,
+    jitter_sec: int = _DEFAULT_JITTER_SEC,
+    exceptions: Tuple[Exception] = _DEFAULT_EXCEPTIONS,
+):
+    def wrapper(fn):
+        @wraps(fn)
+        def wrapped(*args, **kwargs):
+            for cur_retry_count in range(max_retry_count):
+                try:
+                    return fn(*args, **kwargs)
+                except exceptions:
+                    if cur_retry_count + 1 == max_retry_count:
+                        raise
+
+                    sleep_sec = min(
+                        init_delay_sec * (backoff**cur_retry_count) + jitter_sec,
+                        max_delay_sec,
+                    )
+                    time.sleep(sleep_sec)
+
+        return wrapped
+
+    return wrapper
diff --git a/release/ray_release/scripts/build_pipeline.py b/release/ray_release/scripts/build_pipeline.py
index 735ddb2e8cdf..29e448d8f4fd 100644
--- a/release/ray_release/scripts/build_pipeline.py
+++ b/release/ray_release/scripts/build_pipeline.py
@@ -14,7 +14,10 @@
     build_anyscale_base_byod_images,
     build_anyscale_custom_byod_image,
 )
-from ray_release.config import read_and_validate_release_test_collection
+from ray_release.config import (
+    read_and_validate_release_test_collection,
+    RELEASE_TEST_CONFIG_FILES,
+)
 from ray_release.configs.global_config import init_global_config
 from ray_release.exception import ReleaseTestCLIError, ReleaseTestConfigError
 from ray_release.logger import logger
@@ -92,7 +95,7 @@ def main(
 
     try:
         test_collection = read_and_validate_release_test_collection(
-            test_collection_file or ["release/release_tests.yaml"]
+            test_collection_file or RELEASE_TEST_CONFIG_FILES
         )
     except ReleaseTestConfigError as e:
         raise ReleaseTestConfigError(
diff --git a/release/ray_release/scripts/get_test_summary.py b/release/ray_release/scripts/get_test_summary.py
index b4793a860d06..ccb5066713b2 100644
--- a/release/ray_release/scripts/get_test_summary.py
+++ b/release/ray_release/scripts/get_test_summary.py
@@ -5,7 +5,10 @@
 import click
 
 from ray_release.buildkite.concurrency import get_test_resources
-from ray_release.config import read_and_validate_release_test_collection
+from ray_release.config import (
+    read_and_validate_release_test_collection,
+    RELEASE_TEST_CONFIG_FILES,
+)
 
 
 @click.command()
@@ -26,7 +29,7 @@ def main(test_collection_file: Optional[str] = None, output: Optional[str] = Non
     output = output or os.path.join(os.path.dirname(__file__), "test_summary.csv")
 
     tests = read_and_validate_release_test_collection(
-        test_collection_file or ["release/release_tests.yaml"]
+        test_collection_file or RELEASE_TEST_CONFIG_FILES
     )
 
     with open(output, "w") as f:
diff --git a/release/ray_release/scripts/ray_bisect.py b/release/ray_release/scripts/ray_bisect.py
index 46d3c521bb93..15ebd4c9d396 100644
--- a/release/ray_release/scripts/ray_bisect.py
+++ b/release/ray_release/scripts/ray_bisect.py
@@ -14,7 +14,10 @@
     build_anyscale_base_byod_images,
     build_anyscale_custom_byod_image,
 )
-from ray_release.config import read_and_validate_release_test_collection
+from ray_release.config import (
+    read_and_validate_release_test_collection,
+    RELEASE_TEST_CONFIG_FILES,
+)
 from ray_release.configs.global_config import init_global_config
 from ray_release.test import Test
 from ray_release.test_automation.release_state_machine import ReleaseTestStateMachine
@@ -241,7 +244,7 @@ def _obtain_test_result(
 
 def _get_test(test_name: str, test_collection_file: Tuple[str]) -> Test:
     test_collection = read_and_validate_release_test_collection(
-        test_collection_file or ["release/release_tests.yaml"],
+        test_collection_file or RELEASE_TEST_CONFIG_FILES,
     )
     return [test for test in test_collection if test["name"] == test_name][0]
 
diff --git a/release/ray_release/scripts/run_release_test.py b/release/ray_release/scripts/run_release_test.py
index 13830139554a..e03912b9f681 100644
--- a/release/ray_release/scripts/run_release_test.py
+++ b/release/ray_release/scripts/run_release_test.py
@@ -9,6 +9,7 @@
     as_smoke_test,
     find_test,
     read_and_validate_release_test_collection,
+    RELEASE_TEST_CONFIG_FILES,
 )
 from ray_release.configs.global_config import init_global_config
 from ray_release.env import DEFAULT_ENVIRONMENT, load_environment, populate_os_env
@@ -114,7 +115,7 @@ def main(
     )
     init_global_config(global_config_file)
     test_collection = read_and_validate_release_test_collection(
-        test_collection_file or ["release/release_tests.yaml"],
+        test_collection_file or RELEASE_TEST_CONFIG_FILES,
         test_definition_root,
     )
     test = find_test(test_collection, test_name)
diff --git a/release/ray_release/tests/test_cluster_manager.py b/release/ray_release/tests/test_cluster_manager.py
index 170d9b71c1cb..1b652754a96f 100644
--- a/release/ray_release/tests/test_cluster_manager.py
+++ b/release/ray_release/tests/test_cluster_manager.py
@@ -275,19 +275,20 @@ def testClusterComputeExtraTags(self):
         # All ResourceTypes as in
         # ray_release.aws.RELEASE_AWS_RESOURCE_TYPES_TO_TRACK_FOR_BILLING
         target_cluster_compute = TEST_CLUSTER_COMPUTE.copy()
-        target_cluster_compute["aws"] = {
+        target_cluster_compute["advanced_configurations_json"] = {
             "TagSpecifications": [
                 {"ResourceType": "instance", "Tags": [{"Key": "foo", "Value": "bar"}]},
                 {"ResourceType": "volume", "Tags": [{"Key": "foo", "Value": "bar"}]},
             ]
         }
         self.assertEqual(
-            self.cluster_manager.cluster_compute["aws"], target_cluster_compute["aws"]
+            self.cluster_manager.cluster_compute["advanced_configurations_json"],
+            target_cluster_compute["advanced_configurations_json"],
         )
 
         # Test merging with already existing tags
         cluster_compute_with_tags = TEST_CLUSTER_COMPUTE.copy()
-        cluster_compute_with_tags["aws"] = {
+        cluster_compute_with_tags["advanced_configurations_json"] = {
             "TagSpecifications": [
                 {"ResourceType": "fake", "Tags": []},
                 {"ResourceType": "instance", "Tags": [{"Key": "key", "Value": "val"}]},
@@ -299,7 +300,7 @@ def testClusterComputeExtraTags(self):
 
         # All ResourceTypes as in RELEASE_AWS_RESOURCE_TYPES_TO_TRACK_FOR_BILLING
         target_cluster_compute = TEST_CLUSTER_COMPUTE.copy()
-        target_cluster_compute["aws"] = {
+        target_cluster_compute["advanced_configurations_json"] = {
             "TagSpecifications": [
                 {"ResourceType": "fake", "Tags": []},
                 {
@@ -313,7 +314,8 @@ def testClusterComputeExtraTags(self):
             ]
         }
         self.assertEqual(
-            self.cluster_manager.cluster_compute["aws"], target_cluster_compute["aws"]
+            self.cluster_manager.cluster_compute["advanced_configurations_json"],
+            target_cluster_compute["advanced_configurations_json"],
         )
 
     @patch("time.sleep", lambda *a, **kw: None)
diff --git a/release/ray_release/tests/test_config.py b/release/ray_release/tests/test_config.py
index f922d6512d99..c7884b116880 100644
--- a/release/ray_release/tests/test_config.py
+++ b/release/ray_release/tests/test_config.py
@@ -14,6 +14,7 @@
 
 _TEST_COLLECTION_FILES = [
     "release/release_tests.yaml",
+    "release/release_data_tests.yaml",
     "release/ray_release/tests/test_collection_data.yaml",
 ]
 
@@ -92,6 +93,42 @@ def test_parse_test_definition():
         parse_test_definition([invalid_test_definition])
 
 
+def test_parse_test_definition_with_defaults():
+    test_definitions = yaml.safe_load(
+        """
+        - name: DEFAULTS
+          working_dir: default_working_dir
+        - name: sample_test_with_default_working_dir
+          frequency: nightly
+          team: sample
+          cluster:
+            byod:
+              type: gpu
+            cluster_compute: compute.yaml
+          run:
+            timeout: 100
+            script: python script.py
+        - name: sample_test_with_overridden_working_dir
+          working_dir: overridden_working_dir
+          frequency: nightly
+          team: sample
+          cluster:
+            byod:
+              type: gpu
+            cluster_compute: compute.yaml
+          run:
+            timeout: 100
+            script: python script.py
+    """
+    )
+    test_with_default, test_with_override = parse_test_definition(test_definitions)
+    schema = load_schema_file()
+    assert not validate_test(test_with_default, schema)
+    assert not validate_test(test_with_override, schema)
+    assert test_with_default["working_dir"] == "default_working_dir"
+    assert test_with_override["working_dir"] == "overridden_working_dir"
+
+
 def test_schema_validation():
     test = VALID_TEST.copy()
 
diff --git a/release/ray_release/tests/test_retry.py b/release/ray_release/tests/test_retry.py
new file mode 100644
index 000000000000..b630e19f2dd0
--- /dev/null
+++ b/release/ray_release/tests/test_retry.py
@@ -0,0 +1,75 @@
+from ray_release import retry
+
+import sys
+import pytest
+
+
+def test_retry_with_no_error():
+    invocation_count = 0
+
+    # Function doesn't raise exception; use a dummy value to check invocation.
+    @retry.retry()
+    def no_error_func() -> int:
+        nonlocal invocation_count
+        invocation_count += 1
+        return 1
+
+    assert no_error_func() == 1
+    assert invocation_count == 1
+
+
+# Test senario: exception count is less than retry count.
+def test_retry_with_limited_error():
+    invocation_count = 0
+
+    # Function doesn't raise exception; use a dummy value to check invocation.
+    @retry.retry(init_delay_sec=1, jitter_sec=1)
+    def limited_error() -> int:
+        nonlocal invocation_count
+
+        invocation_count += 1
+
+        if invocation_count == 1:
+            raise Exception("Manual exception")
+        return 1
+
+    assert limited_error() == 1
+    assert invocation_count == 2
+
+
+# Test senario: exception count exceeds retry count.
+def test_retry_with_unlimited_error():
+    invocation_count = 0
+
+    @retry.retry(init_delay_sec=1, jitter_sec=1, backoff=1, max_retry_count=3)
+    def unlimited_error() -> int:
+        nonlocal invocation_count
+
+        invocation_count += 1
+        raise Exception("Manual exception")
+
+    with pytest.raises(Exception, match="Manual exception"):
+        unlimited_error()
+    assert invocation_count == 3
+
+
+def test_retry_on_certain_errors():
+    invocation_count = 0
+
+    # Function doesn't raise exception; use a dummy value to check invocation.
+    @retry.retry(init_delay_sec=1, jitter_sec=1, exceptions=(KeyError,))
+    def limited_error() -> int:
+        nonlocal invocation_count
+
+        invocation_count += 1
+
+        if invocation_count == 1:
+            raise KeyError("Manual exception")
+        return 1
+
+    assert limited_error() == 1
+    assert invocation_count == 2
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(["-sv", __file__]))
diff --git a/release/release_data_tests.yaml b/release/release_data_tests.yaml
new file mode 100644
index 000000000000..9ea55e27af8e
--- /dev/null
+++ b/release/release_data_tests.yaml
@@ -0,0 +1,721 @@
+- name: DEFAULTS
+  group: data-tests
+  working_dir: nightly_tests/dataset
+
+  frequency: nightly
+  team: data
+
+  cluster:
+    byod:
+      type: gpu
+    cluster_compute: multi_node_autoscaling_compute.yaml
+
+###############
+# Reading tests
+###############
+
+- name: read_parquet
+  run:
+    timeout: 3600
+    script: python read_and_consume_benchmark.py s3://ray-benchmark-data/parquet/10TiB --format parquet --iterate
+
+- name: read_images
+  run:
+    timeout: 3600
+    script: python read_and_consume_benchmark.py s3://air-example-data-2/300G-image-data-synthetic-raw --format image --iterate
+
+###############
+# Dataset tests
+###############
+
+- name: count_parquet
+  run:
+    timeout: 600
+    script: python read_and_consume_benchmark.py s3://ray-benchmark-data/parquet/10TiB --format parquet --count
+
+- name: stable_diffusion_benchmark
+  group: data-tests
+  working_dir: nightly_tests/dataset
+
+  frequency: nightly
+  team: data
+
+  cluster:
+    byod:
+      type: gpu
+      post_build_script: byod_stable_diffusion.sh
+    cluster_compute: stable_diffusion_benchmark_compute.yaml
+
+  run:
+    timeout: 1800
+    script: python stable_diffusion_benchmark.py
+
+  variations:
+    - __suffix__: aws
+    - __suffix__: gce
+      env: gce
+      frequency: manual
+      cluster:
+        cluster_compute: stable_diffusion_benchmark_compute_gce.yaml
+
+- name: streaming_data_ingest_benchmark_1tb
+  group: data-tests
+  working_dir: nightly_tests/dataset
+
+  frequency: nightly
+  team: data
+
+  cluster:
+    byod:
+      type: gpu
+    cluster_compute: data_ingest_benchmark_compute.yaml
+
+  run:
+    timeout: 300
+    script: python data_ingest_benchmark.py --dataset-size-gb=1000 --num-workers=20 --streaming
+    wait_for_nodes:
+      num_nodes: 20
+
+  variations:
+    - __suffix__: aws
+    - __suffix__: gce
+      env: gce
+      frequency: manual
+      cluster:
+        cluster_compute: data_ingest_benchmark_compute_gce.yaml
+
+- name: streaming_data_ingest_benchmark_100gb_gpu
+  group: data-tests
+  working_dir: nightly_tests/dataset
+
+  frequency: nightly
+  team: data
+
+  cluster:
+    byod:
+      type: gpu
+    cluster_compute: data_ingest_benchmark_compute_gpu.yaml
+
+  run:
+    timeout: 300
+    script: python data_ingest_benchmark.py --dataset-size-gb=100 --num-workers=4 --streaming --use-gpu
+    wait_for_nodes:
+      num_nodes: 3
+
+  variations:
+    - __suffix__: aws
+    - __suffix__: gce
+      env: gce
+      frequency: manual
+      cluster:
+        cluster_compute: data_ingest_benchmark_compute_gpu_gce.yaml
+
+# This test case will early stop the data ingestion iteration on the GPU actors.
+# This is a common usage in PyTorch Lightning
+# (https://lightning.ai/docs/pytorch/stable/common/trainer.html#limit-train-batches).
+# There was a bug in Ray Data that caused GPU memoy leak (see #3.919).
+# We add this test case to cover this scenario.
+- name: streaming_data_ingest_benchmark_100gb_gpu_early_stop
+  group: data-tests
+  working_dir: nightly_tests/dataset
+
+  frequency: nightly
+  team: data
+
+  cluster:
+    byod:
+      type: gpu
+    cluster_compute: data_ingest_benchmark_compute_gpu.yaml
+
+  run:
+    timeout: 300
+    script: python data_ingest_benchmark.py --dataset-size-gb=100 --num-workers=4 --streaming --use-gpu --early-stop
+    wait_for_nodes:
+      num_nodes: 3
+
+  variations:
+    - __suffix__: aws
+    - __suffix__: gce
+      env: gce
+      frequency: manual
+      cluster:
+        cluster_compute: data_ingest_benchmark_compute_gpu_gce.yaml
+
+- name: read_images_comparison_microbenchmark_single_node
+  group: data-tests
+  working_dir: nightly_tests/dataset
+
+  frequency: nightly
+  team: data
+  cluster:
+    byod:
+      type: gpu
+      post_build_script: byod_install_mosaicml.sh
+    cluster_compute: single_worker_node_0_head_node_benchmark_compute.yaml
+
+  run:
+    timeout: 1800
+    script: bash run_image_loader_microbenchmark.sh
+
+  variations:
+    - __suffix__: aws
+    - __suffix__: gce
+      env: gce
+      frequency: manual
+      cluster:
+        cluster_compute: single_node_benchmark_compute_gce.yaml
+
+- name: read_images_train_4_gpu
+  group: data-tests
+  working_dir: nightly_tests/dataset
+
+  frequency: nightly
+  team: data
+  cluster:
+    byod:
+      type: gpu
+      post_build_script: byod_install_mosaicml.sh
+    cluster_compute: multi_node_train_4_workers.yaml
+
+  run:
+    timeout: 18000
+    script: python multi_node_train_benchmark.py --num-workers 4 --file-type image --use-gpu --num-epochs 2
+
+  variations:
+    - __suffix__: aws
+    - __suffix__: gce
+      env: gce
+      frequency: manual
+      cluster:
+        cluster_compute: ../../air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml
+
+- name: read_images_train_4_gpu_worker_chaos
+  group: data-tests
+  working_dir: nightly_tests
+
+  frequency: nightly
+  team: data
+  cluster:
+    byod:
+      type: gpu
+      post_build_script: byod_install_mosaicml.sh
+    cluster_compute: dataset/multi_node_train_4_workers.yaml
+
+  run:
+    timeout: 18000
+    prepare: python setup_chaos.py --kill-workers --kill-interval 100 --max-to-kill 3 --task-names "ReadImage->Map(wnid_to_index)->Map(crop_and_flip_image)"
+    script: python dataset/multi_node_train_benchmark.py --num-workers 4 --file-type image --use-gpu --num-epochs 1
+
+  variations:
+    - __suffix__: aws
+    - __suffix__: gce
+      env: gce
+      frequency: manual
+      cluster:
+        cluster_compute: ../air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml
+
+- name: read_images_train_4_gpu_node_chaos
+  group: data-tests
+  working_dir: nightly_tests
+
+  frequency: nightly
+  team: data
+  cluster:
+    byod:
+      type: gpu
+      post_build_script: byod_install_mosaicml.sh
+    cluster_compute: dataset/multi_node_train_4_workers.yaml
+
+  run:
+    timeout: 18000
+    prepare: python setup_chaos.py --kill-interval 200 --max-to-kill 1 --task-names "_RayTrainWorker__execute.get_next"
+    script: python dataset/multi_node_train_benchmark.py --num-workers 4 --file-type image --use-gpu --num-epochs 1
+
+  variations:
+    - __suffix__: aws
+    - __suffix__: gce
+      env: gce
+      frequency: manual
+      cluster:
+        cluster_compute: ../air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml
+
+- name: read_images_train_16_gpu
+  group: data-tests
+  working_dir: nightly_tests/dataset
+
+  frequency: nightly
+  team: data
+  cluster:
+    byod:
+      type: gpu
+      post_build_script: byod_install_mosaicml.sh
+    cluster_compute: multi_node_train_16_workers.yaml
+
+  run:
+    timeout: 18000
+    script: python multi_node_train_benchmark.py --num-workers 16 --file-type image --use-gpu --num-epochs 2
+
+  variations:
+    - __suffix__: aws
+    - __suffix__: gce
+      env: gce
+      frequency: manual
+      cluster:
+        cluster_compute: ../../air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml
+
+- name: read_images_train_16_gpu_preserve_order
+  group: data-tests
+  working_dir: nightly_tests/dataset
+
+  frequency: nightly
+  team: data
+  cluster:
+    byod:
+      type: gpu
+      post_build_script: byod_install_mosaicml.sh
+    cluster_compute: multi_node_train_16_workers.yaml
+
+  run:
+    timeout: 18000
+    script: python multi_node_train_benchmark.py --num-workers 16 --file-type image --preserve-order --use-gpu --num-epochs 2
+
+  variations:
+    - __suffix__: aws
+    - __suffix__: gce
+      env: gce
+      frequency: manual
+      cluster:
+        cluster_compute: ../../air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml
+
+- name: read_parquet_train_4_gpu
+  group: data-tests
+  working_dir: nightly_tests/dataset
+
+  frequency: nightly
+  team: data
+  cluster:
+    byod:
+      type: gpu
+      post_build_script: byod_install_mosaicml.sh
+    cluster_compute: multi_node_train_4_workers.yaml
+
+  run:
+    timeout: 3600
+    script: python multi_node_train_benchmark.py --num-workers 4 --file-type parquet --target-worker-gb 50 --use-gpu
+
+  variations:
+    - __suffix__: aws
+    - __suffix__: gce
+      env: gce
+      frequency: manual
+      cluster:
+        cluster_compute: ../../air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml
+
+- name: read_parquet_train_16_gpu
+  group: data-tests
+  working_dir: nightly_tests/dataset
+
+  frequency: nightly
+  team: data
+  cluster:
+    byod:
+      type: gpu
+      post_build_script: byod_install_mosaicml.sh
+    cluster_compute: multi_node_train_16_workers.yaml
+
+  run:
+    timeout: 3600
+    script: python multi_node_train_benchmark.py --num-workers 16 --file-type parquet --target-worker-gb 50 --use-gpu
+
+  variations:
+    - __suffix__: aws
+    - __suffix__: gce
+      env: gce
+      frequency: manual
+      cluster:
+        cluster_compute: ../../air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml
+
+- name: read_images_train_1_gpu_5_cpu
+  group: data-tests
+  working_dir: nightly_tests/dataset
+
+  frequency: nightly
+  team: data
+  cluster:
+    byod:
+      type: gpu
+      post_build_script: byod_install_mosaicml.sh
+    cluster_compute: multi_node_train_1g5c.yaml
+
+  run:
+    timeout: 2400
+    script: python multi_node_train_benchmark.py --num-workers 1 --file-type image --use-gpu --num-epochs 2 --skip-train-model --prefetch-batches 16 --batch-size -1 --disable-locality-with-output
+
+  variations:
+    - __suffix__: aws
+    - __suffix__: gce
+      env: gce
+      frequency: manual
+      cluster:
+        cluster_compute: compute_gpu_1g5c_gce.yaml
+
+- name: read_tfrecords_benchmark_single_node
+  group: data-tests
+  working_dir: nightly_tests/dataset
+
+  frequency: nightly
+  team: data
+
+  cluster:
+    byod:
+      type: gpu
+      post_build_script: byod_install_mosaicml.sh
+    cluster_compute: single_node_benchmark_compute.yaml
+
+  run:
+    # Expect the benchmark to finish around 30 minutes.
+    timeout: 2700
+    script: python read_tfrecords_benchmark.py
+
+  variations:
+    - __suffix__: aws
+    - __suffix__: gce
+      env: gce
+      frequency: manual
+      cluster:
+        cluster_compute: single_node_benchmark_compute_gce.yaml
+
+- name: map_batches_benchmark_single_node
+  group: data-tests
+  working_dir: nightly_tests/dataset
+
+  frequency: nightly
+  team: data
+
+  cluster:
+    byod:
+      type: gpu
+    cluster_compute: single_node_benchmark_compute.yaml
+
+  run:
+    # Expect the benchmark to finish around 30 minutes.
+    timeout: 2400
+    script: python map_batches_benchmark.py
+
+  variations:
+    - __suffix__: aws
+    - __suffix__: gce
+      env: gce
+      frequency: manual
+      cluster:
+        cluster_compute: single_node_benchmark_compute_gce.yaml
+
+- name: iter_tensor_batches_benchmark_single_node
+  group: data-tests
+  working_dir: nightly_tests/dataset
+
+  frequency: nightly
+  team: data
+
+  cluster:
+    byod:
+      type: gpu
+    cluster_compute: single_node_benchmark_compute.yaml
+
+  run:
+    # Expect the benchmark to finish around 30 minutes.
+    timeout: 2400
+    script: python iter_tensor_batches_benchmark.py
+
+  variations:
+    - __suffix__: aws
+    - __suffix__: gce
+      env: gce
+      frequency: manual
+      cluster:
+        cluster_compute: single_node_benchmark_compute_gce.yaml
+
+- name: iter_tensor_batches_benchmark_multi_node
+  group: data-tests
+  working_dir: nightly_tests/dataset
+
+  frequency: nightly
+  team: data
+
+  cluster:
+    byod:
+      type: gpu
+    cluster_compute: multi_node_benchmark_compute.yaml
+
+  run:
+    # Expect the benchmark to finish within 90 minutes.
+    timeout: 5400
+    script: python iter_tensor_batches_benchmark.py --data-size-gb=10
+
+  variations:
+    - __suffix__: aws
+    - __suffix__: gce
+      env: gce
+      frequency: manual
+      cluster:
+        cluster_compute: multi_node_benchmark_compute_gce.yaml
+
+- name: iter_batches_benchmark_single_node
+  group: data-tests
+  working_dir: nightly_tests/dataset
+
+  frequency: nightly
+  team: data
+
+  cluster:
+    byod:
+      type: gpu
+    cluster_compute: single_node_benchmark_compute.yaml
+
+  run:
+    # Expect the benchmark to finish around 12 minutes.
+    timeout: 1080
+    script: python iter_batches_benchmark.py
+
+  variations:
+    - __suffix__: aws
+    - __suffix__: gce
+      env: gce
+      frequency: manual
+      cluster:
+        cluster_compute: single_node_benchmark_compute_gce.yaml
+
+- name: dataset_shuffle_random_shuffle_1tb
+  group: data-tests
+  working_dir: nightly_tests
+
+  frequency: nightly
+  team: data
+
+  cluster:
+    byod:
+      runtime_env:
+        - RAY_worker_killing_policy=retriable_lifo
+      pip:
+        - ray[default]
+    cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
+
+  run:
+    timeout: 7200
+    script: python dataset/sort.py --num-partitions=1000 --partition-size=1e9 --shuffle
+    wait_for_nodes:
+      num_nodes: 20
+
+  variations:
+    - __suffix__: aws
+    - __suffix__: gce
+      env: gce
+      frequency: manual
+      cluster:
+        cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
+
+- name: dataset_shuffle_sort_1tb
+  group: data-tests
+  working_dir: nightly_tests
+
+  frequency: nightly
+  team: data
+  stable: False
+
+  cluster:
+    byod:
+      runtime_env:
+        - RAY_worker_killing_policy=retriable_lifo
+      pip:
+        - ray[default]
+    cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
+
+  run:
+    timeout: 7200
+    script: python dataset/sort.py --num-partitions=1000 --partition-size=1e9
+    wait_for_nodes:
+      num_nodes: 20
+
+  variations:
+    - __suffix__: aws
+    - __suffix__: gce
+      env: gce
+      frequency: manual
+      cluster:
+        cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
+
+
+############################
+# Batch Inference Benchmarks
+############################
+
+# 10 GB image classification raw images with 1 GPU.
+# 1 g4dn.4xlarge
+- name: torch_batch_inference_1_gpu_10gb_raw
+  group: data-tests
+  working_dir: nightly_tests/dataset
+
+  frequency: nightly
+  team: data
+  cluster:
+    byod:
+      type: gpu
+    cluster_compute: compute_gpu_1_cpu_16_aws.yaml
+
+  run:
+    timeout: 500
+    script: python gpu_batch_inference.py --data-directory=10G-image-data-synthetic-raw --data-format raw
+
+  alert: default
+
+  variations:
+    - __suffix__: aws
+    - __suffix__: gce
+      env: gce
+      frequency: manual
+      cluster:
+        cluster_compute: compute_gpu_1_cpu_16_gce.yaml
+
+# 10 GB image classification parquet with 1 GPU.
+# 1 g4dn.4xlarge
+- name: torch_batch_inference_1_gpu_10gb_parquet
+  group: data-tests
+  working_dir: nightly_tests/dataset
+
+  frequency: nightly
+  team: data
+  cluster:
+    byod:
+      type: gpu
+    cluster_compute: compute_gpu_1_cpu_16_aws.yaml
+
+  run:
+    timeout: 500
+    script: python gpu_batch_inference.py --data-directory=10G-image-data-synthetic-raw-parquet --data-format parquet
+
+  alert: default
+
+  variations:
+    - __suffix__: aws
+    - __suffix__: gce
+      env: gce
+      frequency: manual
+      cluster:
+        cluster_compute: compute_gpu_1_cpu_16_gce.yaml
+
+
+# 300 GB image classification raw images with 16 GPUs
+# 4 g4dn.12xlarge
+- name: torch_batch_inference_16_gpu_300gb_raw
+  group: data-tests
+  working_dir: nightly_tests/dataset
+
+  frequency: nightly
+  team: data
+  cluster:
+    byod:
+      type: gpu
+    cluster_compute: compute_gpu_4x4_aws.yaml
+
+  run:
+    timeout: 1000
+    script: python gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw --data-format raw
+
+    wait_for_nodes:
+        num_nodes: 4
+
+  alert: default
+
+  variations:
+    - __suffix__: aws
+    - __suffix__: gce
+      env: gce
+      frequency: manual
+      cluster:
+        cluster_compute: compute_gpu_4x4_gce.yaml
+
+
+- name: chaos_torch_batch_inference_16_gpu_300gb_raw
+  group: data-tests
+  working_dir: nightly_tests
+  stable: false
+
+  frequency: nightly
+  team: data
+  cluster:
+    byod:
+      type: gpu
+    cluster_compute: dataset/compute_gpu_4x4_aws.yaml
+
+  run:
+    timeout: 1000
+    prepare: python setup_chaos.py --max-to-kill 2 --kill-delay 30
+    script: python dataset/gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw --data-format raw
+
+    wait_for_nodes:
+        num_nodes: 4
+
+  alert: default
+
+  variations:
+    - __suffix__: aws
+    - __suffix__: gce
+      env: gce
+      frequency: manual
+      cluster:
+        cluster_compute: dataset/compute_gpu_4x4_gce.yaml
+
+
+# 300 GB image classification parquet data with 16 GPUs
+# 4 g4dn.12xlarge
+- name: torch_batch_inference_16_gpu_300gb_parquet
+  group: data-tests
+  working_dir: nightly_tests/dataset
+
+  frequency: nightly
+  team: data
+
+  cluster:
+    byod:
+      type: gpu
+    cluster_compute: compute_gpu_4x4_aws.yaml
+
+  run:
+    timeout: 1000
+    script: python gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw-parquet --data-format parquet
+
+    wait_for_nodes:
+        num_nodes: 4
+
+  alert: default
+
+  variations:
+    - __suffix__: aws
+    - __suffix__: gce
+      env: gce
+      frequency: manual
+      cluster:
+        cluster_compute: compute_gpu_4x4_gce.yaml
+
+# 10 TB image classification parquet data with heterogenous cluster
+# 10 g4dn.12xlarge, 10 m5.16xlarge
+- name: torch_batch_inference_hetero_10tb_parquet
+  group: data-tests
+  working_dir: nightly_tests/dataset
+
+  frequency: weekly
+  team: data
+
+  cluster:
+    byod:
+      type: gpu
+    cluster_compute: compute_hetero_10x10_aws.yaml
+
+  run:
+    timeout: 2000
+    script: python gpu_batch_inference.py --data-directory 10T-image-data-synthetic-raw-parquet --data-format parquet
+
+    wait_for_nodes:
+      num_nodes: 20
+
+  alert: default
diff --git a/release/release_tests.yaml b/release/release_tests.yaml
index 33dc2486eae6..1c150aedf601 100644
--- a/release/release_tests.yaml
+++ b/release/release_tests.yaml
@@ -106,181 +106,6 @@
 
   alert: default
 
-############################
-# Batch Inference Benchmarks
-############################
-
-# 10 GB image classification raw images with 1 GPU.
-# 1 g4dn.4xlarge
-- name: torch_batch_inference_1_gpu_10gb_raw
-  group: data-tests
-  working_dir: nightly_tests/dataset
-
-  frequency: nightly
-  team: data
-  cluster:
-    byod:
-      type: gpu
-    cluster_compute: compute_gpu_1_cpu_16_aws.yaml
-
-  run:
-    timeout: 500
-    script: python gpu_batch_inference.py --data-directory=10G-image-data-synthetic-raw --data-format raw
-
-  alert: default
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: compute_gpu_1_cpu_16_gce.yaml
-
-# 10 GB image classification parquet with 1 GPU.
-# 1 g4dn.4xlarge
-- name: torch_batch_inference_1_gpu_10gb_parquet
-  group: data-tests
-  working_dir: nightly_tests/dataset
-
-  frequency: nightly
-  team: data
-  cluster:
-    byod:
-      type: gpu
-    cluster_compute: compute_gpu_1_cpu_16_aws.yaml
-
-  run:
-    timeout: 500
-    script: python gpu_batch_inference.py --data-directory=10G-image-data-synthetic-raw-parquet --data-format parquet
-
-  alert: default
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: compute_gpu_1_cpu_16_gce.yaml
-
-
-# 300 GB image classification raw images with 16 GPUs
-# 4 g4dn.12xlarge
-- name: torch_batch_inference_16_gpu_300gb_raw
-  group: data-tests
-  working_dir: nightly_tests/dataset
-
-  frequency: nightly
-  team: data
-  cluster:
-    byod:
-      type: gpu
-    cluster_compute: compute_gpu_4x4_aws.yaml
-
-  run:
-    timeout: 1000
-    script: python gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw --data-format raw
-
-    wait_for_nodes:
-        num_nodes: 4
-
-  alert: default
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: compute_gpu_4x4_gce.yaml
-
-
-- name: chaos_torch_batch_inference_16_gpu_300gb_raw
-  group: data-tests
-  working_dir: nightly_tests
-  stable: false
-
-  frequency: nightly
-  team: data
-  cluster:
-    byod:
-      type: gpu
-    cluster_compute: dataset/compute_gpu_4x4_aws.yaml
-
-  run:
-    timeout: 1000
-    prepare: python setup_chaos.py --max-to-kill 2 --kill-delay 30
-    script: python dataset/gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw --data-format raw
-
-    wait_for_nodes:
-        num_nodes: 4
-
-  alert: default
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: dataset/compute_gpu_4x4_gce.yaml
-
-
-# 300 GB image classification parquet data with 16 GPUs
-# 4 g4dn.12xlarge
-- name: torch_batch_inference_16_gpu_300gb_parquet
-  group: data-tests
-  working_dir: nightly_tests/dataset
-
-  frequency: nightly
-  team: data
-
-  cluster:
-    byod:
-      type: gpu
-    cluster_compute: compute_gpu_4x4_aws.yaml
-
-  run:
-    timeout: 1000
-    script: python gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw-parquet --data-format parquet
-
-    wait_for_nodes:
-        num_nodes: 4
-
-  alert: default
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: compute_gpu_4x4_gce.yaml
-
-# 10 TB image classification parquet data with heterogenous cluster
-# 10 g4dn.12xlarge, 10 m5.16xlarge
-- name: torch_batch_inference_hetero_10tb_parquet
-  group: data-tests
-  working_dir: nightly_tests/dataset
-
-  frequency: weekly
-  team: data
-
-  cluster:
-    byod:
-      type: gpu
-    cluster_compute: compute_hetero_10x10_aws.yaml
-
-  run:
-    timeout: 2000
-    script: python gpu_batch_inference.py --data-directory 10T-image-data-synthetic-raw-parquet --data-format parquet
-
-    wait_for_nodes:
-      num_nodes: 20
-
-  alert: default
-
 
 #########################
 # AIR release tests
@@ -1469,7 +1294,7 @@
   group: Golden Notebook tests
   working_dir: golden_notebook_tests
 
-  frequency: nightly-3x
+  frequency: manual
   team: ml
 
   cluster:
@@ -3876,26 +3701,27 @@
 
 #   stable: false
 
-###############
-# Dataset tests
-###############
 
-- name: parquet_metadata_resolution
-  group: data-tests
-  working_dir: nightly_tests/dataset
+##################
+# Core Chaos tests
+##################
 
-  frequency: nightly
-  team: data
+- name: chaos_many_tasks_kill_raylet
+  group: core-nightly-test
+  working_dir: nightly_tests
 
+  frequency: nightly
+  team: core
   cluster:
-    byod:
-      type: gpu
-    cluster_compute: single_node_benchmark_compute.yaml
+    byod: {}
+    cluster_compute: chaos_test/compute_template.yaml
 
   run:
-    # Expect the test to finish around 40 seconds.
-    timeout: 100
-    script: python parquet_metadata_resolution.py --num-files 915 --cloud aws
+    timeout: 3600
+    wait_for_nodes:
+      num_nodes: 10
+    prepare: python setup_chaos.py --no-start
+    script: python chaos_test/test_chaos_basic.py --workload=tasks
 
   variations:
     - __suffix__: aws
@@ -3903,52 +3729,44 @@
       env: gce
       frequency: manual
       cluster:
-        cluster_compute: single_node_benchmark_compute_gce.yaml
-      run:
-        script: python parquet_metadata_resolution.py --num-files 915 --cloud gcp
+        cluster_compute: chaos_test/compute_template_gce.yaml
 
-- name: stable_diffusion_benchmark
-  group: data-tests
-  working_dir: nightly_tests/dataset
+- name: chaos_many_tasks_terminate_instance
+  group: core-nightly-test
+  working_dir: nightly_tests
 
   frequency: nightly
-  team: data
-
+  team: core
   cluster:
-    byod:
-      type: gpu
-      post_build_script: byod_stable_diffusion.sh
-    cluster_compute: stable_diffusion_benchmark_compute.yaml
+    byod: {}
+    cluster_compute: chaos_test/compute_template.yaml
 
   run:
-    timeout: 1800
-    script: python stable_diffusion_benchmark.py
+    timeout: 3600
+    wait_for_nodes:
+      num_nodes: 10
+    prepare: python setup_chaos.py --no-start --chaos TerminateEC2Instance
+    script: python chaos_test/test_chaos_basic.py --workload=tasks
 
   variations:
     - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: stable_diffusion_benchmark_compute_gce.yaml
 
-- name: streaming_data_ingest_benchmark_1tb
-  group: data-tests
-  working_dir: nightly_tests/dataset
+- name: chaos_many_actors_kill_raylet
+  group: core-nightly-test
+  working_dir: nightly_tests
 
   frequency: nightly
-  team: data
-
+  team: core
   cluster:
-    byod:
-      type: gpu
-    cluster_compute: data_ingest_benchmark_compute.yaml
+    byod: {}
+    cluster_compute: chaos_test/compute_template.yaml
 
   run:
-    timeout: 300
-    script: python data_ingest_benchmark.py --dataset-size-gb=1000 --num-workers=20 --streaming
+    timeout: 4200
     wait_for_nodes:
-      num_nodes: 20
+      num_nodes: 10
+    prepare: python setup_chaos.py --no-start
+    script: python chaos_test/test_chaos_basic.py --workload=actors
 
   variations:
     - __suffix__: aws
@@ -3956,56 +3774,48 @@
       env: gce
       frequency: manual
       cluster:
-        cluster_compute: data_ingest_benchmark_compute_gce.yaml
+        cluster_compute: chaos_test/compute_template_gce.yaml
 
-- name: streaming_data_ingest_benchmark_100gb_gpu
-  group: data-tests
-  working_dir: nightly_tests/dataset
+- name: chaos_many_actors_terminate_instance
+  group: core-nightly-test
+  working_dir: nightly_tests
 
   frequency: nightly
-  team: data
-
+  team: core
   cluster:
-    byod:
-      type: gpu
-    cluster_compute: data_ingest_benchmark_compute_gpu.yaml
+    byod: {}
+    cluster_compute: chaos_test/compute_template.yaml
 
   run:
-    timeout: 300
-    script: python data_ingest_benchmark.py --dataset-size-gb=100 --num-workers=4 --streaming --use-gpu
+    timeout: 4200
     wait_for_nodes:
-      num_nodes: 3
+      num_nodes: 10
+    prepare: python setup_chaos.py --no-start --chaos TerminateEC2Instance
+    script: python chaos_test/test_chaos_basic.py --workload=actors
 
   variations:
     - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: data_ingest_benchmark_compute_gpu_gce.yaml
-
-# This test case will early stop the data ingestion iteration on the GPU actors.
-# This is a common usage in PyTorch Lightning
-# (https://lightning.ai/docs/pytorch/stable/common/trainer.html#limit-train-batches).
-# There was a bug in Ray Data that caused GPU memoy leak (see #3.919).
-# We add this test case to cover this scenario.
-- name: streaming_data_ingest_benchmark_100gb_gpu_early_stop
+
+- name: chaos_dask_on_ray_large_scale_test_no_spilling
   group: data-tests
-  working_dir: nightly_tests/dataset
+  working_dir: nightly_tests
 
   frequency: nightly
   team: data
 
   cluster:
     byod:
-      type: gpu
-    cluster_compute: data_ingest_benchmark_compute_gpu.yaml
+      runtime_env:
+        - RAY_lineage_pinning_enabled=1
+    cluster_compute: dask_on_ray/chaos_dask_on_ray_stress_compute.yaml
 
   run:
-    timeout: 300
-    script: python data_ingest_benchmark.py --dataset-size-gb=100 --num-workers=4 --streaming --use-gpu --early-stop
+    timeout: 7200
     wait_for_nodes:
-      num_nodes: 3
+      num_nodes: 21
+    prepare: python setup_chaos.py --kill-interval 100
+    script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb
+      20 --error_rate 0  --data_save_path /tmp/ray
 
   variations:
     - __suffix__: aws
@@ -4013,23 +3823,28 @@
       env: gce
       frequency: manual
       cluster:
-        cluster_compute: data_ingest_benchmark_compute_gpu_gce.yaml
+        cluster_compute: dask_on_ray/dask_on_ray_stress_compute_gce.yaml
 
-- name: aggregate_benchmark
+- name: chaos_dask_on_ray_large_scale_test_spilling
   group: data-tests
-  working_dir: nightly_tests/dataset
+  working_dir: nightly_tests
 
   frequency: nightly
   team: data
 
   cluster:
     byod:
-      type: gpu
-    cluster_compute: single_node_benchmark_compute.yaml
+      runtime_env:
+        - RAY_lineage_pinning_enabled=1
+    cluster_compute: dask_on_ray/dask_on_ray_stress_compute.yaml
 
   run:
-    timeout: 1800
-    script: python aggregate_benchmark.py
+    timeout: 7200
+    wait_for_nodes:
+      num_nodes: 21
+    prepare: python setup_chaos.py --kill-interval 100
+    script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb
+      70 --error_rate 0  --data_save_path /tmp/ray
 
   variations:
     - __suffix__: aws
@@ -4037,749 +3852,24 @@
       env: gce
       frequency: manual
       cluster:
-        cluster_compute: single_node_benchmark_compute_gce.yaml
+        cluster_compute: dask_on_ray/dask_on_ray_stress_compute_gce.yaml
 
-- name: read_parquet_benchmark_single_node
+- name: chaos_dataset_shuffle_push_based_sort_1tb
   group: data-tests
-  working_dir: nightly_tests/dataset
+  working_dir: nightly_tests
+
+  stable: false
 
   frequency: nightly
   team: data
 
   cluster:
     byod:
-      type: gpu
-      post_build_script: byod_install_mosaicml.sh
-    cluster_compute: single_node_benchmark_compute.yaml
-
-  run:
-    # Expect the benchmark to finish in 400 seconds.
-    timeout: 400
-    script: python read_parquet_benchmark.py
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: single_node_benchmark_compute_gce.yaml
-
-- name: read_images_benchmark_single_node
-  group: data-tests
-  working_dir: nightly_tests/dataset
-
-  frequency: nightly
-  team: data
-
-  cluster:
-    byod:
-      type: gpu
-      post_build_script: byod_install_mosaicml.sh
-    cluster_compute: single_node_benchmark_compute.yaml
-
-  run:
-    timeout: 1800
-    script: python read_images_benchmark.py --single-node
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: single_node_benchmark_compute_gce.yaml
-
-# TODO: Re-enable this test once we fix https://github.com/ray-project/ray/issues/40686.
-# - name: read_images_benchmark_multi_node
-#   group: data-tests
-#   working_dir: nightly_tests/dataset
-
-#   frequency: nightly-3x
-#   team: data
-
-#   cluster:
-#     byod:
-#       type: gpu
-#     cluster_compute: multi_node_read_images_benchmark_compute.yaml
-
-#   run:
-#     timeout: 28800
-#     script: python read_images_benchmark.py --multi-node
-
-#   variations:
-#     - __suffix__: aws
-#     - __suffix__: gce
-#       env: gce
-#       frequency: manual
-#       cluster:
-#         cluster_compute: multi_node_read_images_benchmark_compute_gce.yaml
-
-- name: read_images_comparison_microbenchmark_single_node
-  group: data-tests
-  working_dir: nightly_tests/dataset
-
-  frequency: nightly
-  team: data
-  cluster:
-    byod:
-      type: gpu
-      post_build_script: byod_install_mosaicml.sh
-    cluster_compute: single_worker_node_0_head_node_benchmark_compute.yaml
-
-  run:
-    timeout: 1800
-    script: bash run_image_loader_microbenchmark.sh
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: single_node_benchmark_compute_gce.yaml
-
-- name: read_images_train_4_gpu
-  group: data-tests
-  working_dir: nightly_tests/dataset
-
-  frequency: nightly
-  team: data
-  cluster:
-    byod:
-      type: gpu
-      post_build_script: byod_install_mosaicml.sh
-    cluster_compute: multi_node_train_4_workers.yaml
-
-  run:
-    timeout: 18000
-    script: python multi_node_train_benchmark.py --num-workers 4 --file-type image --use-gpu --num-epochs 2
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: ../../air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml
-
-- name: read_images_train_4_gpu_worker_chaos
-  group: data-tests
-  working_dir: nightly_tests
-
-  frequency: nightly
-  team: data
-  cluster:
-    byod:
-      type: gpu
-      post_build_script: byod_install_mosaicml.sh
-    cluster_compute: dataset/multi_node_train_4_workers.yaml
-
-  run:
-    timeout: 18000
-    prepare: python setup_chaos.py --kill-workers --kill-interval 100 --max-to-kill 3 --task-names "ReadImage->Map(wnid_to_index)->Map(crop_and_flip_image)"
-    script: python dataset/multi_node_train_benchmark.py --num-workers 4 --file-type image --use-gpu --num-epochs 1
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: ../air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml
-
-- name: read_images_train_4_gpu_node_chaos
-  group: data-tests
-  working_dir: nightly_tests
-
-  frequency: nightly
-  team: data
-  cluster:
-    byod:
-      type: gpu
-      post_build_script: byod_install_mosaicml.sh
-    cluster_compute: dataset/multi_node_train_4_workers.yaml
-
-  run:
-    timeout: 18000
-    prepare: python setup_chaos.py --kill-interval 200 --max-to-kill 1 --task-names "_RayTrainWorker__execute.get_next"
-    script: python dataset/multi_node_train_benchmark.py --num-workers 4 --file-type image --use-gpu --num-epochs 1
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: ../air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml
-
-- name: read_images_train_16_gpu
-  group: data-tests
-  working_dir: nightly_tests/dataset
-
-  frequency: nightly
-  team: data
-  cluster:
-    byod:
-      type: gpu
-      post_build_script: byod_install_mosaicml.sh
-    cluster_compute: multi_node_train_16_workers.yaml
-
-  run:
-    timeout: 18000
-    script: python multi_node_train_benchmark.py --num-workers 16 --file-type image --use-gpu --num-epochs 2
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: ../../air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml
-
-- name: read_images_train_16_gpu_preserve_order
-  group: data-tests
-  working_dir: nightly_tests/dataset
-
-  frequency: nightly
-  team: data
-  cluster:
-    byod:
-      type: gpu
-      post_build_script: byod_install_mosaicml.sh
-    cluster_compute: multi_node_train_16_workers.yaml
-
-  run:
-    timeout: 18000
-    script: python multi_node_train_benchmark.py --num-workers 16 --file-type image --preserve-order --use-gpu --num-epochs 2
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: ../../air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml
-
-- name: read_parquet_train_4_gpu
-  group: data-tests
-  working_dir: nightly_tests/dataset
-
-  frequency: nightly
-  team: data
-  cluster:
-    byod:
-      type: gpu
-      post_build_script: byod_install_mosaicml.sh
-    cluster_compute: multi_node_train_4_workers.yaml
-
-  run:
-    timeout: 3600
-    script: python multi_node_train_benchmark.py --num-workers 4 --file-type parquet --target-worker-gb 50 --use-gpu
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: ../../air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml
-
-- name: read_parquet_train_16_gpu
-  group: data-tests
-  working_dir: nightly_tests/dataset
-
-  frequency: nightly
-  team: data
-  cluster:
-    byod:
-      type: gpu
-      post_build_script: byod_install_mosaicml.sh
-    cluster_compute: multi_node_train_16_workers.yaml
-
-  run:
-    timeout: 3600
-    script: python multi_node_train_benchmark.py --num-workers 16 --file-type parquet --target-worker-gb 50 --use-gpu
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: ../../air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml
-
-- name: read_images_train_1_gpu_5_cpu
-  group: data-tests
-  working_dir: nightly_tests/dataset
-
-  frequency: nightly
-  team: data
-  cluster:
-    byod:
-      type: gpu
-      post_build_script: byod_install_mosaicml.sh
-    cluster_compute: multi_node_train_1g5c.yaml
-
-  run:
-    timeout: 2400
-    script: python multi_node_train_benchmark.py --num-workers 1 --file-type image --use-gpu --num-epochs 2 --skip-train-model --prefetch-batches 16 --batch-size -1 --disable-locality-with-output
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: compute_gpu_1g5c_gce.yaml
-
-- name: read_tfrecords_benchmark_single_node
-  group: data-tests
-  working_dir: nightly_tests/dataset
-
-  frequency: nightly
-  team: data
-
-  cluster:
-    byod:
-      type: gpu
-      post_build_script: byod_install_mosaicml.sh
-    cluster_compute: single_node_benchmark_compute.yaml
-
-  run:
-    # Expect the benchmark to finish around 30 minutes.
-    timeout: 2700
-    script: python read_tfrecords_benchmark.py
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: single_node_benchmark_compute_gce.yaml
-
-- name: map_batches_benchmark_single_node
-  group: data-tests
-  working_dir: nightly_tests/dataset
-
-  frequency: nightly
-  team: data
-
-  cluster:
-    byod:
-      type: gpu
-    cluster_compute: single_node_benchmark_compute.yaml
-
-  run:
-    # Expect the benchmark to finish around 30 minutes.
-    timeout: 2400
-    script: python map_batches_benchmark.py
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: single_node_benchmark_compute_gce.yaml
-
-- name: iter_tensor_batches_benchmark_single_node
-  group: data-tests
-  working_dir: nightly_tests/dataset
-
-  frequency: nightly
-  team: data
-
-  cluster:
-    byod:
-      type: gpu
-    cluster_compute: single_node_benchmark_compute.yaml
-
-  run:
-    # Expect the benchmark to finish around 30 minutes.
-    timeout: 2400
-    script: python iter_tensor_batches_benchmark.py
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: single_node_benchmark_compute_gce.yaml
-
-- name: iter_tensor_batches_benchmark_multi_node
-  group: data-tests
-  working_dir: nightly_tests/dataset
-
-  frequency: nightly
-  team: data
-
-  cluster:
-    byod:
-      type: gpu
-    cluster_compute: multi_node_benchmark_compute.yaml
-
-  run:
-    # Expect the benchmark to finish within 90 minutes.
-    timeout: 5400
-    script: python iter_tensor_batches_benchmark.py --data-size-gb=10
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: multi_node_benchmark_compute_gce.yaml
-
-- name: iter_batches_benchmark_single_node
-  group: data-tests
-  working_dir: nightly_tests/dataset
-
-  frequency: nightly
-  team: data
-
-  cluster:
-    byod:
-      type: gpu
-    cluster_compute: single_node_benchmark_compute.yaml
-
-  run:
-    # Expect the benchmark to finish around 12 minutes.
-    timeout: 1080
-    script: python iter_batches_benchmark.py
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: single_node_benchmark_compute_gce.yaml
-
-- name: dataset_shuffle_random_shuffle_1tb
-  group: data-tests
-  working_dir: nightly_tests
-
-  frequency: nightly
-  team: data
-
-  cluster:
-    byod:
-      runtime_env:
-        - RAY_worker_killing_policy=retriable_lifo
-      pip:
-        - ray[default]
-    cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
-
-  run:
-    timeout: 7200
-    script: python dataset/sort.py --num-partitions=1000 --partition-size=1e9 --shuffle
-    wait_for_nodes:
-      num_nodes: 20
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
-
-- name: dataset_shuffle_sort_1tb
-  group: data-tests
-  working_dir: nightly_tests
-
-  frequency: nightly
-  team: data
-  stable: False
-
-  cluster:
-    byod:
-      runtime_env:
-        - RAY_worker_killing_policy=retriable_lifo
-      pip:
-        - ray[default]
-    cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
-
-  run:
-    timeout: 7200
-    script: python dataset/sort.py --num-partitions=1000 --partition-size=1e9
-    wait_for_nodes:
-      num_nodes: 20
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
-
-- name: dataset_shuffle_push_based_random_shuffle_1tb
-  group: data-tests
-  working_dir: nightly_tests
-
-  stable: false
-
-  frequency: nightly
-  team: data
-
-  cluster:
-    byod:
-      runtime_env:
-        - RAY_worker_killing_policy=retriable_lifo
-      pip:
-        - ray[default]
-    cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
-
-  run:
-    timeout: 7200
-    script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=1000 --partition-size=1e9 --shuffle
-    wait_for_nodes:
-      num_nodes: 20
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
-
-- name: dataset_shuffle_push_based_sort_1tb
-  group: data-tests
-  working_dir: nightly_tests
-
-  frequency: nightly
-  team: data
-  stable: False
-
-  cluster:
-    byod:
-      runtime_env:
-        - RAY_worker_killing_policy=retriable_lifo
-      pip:
-        - ray[default]
-    cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
-
-  run:
-    timeout: 7200
-    script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=1000 --partition-size=1e9
-    wait_for_nodes:
-      num_nodes: 20
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
-
-- name: dataset_shuffle_push_based_random_shuffle_100tb
-  group: data-tests
-  working_dir: nightly_tests
-  stable: false
-
-  frequency: weekly
-  team: data
-  cluster:
-    byod:
-      runtime_env:
-        - RAY_object_spilling_config={"type":"filesystem","params":{"directory_path":["/tmp/data0","/tmp/data1"]}}
-      post_build_script: byod_dataset_shuffle.sh
-    cluster_compute: shuffle/100tb_shuffle_compute.yaml
-
-  run:
-    timeout: 28800
-    script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=100000 --partition-size=1e9 --shuffle
-    wait_for_nodes:
-      num_nodes: 100
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: shuffle/100tb_shuffle_compute_gce.yaml
-      run:
-        timeout: 28800
-        script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=40000 --partition-size=1e9 --shuffle
-        wait_for_nodes:
-          num_nodes: 100
-
-##################
-# Core Chaos tests
-##################
-
-- name: chaos_many_tasks_kill_raylet
-  group: core-nightly-test
-  working_dir: nightly_tests
-
-  frequency: nightly
-  team: core
-  cluster:
-    byod: {}
-    cluster_compute: chaos_test/compute_template.yaml
-
-  run:
-    timeout: 3600
-    wait_for_nodes:
-      num_nodes: 10
-    prepare: python setup_chaos.py --no-start
-    script: python chaos_test/test_chaos_basic.py --workload=tasks
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: chaos_test/compute_template_gce.yaml
-
-- name: chaos_many_tasks_terminate_instance
-  group: core-nightly-test
-  working_dir: nightly_tests
-
-  frequency: nightly
-  team: core
-  cluster:
-    byod: {}
-    cluster_compute: chaos_test/compute_template.yaml
-
-  run:
-    timeout: 3600
-    wait_for_nodes:
-      num_nodes: 10
-    prepare: python setup_chaos.py --no-start --chaos TerminateEC2Instance
-    script: python chaos_test/test_chaos_basic.py --workload=tasks
-
-  variations:
-    - __suffix__: aws
-
-- name: chaos_many_actors_kill_raylet
-  group: core-nightly-test
-  working_dir: nightly_tests
-
-  frequency: nightly
-  team: core
-  cluster:
-    byod: {}
-    cluster_compute: chaos_test/compute_template.yaml
-
-  run:
-    timeout: 4200
-    wait_for_nodes:
-      num_nodes: 10
-    prepare: python setup_chaos.py --no-start
-    script: python chaos_test/test_chaos_basic.py --workload=actors
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: chaos_test/compute_template_gce.yaml
-
-- name: chaos_many_actors_terminate_instance
-  group: core-nightly-test
-  working_dir: nightly_tests
-
-  frequency: nightly
-  team: core
-  cluster:
-    byod: {}
-    cluster_compute: chaos_test/compute_template.yaml
-
-  run:
-    timeout: 4200
-    wait_for_nodes:
-      num_nodes: 10
-    prepare: python setup_chaos.py --no-start --chaos TerminateEC2Instance
-    script: python chaos_test/test_chaos_basic.py --workload=actors
-
-  variations:
-    - __suffix__: aws
-
-- name: chaos_dask_on_ray_large_scale_test_no_spilling
-  group: data-tests
-  working_dir: nightly_tests
-
-  frequency: nightly
-  team: data
-
-  cluster:
-    byod:
-      runtime_env:
-        - RAY_lineage_pinning_enabled=1
-    cluster_compute: dask_on_ray/chaos_dask_on_ray_stress_compute.yaml
-
-  run:
-    timeout: 7200
-    wait_for_nodes:
-      num_nodes: 21
-    prepare: python setup_chaos.py --kill-interval 100
-    script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb
-      20 --error_rate 0  --data_save_path /tmp/ray
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: dask_on_ray/dask_on_ray_stress_compute_gce.yaml
-
-- name: chaos_dask_on_ray_large_scale_test_spilling
-  group: data-tests
-  working_dir: nightly_tests
-
-  frequency: nightly
-  team: data
-
-  cluster:
-    byod:
-      runtime_env:
-        - RAY_lineage_pinning_enabled=1
-    cluster_compute: dask_on_ray/dask_on_ray_stress_compute.yaml
-
-  run:
-    timeout: 7200
-    wait_for_nodes:
-      num_nodes: 21
-    prepare: python setup_chaos.py --kill-interval 100
-    script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb
-      70 --error_rate 0  --data_save_path /tmp/ray
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: dask_on_ray/dask_on_ray_stress_compute_gce.yaml
-
-- name: chaos_dataset_shuffle_push_based_sort_1tb
-  group: data-tests
-  working_dir: nightly_tests
-
-  stable: false
-
-  frequency: nightly
-  team: data
-
-  cluster:
-    byod:
-      runtime_env:
-        - RAY_worker_killing_policy=retriable_lifo
-      pip:
-        - ray[default]
-    cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
+      runtime_env:
+        - RAY_worker_killing_policy=retriable_lifo
+      pip:
+        - ray[default]
+    cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
 
   run:
     timeout: 7200
@@ -4956,7 +4046,7 @@
   group: cluster-launcher-test
   working_dir: ../python/ray/autoscaler/
 
-  frequency: nightly
+  frequency: manual
   team: clusters
   cluster:
     byod: {}
@@ -4971,7 +4061,7 @@
   group: cluster-launcher-test
   working_dir: ../python/ray/autoscaler/
 
-  frequency: nightly
+  frequency: manual
   team: clusters
   cluster:
     byod: {}
@@ -5041,7 +4131,7 @@
 
   run:
     timeout: 1200
-    script: python launch_and_verify_cluster.py gcp/example-minimal.yaml
+    script: python launch_and_verify_cluster.py gcp/example-minimal-pinned.yaml
 
 - name: gcp_cluster_launcher_full
   group: cluster-launcher-test
@@ -5067,7 +4157,7 @@
   stable: true
 
   env: gce
-  frequency: nightly
+  frequency: manual
   team: clusters
   cluster:
     byod: {}
@@ -5084,7 +4174,7 @@
   stable: true
 
   env: gce
-  frequency: nightly
+  frequency: manual
   team: clusters
   cluster:
     byod: {}
diff --git a/release/requirements_buildkite.in b/release/requirements_buildkite.in
index 96d05e27bb96..0c20af4d9088 100644
--- a/release/requirements_buildkite.in
+++ b/release/requirements_buildkite.in
@@ -14,7 +14,6 @@ pyyaml
 pybuildkite
 PyGithub
 requests
-retry
 twine == 5.0.0
 docker >= 7.1.0
 
diff --git a/release/requirements_buildkite.txt b/release/requirements_buildkite.txt
index 9bfaada37880..659db4c5e8db 100644
--- a/release/requirements_buildkite.txt
+++ b/release/requirements_buildkite.txt
@@ -424,9 +424,7 @@ debugpy==1.8.2 \
 decorator==5.1.1 \
     --hash=sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330 \
     --hash=sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186
-    # via
-    #   ipython
-    #   retry
+    # via ipython
 deprecated==1.2.14 \
     --hash=sha256:6fac8b097794a90302bdbb17b9b815e732d3c4720583ff1b198499d78470466c \
     --hash=sha256:e5323eb936458dccc2582dc6f9c322c852a775a27065ff2b0c4970b9d53d01b3
@@ -1174,10 +1172,6 @@ pure-eval==0.2.3 \
     --hash=sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0 \
     --hash=sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42
     # via stack-data
-py==1.11.0 \
-    --hash=sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719 \
-    --hash=sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378
-    # via retry
 pyasn1==0.6.0 \
     --hash=sha256:3a35ab2c4b5ef98e17dfdec8ab074046fbda76e281c5a706ccd82328cfc8f64c \
     --hash=sha256:cca4bb0f2df5504f02f6f8a775b6e416ff9b0b3b16f7ee80b5a3153d9b804473
@@ -1539,10 +1533,6 @@ requests-toolbelt==1.0.0 \
     --hash=sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6 \
     --hash=sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06
     # via twine
-retry==0.9.2 \
-    --hash=sha256:ccddf89761fa2c726ab29391837d4327f819ea14d244c232a1d24c67a2f98606 \
-    --hash=sha256:f8bfa8b99b69c4506d6f5bd3b0aabf77f98cdb17f3c9fc3f5ca820033336fba4
-    # via -r release/requirements_buildkite.in
 rfc3986==2.0.0 \
     --hash=sha256:50b1502b60e289cb37883f3dfd34532b8873c7de9f49bb546641ce9cbd256ebd \
     --hash=sha256:97aacf9dbd4bfd829baad6e6309fa6573aaf1be3f6fa735c8ab05e46cecb261c
diff --git a/release/rllib_tests/1gpu_16cpus.yaml b/release/rllib_tests/1gpu_16cpus.yaml
index 2a0cdea1c0b3..1b11511cdcb8 100644
--- a/release/rllib_tests/1gpu_16cpus.yaml
+++ b/release/rllib_tests/1gpu_16cpus.yaml
@@ -9,7 +9,7 @@ head_node_type:
 
 worker_node_types: []
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/rllib_tests/1gpu_16cpus_gce.yaml b/release/rllib_tests/1gpu_16cpus_gce.yaml
index f0ad9d505d4a..26b262ab8bed 100644
--- a/release/rllib_tests/1gpu_16cpus_gce.yaml
+++ b/release/rllib_tests/1gpu_16cpus_gce.yaml
@@ -19,7 +19,7 @@ gcp_advanced_configurations_json:
         initialize_params:
           disk_size_gb: 500
 
-#aws:
+#advanced_configurations_json:
 #    BlockDeviceMappings:
 #        - DeviceName: /dev/sda1
 #          Ebs:
diff --git a/release/rllib_tests/1gpu_24cpus.yaml b/release/rllib_tests/1gpu_24cpus.yaml
index af4def71489d..d8d8cb4b866a 100644
--- a/release/rllib_tests/1gpu_24cpus.yaml
+++ b/release/rllib_tests/1gpu_24cpus.yaml
@@ -14,7 +14,7 @@ worker_node_types:
       max_workers: 1
       use_spot: false
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/rllib_tests/1gpu_24cpus_gce.yaml b/release/rllib_tests/1gpu_24cpus_gce.yaml
index ec79552e4984..11e5dc4283d6 100644
--- a/release/rllib_tests/1gpu_24cpus_gce.yaml
+++ b/release/rllib_tests/1gpu_24cpus_gce.yaml
@@ -24,7 +24,7 @@ gcp_advanced_configurations_json:
         initialize_params:
           disk_size_gb: 500
 
-#aws:
+#advanced_configurations_json:
 #    BlockDeviceMappings:
 #        - DeviceName: /dev/sda1
 #          Ebs:
diff --git a/release/rllib_tests/1gpu_32cpus.yaml b/release/rllib_tests/1gpu_32cpus.yaml
index 660791a6cc2f..d1b1349f284f 100644
--- a/release/rllib_tests/1gpu_32cpus.yaml
+++ b/release/rllib_tests/1gpu_32cpus.yaml
@@ -9,7 +9,7 @@ head_node_type:
 
 worker_node_types: []
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/rllib_tests/1gpu_4cpus.yaml b/release/rllib_tests/1gpu_4cpus.yaml
index fa1b042a9260..dcc8baf66e9b 100644
--- a/release/rllib_tests/1gpu_4cpus.yaml
+++ b/release/rllib_tests/1gpu_4cpus.yaml
@@ -9,7 +9,7 @@ head_node_type:
 
 worker_node_types: []
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/rllib_tests/1gpu_4cpus_gce.yaml b/release/rllib_tests/1gpu_4cpus_gce.yaml
index b239d699f91c..7613f5062cfa 100644
--- a/release/rllib_tests/1gpu_4cpus_gce.yaml
+++ b/release/rllib_tests/1gpu_4cpus_gce.yaml
@@ -19,7 +19,7 @@ gcp_advanced_configurations_json:
         initialize_params:
           disk_size_gb: 500
 
-#aws:
+#advanced_configurations_json:
 #    BlockDeviceMappings:
 #        - DeviceName: /dev/sda1
 #          Ebs:
diff --git a/release/rllib_tests/2gpus_32cpus.yaml b/release/rllib_tests/2gpus_32cpus.yaml
index 1c74596c4c8a..02065ef9dc8f 100644
--- a/release/rllib_tests/2gpus_32cpus.yaml
+++ b/release/rllib_tests/2gpus_32cpus.yaml
@@ -9,7 +9,7 @@ head_node_type:
 
 worker_node_types: []
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/rllib_tests/2gpus_32cpus_gce.yaml b/release/rllib_tests/2gpus_32cpus_gce.yaml
index 7086e8d82fbf..fe56a4b11161 100644
--- a/release/rllib_tests/2gpus_32cpus_gce.yaml
+++ b/release/rllib_tests/2gpus_32cpus_gce.yaml
@@ -24,7 +24,7 @@ gcp_advanced_configurations_json:
         initialize_params:
           disk_size_gb: 500
 
-#aws:
+#advanced_configurations_json:
 #    BlockDeviceMappings:
 #        - DeviceName: /dev/sda1
 #          Ebs:
diff --git a/release/rllib_tests/2gpus_64cpus.yaml b/release/rllib_tests/2gpus_64cpus.yaml
index 67392db81700..bd7f534c1fdf 100644
--- a/release/rllib_tests/2gpus_64cpus.yaml
+++ b/release/rllib_tests/2gpus_64cpus.yaml
@@ -14,7 +14,7 @@ worker_node_types:
       max_workers: 1
       use_spot: false
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/rllib_tests/2gpus_64cpus_gce.yaml b/release/rllib_tests/2gpus_64cpus_gce.yaml
index 825124fabbad..484cbf999bd8 100644
--- a/release/rllib_tests/2gpus_64cpus_gce.yaml
+++ b/release/rllib_tests/2gpus_64cpus_gce.yaml
@@ -28,7 +28,7 @@ gcp_advanced_configurations_json:
         initialize_params:
           disk_size_gb: 500
 
-#aws:
+#advanced_configurations_json:
 #    BlockDeviceMappings:
 #        - DeviceName: /dev/sda1
 #          Ebs:
diff --git a/release/rllib_tests/32cpus.yaml b/release/rllib_tests/32cpus.yaml
index f1e092047f78..d67b810f66e1 100644
--- a/release/rllib_tests/32cpus.yaml
+++ b/release/rllib_tests/32cpus.yaml
@@ -9,7 +9,7 @@ head_node_type:
 
 worker_node_types: []
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/rllib_tests/32cpus_gce.yaml b/release/rllib_tests/32cpus_gce.yaml
index c6d1a6729fa0..466d7fe8602c 100644
--- a/release/rllib_tests/32cpus_gce.yaml
+++ b/release/rllib_tests/32cpus_gce.yaml
@@ -19,7 +19,7 @@ gcp_advanced_configurations_json:
         initialize_params:
           disk_size_gb: 500
 
-#aws:
+#advanced_configurations_json:
 #    BlockDeviceMappings:
 #        - DeviceName: /dev/sda1
 #          Ebs:
diff --git a/release/rllib_tests/4gpus_512_cpus_gce.yaml b/release/rllib_tests/4gpus_512_cpus_gce.yaml
index a42e4805795f..92ddfdde89e7 100644
--- a/release/rllib_tests/4gpus_512_cpus_gce.yaml
+++ b/release/rllib_tests/4gpus_512_cpus_gce.yaml
@@ -24,7 +24,7 @@ gcp_advanced_configurations_json:
         initialize_params:
           disk_size_gb: 500
 
-#aws:
+#advanced_configurations_json:
 #    BlockDeviceMappings:
 #        - DeviceName: /dev/sda1
 #          Ebs:
diff --git a/release/rllib_tests/4gpus_544_cpus.yaml b/release/rllib_tests/4gpus_544_cpus.yaml
index dd30af32c19a..c4559737cc17 100644
--- a/release/rllib_tests/4gpus_544_cpus.yaml
+++ b/release/rllib_tests/4gpus_544_cpus.yaml
@@ -14,7 +14,7 @@ worker_node_types:
       max_workers: 5
       use_spot: false
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/rllib_tests/4gpus_64cpus.yaml b/release/rllib_tests/4gpus_64cpus.yaml
index c0f4c76ee300..60d03f122263 100644
--- a/release/rllib_tests/4gpus_64cpus.yaml
+++ b/release/rllib_tests/4gpus_64cpus.yaml
@@ -14,7 +14,7 @@ worker_node_types:
       max_workers: 1
       use_spot: false
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/rllib_tests/4gpus_64cpus_gce.yaml b/release/rllib_tests/4gpus_64cpus_gce.yaml
index 82b95a8b4fdc..a4453843a482 100644
--- a/release/rllib_tests/4gpus_64cpus_gce.yaml
+++ b/release/rllib_tests/4gpus_64cpus_gce.yaml
@@ -19,7 +19,7 @@ gcp_advanced_configurations_json:
         initialize_params:
           disk_size_gb: 500
 
-#aws:
+#advanced_configurations_json:
 #    BlockDeviceMappings:
 #        - DeviceName: /dev/sda1
 #          Ebs:
diff --git a/release/rllib_tests/4gpus_96cpus.yaml b/release/rllib_tests/4gpus_96cpus.yaml
index 80916596a054..e699e2588b8b 100644
--- a/release/rllib_tests/4gpus_96cpus.yaml
+++ b/release/rllib_tests/4gpus_96cpus.yaml
@@ -9,7 +9,7 @@ head_node_type:
 
 worker_node_types: []
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/rllib_tests/4gpus_96cpus_gce.yaml b/release/rllib_tests/4gpus_96cpus_gce.yaml
index 5d1e5b00182c..df83eba5f0f7 100644
--- a/release/rllib_tests/4gpus_96cpus_gce.yaml
+++ b/release/rllib_tests/4gpus_96cpus_gce.yaml
@@ -19,7 +19,7 @@ gcp_advanced_configurations_json:
         initialize_params:
           disk_size_gb: 500
 
-#aws:
+#advanced_configurations_json:
 #    BlockDeviceMappings:
 #        - DeviceName: /dev/sda1
 #          Ebs:
diff --git a/release/rllib_tests/8gpus_96cpus.yaml b/release/rllib_tests/8gpus_96cpus.yaml
index d9509c85dbda..614944b6d20c 100644
--- a/release/rllib_tests/8gpus_96cpus.yaml
+++ b/release/rllib_tests/8gpus_96cpus.yaml
@@ -9,7 +9,7 @@ head_node_type:
 
 worker_node_types: []
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/rllib_tests/multi_node_checkpointing_compute_config.yaml b/release/rllib_tests/multi_node_checkpointing_compute_config.yaml
index 60784554811e..36f37a8738c7 100644
--- a/release/rllib_tests/multi_node_checkpointing_compute_config.yaml
+++ b/release/rllib_tests/multi_node_checkpointing_compute_config.yaml
@@ -14,7 +14,7 @@ worker_node_types:
       max_workers: 2
       use_spot: false
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/serve_tests/compute_tpl_32_cpu.yaml b/release/serve_tests/compute_tpl_32_cpu.yaml
index adcb0ee49569..442ba4222534 100644
--- a/release/serve_tests/compute_tpl_32_cpu.yaml
+++ b/release/serve_tests/compute_tpl_32_cpu.yaml
@@ -20,7 +20,7 @@ worker_node_types:
         custom_resources:
           worker: 1
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/serve_tests/compute_tpl_32_cpu_autoscaling.yaml b/release/serve_tests/compute_tpl_32_cpu_autoscaling.yaml
index 001ff48fef96..c65682bf76b8 100644
--- a/release/serve_tests/compute_tpl_32_cpu_autoscaling.yaml
+++ b/release/serve_tests/compute_tpl_32_cpu_autoscaling.yaml
@@ -20,7 +20,7 @@ worker_node_types:
       max_workers: 35
       use_spot: false
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/serve_tests/compute_tpl_8_cpu_autoscaling.yaml b/release/serve_tests/compute_tpl_8_cpu_autoscaling.yaml
index ab5b59006a93..851620488b20 100644
--- a/release/serve_tests/compute_tpl_8_cpu_autoscaling.yaml
+++ b/release/serve_tests/compute_tpl_8_cpu_autoscaling.yaml
@@ -26,7 +26,7 @@ worker_node_types:
         custom_resources:
           proxy: 1
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/serve_tests/compute_tpl_gpu_node.yaml b/release/serve_tests/compute_tpl_gpu_node.yaml
index c1450feb62d4..96f7dc057a45 100644
--- a/release/serve_tests/compute_tpl_gpu_node.yaml
+++ b/release/serve_tests/compute_tpl_gpu_node.yaml
@@ -13,7 +13,7 @@ worker_node_types:
       max_workers: 1
       use_spot: false
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/serve_tests/compute_tpl_single_node.yaml b/release/serve_tests/compute_tpl_single_node.yaml
index c9f7f84179a0..5f46c956ab7d 100644
--- a/release/serve_tests/compute_tpl_single_node.yaml
+++ b/release/serve_tests/compute_tpl_single_node.yaml
@@ -10,7 +10,7 @@ head_node_type:
 
 worker_node_types: []
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/serve_tests/compute_tpl_single_node_32_cpu.yaml b/release/serve_tests/compute_tpl_single_node_32_cpu.yaml
index c9e1ec1d0f7f..d40e806ea617 100644
--- a/release/serve_tests/compute_tpl_single_node_32_cpu.yaml
+++ b/release/serve_tests/compute_tpl_single_node_32_cpu.yaml
@@ -10,7 +10,7 @@ head_node_type:
 
 worker_node_types: []
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/serve_tests/compute_tpl_single_node_k8s.yaml b/release/serve_tests/compute_tpl_single_node_k8s.yaml
index bbccdfb95a07..7dd79bb0adf1 100644
--- a/release/serve_tests/compute_tpl_single_node_k8s.yaml
+++ b/release/serve_tests/compute_tpl_single_node_k8s.yaml
@@ -12,7 +12,7 @@ head_node_type:
 
 worker_node_types: []
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/train_tests/colocate_trainer/compute_aws.yaml b/release/train_tests/colocate_trainer/compute_aws.yaml
index e2542caf5000..abe241ebddfd 100644
--- a/release/train_tests/colocate_trainer/compute_aws.yaml
+++ b/release/train_tests/colocate_trainer/compute_aws.yaml
@@ -14,7 +14,7 @@ worker_node_types:
       min_workers: 3
       use_spot: false
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/train_tests/horovod/compute_tpl_aws.yaml b/release/train_tests/horovod/compute_tpl_aws.yaml
index 8424f13fbf61..b7065014230c 100644
--- a/release/train_tests/horovod/compute_tpl_aws.yaml
+++ b/release/train_tests/horovod/compute_tpl_aws.yaml
@@ -15,7 +15,7 @@ worker_node_types:
       min_workers: 1
       use_spot: false
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/train_tests/horovod/compute_tpl_gce.yaml b/release/train_tests/horovod/compute_tpl_gce.yaml
index 31730aac6e79..59e43741f983 100644
--- a/release/train_tests/horovod/compute_tpl_gce.yaml
+++ b/release/train_tests/horovod/compute_tpl_gce.yaml
@@ -17,7 +17,7 @@ worker_node_types:
       min_workers: 1
       use_spot: false
 
-#aws:
+#advanced_configurations_json:
 #  TagSpecifications:
 #    - ResourceType: "instance"
 #      Tags:
diff --git a/release/train_tests/multinode_persistence/compute_aws.yaml b/release/train_tests/multinode_persistence/compute_aws.yaml
index ad578f3feba0..a0e4116acafb 100644
--- a/release/train_tests/multinode_persistence/compute_aws.yaml
+++ b/release/train_tests/multinode_persistence/compute_aws.yaml
@@ -14,7 +14,7 @@ worker_node_types:
       min_workers: 3
       use_spot: false
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/train_tests/xgboost_lightgbm/compute_aws_10workers.yaml b/release/train_tests/xgboost_lightgbm/compute_aws_10workers.yaml
index 7888ba7cbec3..55fa05e163f0 100644
--- a/release/train_tests/xgboost_lightgbm/compute_aws_10workers.yaml
+++ b/release/train_tests/xgboost_lightgbm/compute_aws_10workers.yaml
@@ -17,7 +17,7 @@ worker_node_types:
       min_workers: 10
       use_spot: false
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/train_tests/xgboost_lightgbm/compute_aws_1worker.yaml b/release/train_tests/xgboost_lightgbm/compute_aws_1worker.yaml
index 28d8b4a66016..8796876a7de5 100644
--- a/release/train_tests/xgboost_lightgbm/compute_aws_1worker.yaml
+++ b/release/train_tests/xgboost_lightgbm/compute_aws_1worker.yaml
@@ -17,7 +17,7 @@ worker_node_types:
       min_workers: 1
       use_spot: false
 
-aws:
+advanced_configurations_json:
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
diff --git a/release/tune_tests/cloud_tests/tpl_aws_1x4.yaml b/release/tune_tests/cloud_tests/tpl_aws_1x4.yaml
index 1672eb48ba78..0307acc970a5 100644
--- a/release/tune_tests/cloud_tests/tpl_aws_1x4.yaml
+++ b/release/tune_tests/cloud_tests/tpl_aws_1x4.yaml
@@ -9,7 +9,7 @@ head_node_type:
 
 worker_node_types: []
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/tune_tests/fault_tolerance_tests/tpl_aws_16x1.yaml b/release/tune_tests/fault_tolerance_tests/tpl_aws_16x1.yaml
index abe801c950f1..302b975fa5a4 100644
--- a/release/tune_tests/fault_tolerance_tests/tpl_aws_16x1.yaml
+++ b/release/tune_tests/fault_tolerance_tests/tpl_aws_16x1.yaml
@@ -18,7 +18,7 @@ worker_node_types:
       use_spot: true
 
 # Required to allow nodes to terminate themselves.
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/tune_tests/fault_tolerance_tests/tpl_gce_16x1.yaml b/release/tune_tests/fault_tolerance_tests/tpl_gce_16x1.yaml
index dbccfa496b2d..d99976a529e4 100644
--- a/release/tune_tests/fault_tolerance_tests/tpl_gce_16x1.yaml
+++ b/release/tune_tests/fault_tolerance_tests/tpl_gce_16x1.yaml
@@ -20,7 +20,7 @@ worker_node_types:
       use_spot: true
 
 ## Required to allow nodes to terminate themselves.
-#aws:
+#advanced_configurations_json:
 #  TagSpecifications:
 #    - ResourceType: "instance"
 #      Tags:
diff --git a/release/tune_tests/scalability_tests/tpl_1x32_hd.yaml b/release/tune_tests/scalability_tests/tpl_1x32_hd.yaml
index 8fed22723702..de26a8b43484 100644
--- a/release/tune_tests/scalability_tests/tpl_1x32_hd.yaml
+++ b/release/tune_tests/scalability_tests/tpl_1x32_hd.yaml
@@ -9,7 +9,7 @@ head_node_type:
 
 worker_node_types: []
 
-aws:
+advanced_configurations_json:
   TagSpecifications:
     - ResourceType: "instance"
       Tags:
diff --git a/release/tune_tests/scalability_tests/tpl_gce_1x32_hd.yaml b/release/tune_tests/scalability_tests/tpl_gce_1x32_hd.yaml
index 387f47a83437..60ade7f48b9c 100644
--- a/release/tune_tests/scalability_tests/tpl_gce_1x32_hd.yaml
+++ b/release/tune_tests/scalability_tests/tpl_gce_1x32_hd.yaml
@@ -11,7 +11,7 @@ head_node_type:
 
 worker_node_types: []
 
-#aws:
+#advanced_configurations_json:
 #  TagSpecifications:
 #    - ResourceType: "instance"
 #      Tags:
diff --git a/rllib/BUILD b/rllib/BUILD
index e2ec7386ae0a..dc3fd6830245 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -164,23 +164,24 @@ py_test(
     tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_discrete", "torch_only"],
     size = "large",
     srcs = ["tuned_examples/appo/cartpole_appo.py"],
-    args = ["--as-test", "--enable-new-api-stack", "--num-learners=1"]
-)
-py_test(
-    name = "learning_tests_cartpole_appo_gpu",
-    main = "tuned_examples/appo/cartpole_appo.py",
-    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "gpu"],
-    size = "large",
-    srcs = ["tuned_examples/appo/cartpole_appo.py"],
-    args = ["--as-test", "--enable-new-api-stack", "--num-learners=0", "--num-gpus-per-learner=1"]
+    args = ["--as-test", "--num-learners=1", "--num-cpus=8", "--num-env-runners=6"]
 )
+# TODO (sven): For some weird reason, this test runs extremely slow on the CI (not on cluster, not locally) -> taking this out for now ...
+# py_test(
+#    name = "learning_tests_cartpole_appo_gpu",
+#    main = "tuned_examples/appo/cartpole_appo.py",
+#    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "gpu"],
+#    size = "large",
+#    srcs = ["tuned_examples/appo/cartpole_appo.py"],
+#    args = ["--as-test", "--num-learners=0", "--num-gpus-per-learner=1", "--num-cpus=7", "--num-env-runners=6"]
+# )
 py_test(
     name = "learning_tests_cartpole_appo_multi_cpu",
     main = "tuned_examples/appo/cartpole_appo.py",
     tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"],
     size = "large",
     srcs = ["tuned_examples/appo/cartpole_appo.py"],
-    args = ["--as-test", "--enable-new-api-stack", "--num-learners=2"]
+    args = ["--as-test", "--num-learners=2", "--num-cpus=9", "--num-env-runners=6"]
 )
 py_test(
     name = "learning_tests_cartpole_appo_multi_gpu",
@@ -188,7 +189,7 @@ py_test(
     tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"],
     size = "large",
     srcs = ["tuned_examples/appo/cartpole_appo.py"],
-    args = ["--as-test", "--enable-new-api-stack", "--num-learners=2", "--num-gpus-per-learner=1"]
+    args = ["--as-test", "--num-learners=2", "--num-gpus-per-learner=1", "--num-cpus=7", "--num-env-runners=6"]
 )
 # MultiAgentCartPole
 py_test(
@@ -197,7 +198,7 @@ py_test(
     tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_discrete", "torch_only"],
     size = "large",
     srcs = ["tuned_examples/appo/multi_agent_cartpole_appo.py"],
-    args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-learners=1"]
+    args = ["--as-test", "--num-agents=2", "--num-learners=1", "--num-cpus=8", "--num-env-runners=6"]
 )
 py_test(
     name = "learning_tests_multi_agent_cartpole_appo_gpu",
@@ -205,7 +206,7 @@ py_test(
     tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "gpu"],
     size = "large",
     srcs = ["tuned_examples/appo/multi_agent_cartpole_appo.py"],
-    args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-learners=0", "--num-gpus-per-learner=1", "--num-cpus=6"]
+    args = ["--as-test", "--num-agents=2", "--num-learners=0", "--num-gpus-per-learner=1", "--num-cpus=7", "--num-env-runners=6"]
 )
 py_test(
     name = "learning_tests_multi_agent_cartpole_appo_multi_cpu",
@@ -213,7 +214,7 @@ py_test(
     tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"],
     size = "large",
     srcs = ["tuned_examples/appo/multi_agent_cartpole_appo.py"],
-    args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-learners=2", "--num-cpus=7"]
+    args = ["--as-test", "--num-agents=2", "--num-learners=2", "--num-cpus=9", "--num-env-runners=6"]
 )
 py_test(
     name = "learning_tests_multi_agent_cartpole_appo_multi_gpu",
@@ -221,7 +222,7 @@ py_test(
     tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"],
     size = "large",
     srcs = ["tuned_examples/appo/multi_agent_cartpole_appo.py"],
-    args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-learners=2", "--num-gpus-per-learner=1", "--num-cpus=7"]
+    args = ["--as-test", "--num-agents=2", "--num-learners=2", "--num-gpus-per-learner=1", "--num-cpus=7", "--num-env-runners=6"]
 )
 # StatelessCartPole
 py_test(
@@ -230,7 +231,7 @@ py_test(
     tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"],
     size = "large",
     srcs = ["tuned_examples/appo/stateless_cartpole_appo.py"],
-    args = ["--as-test", "--enable-new-api-stack", "--num-learners=1"]
+    args = ["--as-test", "--num-learners=1", "--num-cpus=8", "--num-env-runners=6"]
 )
 py_test(
     name = "learning_tests_stateless_cartpole_appo_gpu",
@@ -238,7 +239,7 @@ py_test(
     tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "gpu"],
     size = "large",
     srcs = ["tuned_examples/appo/stateless_cartpole_appo.py"],
-    args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-learners=0", "--num-gpus-per-learner=1"]
+    args = ["--as-test", "--num-agents=2", "--num-learners=0", "--num-gpus-per-learner=1", "--num-cpus=7", "--num-env-runners=6"]
 )
 py_test(
     name = "learning_tests_stateless_cartpole_appo_multi_cpu",
@@ -246,7 +247,7 @@ py_test(
     tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"],
     size = "large",
     srcs = ["tuned_examples/appo/stateless_cartpole_appo.py"],
-    args = ["--as-test", "--enable-new-api-stack", "--num-learners=2"]
+    args = ["--as-test", "--num-learners=2", "--num-cpus=9", "--num-env-runners=6"]
 )
 py_test(
     name = "learning_tests_stateless_cartpole_appo_multi_gpu",
@@ -254,7 +255,7 @@ py_test(
     tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"],
     size = "large",
     srcs = ["tuned_examples/appo/stateless_cartpole_appo.py"],
-    args = ["--as-test", "--enable-new-api-stack", "--num-learners=2", "--num-gpus-per-learner=1"]
+    args = ["--as-test", "--num-learners=2", "--num-gpus-per-learner=1", "--num-cpus=7", "--num-env-runners=6"]
 )
 # MultiAgentStatelessCartPole
 # py_test(
@@ -2928,6 +2929,17 @@ py_test(
 # subdirectory: offline_rl/
 # ....................................
 
+# Does run into scheduling problems in CI tests. Works on local
+# and GCP cloud.
+# py_test(
+#     name = "examples/offline_rl/cartpole_recording",
+#     main = "examples/offline_rl/cartpole_recording.py",
+#     tags = ["team:rllib", "examples", "exclusive"],
+#     size = "large",
+#     srcs = ["examples/offline_rl/cartpole_recording.py"],
+#     args = ["--enable-new-api-stack", "--as-test", "--framework=torch", "--num-cpus=12"],
+# )
+
 py_test(
     name = "examples/offline_rl/train_w_bc_finetune_w_ppo",
     main = "examples/offline_rl/train_w_bc_finetune_w_ppo.py",
diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py
index bf01aff4be89..7ed33f749713 100644
--- a/rllib/algorithms/algorithm.py
+++ b/rllib/algorithms/algorithm.py
@@ -129,6 +129,7 @@
     NUM_ENV_STEPS_TRAINED_LIFETIME,
     NUM_EPISODES,
     NUM_EPISODES_LIFETIME,
+    NUM_TRAINING_STEP_CALLS_PER_ITERATION,
     RESTORE_WORKERS_TIMER,
     RESTORE_EVAL_WORKERS_TIMER,
     SYNCH_ENV_CONNECTOR_STATES_TIMER,
@@ -3215,7 +3216,17 @@ def _run_one_training_iteration(self) -> Tuple[ResultDict, "TrainIterCtx"]:
                             "one single result dict per training iteration."
                         )
 
-        # Only here, reduce the results into a single result dict.
+                    # TODO (sven): Resolve this metric through log_time's future
+                    #  ability to compute throughput.
+                    self.metrics.log_value(
+                        NUM_TRAINING_STEP_CALLS_PER_ITERATION,
+                        1,
+                        reduce="sum",
+                        clear_on_reduce=True,
+                    )
+
+        # Only here (at the end of the iteration), reduce the results into a single
+        # result dict.
         return self.metrics.reduce(), train_iter_ctx
 
     def _run_one_evaluation(
@@ -3527,13 +3538,16 @@ def _compile_iteration_results_new_api_stack(self, *, train_results, eval_result
         # Fault tolerance stats.
         results[FAULT_TOLERANCE_STATS] = {
             "num_healthy_workers": self.env_runner_group.num_healthy_remote_workers(),
-            "num_in_flight_async_reqs": (
-                self.env_runner_group.num_in_flight_async_reqs()
-            ),
             "num_remote_worker_restarts": (
                 self.env_runner_group.num_remote_worker_restarts()
             ),
         }
+        results["env_runner_group"] = {
+            "actor_manager_num_outstanding_async_reqs": (
+                self.env_runner_group.num_in_flight_async_reqs()
+            ),
+        }
+
         # Resolve all `Stats` leafs by peeking (get their reduced values).
         return tree.map_structure(
             lambda s: s.peek() if isinstance(s, Stats) else s,
diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py
index 542240a00dac..a9c3b5598ef5 100644
--- a/rllib/algorithms/algorithm_config.py
+++ b/rllib/algorithms/algorithm_config.py
@@ -357,6 +357,11 @@ def __init__(self, algo_class: Optional[type] = None):
         self.num_gpus_per_learner = 0
         self.num_cpus_per_learner = 1
         self.local_gpu_idx = 0
+        # TODO (sven): This probably works even without any restriction
+        #  (allowing for any arbitrary number of requests in-flight). Test with
+        #  3 first, then with unlimited, and if both show the same behavior on
+        #  an async algo, remove this restriction entirely.
+        self.max_requests_in_flight_per_learner = 3
 
         # `self.training()`
         self.gamma = 0.99
@@ -463,6 +468,7 @@ def __init__(self, algo_class: Optional[type] = None):
         self.output_compress_columns = [Columns.OBS, Columns.NEXT_OBS]
         self.output_max_file_size = 64 * 1024 * 1024
         self.output_max_rows_per_file = None
+        self.output_write_remaining_data = False
         self.output_write_method = "write_parquet"
         self.output_write_method_kwargs = {}
         self.output_filesystem = None
@@ -1798,15 +1804,16 @@ def env_runners(
                 synchronously in turn with their update step (e.g., PPO or DQN). Not
                 relevant for any algos that sample asynchronously, such as APPO or
                 IMPALA.
-            max_requests_in_flight_per_env_runner: Max number of inflight requests
-                to each EnvRunner worker. See the FaultTolerantActorManager class for
-                more details.
+            max_requests_in_flight_per_env_runner: Max number of in-flight requests
+                to each EnvRunner (actor)). See the
+                `ray.rllib.utils.actor_manager.FaultTolerantActorManager` class for more
+                details.
                 Tuning these values is important when running experiments with
                 large sample batches, where there is the risk that the object store may
                 fill up, causing spilling of objects to disk. This can cause any
                 asynchronous requests to become very slow, making your experiment run
                 slowly as well. You can inspect the object store during your experiment
-                via a call to Ray memory on your head node, and by using the Ray
+                via a call to `ray memory` on your head node, and by using the Ray
                 dashboard. If you're seeing that the object store is filling up,
                 turn down the number of remote requests in flight or enable compression.
             sample_collector: For the old API stack only. The SampleCollector class to
@@ -2110,6 +2117,7 @@ def learners(
         num_cpus_per_learner: Optional[Union[float, int]] = NotProvided,
         num_gpus_per_learner: Optional[Union[float, int]] = NotProvided,
         local_gpu_idx: Optional[int] = NotProvided,
+        max_requests_in_flight_per_learner: Optional[int] = NotProvided,
     ):
         """Sets LearnerGroup and Learner worker related configurations.
 
@@ -2135,6 +2143,10 @@ def learners(
                 an index into the available
                 CUDA devices. For example if `os.environ["CUDA_VISIBLE_DEVICES"] = "1"`
                 and `local_gpu_idx=0`, RLlib uses the GPU with ID=1 on the node.
+            max_requests_in_flight_per_learner: Max number of in-flight requests
+                to each Learner (actor)). See the
+                `ray.rllib.utils.actor_manager.FaultTolerantActorManager` class for more
+                details.
 
         Returns:
             This updated AlgorithmConfig object.
@@ -2147,6 +2159,8 @@ def learners(
             self.num_gpus_per_learner = num_gpus_per_learner
         if local_gpu_idx is not NotProvided:
             self.local_gpu_idx = local_gpu_idx
+        if max_requests_in_flight_per_learner is not NotProvided:
+            self.max_requests_in_flight_per_learner = max_requests_in_flight_per_learner
 
         return self
 
@@ -2566,6 +2580,7 @@ def offline_data(
         output_compress_columns: Optional[List[str]] = NotProvided,
         output_max_file_size: Optional[float] = NotProvided,
         output_max_rows_per_file: Optional[int] = NotProvided,
+        output_write_remaining_data: Optional[bool] = NotProvided,
         output_write_method: Optional[str] = NotProvided,
         output_write_method_kwargs: Optional[Dict] = NotProvided,
         output_filesystem: Optional[str] = NotProvided,
@@ -2735,6 +2750,15 @@ def offline_data(
                 to a new file.
             output_max_rows_per_file: Max output row numbers before rolling over to a
                 new file.
+            output_write_remaining_data: Determines whether any remaining data in the
+                recording buffers should be stored to disk. It is only applicable if
+                `output_max_rows_per_file` is defined. When sampling data, it is
+                buffered until the threshold specified by `output_max_rows_per_file`
+                is reached. Only complete multiples of `output_max_rows_per_file` are
+                written to disk, while any leftover data remains in the buffers. If a
+                recording session is stopped, residual data may still reside in these
+                buffers. Setting `output_write_remaining_data` to `True` ensures this
+                data is flushed to disk. By default, this attribute is set to `False`.
             output_write_method: Write method for the `ray.data.Dataset` to write the
                 offline data to `output`. The default is `read_parquet` for Parquet
                 files. See https://docs.ray.io/en/latest/data/api/input_output.html for
@@ -2842,6 +2866,8 @@ def offline_data(
             self.output_max_file_size = output_max_file_size
         if output_max_rows_per_file is not NotProvided:
             self.output_max_rows_per_file = output_max_rows_per_file
+        if output_write_remaining_data is not NotProvided:
+            self.output_write_remaining_data = output_write_remaining_data
         if output_write_method is not NotProvided:
             self.output_write_method = output_write_method
         if output_write_method_kwargs is not NotProvided:
diff --git a/rllib/algorithms/appo/appo.py b/rllib/algorithms/appo/appo.py
index 37b8fd863c66..5302f7540248 100644
--- a/rllib/algorithms/appo/appo.py
+++ b/rllib/algorithms/appo/appo.py
@@ -1,13 +1,13 @@
-"""
-Asynchronous Proximal Policy Optimization (APPO)
-================================================
+"""Asynchronous Proximal Policy Optimization (APPO)
 
-This file defines the distributed Algorithm class for the asynchronous version
-of proximal policy optimization (APPO).
-See `appo_[tf|torch]_policy.py` for the definition of the policy loss.
+The algorithm is described in [1] (under the name of "IMPACT"):
 
 Detailed documentation:
 https://docs.ray.io/en/master/rllib-algorithms.html#appo
+
+[1] IMPACT: Importance Weighted Asynchronous Architectures with Clipped Target Networks.
+Luo et al. 2020
+https://arxiv.org/pdf/1912.00167
 """
 
 from typing import Optional, Type
@@ -32,8 +32,7 @@
 
 LEARNER_RESULTS_KL_KEY = "mean_kl_loss"
 LEARNER_RESULTS_CURR_KL_COEFF_KEY = "curr_kl_coeff"
-OLD_ACTION_DIST_KEY = "old_action_dist"
-OLD_ACTION_DIST_LOGITS_KEY = "old_action_dist_logits"
+TARGET_ACTION_DIST_LOGITS_KEY = "target_action_dist_logits"
 
 
 class APPOConfig(IMPALAConfig):
@@ -101,25 +100,25 @@ def __init__(self, algo_class=None):
         # __sphinx_doc_begin__
         # APPO specific settings:
         self.vtrace = True
-        self.use_critic = True
         self.use_gae = True
         self.lambda_ = 1.0
         self.clip_param = 0.4
         self.use_kl_loss = False
         self.kl_coeff = 1.0
         self.kl_target = 0.01
-        # TODO (sven): Activate once v-trace sequences in non-RNN batch are solved.
-        #  If we switch this on right now, the shuffling would destroy the rollout
-        #  sequences (non-zero-padded!) needed in the batch for v-trace.
-        # self.shuffle_batch_per_epoch = True
+        self.target_worker_clipping = 2.0
+
+        # Circular replay buffer settings.
+        # Used in [1] for discrete action tasks:
+        # `circular_buffer_num_batches=4` and `circular_buffer_iterations_per_batch=2`
+        # For cont. action tasks:
+        # `circular_buffer_num_batches=16` and `circular_buffer_iterations_per_batch=20`
+        self.circular_buffer_num_batches = 4
+        self.circular_buffer_iterations_per_batch = 2
 
         # Override some of IMPALAConfig's default values with APPO-specific values.
         self.num_env_runners = 2
-        self.min_time_s_per_iteration = 10
-        self.target_network_update_freq = 1
-        self.learner_queue_size = 16
-        self.learner_queue_timeout = 300
-        self.max_sample_requests_in_flight_per_worker = 2
+        self.target_network_update_freq = 2
         self.broadcast_interval = 1
         self.grad_clip = 40.0
         # Note: Only when using enable_rl_module_and_learner=True can the clipping mode
@@ -145,26 +144,32 @@ def __init__(self, algo_class=None):
         self.minibatch_buffer_size = 1  # @OldAPIStack
         self.replay_proportion = 0.0  # @OldAPIStack
         self.replay_buffer_num_slots = 100  # @OldAPIStack
+        self.learner_queue_size = 16  # @OldAPIStack
+        self.learner_queue_timeout = 300  # @OldAPIStack
 
         # Deprecated keys.
         self.target_update_frequency = DEPRECATED_VALUE
+        self.use_critic = DEPRECATED_VALUE
 
     @override(IMPALAConfig)
     def training(
         self,
         *,
         vtrace: Optional[bool] = NotProvided,
-        use_critic: Optional[bool] = NotProvided,
         use_gae: Optional[bool] = NotProvided,
         lambda_: Optional[float] = NotProvided,
         clip_param: Optional[float] = NotProvided,
         use_kl_loss: Optional[bool] = NotProvided,
         kl_coeff: Optional[float] = NotProvided,
         kl_target: Optional[float] = NotProvided,
-        tau: Optional[float] = NotProvided,
         target_network_update_freq: Optional[int] = NotProvided,
+        tau: Optional[float] = NotProvided,
+        target_worker_clipping: Optional[float] = NotProvided,
+        circular_buffer_num_batches: Optional[int] = NotProvided,
+        circular_buffer_iterations_per_batch: Optional[int] = NotProvided,
         # Deprecated keys.
         target_update_frequency=DEPRECATED_VALUE,
+        use_critic=DEPRECATED_VALUE,
         **kwargs,
     ) -> "APPOConfig":
         """Sets the training related configuration.
@@ -172,8 +177,6 @@ def training(
         Args:
             vtrace: Whether to use V-trace weighted advantages. If false, PPO GAE
                 advantages will be used instead.
-            use_critic: Should use a critic as a baseline (otherwise don't use value
-                baseline; required for using GAE). Only applies if vtrace=False.
             use_gae: If true, use the Generalized Advantage Estimator (GAE)
                 with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
                 Only applies if vtrace=False.
@@ -183,9 +186,18 @@ def training(
             kl_coeff: Coefficient for weighting the KL-loss term.
             kl_target: Target term for the KL-term to reach (via adjusting the
                 `kl_coeff` automatically).
-            tau: The factor by which to update the target policy network towards
-                the current policy network. Can range between 0 and 1.
-                e.g. updated_param = tau * current_param + (1 - tau) * target_param
+            target_network_update_freq: NOTE: This parameter is only applicable on
+                the new API stack. The frequency with which to update the target
+                policy network from the main trained policy network. The metric
+                used is `NUM_ENV_STEPS_TRAINED_LIFETIME` and the unit is `n` (see [1]
+                4.1.1), where: `n = [circular_buffer_num_batches (N)] *
+                [circular_buffer_iterations_per_batch (K)] * [train batch size]`
+                For example, if you set `target_network_update_freq=2`, and N=4, K=2,
+                and `train_batch_size_per_learner=500`, then the target net is updated
+                every 2*4*2*500=8000 trained env steps (every 16 batch updates on each
+                learner).
+                The authors in [1] suggests that this setting is robust to a range of
+                choices (try values between 0.125 and 4).
             target_network_update_freq: The frequency to update the target policy and
                 tune the kl loss coefficients that are used during training. After
                 setting this parameter, the algorithm waits for at least
@@ -193,6 +205,20 @@ def training(
                 on before updating the target networks and tune the kl loss
                 coefficients. NOTE: This parameter is only applicable when using the
                 Learner API (enable_rl_module_and_learner=True).
+            tau: The factor by which to update the target policy network towards
+                the current policy network. Can range between 0 and 1.
+                e.g. updated_param = tau * current_param + (1 - tau) * target_param
+            target_worker_clipping: The maximum value for the target-worker-clipping
+                used for computing the IS ratio, described in [1]
+                IS = min(π(i) / π(target), ρ) * (π / π(i))
+            circular_buffer_num_batches: The number of train batches that fit
+                into the circular buffer. Each such train batch can be sampled for
+                training max. `circular_buffer_iterations_per_batch` times.
+            circular_buffer_iterations_per_batch: The number of times any train
+                batch in the circular buffer can be sampled for training. A batch gets
+                evicted from the buffer either if it's the oldest batch in the buffer
+                and a new batch is added OR if the batch reaches this max. number of
+                being sampled.
 
         Returns:
             This updated AlgorithmConfig object.
@@ -203,14 +229,19 @@ def training(
                 new="target_network_update_freq",
                 error=True,
             )
+        if use_critic != DEPRECATED_VALUE:
+            deprecation_warning(
+                old="use_critic",
+                help="`use_critic` no longer supported! APPO always uses a value "
+                "function (critic).",
+                error=True,
+            )
 
         # Pass kwargs onto super's `training()` method.
         super().training(**kwargs)
 
         if vtrace is not NotProvided:
             self.vtrace = vtrace
-        if use_critic is not NotProvided:
-            self.use_critic = use_critic
         if use_gae is not NotProvided:
             self.use_gae = use_gae
         if lambda_ is not NotProvided:
@@ -223,13 +254,56 @@ def training(
             self.kl_coeff = kl_coeff
         if kl_target is not NotProvided:
             self.kl_target = kl_target
-        if tau is not NotProvided:
-            self.tau = tau
         if target_network_update_freq is not NotProvided:
             self.target_network_update_freq = target_network_update_freq
+        if tau is not NotProvided:
+            self.tau = tau
+        if target_worker_clipping is not NotProvided:
+            self.target_worker_clipping = target_worker_clipping
+        if circular_buffer_num_batches is not NotProvided:
+            self.circular_buffer_num_batches = circular_buffer_num_batches
+        if circular_buffer_iterations_per_batch is not NotProvided:
+            self.circular_buffer_iterations_per_batch = (
+                circular_buffer_iterations_per_batch
+            )
 
         return self
 
+    @override(IMPALAConfig)
+    def validate(self) -> None:
+        super().validate()
+
+        # On new API stack, circular buffer should be used, not `minibatch_buffer_size`.
+        if self.enable_rl_module_and_learner:
+            if self.minibatch_buffer_size != 1 or self.replay_proportion != 0.0:
+                raise ValueError(
+                    "`minibatch_buffer_size/replay_proportion` not valid on new API "
+                    "stack with APPO! "
+                    "Use `circular_buffer_num_batches` for the number of train batches "
+                    "in the circular buffer. To change the maximum number of times "
+                    "any batch may be sampled, set "
+                    "`circular_buffer_iterations_per_batch`."
+                )
+            if self.num_multi_gpu_tower_stacks != 1:
+                raise ValueError(
+                    "`num_multi_gpu_tower_stacks` not supported on new API stack with "
+                    "APPO! In order to train on multi-GPU, use "
+                    "`config.learners(num_learners=[number of GPUs], "
+                    "num_gpus_per_learner=1)`. To scale the throughput of batch-to-GPU-"
+                    "pre-loading on each of your `Learners`, set "
+                    "`num_gpu_loader_threads` to a higher number (recommended values: "
+                    "1-8)."
+                )
+            if self.learner_queue_size != 16:
+                raise ValueError(
+                    "`learner_queue_size` not supported on new API stack with "
+                    "APPO! In order set the size of the circular buffer (which acts as "
+                    "a 'learner queue'), use "
+                    "`config.training(circular_buffer_num_batches=..)`. To change the "
+                    "maximum number of times any batch may be sampled, set "
+                    "`config.training(circular_buffer_iterations_per_batch=..)`."
+                )
+
     @override(IMPALAConfig)
     def get_default_learner_class(self):
         if self.framework_str == "torch":
diff --git a/rllib/algorithms/appo/appo_learner.py b/rllib/algorithms/appo/appo_learner.py
index 7b4cf2b14d8f..431449893264 100644
--- a/rllib/algorithms/appo/appo_learner.py
+++ b/rllib/algorithms/appo/appo_learner.py
@@ -2,6 +2,7 @@
 from typing import Any, Dict, Optional
 
 from ray.rllib.algorithms.appo.appo import APPOConfig
+from ray.rllib.algorithms.appo.utils import CircularBuffer
 from ray.rllib.algorithms.impala.impala_learner import IMPALALearner
 from ray.rllib.core.learner.learner import Learner
 from ray.rllib.core.learner.utils import update_target_network
@@ -11,8 +12,9 @@
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.lambda_defaultdict import LambdaDefaultDict
 from ray.rllib.utils.metrics import (
+    ALL_MODULES,
     LAST_TARGET_UPDATE_TS,
-    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+    NUM_ENV_STEPS_TRAINED_LIFETIME,
     NUM_MODULE_STEPS_TRAINED,
     NUM_TARGET_UPDATES,
 )
@@ -28,6 +30,11 @@ class APPOLearner(IMPALALearner):
 
     @override(IMPALALearner)
     def build(self):
+        self._learner_thread_in_queue = CircularBuffer(
+            num_batches=self.config.circular_buffer_num_batches,
+            iterations_per_batch=self.config.circular_buffer_iterations_per_batch,
+        )
+
         super().build()
 
         # Make target networks.
@@ -80,30 +87,22 @@ def after_gradient_based_update(self, *, timesteps: Dict[str, Any]) -> None:
         """Updates the target Q Networks."""
         super().after_gradient_based_update(timesteps=timesteps)
 
-        timestep = timesteps.get(NUM_ENV_STEPS_SAMPLED_LIFETIME, 0)
-
         # TODO (sven): Maybe we should have a `after_gradient_based_update`
         #  method per module?
+        curr_timestep = self.metrics.peek((ALL_MODULES, NUM_ENV_STEPS_TRAINED_LIFETIME))
         for module_id, module in self.module._rl_modules.items():
             config = self.config.get_config_for_module(module_id)
 
-            # TODO (avnish) Using steps trained here instead of sampled ... I'm not sure
-            #  why the other implementation uses sampled.
-            #  The difference in steps sampled/trained is pretty
-            #  much always going to be larger than self.config.num_epochs *
-            #  self.config.minibatch_buffer_size unless the number of steps collected
-            #  is really small. The thing is that the default rollout fragment length
-            #  is 50, so the minibatch buffer size * num_epochs is going to be
-            #  have to be 50 to even meet the threshold of having delayed target
-            #  updates.
-            #  We should instead have the target / kl threshold update be based off
-            #  of the train_batch_size * some target update frequency * num_epochs.
-
             last_update_ts_key = (module_id, LAST_TARGET_UPDATE_TS)
-            if timestep - self.metrics.peek(
-                last_update_ts_key, default=0
-            ) >= config.target_network_update_freq and isinstance(
-                module.unwrapped(), TargetNetworkAPI
+            if isinstance(module.unwrapped(), TargetNetworkAPI) and (
+                curr_timestep - self.metrics.peek(last_update_ts_key, default=0)
+                >= (
+                    config.target_network_update_freq
+                    * config.circular_buffer_num_batches
+                    * config.circular_buffer_iterations_per_batch
+                    * config.total_train_batch_size
+                    / (config.num_learners or 1)
+                )
             ):
                 for (
                     main_net,
@@ -117,7 +116,7 @@ def after_gradient_based_update(self, *, timesteps: Dict[str, Any]) -> None:
                 # Increase lifetime target network update counter by one.
                 self.metrics.log_value((module_id, NUM_TARGET_UPDATES), 1, reduce="sum")
                 # Update the (single-value -> window=1) last updated timestep metric.
-                self.metrics.log_value(last_update_ts_key, timestep, window=1)
+                self.metrics.log_value(last_update_ts_key, curr_timestep, window=1)
 
             if (
                 config.use_kl_loss
diff --git a/rllib/algorithms/appo/appo_rl_module.py b/rllib/algorithms/appo/appo_rl_module.py
index a3a34bb37735..178f3d0951fb 100644
--- a/rllib/algorithms/appo/appo_rl_module.py
+++ b/rllib/algorithms/appo/appo_rl_module.py
@@ -2,7 +2,7 @@
 from typing import Any, Dict, List, Tuple
 
 from ray.rllib.algorithms.ppo.ppo_rl_module import PPORLModule
-from ray.rllib.algorithms.appo.appo import OLD_ACTION_DIST_LOGITS_KEY
+from ray.rllib.algorithms.appo.appo import TARGET_ACTION_DIST_LOGITS_KEY
 from ray.rllib.core.learner.utils import make_target_network
 from ray.rllib.core.models.base import ACTOR
 from ray.rllib.core.models.tf.encoder import ENCODER_OUT
@@ -32,7 +32,7 @@ def get_target_network_pairs(self) -> List[Tuple[NetworkType, NetworkType]]:
     def forward_target(self, batch: Dict[str, Any]) -> Dict[str, Any]:
         old_pi_inputs_encoded = self._old_encoder(batch)[ENCODER_OUT][ACTOR]
         old_action_dist_logits = self._old_pi(old_pi_inputs_encoded)
-        return {OLD_ACTION_DIST_LOGITS_KEY: old_action_dist_logits}
+        return {TARGET_ACTION_DIST_LOGITS_KEY: old_action_dist_logits}
 
     @OverrideToImplementCustomLogic_CallToSuperRecommended
     @override(PPORLModule)
diff --git a/rllib/algorithms/appo/torch/appo_torch_learner.py b/rllib/algorithms/appo/torch/appo_torch_learner.py
index d53815989e09..67d585424343 100644
--- a/rllib/algorithms/appo/torch/appo_torch_learner.py
+++ b/rllib/algorithms/appo/torch/appo_torch_learner.py
@@ -1,10 +1,21 @@
+"""Asynchronous Proximal Policy Optimization (APPO)
+
+The algorithm is described in [1] (under the name of "IMPACT"):
+
+Detailed documentation:
+https://docs.ray.io/en/master/rllib-algorithms.html#appo
+
+[1] IMPACT: Importance Weighted Asynchronous Architectures with Clipped Target Networks.
+Luo et al. 2020
+https://arxiv.org/pdf/1912.00167
+"""
 from typing import Dict
 
 from ray.rllib.algorithms.appo.appo import (
     APPOConfig,
     LEARNER_RESULTS_CURR_KL_COEFF_KEY,
     LEARNER_RESULTS_KL_KEY,
-    OLD_ACTION_DIST_LOGITS_KEY,
+    TARGET_ACTION_DIST_LOGITS_KEY,
 )
 from ray.rllib.algorithms.appo.appo_learner import APPOLearner
 from ray.rllib.algorithms.impala.torch.impala_torch_learner import IMPALATorchLearner
@@ -60,45 +71,49 @@ def compute_loss_for_module(
         )
 
         action_dist_cls_train = module.get_train_action_dist_cls()
-        target_policy_dist = action_dist_cls_train.from_logits(
-            fwd_out[Columns.ACTION_DIST_INPUTS]
-        )
 
-        old_target_policy_dist = action_dist_cls_train.from_logits(
-            module.forward_target(batch)[OLD_ACTION_DIST_LOGITS_KEY]
-        )
-        old_target_policy_actions_logp = old_target_policy_dist.logp(
-            batch[Columns.ACTIONS]
+        # Policy being trained (current).
+        current_action_dist = action_dist_cls_train.from_logits(
+            fwd_out[Columns.ACTION_DIST_INPUTS]
         )
-        behaviour_actions_logp = batch[Columns.ACTION_LOGP]
-        target_actions_logp = target_policy_dist.logp(batch[Columns.ACTIONS])
-
-        behaviour_actions_logp_time_major = make_time_major(
-            behaviour_actions_logp,
+        current_actions_logp = current_action_dist.logp(batch[Columns.ACTIONS])
+        current_actions_logp_time_major = make_time_major(
+            current_actions_logp,
             trajectory_len=rollout_frag_or_episode_len,
             recurrent_seq_len=recurrent_seq_len,
         )
+
+        # Target policy.
+        target_action_dist = action_dist_cls_train.from_logits(
+            module.forward_target(batch)[TARGET_ACTION_DIST_LOGITS_KEY]
+        )
+        target_actions_logp = target_action_dist.logp(batch[Columns.ACTIONS])
         target_actions_logp_time_major = make_time_major(
             target_actions_logp,
             trajectory_len=rollout_frag_or_episode_len,
             recurrent_seq_len=recurrent_seq_len,
         )
-        old_actions_logp_time_major = make_time_major(
-            old_target_policy_actions_logp,
+
+        # EnvRunner's policy (behavior).
+        behavior_actions_logp = batch[Columns.ACTION_LOGP]
+        behavior_actions_logp_time_major = make_time_major(
+            behavior_actions_logp,
             trajectory_len=rollout_frag_or_episode_len,
             recurrent_seq_len=recurrent_seq_len,
         )
+
         rewards_time_major = make_time_major(
             batch[Columns.REWARDS],
             trajectory_len=rollout_frag_or_episode_len,
             recurrent_seq_len=recurrent_seq_len,
         )
+
+        assert Columns.VALUES_BOOTSTRAPPED not in batch
         values_time_major = make_time_major(
             values,
             trajectory_len=rollout_frag_or_episode_len,
             recurrent_seq_len=recurrent_seq_len,
         )
-        assert Columns.VALUES_BOOTSTRAPPED not in batch
         # Use as bootstrap values the vf-preds in the next "batch row", except
         # for the very last row (which doesn't have a next row), for which the
         # bootstrap value does not matter b/c it has a +1ts value at its end
@@ -112,61 +127,86 @@ def compute_loss_for_module(
             dim=0,
         )
 
-        # The discount factor that is used should be gamma except for timesteps where
-        # the episode is terminated. In that case, the discount factor should be 0.
+        # The discount factor that is used should be `gamma * lambda_`, except for
+        # termination timesteps, in which case the discount factor should be 0.
         discounts_time_major = (
-            1.0
-            - make_time_major(
-                batch[Columns.TERMINATEDS],
-                trajectory_len=rollout_frag_or_episode_len,
-                recurrent_seq_len=recurrent_seq_len,
-            ).float()
-        ) * config.gamma
+            (
+                1.0
+                - make_time_major(
+                    batch[Columns.TERMINATEDS],
+                    trajectory_len=rollout_frag_or_episode_len,
+                    recurrent_seq_len=recurrent_seq_len,
+                ).float()
+                # See [1] 3.1: Discounts must contain the GAE lambda_ parameter as well.
+            )
+            * config.gamma
+            * config.lambda_
+        )
 
         # Note that vtrace will compute the main loop on the CPU for better performance.
         vtrace_adjusted_target_values, pg_advantages = vtrace_torch(
-            target_action_log_probs=old_actions_logp_time_major,
-            behaviour_action_log_probs=behaviour_actions_logp_time_major,
+            # See [1] 3.1: For AˆV-GAE, the ratios used are: min(c¯, π(target)/π(i))
+            # π(target)
+            target_action_log_probs=target_actions_logp_time_major,
+            # π(i)
+            behaviour_action_log_probs=behavior_actions_logp_time_major,
+            # See [1] 3.1: Discounts must contain the GAE lambda_ parameter as well.
             discounts=discounts_time_major,
             rewards=rewards_time_major,
             values=values_time_major,
             bootstrap_values=bootstrap_values,
-            clip_pg_rho_threshold=config.vtrace_clip_pg_rho_threshold,
+            # c¯
             clip_rho_threshold=config.vtrace_clip_rho_threshold,
+            # c¯ (but we allow users to distinguish between c¯ used for
+            # value estimates and c¯ used for the advantages.
+            clip_pg_rho_threshold=config.vtrace_clip_pg_rho_threshold,
         )
         pg_advantages = pg_advantages * loss_mask_time_major
 
-        # The policy gradients loss.
-        is_ratio = torch.clip(
-            torch.exp(behaviour_actions_logp_time_major - old_actions_logp_time_major),
+        # The policy gradient loss.
+        # As described in [1], use a logp-ratio of:
+        # min(π(i) / π(target), ρ) * (π / π(i)), where ..
+        # - π are the action probs from the current (learner) policy
+        # - π(i) are the action probs from the ith EnvRunner
+        # - π(target) are the action probs from the target network
+        # - ρ is the "target-worker clipping" (2.0 in the paper)
+        target_worker_is_ratio = torch.clip(
+            torch.exp(
+                behavior_actions_logp_time_major - target_actions_logp_time_major
+            ),
             0.0,
-            2.0,
+            config.target_worker_clipping,
         )
-        logp_ratio = is_ratio * torch.exp(
-            target_actions_logp_time_major - behaviour_actions_logp_time_major
+        target_worker_logp_ratio = target_worker_is_ratio * torch.exp(
+            current_actions_logp_time_major - behavior_actions_logp_time_major
         )
-
         surrogate_loss = torch.minimum(
-            pg_advantages * logp_ratio,
+            pg_advantages * target_worker_logp_ratio,
             pg_advantages
-            * torch.clip(logp_ratio, 1 - config.clip_param, 1 + config.clip_param),
+            * torch.clip(
+                target_worker_logp_ratio,
+                1 - config.clip_param,
+                1 + config.clip_param,
+            ),
         )
+        mean_pi_loss = -(torch.sum(surrogate_loss) / size_loss_mask)
 
+        # Compute KL-loss (if required): KL divergence between current action dist.
+        # and target action dict.
         if config.use_kl_loss:
-            action_kl = old_target_policy_dist.kl(target_policy_dist) * loss_mask
+            action_kl = target_action_dist.kl(current_action_dist) * loss_mask
             mean_kl_loss = torch.sum(action_kl) / size_loss_mask
         else:
             mean_kl_loss = 0.0
-        mean_pi_loss = -(torch.sum(surrogate_loss) / size_loss_mask)
 
-        # The baseline loss.
+        # Compute value function loss.
         delta = values_time_major - vtrace_adjusted_target_values
         vf_loss = 0.5 * torch.sum(torch.pow(delta, 2.0) * loss_mask_time_major)
         mean_vf_loss = vf_loss / size_loss_mask
 
-        # The entropy loss.
+        # Compute entropy loss.
         mean_entropy_loss = (
-            -torch.sum(target_policy_dist.entropy() * loss_mask) / size_loss_mask
+            -torch.sum(current_action_dist.entropy() * loss_mask) / size_loss_mask
         )
 
         # The summed weighted loss.
diff --git a/rllib/algorithms/appo/utils.py b/rllib/algorithms/appo/utils.py
index cbd2efe82161..9a4f1e66d0a9 100644
--- a/rllib/algorithms/appo/utils.py
+++ b/rllib/algorithms/appo/utils.py
@@ -1,12 +1,99 @@
+"""
+[1] IMPACT: Importance Weighted Asynchronous Architectures with Clipped Target Networks.
+Luo et al. 2020
+https://arxiv.org/pdf/1912.00167
+"""
+from collections import deque
+import random
+import threading
+import time
+
 from ray.rllib.models.catalog import ModelCatalog
 from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.utils.annotations import OldAPIStack
 
 
 POLICY_SCOPE = "func"
 TARGET_POLICY_SCOPE = "target_func"
 
 
-# TODO (sven): Deprecate once APPO and IMPALA fully on RLModules/Learner APIs.
+class CircularBuffer:
+    """A circular batch-wise buffer as described in [1] for APPO.
+
+    The buffer holds at most N batches, which are sampled at random (uniformly).
+    If full and a new batch is added, the oldest batch is discarded. Also, each batch
+    currently in the buffer can be sampled at most K times (after which it is also
+    discarded).
+    """
+
+    def __init__(self, num_batches: int, iterations_per_batch: int):
+        # N from the paper (buffer size).
+        self.num_batches = num_batches
+        # K ("replay coefficient") from the paper.
+        self.iterations_per_batch = iterations_per_batch
+
+        self._buffer = deque(maxlen=self.num_batches)
+        self._lock = threading.Lock()
+
+        # The number of valid (not expired) entries in this buffer.
+        self._num_valid_batches = 0
+
+    def add(self, batch):
+        dropped_entry = None
+        dropped_ts = 0
+
+        # Add buffer and k=0 information to the deque.
+        with self._lock:
+            len_ = len(self._buffer)
+            if len_ == self.num_batches:
+                dropped_entry = self._buffer[0]
+            self._buffer.append([batch, 0])
+            self._num_valid_batches += 1
+
+        # A valid entry (w/ a batch whose k has not been reach K yet) was dropped.
+        if dropped_entry is not None and dropped_entry[0] is not None:
+            dropped_ts += dropped_entry[0].env_steps() * (
+                self.iterations_per_batch - dropped_entry[1]
+            )
+            self._num_valid_batches -= 1
+
+        return dropped_ts
+
+    def sample(self):
+        k = entry = batch = None
+
+        while True:
+            # Only initially, the buffer may be empty -> Just wait for some time.
+            if len(self) == 0:
+                time.sleep(0.001)
+                continue
+            # Sample a random buffer index.
+            with self._lock:
+                entry = self._buffer[random.randint(0, len(self._buffer) - 1)]
+            batch, k = entry
+            # Ignore batches that have already been invalidated.
+            if batch is not None:
+                break
+
+        # Increase k += 1 for this batch.
+        assert k is not None
+        entry[1] += 1
+
+        # This batch has been exhausted (k == K) -> Invalidate it in the buffer.
+        if k == self.iterations_per_batch - 1:
+            entry[0] = None
+            entry[1] = None
+            self._num_valid_batches += 1
+
+        # Return the sampled batch.
+        return batch
+
+    def __len__(self) -> int:
+        """Returns the number of actually valid (non-expired) batches in the buffer."""
+        return self._num_valid_batches
+
+
+@OldAPIStack
 def make_appo_models(policy) -> ModelV2:
     """Builds model and target model for APPO.
 
diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py
index 69f140fda775..862c2cf84388 100644
--- a/rllib/algorithms/impala/impala.py
+++ b/rllib/algorithms/impala/impala.py
@@ -80,12 +80,16 @@ class IMPALAConfig(AlgorithmConfig):
     .. testcode::
 
         from ray.rllib.algorithms.impala import IMPALAConfig
-        config = IMPALAConfig()
-        config = config.training(lr=0.0003, train_batch_size_per_learner=512)
-        config = config.learners(num_learners=1)
-        config = config.env_runners(num_env_runners=1)
+
+        config = (
+            IMPALAConfig()
+            .environment("CartPole-v1")
+            .env_runners(num_env_runners=1)
+            .training(lr=0.0003, train_batch_size_per_learner=512)
+            .learners(num_learners=1)
+        )
         # Build a Algorithm object from the config and run 1 training iteration.
-        algo = config.build(env="CartPole-v1")
+        algo = config.build()
         algo.train()
         del algo
 
@@ -94,16 +98,14 @@ class IMPALAConfig(AlgorithmConfig):
         from ray.rllib.algorithms.impala import IMPALAConfig
         from ray import air
         from ray import tune
-        config = IMPALAConfig()
 
-        # Update the config object.
-        config = config.training(
-            lr=tune.grid_search([0.0001, 0.0002]), grad_clip=20.0
+        config = (
+            IMPALAConfig()
+            .environment("CartPole-v1")
+            .env_runners(num_env_runners=1)
+            .training(lr=tune.grid_search([0.0001, 0.0002]), grad_clip=20.0)
+            .learners(num_learners=1)
         )
-        config = config.learners(num_learners=1)
-        config = config.env_runners(num_env_runners=1)
-        # Set the config object's env.
-        config = config.environment(env="CartPole-v1")
         # Run with tune.
         tune.Tuner(
             "IMPALA",
@@ -146,8 +148,6 @@ def __init__(self, algo_class=None):
         self.broadcast_interval = 1
         self.num_aggregation_workers = 0
         self.num_gpu_loader_threads = 8
-        # IMPALA takes care of its own EnvRunner (weights, connector, metrics) synching.
-        self._dont_auto_sync_env_runner_states = True
 
         self.grad_clip = 40.0
         # Note: Only when using enable_rl_module_and_learner=True can the clipping mode
@@ -168,6 +168,9 @@ def __init__(self, algo_class=None):
         # __sphinx_doc_end__
         # fmt: on
 
+        # IMPALA takes care of its own EnvRunner (weights, connector, metrics) synching.
+        self._dont_auto_sync_env_runner_states = True
+
         self.lr_schedule = None  # @OldAPIStack
         self.entropy_coeff_schedule = None  # @OldAPIStack
         self.num_multi_gpu_tower_stacks = 1  # @OldAPIstack
@@ -181,7 +184,6 @@ def __init__(self, algo_class=None):
         self.epsilon = 0.1  # @OldAPIstack
         self._separate_vf_optimizer = False  # @OldAPIstack
         self._lr_vf = 0.0005  # @OldAPIstack
-        self.train_batch_size = 500  # @OldAPIstack
         self.num_gpus = 1  # @OldAPIstack
         self._tf_policy_handles_more_than_one_loss = True  # @OldAPIstack
 
diff --git a/rllib/algorithms/impala/impala_learner.py b/rllib/algorithms/impala/impala_learner.py
index c38315d543b7..1929f9f010d6 100644
--- a/rllib/algorithms/impala/impala_learner.py
+++ b/rllib/algorithms/impala/impala_learner.py
@@ -3,11 +3,12 @@
 import queue
 import threading
 import time
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Union
 
 import tree  # pip install dm_tree
 
 import ray
+from ray.rllib.algorithms.appo.utils import CircularBuffer
 from ray.rllib.algorithms.impala.impala import LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY
 from ray.rllib.core.columns import Columns
 from ray.rllib.core.learner.learner import Learner
@@ -71,7 +72,7 @@ def build(self) -> None:
         ):
             self._learner_connector.prepend(AddOneTsToEpisodesAndTruncate())
             # Leave all batches on the CPU (they'll be moved to the GPU, if applicable,
-            # by the n GPU loader threads.
+            # by the n GPU loader threads).
             numpy_to_tensor_connector = self._learner_connector[NumpyToTensor][0]
             numpy_to_tensor_connector._device = "cpu"  # TODO (sven): Provide API?
 
@@ -80,7 +81,9 @@ def build(self) -> None:
         # on the "update queue" for the actual RLModule forward pass and loss
         # computations.
         self._gpu_loader_in_queue = queue.Queue()
-        self._learner_thread_in_queue = deque(maxlen=self.config.learner_queue_size)
+        # Default is to have a learner thread.
+        if not hasattr(self, "_learner_thread_in_queue"):
+            self._learner_thread_in_queue = deque(maxlen=self.config.learner_queue_size)
         self._learner_thread_out_queue = queue.Queue()
 
         # Create and start the GPU loader thread(s).
@@ -103,9 +106,6 @@ def build(self) -> None:
             in_queue=self._learner_thread_in_queue,
             out_queue=self._learner_thread_out_queue,
             metrics_logger=self.metrics,
-            num_epochs=self.config.num_epochs,
-            minibatch_size=self.config.minibatch_size,
-            shuffle_batch_per_epoch=self.config.shuffle_batch_per_epoch,
         )
         self._learner_thread.start()
 
@@ -115,13 +115,6 @@ def update_from_episodes(
         episodes: List[EpisodeType],
         *,
         timesteps: Dict[str, Any],
-        # TODO (sven): Deprecate these in favor of config attributes for only those
-        #  algos that actually need (and know how) to do minibatching.
-        minibatch_size: Optional[int] = None,
-        num_epochs: int = 1,
-        shuffle_batch_per_epoch: bool = False,
-        num_total_minibatches: int = 0,
-        reduce_fn=None,  # Deprecated args.
         **kwargs,
     ) -> ResultDict:
         self.metrics.set_value(
@@ -175,15 +168,25 @@ def update_from_episodes(
                     self._gpu_loader_in_queue.qsize(),
                 )
             else:
-                # Enqueue to Learner thread's in-queue.
-                _LearnerThread.enqueue(
-                    self._learner_thread_in_queue,
-                    MultiAgentBatch(
-                        {mid: SampleBatch(b) for mid, b in batch.items()},
-                        env_steps=env_steps,
-                    ),
-                    self.metrics,
+                ma_batch = MultiAgentBatch(
+                    {mid: SampleBatch(b) for mid, b in batch.items()},
+                    env_steps=env_steps,
                 )
+                # Add the batch directly to the circular buffer.
+                if isinstance(self._learner_thread_in_queue, CircularBuffer):
+                    ts_dropped = self._learner_thread_in_queue.add(ma_batch)
+                    self.metrics.log_value(
+                        (ALL_MODULES, LEARNER_THREAD_ENV_STEPS_DROPPED),
+                        ts_dropped,
+                        reduce="sum",
+                    )
+                else:
+                    # Enqueue to Learner thread's in-queue.
+                    _LearnerThread.enqueue(
+                        self._learner_thread_in_queue,
+                        ma_batch,
+                        self.metrics,
+                    )
 
         # Return all queued result dicts thus far (after reducing over them).
         results = {}
@@ -263,8 +266,17 @@ def _step(self) -> None:
                 policy_batches={mid: SampleBatch(b) for mid, b in batch_on_gpu.items()},
                 env_steps=env_steps,
             )
-            # Enqueue to Learner thread's in-queue.
-            _LearnerThread.enqueue(self._out_queue, ma_batch_on_gpu, self.metrics)
+
+            if isinstance(self._out_queue, CircularBuffer):
+                ts_dropped = self._out_queue.add(ma_batch_on_gpu)
+                self.metrics.log_value(
+                    (ALL_MODULES, LEARNER_THREAD_ENV_STEPS_DROPPED),
+                    ts_dropped,
+                    reduce="sum",
+                )
+            else:
+                # Enqueue to Learner thread's in-queue.
+                _LearnerThread.enqueue(self._out_queue, ma_batch_on_gpu, self.metrics)
 
 
 class _LearnerThread(threading.Thread):
@@ -275,9 +287,6 @@ def __init__(
         in_queue: deque,
         out_queue: queue.Queue,
         metrics_logger,
-        num_epochs,
-        minibatch_size,
-        shuffle_batch_per_epoch,
     ):
         super().__init__()
         self.daemon = True
@@ -285,13 +294,9 @@ def __init__(
         self.stopped = False
 
         self._update_method = update_method
-        self._in_queue: deque = in_queue
+        self._in_queue: Union[deque, CircularBuffer] = in_queue
         self._out_queue: queue.Queue = out_queue
 
-        self._num_epochs = num_epochs
-        self._minibatch_size = minibatch_size
-        self._shuffle_batch_per_epoch = shuffle_batch_per_epoch
-
     def run(self) -> None:
         while not self.stopped:
             self.step()
@@ -299,14 +304,19 @@ def run(self) -> None:
     def step(self):
         # Get a new batch from the GPU-data (deque.pop -> newest item first).
         with self.metrics.log_time((ALL_MODULES, LEARNER_THREAD_IN_QUEUE_WAIT_TIMER)):
-            if not self._in_queue:
-                time.sleep(0.001)
-                return
-            # Consume from the left (oldest batches first).
-            # If we consumed from the right, we would run into the danger of learning
-            # from newer batches (left side) most times, BUT sometimes grabbing a
-            # really old batches (right area of deque).
-            ma_batch_on_gpu = self._in_queue.popleft()
+            # Get a new batch from the GPU-data (learner queue OR circular buffer).
+            if isinstance(self._in_queue, CircularBuffer):
+                ma_batch_on_gpu = self._in_queue.sample()
+            else:
+                # Queue is empty: Sleep a tiny bit to avoid CPU-thrashing.
+                if not self._in_queue:
+                    time.sleep(0.001)
+                    return
+                # Consume from the left (oldest batches first).
+                # If we consumed from the right, we would run into the danger of
+                # learning from newer batches (left side) most times, BUT sometimes
+                # grabbing older batches (right area of deque).
+                ma_batch_on_gpu = self._in_queue.popleft()
 
         # Call the update method on the batch.
         with self.metrics.log_time((ALL_MODULES, LEARNER_THREAD_UPDATE_TIMER)):
@@ -321,9 +331,6 @@ def step(self):
                         (ALL_MODULES, NUM_ENV_STEPS_SAMPLED_LIFETIME), default=0
                     )
                 },
-                num_epochs=self._num_epochs,
-                minibatch_size=self._minibatch_size,
-                shuffle_batch_per_epoch=self._shuffle_batch_per_epoch,
             )
             # We have to deepcopy the results dict, b/c we must avoid having a returned
             # Stats object sit in the queue and getting a new (possibly even tensor)
diff --git a/rllib/algorithms/impala/vtrace_torch.py b/rllib/algorithms/impala/vtrace_torch.py
index 35d88822ca89..b63a5181c7ac 100644
--- a/rllib/algorithms/impala/vtrace_torch.py
+++ b/rllib/algorithms/impala/vtrace_torch.py
@@ -228,6 +228,7 @@ def multi_from_logits(
         behaviour_action_log_probs, device="cpu"
     )
     behaviour_action_log_probs = force_list(behaviour_action_log_probs)
+    # log_rhos = target_logp - behavior_logp
     log_rhos = get_log_rhos(target_action_log_probs, behaviour_action_log_probs)
 
     vtrace_returns = from_importance_weights(
diff --git a/rllib/core/learner/learner.py b/rllib/core/learner/learner.py
index c26cd6a22a94..5c3aa575ea9e 100644
--- a/rllib/core/learner/learner.py
+++ b/rllib/core/learner/learner.py
@@ -1134,6 +1134,9 @@ def _finalize_fn(batch: Dict[str, numpy.ndarray]) -> Dict[str, Any]:
             fwd_out, loss_per_module, tensor_metrics = self._update(
                 batch.policy_batches
             )
+            # Convert logged tensor metrics (logged during tensor-mode of MetricsLogger)
+            # to actual (numpy) values.
+            self.metrics.tensors_to_numpy(tensor_metrics)
 
             self._set_slicing_by_batch_id(batch, value=False)
             # If `num_iters` is reached break and return.
@@ -1143,9 +1146,6 @@ def _finalize_fn(batch: Dict[str, numpy.ndarray]) -> Dict[str, Any]:
         logger.info(
             f"===> [Learner {id(self)}] number of iterations run in this epoch: {i}"
         )
-        # Convert logged tensor metrics (logged during tensor-mode of MetricsLogger)
-        # to actual (numpy) values.
-        self.metrics.tensors_to_numpy(tensor_metrics)
 
         # Log all individual RLModules' loss terms and its registered optimizers'
         # current learning rates.
@@ -1350,15 +1350,6 @@ def _update_from_batch_or_episodes(
                 {next(iter(self.module.keys())): batch}, env_steps=len(batch)
             )
 
-        # TODO (sven): Remove this leftover hack here for the situation in which we
-        #  did not go through the learner connector.
-        #  Options:
-        #  a) Either also pass given batches through the learner connector (even if
-        #     episodes is None). (preferred solution)
-        #  b) Get rid of the option to pass in a batch altogether.
-        # if episodes is None:
-        #    batch = self._convert_batch_type(batch)
-
         # Check the MultiAgentBatch, whether our RLModule contains all ModuleIDs
         # found in this batch. If not, throw an error.
         unknown_module_ids = set(batch.policy_batches.keys()) - set(self.module.keys())
diff --git a/rllib/core/learner/learner_group.py b/rllib/core/learner/learner_group.py
index 31994fa5dcce..a80de4cd2e76 100644
--- a/rllib/core/learner/learner_group.py
+++ b/rllib/core/learner/learner_group.py
@@ -171,11 +171,9 @@ def __init__(
 
             self._worker_manager = FaultTolerantActorManager(
                 self._workers,
-                # TODO (sven): This probably works even without any restriction
-                #  (allowing for any arbitrary number of requests in-flight). Test with
-                #  3 first, then with unlimited, and if both show the same behavior on
-                #  an async algo, remove this restriction entirely.
-                max_remote_requests_in_flight_per_actor=3,
+                max_remote_requests_in_flight_per_actor=(
+                    self.config.max_requests_in_flight_per_learner
+                ),
             )
             # Counters for the tags for asynchronous update requests that are
             # in-flight. Used for keeping trakc of and grouping together the results of
diff --git a/rllib/core/learner/torch/torch_learner.py b/rllib/core/learner/torch/torch_learner.py
index c8cdff3b8ca3..e04a8b491c9a 100644
--- a/rllib/core/learner/torch/torch_learner.py
+++ b/rllib/core/learner/torch/torch_learner.py
@@ -147,23 +147,7 @@ def _uncompiled_update(
         # Activate tensor-mode on our MetricsLogger.
         self.metrics.activate_tensor_mode()
 
-        # Log off-policy'ness of this update.
-        off_policyness = {
-            (mid, DIFF_NUM_GRAD_UPDATES_VS_SAMPLER_POLICY): (
-                (self._weights_seq_no - module_batch[WEIGHTS_SEQ_NO]).float()
-            )
-            for mid, module_batch in batch.items()
-            if WEIGHTS_SEQ_NO in module_batch
-        }
-        for key in off_policyness.keys():
-            mid = key[0]
-            if Columns.LOSS_MASK not in batch[mid]:
-                off_policyness[key] = torch.mean(off_policyness[key])
-            else:
-                mask = batch[mid][Columns.LOSS_MASK]
-                num_valid = torch.sum(mask)
-                off_policyness[key] = torch.sum(off_policyness[key][mask]) / num_valid
-        self.metrics.log_dict(off_policyness, window=1)
+        self._compute_off_policyness(batch)
 
         fwd_out = self.module.forward_train(batch)
         loss_per_module = self.compute_losses(fwd_out=fwd_out, batch=batch)
@@ -630,6 +614,25 @@ def _map_module_to_device(self, module: MultiRLModule) -> None:
                 if isinstance(module[key], torch.nn.Module):
                     module[key].to(self._device)
 
+    def _compute_off_policyness(self, batch):
+        # Log off-policy'ness of this batch wrt the current weights.
+        off_policyness = {
+            (mid, DIFF_NUM_GRAD_UPDATES_VS_SAMPLER_POLICY): (
+                (self._weights_seq_no - module_batch[WEIGHTS_SEQ_NO]).float()
+            )
+            for mid, module_batch in batch.items()
+            if WEIGHTS_SEQ_NO in module_batch
+        }
+        for key in off_policyness.keys():
+            mid = key[0]
+            if Columns.LOSS_MASK not in batch[mid]:
+                off_policyness[key] = torch.mean(off_policyness[key])
+            else:
+                mask = batch[mid][Columns.LOSS_MASK]
+                num_valid = torch.sum(mask)
+                off_policyness[key] = torch.sum(off_policyness[key][mask]) / num_valid
+        self.metrics.log_dict(off_policyness, window=1)
+
     @override(Learner)
     def _get_tensor_variable(
         self, value, dtype=None, trainable=False
diff --git a/rllib/examples/envs/custom_gym_env.py b/rllib/examples/envs/custom_gym_env.py
index 01fa5ecc452f..2612575adb63 100644
--- a/rllib/examples/envs/custom_gym_env.py
+++ b/rllib/examples/envs/custom_gym_env.py
@@ -45,6 +45,8 @@
 |          18.3034 | 28000 | 0.908918 |            12.9676 |
 +------------------+-------+----------+--------------------+
 """
+# These tags allow extracting portions of this script on Anyscale.
+# ws-template-imports-start
 import gymnasium as gym
 from gymnasium.spaces import Discrete, Box
 import numpy as np
@@ -52,6 +54,8 @@
 
 from typing import Optional
 
+# ws-template-imports-end
+
 from ray.rllib.utils.test_utils import (
     add_rllib_example_script_args,
     run_rllib_example_script_experiment,
@@ -71,6 +75,8 @@
 )
 
 
+# These tags allow extracting portions of this script on Anyscale.
+# ws-template-code-start
 class SimpleCorridor(gym.Env):
     """Example of a custom env in which the agent has to walk down a corridor.
 
@@ -126,6 +132,8 @@ def step(self, action):
         )
 
 
+# ws-template-code-end
+
 if __name__ == "__main__":
     args = parser.parse_args()
 
diff --git a/rllib/examples/offline_rl/cartpole_recording.py b/rllib/examples/offline_rl/cartpole_recording.py
new file mode 100644
index 000000000000..42258ac46fe0
--- /dev/null
+++ b/rllib/examples/offline_rl/cartpole_recording.py
@@ -0,0 +1,163 @@
+"""Example showing how to record expert data from a trained policy.
+
+This example:
+    - demonstrates how you can train a single-agent expert PPO Policy (RLModule)
+    and checkpoint it.
+    - shows how you can then record expert data from the trained PPO Policy to
+    disk during evaluation.
+
+How to run this script
+----------------------
+`python [script file name].py --checkpoint-at-end`
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+Results to expect
+-----------------
+In the console output you can see that the episode return of 350.0 is reached
+before the timestep stop criteria is touched. Afterwards evaluation starts and
+runs 10 iterations while recording the data. The number of recorded experiences
+might differ from evaluation run to evaluation run because evaluation
+`EnvRunner`s sample episodes while recording timesteps and episodes contain
+usually different numbers of timesteps. Note, this is different when recording
+episodes - in this case each row is one episode.
+
++-----------------------------+------------+----------------------+
+| Trial name                  | status     | loc                  |
+|                             |            |                      |
+|-----------------------------+------------+----------------------+
+| PPO_CartPole-v1_df83f_00000 | TERMINATED | 192.168.0.119:233661 |
++-----------------------------+------------+----------------------+
++--------+------------------+------------------------+------------------------+
+|   iter |   total time (s) |   num_training_step_ca |   num_env_steps_sample |
+|        |                  |      lls_per_iteration |             d_lifetime |
++--------+------------------+------------------------+------------------------|
+|     21 |          25.9162 |                      1 |                  84000 |
++--------+------------------+------------------------+------------------------+
+
+...
+
+Number of experiences recorded: 26644
+"""
+
+import ray
+
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.core import COMPONENT_RL_MODULE
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    EVALUATION_RESULTS,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+)
+from ray.rllib.utils.test_utils import add_rllib_example_script_args
+
+parser = add_rllib_example_script_args(
+    default_timesteps=200000,
+    default_reward=350.0,
+)
+parser.set_defaults(checkpoint_at_end=True, max_concurrent_trials=1)
+# Use `parser` to add your own custom command line options to this script
+# and (if needed) use their values to set up `config` below.
+args = parser.parse_args()
+
+config = (
+    PPOConfig()
+    .env_runners(
+        num_env_runners=5,
+    )
+    .environment("CartPole-v1")
+    .rl_module(
+        model_config=DefaultModelConfig(
+            fcnet_hiddens=[32],
+            fcnet_activation="linear",
+            vf_share_layers=True,
+        ),
+    )
+    .training(
+        lr=0.0003,
+        num_epochs=6,
+        vf_loss_coeff=0.01,
+    )
+    .evaluation(
+        evaluation_num_env_runners=1,
+        evaluation_interval=1,
+        evaluation_parallel_to_training=True,
+        evaluation_config=PPOConfig.overrides(explore=False),
+    )
+)
+
+stop = {
+    f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}": args.stop_timesteps,
+    f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": (
+        args.stop_reward
+    ),
+}
+
+
+if __name__ == "__main__":
+    from ray.rllib.utils.test_utils import run_rllib_example_script_experiment
+
+    results = run_rllib_example_script_experiment(config, args, stop=stop)
+
+    # Store the best checkpoint for recording.
+    best_checkpoint = results.get_best_result(
+        metric=f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}",
+        mode="max",
+    ).checkpoint.path
+
+    # Configure the algorithm for offline recording.
+    config.offline_data(
+        output="local:///tmp/cartpole/",
+        # Store columnar (tabular) data.
+        output_write_episodes=False,
+        # Each file should hold 1,000 rows.
+        output_max_rows_per_file=1000,
+        output_write_remaining_data=True,
+        # LZ4-compress columns 'obs', 'new_obs', and 'actions' to
+        # save disk space and increase performance. Note, this means
+        # that you have to use `input_compress_columns` in the same
+        # way when using the data for training in `RLlib`.
+        output_compress_columns=[Columns.OBS, Columns.ACTIONS],
+    )
+    # Change the evaluation settings to sample exactly 50 episodes
+    # per evaluation iteration and increase the number of evaluation
+    # env-runners to 5.
+    config.evaluation(
+        evaluation_num_env_runners=5,
+        evaluation_duration=50,
+        evaluation_duration_unit="episodes",
+        evaluation_interval=1,
+        evaluation_parallel_to_training=False,
+        evaluation_config=PPOConfig.overrides(explore=False),
+    )
+
+    # Build the algorithm for evaluation.
+    algo = config.build()
+    # Load the checkpoint stored above.
+    algo.restore_from_path(
+        best_checkpoint,
+        component=COMPONENT_RL_MODULE,
+    )
+
+    # Evaluate over 10 iterations and record the data.
+    for i in range(10):
+        print(f"Iteration: {i + 1}:\n")
+        res = algo.evaluate()
+        print(res)
+
+    # Stop the algorithm.
+    algo.stop()
+
+    # Check the number of rows in the dataset.
+    ds = ray.data.read_parquet("local:///tmp/cartpole")
+    print(f"Number of experiences recorded: {ds.count()}")
diff --git a/rllib/offline/offline_env_runner.py b/rllib/offline/offline_env_runner.py
index 9da38b60bd6a..a0fee273c007 100644
--- a/rllib/offline/offline_env_runner.py
+++ b/rllib/offline/offline_env_runner.py
@@ -29,6 +29,13 @@ def __init__(self, config: AlgorithmConfig, **kwargs):
         # Initialize the parent.
         super().__init__(config, **kwargs)
 
+        # Get the data context for this `EnvRunner`.
+        data_context = ray.data.DataContext.get_current()
+        # Limit the resources for Ray Data to the CPUs given to this `EnvRunner`.
+        data_context.execution_options.resource_limits.cpu = (
+            config.num_cpus_per_env_runner
+        )
+
         # Set the output write method.
         self.output_write_method = self.config.output_write_method
         self.output_write_method_kwargs = self.config.output_write_method_kwargs
@@ -92,6 +99,10 @@ def __init__(self, config: AlgorithmConfig, **kwargs):
         else:
             self.write_data_this_iter = True
 
+        # If the remaining data should be stored. Note, this is only
+        # relevant in case `output_max_rows_per_file` is defined.
+        self.write_remaining_data = self.config.output_write_remaining_data
+
         # Counts how often `sample` is called to define the output path for
         # each file.
         self._sample_counter = 0
@@ -155,15 +166,18 @@ def sample(
             if self.output_max_rows_per_file:
                 # Reset the event.
                 self.write_data_this_iter = False
-
-                # Extract the number of samples to be written to disk this iteration.
-                samples_to_write = self._samples[: self.output_max_rows_per_file]
-                # Reset the buffer to the remaining data. This only makes sense, if
-                # `rollout_fragment_length` is smaller `output_max_rows_per_file` or
-                # a 2 x `output_max_rows_per_file`.
-                # TODO (simon): Find a better way to write these data.
-                self._samples = self._samples[self.output_max_rows_per_file :]
-                samples_ds = ray.data.from_items(samples_to_write)
+                # Ensure that all data ready to be written is released from
+                # the buffer. Note, this is important in case we have many
+                # episodes sampled and a relatively small `output_max_rows_per_file`.
+                while len(self._samples) >= self.output_max_rows_per_file:
+                    # Extract the number of samples to be written to disk this
+                    # iteration.
+                    samples_to_write = self._samples[: self.output_max_rows_per_file]
+                    # Reset the buffer to the remaining data. This only makes sense, if
+                    # `rollout_fragment_length` is smaller `output_max_rows_per_file` or
+                    # a 2 x `output_max_rows_per_file`.
+                    self._samples = self._samples[self.output_max_rows_per_file :]
+                    samples_ds = ray.data.from_items(samples_to_write)
             # Otherwise, write the complete data.
             else:
                 samples_ds = ray.data.from_items(self._samples)
@@ -183,6 +197,11 @@ def sample(
             except Exception as e:
                 logger.error(e)
 
+        self.metrics.log_value(
+            key="recording_buffer_size",
+            value=len(self._samples),
+        )
+
         # Finally return the samples as usual.
         return samples
 
@@ -196,11 +215,11 @@ def stop(self) -> None:
         """
         # If there are samples left over we have to write htem to disk. them
         # to a dataset.
-        if self._samples:
+        if self._samples and self.write_remaining_data:
             # Convert them to a `ray.data.Dataset`.
             samples_ds = ray.data.from_items(self._samples)
             # Increase the sample counter for the folder/file name.
-            self._sample_counter += 1.0
+            self._sample_counter += 1
             # Try to write the dataset to disk/cloud storage.
             try:
                 # Setup the path for writing data. Each run will be written to
diff --git a/rllib/offline/offline_prelearner.py b/rllib/offline/offline_prelearner.py
index b000f2c965fc..f5ffca03e75a 100644
--- a/rllib/offline/offline_prelearner.py
+++ b/rllib/offline/offline_prelearner.py
@@ -1,10 +1,10 @@
 import gymnasium as gym
 import logging
 import numpy as np
-import random
+import uuid
+
 from typing import Any, Dict, List, Optional, Union, Set, Tuple, TYPE_CHECKING
 
-import ray
 from ray.actor import ActorHandle
 from ray.rllib.core.columns import Columns
 from ray.rllib.core.learner import Learner
@@ -86,8 +86,8 @@ def __init__(
         self,
         config: "AlgorithmConfig",
         learner: Union[Learner, list[ActorHandle]],
+        locality_hints: Optional[List[str]] = None,
         spaces: Optional[Tuple[gym.Space, gym.Space]] = None,
-        locality_hints: Optional[list] = None,
         module_spec: Optional[MultiRLModuleSpec] = None,
         module_state: Optional[Dict[ModuleID, Any]] = None,
     ):
@@ -103,24 +103,6 @@ def __init__(
             self._module = self._learner._module
         # Otherwise we have remote `Learner`s.
         else:
-            # TODO (simon): Check with the data team how to get at
-            # initialization the data block location.
-            node_id = ray.get_runtime_context().get_node_id()
-            # Shuffle indices such that not each data block syncs weights
-            # with the same learner in case there are multiple learners
-            # on the same node like the `PreLearner`.
-            indices = list(range(len(locality_hints)))
-            random.shuffle(indices)
-            locality_hints = [locality_hints[i] for i in indices]
-            learner = [learner[i] for i in indices]
-            # Choose a learner from the same node.
-            for i, hint in enumerate(locality_hints):
-                if hint == node_id:
-                    self._learner = learner[i]
-            # If no learner has been chosen, there is none on the same node.
-            if not self._learner:
-                # Then choose a learner randomly.
-                self._learner = learner[random.randint(0, len(learner) - 1)]
             self.learner_is_remote = True
             # Build the module from spec. Note, this will be a MultiRLModule.
             self._module = module_spec.build()
@@ -525,21 +507,83 @@ def _map_sample_batch_to_episode(
                 # TODO (simon): Add support for multi-agent episodes.
                 NotImplementedError
             else:
-                # Unpack observations, if needed.
-                obs = (
-                    unpack_if_needed(obs.tolist())
-                    if schema[Columns.OBS] in input_compress_columns
-                    else obs.tolist()
-                )
-                # Append the last `new_obs` to get the correct length of observations.
-                obs.append(
-                    unpack_if_needed(batch[schema[Columns.NEXT_OBS]][i][-1])
-                    if schema[Columns.OBS] in input_compress_columns
-                    else batch[schema[Columns.NEXT_OBS]][i][-1]
-                )
+                # Unpack observations, if needed. Note, observations could
+                # be either compressed by their entirety (the complete batch
+                # column) or individually (each column entry).
+                if isinstance(obs, str):
+                    # Decompress the observations if we have a string, i.e.
+                    # observations are compressed in their entirety.
+                    obs = unpack_if_needed(obs)
+                    # Convert to a list of arrays. This is needed as input by
+                    # the `SingleAgentEpisode`.
+                    obs = [obs[i, ...] for i in range(obs.shape[0])]
+                # Otherwise observations are only compressed inside of the
+                # batch column (if at all).
+                elif isinstance(obs, np.ndarray):
+                    # Unpack observations, if they are compressed otherwise we
+                    # simply convert to a list, which is needed by the
+                    # `SingleAgentEpisode`.
+                    obs = (
+                        unpack_if_needed(obs.tolist())
+                        if schema[Columns.OBS] in input_compress_columns
+                        else obs.tolist()
+                    )
+                else:
+                    raise TypeError(
+                        f"Unknown observation type: {type(obs)}. When mapping "
+                        "from old recorded `SampleBatches` batched "
+                        "observations should be either of type `np.array` "
+                        "or - if the column is compressed - of `str` type."
+                    )
+
+                if schema[Columns.NEXT_OBS] in batch:
+                    # Append the last `new_obs` to get the correct length of
+                    # observations.
+                    obs.append(
+                        unpack_if_needed(batch[schema[Columns.NEXT_OBS]][i][-1])
+                        if schema[Columns.OBS] in input_compress_columns
+                        else batch[schema[Columns.NEXT_OBS]][i][-1]
+                    )
+                else:
+                    # Otherwise we duplicate the last observation.
+                    obs.append(obs[-1])
+
+                # Check, if we have `done`, `truncated`, or `terminated`s in
+                # the batch.
+                if (
+                    schema[Columns.TRUNCATEDS] in batch
+                    and schema[Columns.TERMINATEDS] in batch
+                ):
+                    truncated = batch[schema[Columns.TRUNCATEDS]][i][-1]
+                    terminated = batch[schema[Columns.TERMINATEDS]][i][-1]
+                elif (
+                    schema[Columns.TRUNCATEDS] in batch
+                    and schema[Columns.TERMINATEDS] not in batch
+                ):
+                    truncated = batch[schema[Columns.TRUNCATEDS]][i][-1]
+                    terminated = False
+                elif (
+                    schema[Columns.TRUNCATEDS] not in batch
+                    and schema[Columns.TERMINATEDS] in batch
+                ):
+                    terminated = batch[schema[Columns.TERMINATEDS]][i][-1]
+                    truncated = False
+                elif "done" in batch:
+                    terminated = batch["done"][i][-1]
+                    truncated = False
+                # Otherwise, if no `terminated`, nor `truncated` nor `done`
+                # is given, we consider the episode as terminated.
+                else:
+                    terminated = True
+                    truncated = False
+
                 # Create a `SingleAgentEpisode`.
                 episode = SingleAgentEpisode(
-                    id_=str(batch[schema[Columns.EPS_ID]][i][0]),
+                    # If the recorded episode has an ID we use this ID,
+                    # otherwise we generate a new one.
+                    id_=str(batch[schema[Columns.EPS_ID]][i][0])
+                    if schema[Columns.EPS_ID] in batch
+                    else uuid.uuid4().hex,
                     agent_id=agent_id,
                     observations=obs,
                     infos=(
@@ -554,16 +598,8 @@ def _map_sample_batch_to_episode(
                         else batch[schema[Columns.ACTIONS]][i]
                     ),
                     rewards=batch[schema[Columns.REWARDS]][i],
-                    terminated=(
-                        any(batch[schema[Columns.TERMINATEDS]][i])
-                        if schema[Columns.TERMINATEDS] in batch
-                        else any(batch["dones"][i])
-                    ),
-                    truncated=(
-                        any(batch[schema[Columns.TRUNCATEDS]][i])
-                        if schema[Columns.TRUNCATEDS] in batch
-                        else False
-                    ),
+                    terminated=terminated,
+                    truncated=truncated,
                     # TODO (simon): Results in zero-length episodes in connector.
                     # t_started=batch[Columns.T if Columns.T in batch else
                     # "unroll_id"][i][0],
diff --git a/rllib/offline/tests/test_offline_data.py b/rllib/offline/tests/test_offline_data.py
index fad307cc7745..038e9cef383f 100644
--- a/rllib/offline/tests/test_offline_data.py
+++ b/rllib/offline/tests/test_offline_data.py
@@ -124,7 +124,7 @@ def test_sample_multiple_learners(self):
             num_samples=10, return_iterator=2, num_shards=2
         )
         self.assertIsInstance(batch, list)
-        # Ensure we have indeed two such `SStreamSplitDataIterator` instances.
+        # Ensure we have indeed two such `StreamSplitDataIterator` instances.
         self.assertEqual(len(batch), 2)
         from ray.data._internal.iterator.stream_split_iterator import (
             StreamSplitDataIterator,
diff --git a/rllib/tuned_examples/appo/cartpole_appo.py b/rllib/tuned_examples/appo/cartpole_appo.py
index 0af651b6c607..a85a9120ba2a 100644
--- a/rllib/tuned_examples/appo/cartpole_appo.py
+++ b/rllib/tuned_examples/appo/cartpole_appo.py
@@ -16,6 +16,7 @@
     APPOConfig()
     .environment("CartPole-v1")
     .training(
+        circular_buffer_iterations_per_batch=2,
         vf_loss_coeff=0.05,
         entropy_coeff=0.0,
     )
diff --git a/rllib/tuned_examples/appo/pong_appo.py b/rllib/tuned_examples/appo/pong_appo.py
index d79dbaa13fc7..ca36ca60fb7c 100644
--- a/rllib/tuned_examples/appo/pong_appo.py
+++ b/rllib/tuned_examples/appo/pong_appo.py
@@ -65,7 +65,7 @@ def _env_creator(cfg):
         entropy_coeff=[[0, 0.05], [3000000, 0.0]],  # <- crucial parameter to finetune
         # Only update connector states and model weights every n training_step calls.
         broadcast_interval=5,
-        learner_queue_size=1,
+        circular_buffer_num_batches=1,
     )
     .rl_module(
         model_config=DefaultModelConfig(
diff --git a/rllib/tuned_examples/bc/cartpole_recording.py b/rllib/tuned_examples/bc/cartpole_recording.py
deleted file mode 100644
index a75cb31a9228..000000000000
--- a/rllib/tuned_examples/bc/cartpole_recording.py
+++ /dev/null
@@ -1,61 +0,0 @@
-from ray.rllib.algorithms.ppo import PPOConfig
-from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
-from ray.rllib.utils.metrics import (
-    ENV_RUNNER_RESULTS,
-    EPISODE_RETURN_MEAN,
-    EVALUATION_RESULTS,
-    NUM_ENV_STEPS_SAMPLED_LIFETIME,
-)
-from ray.rllib.utils.test_utils import add_rllib_example_script_args
-
-parser = add_rllib_example_script_args()
-# Use `parser` to add your own custom command line options to this script
-# and (if needed) use their values to set up `config` below.
-args = parser.parse_args()
-
-config = (
-    PPOConfig()
-    .env_runners(
-        rollout_fragment_length=1000, num_env_runners=0, batch_mode="truncate_episodes"
-    )
-    .environment("CartPole-v1")
-    .rl_module(
-        model_config=DefaultModelConfig(
-            fcnet_hiddens=[32],
-            fcnet_activation="linear",
-            vf_share_layers=True,
-        ),
-    )
-    .training(
-        lr=0.0003,
-        num_epochs=6,
-        vf_loss_coeff=0.01,
-    )
-    .evaluation(
-        evaluation_num_env_runners=1,
-        evaluation_interval=1,
-        evaluation_parallel_to_training=True,
-        evaluation_config=PPOConfig.overrides(exploration=False),
-    )
-    .offline_data(
-        output="local:///tmp/cartpole/",
-        output_write_episodes=False,
-        output_max_rows_per_file=1000,
-        # LZ4-compress columns 'obs', 'new_obs', and 'actions' to
-        # save disk space and increase performance. Note, this means
-        # that you have to use `input_compress_columns` in the same
-        # way when using the data for training in `RLlib`.
-        output_compress_columns=["obs", "new_obs", "actions"],
-    )
-)
-
-stop = {
-    f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}": 200000,
-    f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 350.0,
-}
-
-
-if __name__ == "__main__":
-    from ray.rllib.utils.test_utils import run_rllib_example_script_experiment
-
-    run_rllib_example_script_experiment(config, args, stop=stop)
diff --git a/rllib/tuned_examples/ppo/atari_ppo.py b/rllib/tuned_examples/ppo/atari_ppo.py
index b4d881574f4e..c58c47898a1a 100644
--- a/rllib/tuned_examples/ppo/atari_ppo.py
+++ b/rllib/tuned_examples/ppo/atari_ppo.py
@@ -1,3 +1,5 @@
+# These tags allow extracting portions of this script on Anyscale.
+# ws-template-imports-start
 import gymnasium as gym
 
 from ray import tune
@@ -8,6 +10,7 @@
 from ray.rllib.env.wrappers.atari_wrappers import wrap_atari_for_new_api_stack
 from ray.rllib.utils.test_utils import add_rllib_example_script_args
 
+# ws-template-imports-end
 
 parser = add_rllib_example_script_args(
     default_reward=float("inf"),
@@ -22,7 +25,12 @@
 # and (if needed) use their values to set up `config` below.
 args = parser.parse_args()
 
+NUM_LEARNERS = args.num_learners or 1
+ENV = args.env
 
+
+# These tags allow extracting portions of this script on Anyscale.
+# ws-template-code-start
 def _make_env_to_module_connector(env):
     return FrameStackingEnvToModule(num_frames=4)
 
@@ -35,7 +43,7 @@ def _make_learner_connector(input_observation_space, input_action_space):
 # We would like our frame stacking connector to do this job.
 def _env_creator(cfg):
     return wrap_atari_for_new_api_stack(
-        gym.make(args.env, **cfg, render_mode="rgb_array"),
+        gym.make(ENV, **cfg, render_mode="rgb_array"),
         # Perform frame-stacking through ConnectorV2 API.
         framestack=None,
     )
@@ -43,7 +51,6 @@ def _env_creator(cfg):
 
 tune.register_env("env", _env_creator)
 
-
 config = (
     PPOConfig()
     .environment(
@@ -57,20 +64,19 @@ def _env_creator(cfg):
         clip_rewards=True,
     )
     .env_runners(
-        # num_envs_per_env_runner=5,  # 5 on old yaml example
         env_to_module_connector=_make_env_to_module_connector,
     )
     .training(
         learner_connector=_make_learner_connector,
-        train_batch_size_per_learner=4000,  # 5000 on old yaml example
-        minibatch_size=128,  # 500 on old yaml example
+        train_batch_size_per_learner=4000,
+        minibatch_size=128,
         lambda_=0.95,
         kl_coeff=0.5,
         clip_param=0.1,
         vf_clip_param=10.0,
         entropy_coeff=0.01,
         num_epochs=10,
-        lr=0.00015 * (args.num_learners or 1),
+        lr=0.00015 * NUM_LEARNERS,
         grad_clip=100.0,
         grad_clip_by="global_norm",
     )
@@ -83,7 +89,7 @@ def _env_creator(cfg):
         ),
     )
 )
-
+# ws-template-code-end
 
 if __name__ == "__main__":
     from ray.rllib.utils.test_utils import run_rllib_example_script_experiment
diff --git a/rllib/utils/images.py b/rllib/utils/images.py
index 0716ea5c45b4..7b0f1601d574 100644
--- a/rllib/utils/images.py
+++ b/rllib/utils/images.py
@@ -1,4 +1,5 @@
 import logging
+import importlib
 
 import numpy as np
 
@@ -6,13 +7,29 @@
 
 logger = logging.getLogger(__name__)
 
+
+@DeveloperAPI
+def is_package_installed(package_name):
+    try:
+        importlib.metadata.version(package_name)
+        return True
+    except importlib.metadata.PackageNotFoundError:
+        return False
+
+
 try:
     import cv2
 
     cv2.ocl.setUseOpenCL(False)
 
     logger.debug("CV2 found for image processing.")
-except ImportError:
+except ImportError as e:
+    if is_package_installed("opencv-python"):
+        raise ImportError(
+            f"OpenCV is installed, but we failed to import it. This may be because "
+            f"you need to install `opencv-python-headless` instead of "
+            f"`opencv-python`. Error message: {e}",
+        )
     cv2 = None
 
 
diff --git a/rllib/utils/metrics/__init__.py b/rllib/utils/metrics/__init__.py
index d9a67933998c..5e7e3ad071bd 100644
--- a/rllib/utils/metrics/__init__.py
+++ b/rllib/utils/metrics/__init__.py
@@ -11,6 +11,9 @@
 NUM_TRAINABLE_PARAMETERS = "num_trainable_parameters"
 NUM_NON_TRAINABLE_PARAMETERS = "num_non_trainable_parameters"
 
+# Number of times `training_step()` was called in one iteration.
+NUM_TRAINING_STEP_CALLS_PER_ITERATION = "num_training_step_calls_per_iteration"
+
 # Counters for sampling, sampling (on eval workers) and
 # training steps (env- and agent steps).
 MEAN_NUM_EPISODE_LISTS_RECEIVED = "mean_num_episode_lists_received"
diff --git a/rllib/utils/metrics/metrics_logger.py b/rllib/utils/metrics/metrics_logger.py
index f1f6f4cc12e8..276d6891b14a 100644
--- a/rllib/utils/metrics/metrics_logger.py
+++ b/rllib/utils/metrics/metrics_logger.py
@@ -937,10 +937,10 @@ def deactivate_tensor_mode(self):
         assert self.tensor_mode
         self._tensor_mode = False
         # Return all logged tensors (logged during the tensor-mode phase).
-        ret = {key: self._get_key(key).peek() for key in self._tensor_keys}
+        logged_tensors = {key: self._get_key(key).peek() for key in self._tensor_keys}
         # Clear out logged tensor keys.
         self._tensor_keys.clear()
-        return ret
+        return logged_tensors
 
     def tensors_to_numpy(self, tensor_metrics):
         """Converts all previously logged and returned tensors back to numpy values."""
diff --git a/rllib/utils/metrics/stats.py b/rllib/utils/metrics/stats.py
index 1929cec2b063..b13b7edb3b48 100644
--- a/rllib/utils/metrics/stats.py
+++ b/rllib/utils/metrics/stats.py
@@ -216,7 +216,8 @@ def __init__(
         # Code to execute when exiting a with-context.
         self._on_exit = on_exit
 
-        # On each `.reduce()` call, we store the result of this call in
+        # On each `.reduce()` call, we store the result of this call in hist[0] and the
+        # previous `reduce()` result in hist[1].
         self._hist = (0, 0)
 
     def push(self, value) -> None:
@@ -274,6 +275,7 @@ def peek(self, *, previous: bool = False) -> Any:
             The result of reducing the internal values list (or the previously computed
             reduced result, if `previous` is True).
         """
+        # Return previously reduced value.
         if previous:
             return self._hist[1]
         return self._reduced_values()[0]
@@ -355,10 +357,8 @@ def merge_in_parallel(self, *others: "Stats") -> None:
             # - Thereby always reducing across the different Stats objects' at the
             #   current index.
             # - The resulting reduced value (across Stats at current index) is then
-            #   repeated AND
-            #   added to the new merged-values list n times (where n is the number of
-            #   Stats, across
-            #   which we merge).
+            #   repeated AND added to the new merged-values list n times (where n is
+            #   the number of Stats, across which we merge).
             # - The merged-values list is reversed.
             # Here:
             # index -1: [3, 6] -> [4.5, 4.5]
@@ -381,13 +381,11 @@ def merge_in_parallel(self, *others: "Stats") -> None:
             stats.merge_in_parallel(stats1, stats2)
             # Same here: Fill new merged-values list:
             # - Start with index -1, moving to the start.
-            # - Thereby always reducing across the different Stats objects' at the
+            # - Thereby always reduce across the different Stats objects' at the
             #   current index.
             # - The resulting reduced value (across Stats at current index) is then
-            #   repeated AND
-            #   added to the new merged-values list n times (where n is the number of
-            #   Stats, across
-            #   which we merge).
+            #   repeated AND added to the new merged-values list n times (where n is the
+            #   number of Stats, across which we merge).
             # - The merged-values list is reversed.
             # Here:
             # index -1: [3, 6] -> [6, 6]
@@ -420,7 +418,7 @@ def merge_in_parallel(self, *others: "Stats") -> None:
 
             # Parallel-merge two (reduce=sum) stats with no window.
             # Note that when reduce="sum", we do NOT reduce across the indices of the
-            # parallel
+            # parallel values.
             stats = Stats(reduce="sum")
             stats1 = Stats(reduce="sum")
             stats1.push(1)
@@ -435,7 +433,6 @@ def merge_in_parallel(self, *others: "Stats") -> None:
             # index -2: [0, 5] -> [3, 6, 0, 5]
             # index -3: [2, 4] -> [3, 6, 0, 5, 2, 4]
             # index -4: [1] -> [3, 6, 0, 5, 2, 4, 1]
-            # STOP after merged list contains >= 4 items (window size)
             # reverse: [1, 4, 2, 5, 0, 6, 3]
             stats.merge_in_parallel(stats1, stats2)
             check(stats.values, [1, 4, 2, 5, 0, 6, 3])
@@ -443,7 +440,7 @@ def merge_in_parallel(self, *others: "Stats") -> None:
 
             # Parallel-merge two "concat" (reduce=None) stats with no window.
             # Note that when reduce=None, we do NOT reduce across the indices of the
-            # parallel
+            # parallel values.
             stats = Stats(reduce=None, window=float("inf"), clear_on_reduce=True)
             stats1 = Stats(reduce=None, window=float("inf"), clear_on_reduce=True)
             stats1.push(1)
@@ -586,7 +583,6 @@ def from_state(state: Dict[str, Any]) -> "Stats":
     def similar_to(
         other: "Stats",
         init_value: Optional[Any] = None,
-        prev_values: Optional[Tuple[Any, Any]] = None,
     ) -> "Stats":
         """Returns a new Stats object that's similar to `other`.
 
diff --git a/src/mock/ray/core_worker/core_worker.h b/src/mock/ray/core_worker/core_worker.h
index 60817fb7af1c..ff1e1d7ab130 100644
--- a/src/mock/ray/core_worker/core_worker.h
+++ b/src/mock/ray/core_worker/core_worker.h
@@ -14,28 +14,11 @@
 #pragma once
 #include "gmock/gmock.h"
 #include "mock/ray/gcs/gcs_client/gcs_client.h"
-namespace ray {
-namespace core {
 
-class MockCoreWorkerOptions : public CoreWorkerOptions {
- public:
-};
-
-}  // namespace core
-}  // namespace ray
-
-namespace ray {
-namespace core {
-
-class MockCoreWorkerProcess : public CoreWorkerProcess {
- public:
-};
-
-}  // namespace core
-}  // namespace ray
+namespace ray::core {
 
-namespace ray {
-namespace core {
+class MockCoreWorkerOptions : public CoreWorkerOptions {};
+class MockCoreWorkerProcess : public CoreWorkerProcess {};
 
 class MockCoreWorker : public CoreWorker {
  public:
@@ -179,5 +162,4 @@ class MockCoreWorker : public CoreWorker {
               (override));
 };
 
-}  // namespace core
-}  // namespace ray
+}  // namespace ray::core
diff --git a/src/mock/ray/core_worker/reference_count.h b/src/mock/ray/core_worker/reference_count.h
index c0679dec135f..c9f7a1d0b415 100644
--- a/src/mock/ray/core_worker/reference_count.h
+++ b/src/mock/ray/core_worker/reference_count.h
@@ -41,7 +41,7 @@ class MockReferenceCounter : public ReferenceCounterInterface {
                     bool add_local_ref,
                     const absl::optional<NodeID> &pinned_at_raylet_id));
 
-  MOCK_METHOD2(AddObjectPrimaryCopyDeleteCallback,
+  MOCK_METHOD2(AddObjectOutOfScopeOrFreedCallback,
                bool(const ObjectID &object_id,
                     const std::function<void(const ObjectID &)> callback));
 
diff --git a/src/mock/ray/gcs/gcs_server/gcs_node_manager.h b/src/mock/ray/gcs/gcs_server/gcs_node_manager.h
index 3a8f22949fae..7a3efe197529 100644
--- a/src/mock/ray/gcs/gcs_server/gcs_node_manager.h
+++ b/src/mock/ray/gcs/gcs_server/gcs_node_manager.h
@@ -18,7 +18,11 @@ namespace gcs {
 
 class MockGcsNodeManager : public GcsNodeManager {
  public:
-  MockGcsNodeManager() : GcsNodeManager(nullptr, nullptr, nullptr, ClusterID::Nil()) {}
+  MockGcsNodeManager()
+      : GcsNodeManager(/*gcs_publisher=*/nullptr,
+                       /*gcs_table_storage=*/nullptr,
+                       /*raylet_client_pool=*/nullptr,
+                       /*cluster_id=*/ClusterID::Nil()) {}
   MOCK_METHOD(void,
               HandleRegisterNode,
               (rpc::RegisterNodeRequest request,
diff --git a/src/ray/common/BUILD b/src/ray/common/BUILD
index 416dd9659983..563c53007ecc 100644
--- a/src/ray/common/BUILD
+++ b/src/ray/common/BUILD
@@ -181,6 +181,7 @@ ray_cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
     ],
 )
diff --git a/src/ray/common/runtime_env_manager.cc b/src/ray/common/runtime_env_manager.cc
index d765dbc4f1de..08187cf29932 100644
--- a/src/ray/common/runtime_env_manager.cc
+++ b/src/ray/common/runtime_env_manager.cc
@@ -31,12 +31,13 @@ void RuntimeEnvManager::AddURIReference(const std::string &hex_id,
   if (!uris.working_dir_uri().empty()) {
     const auto &uri = uris.working_dir_uri();
     uri_reference_[uri]++;
-    id_to_uris_[hex_id].push_back(uri);
+    id_to_uris_[hex_id].emplace_back(uri);
     RAY_LOG(DEBUG) << "[working_dir] Added URI Reference " << uri << " for id " << hex_id;
   }
+
   for (const auto &uri : uris.py_modules_uris()) {
     uri_reference_[uri]++;
-    id_to_uris_[hex_id].push_back(uri);
+    id_to_uris_[hex_id].emplace_back(uri);
     RAY_LOG(DEBUG) << "[py_modules] Added URI Reference " << uri << " for id " << hex_id;
   }
   PrintDebugString();
@@ -51,21 +52,24 @@ const std::vector<std::string> &RuntimeEnvManager::GetReferences(
 
 void RuntimeEnvManager::RemoveURIReference(const std::string &hex_id) {
   RAY_LOG(DEBUG) << "Subtracting 1 from URI Reference for id " << hex_id;
-  if (!id_to_uris_.count(hex_id)) {
+  auto iter = id_to_uris_.find(hex_id);
+  if (iter == id_to_uris_.end()) {
     return;
   }
 
-  for (const auto &uri : id_to_uris_[hex_id]) {
-    --uri_reference_[uri];
-    auto ref_count = uri_reference_[uri];
-    RAY_CHECK(ref_count >= 0);
-    if (ref_count == 0) {
-      uri_reference_.erase(uri);
+  for (const auto &uri : iter->second) {
+    auto uri_ref_iter = uri_reference_.find(uri);
+    RAY_CHECK(uri_ref_iter != uri_reference_.end());
+    --uri_ref_iter->second;
+    const auto new_ref_count = uri_ref_iter->second;
+    RAY_CHECK_GE(new_ref_count, 0);
+    if (new_ref_count == 0) {
+      uri_reference_.erase(uri_ref_iter);
       RAY_LOG(DEBUG) << "Deleting URI Reference " << uri;
       deleter_(uri, [](bool success) {});
     }
   }
-  id_to_uris_.erase(hex_id);
+  id_to_uris_.erase(iter);
   PrintDebugString();
 }
 
diff --git a/src/ray/common/runtime_env_manager.h b/src/ray/common/runtime_env_manager.h
index a6b282863307..ab58409d8d91 100644
--- a/src/ray/common/runtime_env_manager.h
+++ b/src/ray/common/runtime_env_manager.h
@@ -12,7 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
+
 #include <functional>
+#include <utility>
 
 #include "absl/container/flat_hash_map.h"
 #include "ray/common/id.h"
@@ -32,9 +34,9 @@ class RuntimeEnvManager {
  public:
   using DeleteFunc =
       std::function<void(const std::string &uri, std::function<void(bool successful)>)>;
-  explicit RuntimeEnvManager(DeleteFunc deleter) : deleter_(deleter) {}
+  explicit RuntimeEnvManager(DeleteFunc deleter) : deleter_(std::move(deleter)) {}
 
-  /// Increase the reference of URI by job or actor ID and runtime_env.
+  /// Increase the reference count of URI by job or actor ID and runtime_env.
   ///
   /// \param[in] hex_id The id of the runtime env. It can be an actor or job id.
   /// \param[in] runtime_env_info The runtime env used by the id.
@@ -53,7 +55,7 @@ class RuntimeEnvManager {
   /// \return The URIs referenced by the id.
   const std::vector<std::string> &GetReferences(const std::string &hex_id) const;
 
-  /// Decrease the reference of URI by job_id
+  /// Decrease the reference count of URI by job_id
   /// \param[in] hex_id The id of the runtime env.
   void RemoveURIReference(const std::string &hex_id);
 
diff --git a/src/ray/common/task/task.cc b/src/ray/common/task/task.cc
index e2ac8571c4e5..812c0598cc35 100644
--- a/src/ray/common/task/task.cc
+++ b/src/ray/common/task/task.cc
@@ -14,7 +14,7 @@
 
 #include "ray/common/task/task.h"
 
-#include <sstream>
+#include "absl/strings/str_format.h"
 
 namespace ray {
 
@@ -42,9 +42,7 @@ const std::string &RayTask::GetPreferredNodeID() const { return preferred_node_i
 void RayTask::ComputeDependencies() { dependencies_ = task_spec_.GetDependencies(); }
 
 std::string RayTask::DebugString() const {
-  std::ostringstream stream;
-  stream << "task_spec={" << task_spec_.DebugString() << "}";
-  return stream.str();
+  return absl::StrFormat("task_spec={%s}", task_spec_.DebugString());
 }
 
 }  // namespace ray
diff --git a/src/ray/common/task/task.h b/src/ray/common/task/task.h
index 5a4a9e323de5..52165665da2c 100644
--- a/src/ray/common/task/task.h
+++ b/src/ray/common/task/task.h
@@ -21,8 +21,6 @@
 
 namespace ray {
 
-typedef std::function<void()> CancelTaskCallback;
-
 /// \class RayTask
 ///
 /// A RayTask represents a Ray task and a specification of its execution (e.g.,
@@ -33,7 +31,7 @@ class RayTask {
  public:
   /// Construct an empty task. This should only be used to pass a task
   /// as an out parameter to a function or method.
-  RayTask() {}
+  RayTask() = default;
 
   /// Construct a `RayTask` object from a protobuf message.
   ///
@@ -41,7 +39,7 @@ class RayTask {
   explicit RayTask(const rpc::Task &message);
 
   /// Construct a `RayTask` object from a `TaskSpecification`.
-  RayTask(TaskSpecification task_spec);
+  explicit RayTask(TaskSpecification task_spec);
 
   RayTask(TaskSpecification task_spec, std::string preferred_node_id);
 
diff --git a/src/ray/common/task/task_spec.cc b/src/ray/common/task/task_spec.cc
index a1b3c04f80b3..6ffc13f5a8e6 100644
--- a/src/ray/common/task/task_spec.cc
+++ b/src/ray/common/task/task_spec.cc
@@ -181,15 +181,15 @@ ray::FunctionDescriptor TaskSpecification::FunctionDescriptor() const {
   return ray::FunctionDescriptorBuilder::FromProto(message_->function_descriptor());
 }
 
-rpc::RuntimeEnvInfo TaskSpecification::RuntimeEnvInfo() const {
+const rpc::RuntimeEnvInfo &TaskSpecification::RuntimeEnvInfo() const {
   return message_->runtime_env_info();
 }
 
-std::string TaskSpecification::SerializedRuntimeEnv() const {
+const std::string &TaskSpecification::SerializedRuntimeEnv() const {
   return message_->runtime_env_info().serialized_runtime_env();
 }
 
-rpc::RuntimeEnvConfig TaskSpecification::RuntimeEnvConfig() const {
+const rpc::RuntimeEnvConfig &TaskSpecification::RuntimeEnvConfig() const {
   return message_->runtime_env_info().runtime_env_config();
 }
 
@@ -208,7 +208,7 @@ int TaskSpecification::GetRuntimeEnvHash() const { return runtime_env_hash_; }
 const SchedulingClass TaskSpecification::GetSchedulingClass() const {
   if (!IsActorTask()) {
     // Actor task doesn't have scheudling id, so we don't need to check this.
-    RAY_CHECK(sched_cls_id_ > 0);
+    RAY_CHECK_GT(sched_cls_id_, 0);
   }
   return sched_cls_id_;
 }
diff --git a/src/ray/common/task/task_spec.h b/src/ray/common/task/task_spec.h
index 019835062d31..ac4a38c92cee 100644
--- a/src/ray/common/task/task_spec.h
+++ b/src/ray/common/task/task_spec.h
@@ -306,11 +306,11 @@ class TaskSpecification : public MessageWrapper<rpc::TaskSpec> {
 
   ray::FunctionDescriptor FunctionDescriptor() const;
 
-  [[nodiscard]] rpc::RuntimeEnvInfo RuntimeEnvInfo() const;
+  [[nodiscard]] const rpc::RuntimeEnvInfo &RuntimeEnvInfo() const;
 
-  std::string SerializedRuntimeEnv() const;
+  const std::string &SerializedRuntimeEnv() const;
 
-  rpc::RuntimeEnvConfig RuntimeEnvConfig() const;
+  const rpc::RuntimeEnvConfig &RuntimeEnvConfig() const;
 
   bool HasRuntimeEnv() const;
 
diff --git a/src/ray/common/task/task_util.h b/src/ray/common/task/task_util.h
index 488c52069aa4..4ecfab358c7b 100644
--- a/src/ray/common/task/task_util.h
+++ b/src/ray/common/task/task_util.h
@@ -135,7 +135,8 @@ class TaskSpecBuilder {
       const TaskID &submitter_task_id,
       const std::shared_ptr<rpc::RuntimeEnvInfo> runtime_env_info = nullptr,
       const std::string &concurrency_group_name = "",
-      bool enable_task_events = true) {
+      bool enable_task_events = true,
+      const std::unordered_map<std::string, std::string> &labels = {}) {
     message_->set_type(TaskType::NORMAL_TASK);
     message_->set_name(name);
     message_->set_language(language);
@@ -165,6 +166,7 @@ class TaskSpecBuilder {
     }
     message_->set_concurrency_group_name(concurrency_group_name);
     message_->set_enable_task_events(enable_task_events);
+    message_->mutable_labels()->insert(labels.begin(), labels.end());
     return *this;
   }
 
diff --git a/src/ray/core_worker/actor_handle.cc b/src/ray/core_worker/actor_handle.cc
index 4228cd392351..39257bbb7fcc 100644
--- a/src/ray/core_worker/actor_handle.cc
+++ b/src/ray/core_worker/actor_handle.cc
@@ -33,7 +33,8 @@ rpc::ActorHandle CreateInnerActorHandle(
     const std::string &ray_namespace,
     int32_t max_pending_calls,
     bool execute_out_of_order,
-    absl::optional<bool> enable_task_events) {
+    absl::optional<bool> enable_task_events,
+    const std::unordered_map<std::string, std::string> &labels) {
   rpc::ActorHandle inner;
   inner.set_actor_id(actor_id.Data(), actor_id.Size());
   inner.set_owner_id(owner_id.Binary());
@@ -50,6 +51,7 @@ rpc::ActorHandle CreateInnerActorHandle(
   inner.set_execute_out_of_order(execute_out_of_order);
   inner.set_max_pending_calls(max_pending_calls);
   inner.set_enable_task_events(enable_task_events.value_or(kDefaultTaskEventEnabled));
+  inner.mutable_labels()->insert(labels.begin(), labels.end());
   return inner;
 }
 
@@ -82,6 +84,7 @@ rpc::ActorHandle CreateInnerActorHandleFromActorData(
   inner.set_execute_out_of_order(
       task_spec.actor_creation_task_spec().execute_out_of_order());
   inner.set_max_pending_calls(task_spec.actor_creation_task_spec().max_pending_calls());
+  inner.mutable_labels()->insert(task_spec.labels().begin(), task_spec.labels().end());
   return inner;
 }
 }  // namespace
@@ -100,7 +103,8 @@ ActorHandle::ActorHandle(
     const std::string &ray_namespace,
     int32_t max_pending_calls,
     bool execute_out_of_order,
-    absl::optional<bool> enable_task_events)
+    absl::optional<bool> enable_task_events,
+    const std::unordered_map<std::string, std::string> &labels)
     : ActorHandle(CreateInnerActorHandle(actor_id,
                                          owner_id,
                                          owner_address,
@@ -114,7 +118,8 @@ ActorHandle::ActorHandle(
                                          ray_namespace,
                                          max_pending_calls,
                                          execute_out_of_order,
-                                         enable_task_events)) {}
+                                         enable_task_events,
+                                         labels)) {}
 
 ActorHandle::ActorHandle(const std::string &serialized)
     : ActorHandle(CreateInnerActorHandleFromString(serialized)) {}
diff --git a/src/ray/core_worker/actor_handle.h b/src/ray/core_worker/actor_handle.h
index 22f00c066dff..98306cb6d6b6 100644
--- a/src/ray/core_worker/actor_handle.h
+++ b/src/ray/core_worker/actor_handle.h
@@ -45,7 +45,8 @@ class ActorHandle {
               const std::string &ray_namespace,
               int32_t max_pending_calls,
               bool execute_out_of_order = false,
-              absl::optional<bool> enable_task_events = absl::nullopt);
+              absl::optional<bool> enable_task_events = absl::nullopt,
+              const std::unordered_map<std::string, std::string> &labels = {});
 
   /// Constructs an ActorHandle from a serialized string.
   explicit ActorHandle(const std::string &serialized);
@@ -105,6 +106,10 @@ class ActorHandle {
 
   bool ExecuteOutOfOrder() const { return inner_.execute_out_of_order(); }
 
+  const ::google::protobuf::Map<std::string, std::string> &GetLabels() const {
+    return inner_.labels();
+  }
+
  private:
   // Protobuf-defined persistent state of the actor handle.
   const rpc::ActorHandle inner_;
diff --git a/src/ray/core_worker/actor_manager.cc b/src/ray/core_worker/actor_manager.cc
index 02a89a7c65c9..a31c402fae11 100644
--- a/src/ray/core_worker/actor_manager.cc
+++ b/src/ray/core_worker/actor_manager.cc
@@ -172,7 +172,7 @@ bool ActorManager::AddActorHandle(std::unique_ptr<ActorHandle> actor_handle,
   }
 
   if (inserted && owned) {
-    RAY_CHECK(reference_counter_->AddObjectPrimaryCopyDeleteCallback(
+    RAY_CHECK(reference_counter_->AddObjectOutOfScopeOrFreedCallback(
         actor_creation_return_id, [this, actor_id](const ObjectID &object_id) {
           MarkActorKilledOrOutOfScope(GetActorHandle(actor_id));
         }));
diff --git a/src/ray/core_worker/actor_manager.h b/src/ray/core_worker/actor_manager.h
index dcfe8e11a68d..a42cdcc13d6b 100644
--- a/src/ray/core_worker/actor_manager.h
+++ b/src/ray/core_worker/actor_manager.h
@@ -20,6 +20,7 @@
 #include "ray/core_worker/actor_creator.h"
 #include "ray/core_worker/actor_handle.h"
 #include "ray/core_worker/reference_count.h"
+#include "ray/core_worker/transport/actor_task_submitter.h"
 #include "ray/core_worker/transport/task_receiver.h"
 #include "ray/gcs/gcs_client/gcs_client.h"
 namespace ray {
diff --git a/src/ray/core_worker/common.h b/src/ray/core_worker/common.h
index 3a160cd302d8..bc4c18c22bb1 100644
--- a/src/ray/core_worker/common.h
+++ b/src/ray/core_worker/common.h
@@ -67,14 +67,16 @@ struct TaskOptions {
               const std::string &concurrency_group_name = "",
               int64_t generator_backpressure_num_objects = -1,
               const std::string &serialized_runtime_env_info = "{}",
-              bool enable_task_events = kDefaultTaskEventEnabled)
+              bool enable_task_events = kDefaultTaskEventEnabled,
+              const std::unordered_map<std::string, std::string> &labels = {})
       : name(name),
         num_returns(num_returns),
         resources(resources),
         concurrency_group_name(concurrency_group_name),
         serialized_runtime_env_info(serialized_runtime_env_info),
         generator_backpressure_num_objects(generator_backpressure_num_objects),
-        enable_task_events(enable_task_events) {}
+        enable_task_events(enable_task_events),
+        labels(labels) {}
 
   /// The name of this task.
   std::string name;
@@ -95,6 +97,7 @@ struct TaskOptions {
   /// True if task events (worker::TaskEvent) from this task should be reported, default
   /// to true.
   bool enable_task_events = kDefaultTaskEventEnabled;
+  std::unordered_map<std::string, std::string> labels;
 };
 
 /// Options for actor creation tasks.
@@ -115,7 +118,8 @@ struct ActorCreationOptions {
                        const std::vector<ConcurrencyGroup> &concurrency_groups = {},
                        bool execute_out_of_order = false,
                        int32_t max_pending_calls = -1,
-                       bool enable_task_events = kDefaultTaskEventEnabled)
+                       bool enable_task_events = kDefaultTaskEventEnabled,
+                       const std::unordered_map<std::string, std::string> &labels = {})
       : max_restarts(max_restarts),
         max_task_retries(max_task_retries),
         max_concurrency(max_concurrency),
@@ -132,7 +136,8 @@ struct ActorCreationOptions {
         execute_out_of_order(execute_out_of_order),
         max_pending_calls(max_pending_calls),
         scheduling_strategy(scheduling_strategy),
-        enable_task_events(enable_task_events) {
+        enable_task_events(enable_task_events),
+        labels(labels) {
     // Check that resources is a subset of placement resources.
     for (auto &resource : resources) {
       auto it = this->placement_resources.find(resource.first);
@@ -187,6 +192,7 @@ struct ActorCreationOptions {
   /// True if task events (worker::TaskEvent) from this creation task should be reported
   /// default to true.
   const bool enable_task_events = kDefaultTaskEventEnabled;
+  const std::unordered_map<std::string, std::string> labels;
 };
 
 using PlacementStrategy = rpc::PlacementStrategy;
@@ -285,11 +291,11 @@ template <>
 struct hash<ray::rpc::LineageReconstructionTask> {
   size_t operator()(const ray::rpc::LineageReconstructionTask &task) const {
     size_t hash = std::hash<std::string>()(task.name());
-    for (const auto &resource : task.resources()) {
-      hash ^= std::hash<std::string>()(resource.first);
-      hash ^= std::hash<double>()(resource.second);
-    }
     hash ^= std::hash<ray::rpc::TaskStatus>()(task.status());
+    for (const auto &label : task.labels()) {
+      hash ^= std::hash<std::string>()(label.first);
+      hash ^= std::hash<std::string>()(label.second);
+    }
     return hash;
   }
 };
diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc
index 9a15e8702892..e04ade96a0fe 100644
--- a/src/ray/core_worker/core_worker.cc
+++ b/src/ray/core_worker/core_worker.cc
@@ -126,8 +126,129 @@ std::optional<ObjectLocation> TryGetLocalObjectLocation(
 
 }  // namespace
 
-CoreWorker::CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_id)
-    : options_(options),
+TaskCounter::TaskCounter() {
+  counter_.SetOnChangeCallback(
+      [this](const std::tuple<std::string, TaskStatusType, bool>
+                 &key) ABSL_EXCLUSIVE_LOCKS_REQUIRED(&mu_) mutable {
+        if (std::get<1>(key) != TaskStatusType::kRunning) {
+          return;
+        }
+        const auto &func_name = std::get<0>(key);
+        const auto is_retry = std::get<2>(key);
+        const int64_t running_total = counter_.Get(key);
+        const int64_t num_in_get = running_in_get_counter_.Get({func_name, is_retry});
+        const int64_t num_in_wait = running_in_wait_counter_.Get({func_name, is_retry});
+        const auto is_retry_label = is_retry ? "1" : "0";
+        // RUNNING_IN_RAY_GET/WAIT are sub-states of RUNNING, so we need to subtract
+        // them out to avoid double-counting.
+        ray::stats::STATS_tasks.Record(
+            running_total - num_in_get - num_in_wait,
+            {{"State", rpc::TaskStatus_Name(rpc::TaskStatus::RUNNING)},
+             {"Name", func_name},
+             {"IsRetry", is_retry_label},
+             {"JobId", job_id_},
+             {"Source", "executor"}});
+        // Negate the metrics recorded from the submitter process for these tasks.
+        ray::stats::STATS_tasks.Record(
+            -running_total,
+            {{"State", rpc::TaskStatus_Name(rpc::TaskStatus::SUBMITTED_TO_WORKER)},
+             {"Name", func_name},
+             {"IsRetry", is_retry_label},
+             {"JobId", job_id_},
+             {"Source", "executor"}});
+        // Record sub-state for get.
+        ray::stats::STATS_tasks.Record(
+            num_in_get,
+            {{"State", rpc::TaskStatus_Name(rpc::TaskStatus::RUNNING_IN_RAY_GET)},
+             {"Name", func_name},
+             {"IsRetry", is_retry_label},
+             {"JobId", job_id_},
+             {"Source", "executor"}});
+        // Record sub-state for wait.
+        ray::stats::STATS_tasks.Record(
+            num_in_wait,
+            {{"State", rpc::TaskStatus_Name(rpc::TaskStatus::RUNNING_IN_RAY_WAIT)},
+             {"Name", func_name},
+             {"IsRetry", is_retry_label},
+             {"JobId", job_id_},
+             {"Source", "executor"}});
+      });
+}
+
+void TaskCounter::RecordMetrics() {
+  absl::MutexLock l(&mu_);
+  counter_.FlushOnChangeCallbacks();
+  if (IsActor()) {
+    float running = 0.0;
+    float in_get = 0.0;
+    float in_wait = 0.0;
+    float idle = 0.0;
+    if (running_in_wait_counter_.Total() > 0) {
+      in_wait = 1.0;
+    } else if (running_in_get_counter_.Total() > 0) {
+      in_get = 1.0;
+    } else if (num_tasks_running_ > 0) {
+      running = 1.0;
+    } else {
+      idle = 1.0;
+    }
+    ray::stats::STATS_actors.Record(idle,
+                                    {{"State", "IDLE"},
+                                     {"Name", actor_name_},
+                                     {"Source", "executor"},
+                                     {"JobId", job_id_}});
+    ray::stats::STATS_actors.Record(running,
+                                    {{"State", "RUNNING_TASK"},
+                                     {"Name", actor_name_},
+                                     {"Source", "executor"},
+                                     {"JobId", job_id_}});
+    ray::stats::STATS_actors.Record(in_get,
+                                    {{"State", "RUNNING_IN_RAY_GET"},
+                                     {"Name", actor_name_},
+                                     {"Source", "executor"},
+                                     {"JobId", job_id_}});
+    ray::stats::STATS_actors.Record(in_wait,
+                                    {{"State", "RUNNING_IN_RAY_WAIT"},
+                                     {"Name", actor_name_},
+                                     {"Source", "executor"},
+                                     {"JobId", job_id_}});
+  }
+}
+
+void TaskCounter::SetMetricStatus(const std::string &func_name,
+                                  rpc::TaskStatus status,
+                                  bool is_retry) {
+  absl::MutexLock l(&mu_);
+  // Add a no-op increment to counter_ so that
+  // it will invoke a callback upon RecordMetrics.
+  counter_.Increment({func_name, TaskStatusType::kRunning, is_retry}, 0);
+  if (status == rpc::TaskStatus::RUNNING_IN_RAY_GET) {
+    running_in_get_counter_.Increment({func_name, is_retry});
+  } else if (status == rpc::TaskStatus::RUNNING_IN_RAY_WAIT) {
+    running_in_wait_counter_.Increment({func_name, is_retry});
+  } else {
+    RAY_CHECK(false) << "Unexpected status " << rpc::TaskStatus_Name(status);
+  }
+}
+
+void TaskCounter::UnsetMetricStatus(const std::string &func_name,
+                                    rpc::TaskStatus status,
+                                    bool is_retry) {
+  absl::MutexLock l(&mu_);
+  // Add a no-op decrement to counter_ so that
+  // it will invoke a callback upon RecordMetrics.
+  counter_.Decrement({func_name, TaskStatusType::kRunning, is_retry}, 0);
+  if (status == rpc::TaskStatus::RUNNING_IN_RAY_GET) {
+    running_in_get_counter_.Decrement({func_name, is_retry});
+  } else if (status == rpc::TaskStatus::RUNNING_IN_RAY_WAIT) {
+    running_in_wait_counter_.Decrement({func_name, is_retry});
+  } else {
+    RAY_LOG(FATAL) << "Unexpected status " << rpc::TaskStatus_Name(status);
+  }
+}
+
+CoreWorker::CoreWorker(CoreWorkerOptions options, const WorkerID &worker_id)
+    : options_(std::move(options)),
       get_call_site_(RayConfig::instance().record_ref_creation_sites()
                          ? options_.get_lang_stack
                          : nullptr),
@@ -339,7 +460,7 @@ CoreWorker::CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_
         << "max_pending_lease_requests_per_scheduling_category can't be 0";
     lease_request_rate_limiter_ =
         std::make_shared<ClusterSizeBasedLeaseRequestRateLimiter>(
-            /*kMinConcurrentLeaseCap*/ 10);
+            /*min_concurrent_lease_cap_*/ 10);
   }
 
   // Register a callback to monitor add/removed nodes.
@@ -516,14 +637,13 @@ CoreWorker::CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_
 
   actor_creator_ = std::make_shared<DefaultActorCreator>(gcs_client_);
 
-  actor_task_submitter_ = std::shared_ptr<ActorTaskSubmitter>(
-      new ActorTaskSubmitter(*core_worker_client_pool_,
-                             *memory_store_,
-                             *task_manager_,
-                             *actor_creator_,
-                             on_excess_queueing,
-                             io_service_,
-                             reference_counter_));
+  actor_task_submitter_ = std::make_shared<ActorTaskSubmitter>(*core_worker_client_pool_,
+                                                               *memory_store_,
+                                                               *task_manager_,
+                                                               *actor_creator_,
+                                                               on_excess_queueing,
+                                                               io_service_,
+                                                               reference_counter_);
 
   auto node_addr_factory = [this](const NodeID &node_id) {
     absl::optional<rpc::Address> addr;
@@ -564,15 +684,16 @@ CoreWorker::CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_
                                            uint64_t object_size) {
     reference_counter_->ReportLocalityData(object_id, locations, object_size);
   };
-  future_resolver_.reset(new FutureResolver(memory_store_,
-                                            reference_counter_,
-                                            std::move(report_locality_data_callback),
-                                            core_worker_client_pool_,
-                                            rpc_address_));
+  future_resolver_ =
+      std::make_unique<FutureResolver>(memory_store_,
+                                       reference_counter_,
+                                       std::move(report_locality_data_callback),
+                                       core_worker_client_pool_,
+                                       rpc_address_);
 
   // Unfortunately the raylet client has to be constructed after the receivers.
   if (task_receiver_ != nullptr) {
-    task_argument_waiter_.reset(new DependencyWaiterImpl(*local_raylet_client_));
+    task_argument_waiter_ = std::make_unique<DependencyWaiterImpl>(*local_raylet_client_);
     task_receiver_->Init(core_worker_client_pool_, rpc_address_, task_argument_waiter_);
   }
 
@@ -592,13 +713,13 @@ CoreWorker::CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_
       for (const auto &node_id : object_locations.value()) {
         absl::optional<rpc::Address> addr = node_addr_factory(node_id);
         if (addr.has_value()) {
-          locations.push_back(addr.value());
-        } else {
-          // We're getting potentially stale locations directly from the reference
-          // counter, so the location might be a dead node.
-          RAY_LOG(DEBUG).WithField(object_id).WithField(node_id)
-              << "Object location is dead, not using it in the recovery of object";
+          locations.emplace_back(std::move(addr.value()));
+          continue;
         }
+        // We're getting potentially stale locations directly from the reference
+        // counter, so the location might be a dead node.
+        RAY_LOG(DEBUG).WithField(object_id).WithField(node_id)
+            << "Object location is dead, not using it in the recovery of object";
       }
     }
     callback(object_id, locations);
@@ -1014,7 +1135,7 @@ void CoreWorker::RegisterToGcs(int64_t worker_launch_time_ms,
                           std::chrono::system_clock::now().time_since_epoch())
                           .count();
     worker_info.emplace("driver_id", worker_id.Binary());
-    worker_info.emplace("start_time", std::to_string(start_time));
+    worker_info.emplace("start_time", absl::StrFormat("%d", start_time));
     if (!options_.driver_name.empty()) {
       worker_info.emplace("name", options_.driver_name);
     }
@@ -1033,7 +1154,8 @@ void CoreWorker::RegisterToGcs(int64_t worker_launch_time_ms,
   worker_data->mutable_worker_address()->set_port(rpc_address_.port());
   worker_data->mutable_worker_address()->set_worker_id(worker_id.Binary());
   worker_data->set_worker_type(options_.worker_type);
-  worker_data->mutable_worker_info()->insert(worker_info.begin(), worker_info.end());
+  worker_data->mutable_worker_info()->insert(std::make_move_iterator(worker_info.begin()),
+                                             std::make_move_iterator(worker_info.end()));
 
   worker_data->set_is_alive(true);
   worker_data->set_pid(pid_);
@@ -1067,7 +1189,7 @@ void CoreWorker::InternalHeartbeat() {
     absl::MutexLock lock(&mutex_);
     while (!to_resubmit_.empty() &&
            current_time_ms() > to_resubmit_.top().execution_time_ms) {
-      tasks_to_resubmit.push_back(std::move(to_resubmit_.top()));
+      tasks_to_resubmit.emplace_back(to_resubmit_.top());
       to_resubmit_.pop();
     }
   }
@@ -1168,15 +1290,16 @@ Status CoreWorker::GetOwnerAddress(const ObjectID &object_id,
 std::vector<rpc::ObjectReference> CoreWorker::GetObjectRefs(
     const std::vector<ObjectID> &object_ids) const {
   std::vector<rpc::ObjectReference> refs;
+  refs.reserve(object_ids.size());
   for (const auto &object_id : object_ids) {
     rpc::ObjectReference ref;
     ref.set_object_id(object_id.Binary());
     rpc::Address owner_address;
     if (reference_counter_->GetOwner(object_id, &owner_address)) {
       // NOTE(swang): Detached actors do not have an owner address set.
-      ref.mutable_owner_address()->CopyFrom(owner_address);
+      *ref.mutable_owner_address() = std::move(owner_address);
     }
-    refs.push_back(std::move(ref));
+    refs.emplace_back(std::move(ref));
   }
   return refs;
 }
@@ -2115,7 +2238,8 @@ void CoreWorker::BuildCommonTaskSpec(
     const std::string &concurrency_group_name,
     bool include_job_config,
     int64_t generator_backpressure_num_objects,
-    bool enable_task_events) {
+    bool enable_task_events,
+    const std::unordered_map<std::string, std::string> &labels) {
   // Build common task spec.
   auto override_runtime_env_info =
       OverrideTaskOrActorRuntimeEnvInfo(serialized_runtime_env_info);
@@ -2161,7 +2285,8 @@ void CoreWorker::BuildCommonTaskSpec(
       main_thread_current_task_id,
       override_runtime_env_info,
       concurrency_group_name,
-      enable_task_events);
+      enable_task_events,
+      labels);
   // Set task arguments.
   for (const auto &arg : args) {
     builder.AddArg(*arg);
@@ -2217,7 +2342,8 @@ std::vector<rpc::ObjectReference> CoreWorker::SubmitTask(
                       /*include_job_config*/ true,
                       /*generator_backpressure_num_objects*/
                       task_options.generator_backpressure_num_objects,
-                      /*enable_task_event*/ task_options.enable_task_events);
+                      /*enable_task_event*/ task_options.enable_task_events,
+                      task_options.labels);
   ActorID root_detached_actor_id;
   if (!worker_context_.GetRootDetachedActorID().IsNil()) {
     root_detached_actor_id = worker_context_.GetRootDetachedActorID();
@@ -2308,7 +2434,8 @@ Status CoreWorker::CreateActor(const RayFunction &function,
                       /*concurrency_group_name*/ "",
                       /*include_job_config*/ true,
                       /*generator_backpressure_num_objects*/ -1,
-                      /*enable_task_events*/ actor_creation_options.enable_task_events);
+                      /*enable_task_events*/ actor_creation_options.enable_task_events,
+                      actor_creation_options.labels);
 
   // If the namespace is not specified, get it from the job.
   const auto ray_namespace = (actor_creation_options.ray_namespace.empty()
@@ -2328,7 +2455,8 @@ Status CoreWorker::CreateActor(const RayFunction &function,
       ray_namespace,
       actor_creation_options.max_pending_calls,
       actor_creation_options.execute_out_of_order,
-      actor_creation_options.enable_task_events);
+      actor_creation_options.enable_task_events,
+      actor_creation_options.labels);
   std::string serialized_actor_handle;
   actor_handle->Serialize(&serialized_actor_handle);
   ActorID root_detached_actor_id;
@@ -2456,9 +2584,8 @@ Status CoreWorker::CreatePlacementGroup(
            << ". It is probably "
               "because GCS server is dead or there's a high load there.";
     return Status::TimedOut(stream.str());
-  } else {
-    return status;
   }
+  return status;
 }
 
 Status CoreWorker::RemovePlacementGroup(const PlacementGroupID &placement_group_id) {
@@ -2472,9 +2599,8 @@ Status CoreWorker::RemovePlacementGroup(const PlacementGroupID &placement_group_
            << ". It is probably "
               "because GCS server is dead or there's a high load there.";
     return Status::TimedOut(stream.str());
-  } else {
-    return status;
   }
+  return status;
 }
 
 Status CoreWorker::WaitPlacementGroupReady(const PlacementGroupID &placement_group_id,
@@ -2486,9 +2612,8 @@ Status CoreWorker::WaitPlacementGroupReady(const PlacementGroupID &placement_gro
     stream << "There was timeout in waiting for placement group " << placement_group_id
            << " creation.";
     return Status::TimedOut(stream.str());
-  } else {
-    return status;
   }
+  return status;
 }
 
 Status CoreWorker::SubmitActorTask(
@@ -2741,7 +2866,7 @@ std::optional<rpc::ActorTableData::ActorState> CoreWorker::GetLocalActorState(
 ActorID CoreWorker::DeserializeAndRegisterActorHandle(const std::string &serialized,
                                                       const ObjectID &outer_object_id,
                                                       bool add_local_ref) {
-  std::unique_ptr<ActorHandle> actor_handle(new ActorHandle(serialized));
+  auto actor_handle = std::make_unique<ActorHandle>(serialized);
   return actor_manager_->RegisterActorHandle(std::move(actor_handle),
                                              outer_object_id,
                                              CurrentCallSite(),
@@ -2796,20 +2921,16 @@ CoreWorker::ListNamedActors(bool all_namespaces) {
     stream << "There was timeout in getting the list of named actors, "
               "probably because the GCS server is dead or under high load .";
     return std::make_pair(std::move(actors), Status::TimedOut(stream.str()));
-  } else if (!status.ok()) {
-    return std::make_pair(std::move(actors), status);
-  } else {
-    return std::make_pair(std::move(actors), status);
   }
+  return std::make_pair(std::move(actors), std::move(status));
 }
 
 std::pair<std::shared_ptr<const ActorHandle>, Status>
 CoreWorker::GetNamedActorHandleLocalMode(const std::string &name) {
   auto it = local_mode_named_actor_registry_.find(name);
   if (it == local_mode_named_actor_registry_.end()) {
-    std::ostringstream stream;
-    stream << "Failed to look up actor with name '" << name;
-    return std::make_pair(nullptr, Status::NotFound(stream.str()));
+    std::string err_msg = absl::StrFormat("Failed to look up actor with name %s", name);
+    return std::make_pair(nullptr, Status::NotFound(std::move(err_msg)));
   }
 
   return std::make_pair(GetActorHandle(it->second), Status::OK());
@@ -2818,12 +2939,13 @@ CoreWorker::GetNamedActorHandleLocalMode(const std::string &name) {
 std::pair<std::vector<std::pair<std::string, std::string>>, Status>
 CoreWorker::ListNamedActorsLocalMode() {
   std::vector<std::pair<std::string, std::string>> actors;
+  actors.reserve(local_mode_named_actor_registry_.size());
   for (auto it = local_mode_named_actor_registry_.begin();
        it != local_mode_named_actor_registry_.end();
        it++) {
-    actors.push_back(std::make_pair(/*namespace=*/"", it->first));
+    actors.emplace_back(/*namespace=*/"", it->first);
   }
-  return std::make_pair(actors, Status::OK());
+  return std::make_pair(std::move(actors), Status::OK());
 }
 
 const std::string CoreWorker::GetActorName() const {
@@ -2853,7 +2975,8 @@ void CoreWorker::RunTaskExecutionLoop() {
             Exit(rpc::WorkerExitType::INTENDED_USER_EXIT,
                  absl::StrCat("Worker exits by a signal. ", status.message()),
                  nullptr);
-          } else if (status.IsUnexpectedSystemExit()) {
+          }
+          if (status.IsUnexpectedSystemExit()) {
             Exit(
                 rpc::WorkerExitType::SYSTEM_ERROR,
                 absl::StrCat("Worker exits unexpectedly by a signal. ", status.message()),
@@ -3009,14 +3132,14 @@ Status CoreWorker::ExecuteTask(
   Status status;
   TaskType task_type = TaskType::NORMAL_TASK;
   if (task_spec.IsActorCreationTask()) {
-    RAY_CHECK(return_objects->size() > 0);
+    RAY_CHECK_GT(return_objects->size(), static_cast<unsigned int>(0));
     return_objects->pop_back();
     task_type = TaskType::ACTOR_CREATION_TASK;
     SetActorId(task_spec.ActorCreationId());
     task_counter_.BecomeActor(task_spec.FunctionDescriptor()->ClassName());
     {
-      std::unique_ptr<ActorHandle> self_actor_handle(
-          new ActorHandle(task_spec.GetSerializedActorHandle()));
+      auto self_actor_handle =
+          std::make_unique<ActorHandle>(task_spec.GetSerializedActorHandle());
       // Register the handle to the current actor itself.
       actor_manager_->RegisterActorHandle(std::move(self_actor_handle),
                                           ObjectID::Nil(),
@@ -3058,9 +3181,9 @@ Status CoreWorker::ExecuteTask(
       defined_concurrency_groups,
       name_of_concurrency_group_to_execute,
       /*is_reattempt=*/task_spec.AttemptNumber() > 0,
-      /*is_streaming_generator*/ task_spec.IsStreamingGenerator(),
-      /*retry_exception*/ task_spec.ShouldRetryExceptions(),
-      /*generator_backpressure_num_objects*/
+      /*is_streaming_generator=*/task_spec.IsStreamingGenerator(),
+      /*retry_exception=*/task_spec.ShouldRetryExceptions(),
+      /*generator_backpressure_num_objects=*/
       task_spec.GeneratorBackpressureNumObjects());
 
   // Get the reference counts for any IDs that we borrowed during this task,
@@ -3073,7 +3196,7 @@ Status CoreWorker::ExecuteTask(
   if (!borrowed_ids.empty()) {
     reference_counter_->PopAndClearLocalBorrowers(borrowed_ids, borrowed_refs, &deleted);
   }
-  if (dynamic_return_objects != NULL) {
+  if (dynamic_return_objects != nullptr) {
     for (const auto &dynamic_return : *dynamic_return_objects) {
       reference_counter_->PopAndClearLocalBorrowers(
           {dynamic_return.first}, borrowed_refs, &deleted);
@@ -3092,7 +3215,7 @@ Status CoreWorker::ExecuteTask(
   }
 
   if (!options_.is_local_mode) {
-    SetCurrentTaskId(TaskID::Nil(), /*attempt_number=*/0, "");
+    SetCurrentTaskId(TaskID::Nil(), /*attempt_number=*/0, /*task_name=*/"");
     worker_context_.ResetCurrentTask();
   }
   {
@@ -3737,7 +3860,7 @@ void CoreWorker::ProcessSubscribeForObjectEviction(
   // Returns true if the object was present and the callback was added. It might have
   // already been evicted by the time we get this request, in which case we should
   // respond immediately so the raylet unpins the object.
-  if (!reference_counter_->AddObjectPrimaryCopyDeleteCallback(object_id, unpin_object)) {
+  if (!reference_counter_->AddObjectOutOfScopeOrFreedCallback(object_id, unpin_object)) {
     // If the object is already evicted (callback cannot be set), unregister the
     // subscription & publish the message so that the subscriber knows it.
     unpin_object(object_id);
@@ -3939,7 +4062,7 @@ void CoreWorker::ProcessSubscribeObjectLocations(
 
 std::unordered_map<rpc::LineageReconstructionTask, uint64_t>
 CoreWorker::GetLocalOngoingLineageReconstructionTasks() const {
-  return task_manager_->GetOngoingLineageReconstructionTasks();
+  return task_manager_->GetOngoingLineageReconstructionTasks(*actor_manager_);
 }
 
 Status CoreWorker::GetLocalObjectLocations(
@@ -4731,11 +4854,11 @@ void CoreWorker::UpdateTaskIsDebuggerPaused(const TaskID &task_id,
 
 ClusterSizeBasedLeaseRequestRateLimiter::ClusterSizeBasedLeaseRequestRateLimiter(
     size_t min_concurrent_lease_limit)
-    : kMinConcurrentLeaseCap(min_concurrent_lease_limit), num_alive_nodes_(0) {}
+    : min_concurrent_lease_cap_(min_concurrent_lease_limit), num_alive_nodes_(0) {}
 
 size_t ClusterSizeBasedLeaseRequestRateLimiter::
     GetMaxPendingLeaseRequestsPerSchedulingCategory() {
-  return std::max<size_t>(kMinConcurrentLeaseCap, num_alive_nodes_.load());
+  return std::max<size_t>(min_concurrent_lease_cap_, num_alive_nodes_.load());
 }
 
 void ClusterSizeBasedLeaseRequestRateLimiter::OnNodeChanges(
diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h
index 2d135a5983f8..4ab8a5cd0f3c 100644
--- a/src/ray/core_worker/core_worker.h
+++ b/src/ray/core_worker/core_worker.h
@@ -59,8 +59,7 @@
 ///    "RPC_SERVICE_HANDLER(CoreWorkerService, ExampleCall, 1)"
 /// 4) Add a method to the CoreWorker class below: "CoreWorker::HandleExampleCall"
 
-namespace ray {
-namespace core {
+namespace ray::core {
 
 JobID GetProcessJobID(const CoreWorkerOptions &options);
 
@@ -69,57 +68,10 @@ JobID GetProcessJobID(const CoreWorkerOptions &options);
 class TaskCounter {
   /// A task can only be one of the following state. Received state in particular
   /// covers from the point of RPC call to beginning execution.
-  enum TaskStatusType { kPending, kRunning, kFinished };
+  enum class TaskStatusType { kPending, kRunning, kFinished };
 
  public:
-  TaskCounter() {
-    counter_.SetOnChangeCallback(
-        [this](const std::tuple<std::string, TaskStatusType, bool> &key)
-            ABSL_EXCLUSIVE_LOCKS_REQUIRED(&mu_) mutable {
-              if (std::get<1>(key) != kRunning) {
-                return;
-              }
-              auto func_name = std::get<0>(key);
-              auto is_retry = std::get<2>(key);
-              int64_t running_total = counter_.Get(key);
-              int64_t num_in_get = running_in_get_counter_.Get({func_name, is_retry});
-              int64_t num_in_wait = running_in_wait_counter_.Get({func_name, is_retry});
-              auto is_retry_label = is_retry ? "1" : "0";
-              // RUNNING_IN_RAY_GET/WAIT are sub-states of RUNNING, so we need to subtract
-              // them out to avoid double-counting.
-              ray::stats::STATS_tasks.Record(
-                  running_total - num_in_get - num_in_wait,
-                  {{"State", rpc::TaskStatus_Name(rpc::TaskStatus::RUNNING)},
-                   {"Name", func_name},
-                   {"IsRetry", is_retry_label},
-                   {"JobId", job_id_},
-                   {"Source", "executor"}});
-              // Negate the metrics recorded from the submitter process for these tasks.
-              ray::stats::STATS_tasks.Record(
-                  -running_total,
-                  {{"State", rpc::TaskStatus_Name(rpc::TaskStatus::SUBMITTED_TO_WORKER)},
-                   {"Name", func_name},
-                   {"IsRetry", is_retry_label},
-                   {"JobId", job_id_},
-                   {"Source", "executor"}});
-              // Record sub-state for get.
-              ray::stats::STATS_tasks.Record(
-                  num_in_get,
-                  {{"State", rpc::TaskStatus_Name(rpc::TaskStatus::RUNNING_IN_RAY_GET)},
-                   {"Name", func_name},
-                   {"IsRetry", is_retry_label},
-                   {"JobId", job_id_},
-                   {"Source", "executor"}});
-              // Record sub-state for wait.
-              ray::stats::STATS_tasks.Record(
-                  num_in_wait,
-                  {{"State", rpc::TaskStatus_Name(rpc::TaskStatus::RUNNING_IN_RAY_WAIT)},
-                   {"Name", func_name},
-                   {"IsRetry", is_retry_label},
-                   {"JobId", job_id_},
-                   {"Source", "executor"}});
-            });
-  }
+  TaskCounter();
 
   void BecomeActor(const std::string &actor_name) {
     absl::MutexLock l(&mu_);
@@ -133,95 +85,35 @@ class TaskCounter {
 
   bool IsActor() ABSL_EXCLUSIVE_LOCKS_REQUIRED(&mu_) { return actor_name_.size() > 0; }
 
-  void RecordMetrics() {
-    absl::MutexLock l(&mu_);
-    counter_.FlushOnChangeCallbacks();
-    if (IsActor()) {
-      float running = 0.0;
-      float in_get = 0.0;
-      float in_wait = 0.0;
-      float idle = 0.0;
-      if (running_in_wait_counter_.Total() > 0) {
-        in_wait = 1.0;
-      } else if (running_in_get_counter_.Total() > 0) {
-        in_get = 1.0;
-      } else if (num_tasks_running_ > 0) {
-        running = 1.0;
-      } else {
-        idle = 1.0;
-      }
-      ray::stats::STATS_actors.Record(idle,
-                                      {{"State", "IDLE"},
-                                       {"Name", actor_name_},
-                                       {"Source", "executor"},
-                                       {"JobId", job_id_}});
-      ray::stats::STATS_actors.Record(running,
-                                      {{"State", "RUNNING_TASK"},
-                                       {"Name", actor_name_},
-                                       {"Source", "executor"},
-                                       {"JobId", job_id_}});
-      ray::stats::STATS_actors.Record(in_get,
-                                      {{"State", "RUNNING_IN_RAY_GET"},
-                                       {"Name", actor_name_},
-                                       {"Source", "executor"},
-                                       {"JobId", job_id_}});
-      ray::stats::STATS_actors.Record(in_wait,
-                                      {{"State", "RUNNING_IN_RAY_WAIT"},
-                                       {"Name", actor_name_},
-                                       {"Source", "executor"},
-                                       {"JobId", job_id_}});
-    }
-  }
+  void RecordMetrics();
 
   void IncPending(const std::string &func_name, bool is_retry) {
     absl::MutexLock l(&mu_);
-    counter_.Increment({func_name, kPending, is_retry});
+    counter_.Increment({func_name, TaskStatusType::kPending, is_retry});
   }
 
   void MovePendingToRunning(const std::string &func_name, bool is_retry) {
     absl::MutexLock l(&mu_);
-    counter_.Swap({func_name, kPending, is_retry}, {func_name, kRunning, is_retry});
+    counter_.Swap({func_name, TaskStatusType::kPending, is_retry},
+                  {func_name, TaskStatusType::kRunning, is_retry});
     num_tasks_running_++;
   }
 
   void MoveRunningToFinished(const std::string &func_name, bool is_retry) {
     absl::MutexLock l(&mu_);
-    counter_.Swap({func_name, kRunning, is_retry}, {func_name, kFinished, is_retry});
+    counter_.Swap({func_name, TaskStatusType::kRunning, is_retry},
+                  {func_name, TaskStatusType::kFinished, is_retry});
     num_tasks_running_--;
-    RAY_CHECK(num_tasks_running_ >= 0);
+    RAY_CHECK_GE(num_tasks_running_, 0);
   }
 
   void SetMetricStatus(const std::string &func_name,
                        rpc::TaskStatus status,
-                       bool is_retry) {
-    absl::MutexLock l(&mu_);
-    // Add a no-op increment to counter_ so that
-    // it will invoke a callback upon RecordMetrics.
-    counter_.Increment({func_name, TaskStatusType::kRunning, is_retry}, 0);
-    if (status == rpc::TaskStatus::RUNNING_IN_RAY_GET) {
-      running_in_get_counter_.Increment({func_name, is_retry});
-    } else if (status == rpc::TaskStatus::RUNNING_IN_RAY_WAIT) {
-      running_in_wait_counter_.Increment({func_name, is_retry});
-    } else {
-      RAY_CHECK(false) << "Unexpected status " << rpc::TaskStatus_Name(status);
-    }
-  }
+                       bool is_retry);
 
   void UnsetMetricStatus(const std::string &func_name,
                          rpc::TaskStatus status,
-                         bool is_retry) {
-    absl::MutexLock l(&mu_);
-    // Add a no-op decrement to counter_ so that
-    // it will invoke a callback upon RecordMetrics.
-    counter_.Decrement({func_name, TaskStatusType::kRunning, is_retry}, 0);
-    if (status == rpc::TaskStatus::RUNNING_IN_RAY_GET) {
-      running_in_get_counter_.Decrement({func_name, is_retry});
-    } else if (status == rpc::TaskStatus::RUNNING_IN_RAY_WAIT) {
-      running_in_wait_counter_.Decrement({func_name, is_retry});
-    } else {
-      RAY_CHECK(false) << "Unexpected status " << rpc::TaskStatus_Name(status);
-    }
-  }
+                         bool is_retry);
 
  private:
   mutable absl::Mutex mu_;
@@ -274,7 +166,7 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler {
   ///
   /// \param[in] options The various initialization options.
   /// \param[in] worker_id ID of this worker.
-  CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_id);
+  CoreWorker(CoreWorkerOptions options, const WorkerID &worker_id);
 
   CoreWorker(CoreWorker const &) = delete;
 
@@ -335,6 +227,19 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler {
 
   const TaskID &GetCurrentTaskId() const { return worker_context_.GetCurrentTaskID(); }
 
+  const std::string GetCurrentTaskName() const {
+    return worker_context_.GetCurrentTask() != nullptr
+               ? worker_context_.GetCurrentTask()->GetName()
+               : "";
+  }
+
+  const std::string GetCurrentTaskFunctionName() const {
+    return (worker_context_.GetCurrentTask() != nullptr &&
+            worker_context_.GetCurrentTask()->FunctionDescriptor() != nullptr)
+               ? worker_context_.GetCurrentTask()->FunctionDescriptor()->CallSiteString()
+               : "";
+  }
+
   /// Controls the is debugger paused flag.
   ///
   /// \param task_id The task id of the task to update.
@@ -402,11 +307,10 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler {
   }
 
   bool GetCurrentTaskRetryExceptions() const {
-    if (!options_.is_local_mode) {
-      return worker_context_.GetCurrentTask()->ShouldRetryExceptions();
-    } else {
+    if (options_.is_local_mode) {
       return false;
     }
+    return worker_context_.GetCurrentTask()->ShouldRetryExceptions();
   }
 
   void SetWebuiDisplay(const std::string &key, const std::string &message);
@@ -1100,9 +1004,7 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler {
   const ResourceMappingType GetResourceIDs() const;
 
   /// Create a profile event and push it the TaskEventBuffer when the event is destructed.
-  std::unique_ptr<worker::ProfileEvent> CreateProfileEvent(
-
-      const std::string &event_name);
+  std::unique_ptr<worker::ProfileEvent> CreateProfileEvent(const std::string &event_name);
 
   int64_t GetNumTasksSubmitted() const {
     return normal_task_submitter_->GetNumTasksSubmitted();
@@ -1467,7 +1369,8 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler {
       const std::string &concurrency_group_name = "",
       bool include_job_config = false,
       int64_t generator_backpressure_num_objects = -1,
-      bool enable_task_events = true);
+      bool enable_task_events = true,
+      const std::unordered_map<std::string, std::string> &labels = {});
   void SetCurrentTaskId(const TaskID &task_id,
                         uint64_t attempt_number,
                         const std::string &task_name);
@@ -1970,8 +1873,7 @@ class ClusterSizeBasedLeaseRequestRateLimiter : public LeaseRequestRateLimiter {
   void OnNodeChanges(const rpc::GcsNodeInfo &data);
 
  private:
-  const size_t kMinConcurrentLeaseCap;
+  const size_t min_concurrent_lease_cap_;
   std::atomic<size_t> num_alive_nodes_;
 };
-}  // namespace core
-}  // namespace ray
+}  // namespace ray::core
diff --git a/src/ray/core_worker/reference_count.cc b/src/ray/core_worker/reference_count.cc
index eb5abe2ea387..6dd4c8bf6b7d 100644
--- a/src/ray/core_worker/reference_count.cc
+++ b/src/ray/core_worker/reference_count.cc
@@ -543,7 +543,7 @@ int64_t ReferenceCounter::ReleaseLineageReferences(ReferenceTable::iterator ref)
     RAY_LOG(DEBUG) << "Releasing lineage internal for argument " << argument_id;
     arg_it->second.lineage_ref_count--;
     if (arg_it->second.OutOfScope(lineage_pinning_enabled_)) {
-      DeleteObjectPrimaryCopy(arg_it);
+      OnObjectOutOfScopeOrFreed(arg_it);
     }
     if (arg_it->second.ShouldDelete(lineage_pinning_enabled_)) {
       RAY_CHECK(arg_it->second.on_ref_removed == nullptr);
@@ -663,7 +663,7 @@ void ReferenceCounter::FreePlasmaObjects(const std::vector<ObjectID> &object_ids
     }
     // Free only the plasma value. We must keep the reference around so that we
     // have the ownership information.
-    DeleteObjectPrimaryCopy(it);
+    OnObjectOutOfScopeOrFreed(it);
   }
 }
 
@@ -700,8 +700,7 @@ void ReferenceCounter::DeleteReferenceInternal(ReferenceTable::iterator it,
         DeleteReferenceInternal(inner_it, deleted);
       }
     }
-    // Perform the deletion.
-    DeleteObjectPrimaryCopy(it);
+    OnObjectOutOfScopeOrFreed(it);
     if (deleted) {
       deleted->push_back(id);
     }
@@ -764,20 +763,20 @@ int64_t ReferenceCounter::EvictLineage(int64_t min_bytes_to_evict) {
   return lineage_bytes_evicted;
 }
 
-void ReferenceCounter::DeleteObjectPrimaryCopy(ReferenceTable::iterator it) {
-  RAY_LOG(DEBUG) << "Calling on_object_primary_copy_delete for object " << it->first
-                 << " num callbacks: "
-                 << it->second.on_object_primary_copy_delete_callbacks.size();
-  for (const auto &callback : it->second.on_object_primary_copy_delete_callbacks) {
+void ReferenceCounter::OnObjectOutOfScopeOrFreed(ReferenceTable::iterator it) {
+  RAY_LOG(DEBUG) << "Calling on_object_out_of_scope_or_freed_callbacks for object "
+                 << it->first << " num callbacks: "
+                 << it->second.on_object_out_of_scope_or_freed_callbacks.size();
+  for (const auto &callback : it->second.on_object_out_of_scope_or_freed_callbacks) {
     callback(it->first);
   }
-  it->second.on_object_primary_copy_delete_callbacks.clear();
+  it->second.on_object_out_of_scope_or_freed_callbacks.clear();
+  UnsetObjectPrimaryCopy(it);
+}
+
+void ReferenceCounter::UnsetObjectPrimaryCopy(ReferenceTable::iterator it) {
   it->second.pinned_at_raylet_id.reset();
   if (it->second.spilled && !it->second.spilled_node_id.IsNil()) {
-    // The spilled copy of the object should get deleted during the
-    // on_object_primary_copy_delete callback, so reset the spill location metadata here.
-    // NOTE(swang): Spilled copies in cloud storage are not GCed, so we do not
-    // reset the spilled metadata.
     it->second.spilled = false;
     it->second.spilled_url = "";
     it->second.spilled_node_id = NodeID::Nil();
@@ -795,7 +794,7 @@ bool ReferenceCounter::SetObjectRefDeletedCallback(
   return true;
 }
 
-bool ReferenceCounter::AddObjectPrimaryCopyDeleteCallback(
+bool ReferenceCounter::AddObjectOutOfScopeOrFreedCallback(
     const ObjectID &object_id, const std::function<void(const ObjectID &)> callback) {
   absl::MutexLock lock(&mutex_);
   auto it = object_id_refs_.find(object_id);
@@ -812,7 +811,7 @@ bool ReferenceCounter::AddObjectPrimaryCopyDeleteCallback(
     return false;
   }
 
-  it->second.on_object_primary_copy_delete_callbacks.emplace_back(callback);
+  it->second.on_object_out_of_scope_or_freed_callbacks.emplace_back(callback);
   return true;
 }
 
@@ -822,7 +821,7 @@ void ReferenceCounter::ResetObjectsOnRemovedNode(const NodeID &raylet_id) {
     const auto &object_id = it->first;
     if (it->second.pinned_at_raylet_id.value_or(NodeID::Nil()) == raylet_id ||
         it->second.spilled_node_id == raylet_id) {
-      DeleteObjectPrimaryCopy(it);
+      UnsetObjectPrimaryCopy(it);
       if (!it->second.OutOfScope(lineage_pinning_enabled_)) {
         objects_to_recover_.push_back(object_id);
       }
@@ -862,7 +861,7 @@ void ReferenceCounter::UpdateObjectPinnedAtRaylet(const ObjectID &object_id,
       if (check_node_alive_(raylet_id)) {
         it->second.pinned_at_raylet_id = raylet_id;
       } else {
-        DeleteObjectPrimaryCopy(it);
+        UnsetObjectPrimaryCopy(it);
         objects_to_recover_.push_back(object_id);
       }
     }
@@ -1429,7 +1428,7 @@ bool ReferenceCounter::HandleObjectSpilled(const ObjectID &object_id,
   } else {
     RAY_LOG(DEBUG).WithField(spilled_node_id).WithField(object_id)
         << "Object spilled to dead node ";
-    DeleteObjectPrimaryCopy(it);
+    UnsetObjectPrimaryCopy(it);
     objects_to_recover_.push_back(object_id);
   }
   return true;
diff --git a/src/ray/core_worker/reference_count.h b/src/ray/core_worker/reference_count.h
index 4ef6e14a0016..5eb228301c34 100644
--- a/src/ray/core_worker/reference_count.h
+++ b/src/ray/core_worker/reference_count.h
@@ -49,7 +49,7 @@ class ReferenceCounterInterface {
       bool is_reconstructable,
       bool add_local_ref,
       const absl::optional<NodeID> &pinned_at_raylet_id = absl::optional<NodeID>()) = 0;
-  virtual bool AddObjectPrimaryCopyDeleteCallback(
+  virtual bool AddObjectOutOfScopeOrFreedCallback(
       const ObjectID &object_id,
       const std::function<void(const ObjectID &)> callback) = 0;
   virtual bool SetObjectRefDeletedCallback(
@@ -320,7 +320,7 @@ class ReferenceCounter : public ReferenceCounterInterface,
   /// Adds the callback that will be run when the object goes out of scope
   /// (Reference.OutOfScope() returns true).
   /// Returns true if the object was in scope and the callback was added, else false.
-  bool AddObjectPrimaryCopyDeleteCallback(
+  bool AddObjectOutOfScopeOrFreedCallback(
       const ObjectID &object_id, const std::function<void(const ObjectID &)> callback)
       ABSL_LOCKS_EXCLUDED(mutex_);
 
@@ -783,13 +783,13 @@ class ReferenceCounter : public ReferenceCounterInterface,
     /// Metadata related to borrowing.
     std::unique_ptr<BorrowInfo> borrow_info;
 
-    /// Callback that will be called when this Object's primary copy
-    /// should be deleted: out of scope or internal_api.free
+    /// Callback that will be called when this object
+    /// is out of scope or manually freed.
     /// Note: when an object is out of scope, it can still
     /// have lineage ref count and on_object_ref_delete
     /// will be called when lineage ref count is also 0.
     std::vector<std::function<void(const ObjectID &)>>
-        on_object_primary_copy_delete_callbacks;
+        on_object_out_of_scope_or_freed_callbacks;
     /// Callback that will be called when the object ref is deleted
     /// from the reference table (all refs including lineage ref count go to 0).
     std::function<void(const ObjectID &)> on_object_ref_delete;
@@ -847,9 +847,12 @@ class ReferenceCounter : public ReferenceCounterInterface,
                         rpc::Address *owner_address = nullptr) const
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
-  /// Delete the object primary copy, if any. Also unsets the raylet address
-  /// that the object was pinned at, if the address was set.
-  void DeleteObjectPrimaryCopy(ReferenceTable::iterator it);
+  /// Unsets the raylet address
+  /// that the object was pinned at or spilled at, if the address was set.
+  void UnsetObjectPrimaryCopy(ReferenceTable::iterator it);
+
+  /// This should be called whenever the object is out of scope or manually freed.
+  void OnObjectOutOfScopeOrFreed(ReferenceTable::iterator it);
 
   /// Shutdown if all references have gone out of scope and shutdown
   /// is scheduled.
diff --git a/src/ray/core_worker/task_manager.cc b/src/ray/core_worker/task_manager.cc
index bc5a78c7862e..cb175bfd2ebb 100644
--- a/src/ray/core_worker/task_manager.cc
+++ b/src/ray/core_worker/task_manager.cc
@@ -16,6 +16,7 @@
 
 #include "ray/common/buffer.h"
 #include "ray/common/common_protocol.h"
+#include "ray/core_worker/actor_manager.h"
 #include "ray/gcs/pb_util.h"
 #include "ray/util/exponential_backoff.h"
 #include "ray/util/util.h"
@@ -1475,7 +1476,8 @@ void TaskManager::SetTaskStatus(
 }
 
 std::unordered_map<rpc::LineageReconstructionTask, uint64_t>
-TaskManager::GetOngoingLineageReconstructionTasks() const {
+TaskManager::GetOngoingLineageReconstructionTasks(
+    const ActorManager &actor_manager) const {
   absl::MutexLock lock(&mu_);
   std::unordered_map<rpc::LineageReconstructionTask, uint64_t> result;
   for (const auto &task_it : submissible_tasks_) {
@@ -1491,9 +1493,16 @@ TaskManager::GetOngoingLineageReconstructionTasks() const {
 
     rpc::LineageReconstructionTask task;
     task.set_name(task_entry.spec.GetName());
-    auto resources = task_entry.spec.GetRequiredResources().GetResourceUnorderedMap();
-    task.mutable_resources()->insert(resources.begin(), resources.end());
     task.set_status(task_entry.GetStatus());
+    if (task_entry.spec.IsNormalTask()) {
+      task.mutable_labels()->insert(task_entry.spec.GetMessage().labels().begin(),
+                                    task_entry.spec.GetMessage().labels().end());
+    } else if (task_entry.spec.IsActorTask()) {
+      auto actor_handle = actor_manager.GetActorHandle(task_entry.spec.ActorId());
+      RAY_CHECK(actor_handle) << "Actor task must be submitted via actor handle";
+      const auto &labels = actor_handle->GetLabels();
+      task.mutable_labels()->insert(labels.begin(), labels.end());
+    }
 
     if (result.find(task) != result.end()) {
       result[task] += 1;
diff --git a/src/ray/core_worker/task_manager.h b/src/ray/core_worker/task_manager.h
index 196e18beb277..49188e9a630f 100644
--- a/src/ray/core_worker/task_manager.h
+++ b/src/ray/core_worker/task_manager.h
@@ -30,6 +30,8 @@
 namespace ray {
 namespace core {
 
+class ActorManager;
+
 class TaskFinisherInterface {
  public:
   virtual void CompletePendingTask(const TaskID &task_id,
@@ -603,7 +605,7 @@ class TaskManager : public TaskFinisherInterface, public TaskResubmissionInterfa
   /// Key is the lineage reconstruction task info.
   /// Value is the number of ongoing lineage reconstruction tasks of this type.
   std::unordered_map<rpc::LineageReconstructionTask, uint64_t>
-  GetOngoingLineageReconstructionTasks() const;
+  GetOngoingLineageReconstructionTasks(const ActorManager &actor_manager) const;
 
   /// Returns the generator ID that contains the dynamically allocated
   /// ObjectRefs, if the task is dynamic. Else, returns Nil.
diff --git a/src/ray/core_worker/test/actor_manager_test.cc b/src/ray/core_worker/test/actor_manager_test.cc
index b5c938c6c0ce..8f68f28cd845 100644
--- a/src/ray/core_worker/test/actor_manager_test.cc
+++ b/src/ray/core_worker/test/actor_manager_test.cc
@@ -168,7 +168,7 @@ class ActorManagerTest : public ::testing::Test {
                                                        ray_namespace,
                                                        -1,
                                                        false);
-    EXPECT_CALL(*reference_counter_, AddObjectPrimaryCopyDeleteCallback(_, _))
+    EXPECT_CALL(*reference_counter_, AddObjectOutOfScopeOrFreedCallback(_, _))
         .WillRepeatedly(testing::Return(true));
     actor_manager_->AddNewActorHandle(std::move(actor_handle),
                                       call_site,
@@ -207,7 +207,7 @@ TEST_F(ActorManagerTest, TestAddAndGetActorHandleEndToEnd) {
                                                      "",
                                                      -1,
                                                      false);
-  EXPECT_CALL(*reference_counter_, AddObjectPrimaryCopyDeleteCallback(_, _))
+  EXPECT_CALL(*reference_counter_, AddObjectOutOfScopeOrFreedCallback(_, _))
       .WillRepeatedly(testing::Return(true));
 
   // Add an actor handle.
@@ -284,7 +284,7 @@ TEST_F(ActorManagerTest, RegisterActorHandles) {
                                                      "",
                                                      -1,
                                                      false);
-  EXPECT_CALL(*reference_counter_, AddObjectPrimaryCopyDeleteCallback(_, _))
+  EXPECT_CALL(*reference_counter_, AddObjectOutOfScopeOrFreedCallback(_, _))
       .WillRepeatedly(testing::Return(true));
   ObjectID outer_object_id = ObjectID::Nil();
 
diff --git a/src/ray/core_worker/test/reference_count_test.cc b/src/ray/core_worker/test/reference_count_test.cc
index 4351692284e2..67423a3ed75a 100644
--- a/src/ray/core_worker/test/reference_count_test.cc
+++ b/src/ray/core_worker/test/reference_count_test.cc
@@ -572,9 +572,9 @@ TEST_F(ReferenceCountTest, TestUnreconstructableObjectOutOfScope) {
 
   // The object goes out of scope once it has no more refs.
   std::vector<ObjectID> out;
-  ASSERT_FALSE(rc->AddObjectPrimaryCopyDeleteCallback(id, callback));
+  ASSERT_FALSE(rc->AddObjectOutOfScopeOrFreedCallback(id, callback));
   rc->AddOwnedObject(id, {}, address, "", 0, false, /*add_local_ref=*/true);
-  ASSERT_TRUE(rc->AddObjectPrimaryCopyDeleteCallback(id, callback));
+  ASSERT_TRUE(rc->AddObjectOutOfScopeOrFreedCallback(id, callback));
   ASSERT_FALSE(*out_of_scope);
   rc->RemoveLocalReference(id, &out);
   ASSERT_TRUE(*out_of_scope);
@@ -582,9 +582,9 @@ TEST_F(ReferenceCountTest, TestUnreconstructableObjectOutOfScope) {
   // Unreconstructable objects go out of scope even if they have a nonzero
   // lineage ref count.
   *out_of_scope = false;
-  ASSERT_FALSE(rc->AddObjectPrimaryCopyDeleteCallback(id, callback));
+  ASSERT_FALSE(rc->AddObjectOutOfScopeOrFreedCallback(id, callback));
   rc->AddOwnedObject(id, {}, address, "", 0, false, /*add_local_ref=*/false);
-  ASSERT_TRUE(rc->AddObjectPrimaryCopyDeleteCallback(id, callback));
+  ASSERT_TRUE(rc->AddObjectOutOfScopeOrFreedCallback(id, callback));
   rc->UpdateSubmittedTaskReferences({}, {id});
   ASSERT_FALSE(*out_of_scope);
   rc->UpdateFinishedTaskReferences({}, {id}, false, empty_borrower, empty_refs, &out);
@@ -2437,9 +2437,9 @@ TEST_F(ReferenceCountLineageEnabledTest, TestUnreconstructableObjectOutOfScope)
 
   // The object goes out of scope once it has no more refs.
   std::vector<ObjectID> out;
-  ASSERT_FALSE(rc->AddObjectPrimaryCopyDeleteCallback(id, callback));
+  ASSERT_FALSE(rc->AddObjectOutOfScopeOrFreedCallback(id, callback));
   rc->AddOwnedObject(id, {}, address, "", 0, false, /*add_local_ref=*/true);
-  ASSERT_TRUE(rc->AddObjectPrimaryCopyDeleteCallback(id, callback));
+  ASSERT_TRUE(rc->AddObjectOutOfScopeOrFreedCallback(id, callback));
   ASSERT_FALSE(*out_of_scope);
   ASSERT_FALSE(*out_of_scope);
   rc->RemoveLocalReference(id, &out);
@@ -2450,9 +2450,9 @@ TEST_F(ReferenceCountLineageEnabledTest, TestUnreconstructableObjectOutOfScope)
   // Unreconstructable objects stay in scope if they have a nonzero lineage ref
   // count.
   *out_of_scope = false;
-  ASSERT_FALSE(rc->AddObjectPrimaryCopyDeleteCallback(id, callback));
+  ASSERT_FALSE(rc->AddObjectOutOfScopeOrFreedCallback(id, callback));
   rc->AddOwnedObject(id, {}, address, "", 0, false, /*add_local_ref=*/false);
-  ASSERT_TRUE(rc->AddObjectPrimaryCopyDeleteCallback(id, callback));
+  ASSERT_TRUE(rc->AddObjectOutOfScopeOrFreedCallback(id, callback));
   rc->UpdateSubmittedTaskReferences({return_id}, {id});
   ASSERT_TRUE(rc->IsObjectPendingCreation(return_id));
   ASSERT_FALSE(*out_of_scope);
@@ -2541,7 +2541,7 @@ TEST_F(ReferenceCountLineageEnabledTest, TestPinLineageRecursive) {
     rc->UpdateFinishedTaskReferences({}, {id}, false, empty_borrower, empty_refs, &out);
     // We should fail to set the deletion callback because the object has
     // already gone out of scope.
-    ASSERT_FALSE(rc->AddObjectPrimaryCopyDeleteCallback(
+    ASSERT_FALSE(rc->AddObjectOutOfScopeOrFreedCallback(
         id, [&](const ObjectID &object_id) { ASSERT_FALSE(true); }));
 
     ASSERT_EQ(out.size(), 1);
@@ -2658,7 +2658,7 @@ TEST_F(ReferenceCountLineageEnabledTest, TestPlasmaLocation) {
   ObjectID id = ObjectID::FromRandom();
   NodeID node_id = NodeID::FromRandom();
   rc->AddOwnedObject(id, {}, rpc::Address(), "", 0, true, /*add_local_ref=*/true);
-  ASSERT_TRUE(rc->AddObjectPrimaryCopyDeleteCallback(id, callback));
+  ASSERT_TRUE(rc->AddObjectOutOfScopeOrFreedCallback(id, callback));
   ASSERT_TRUE(rc->IsPlasmaObjectPinnedOrSpilled(id, &owned_by_us, &pinned_at, &spilled));
   ASSERT_TRUE(owned_by_us);
   ASSERT_TRUE(pinned_at.IsNil());
@@ -2674,7 +2674,7 @@ TEST_F(ReferenceCountLineageEnabledTest, TestPlasmaLocation) {
   deleted->clear();
 
   rc->AddOwnedObject(id, {}, rpc::Address(), "", 0, true, /*add_local_ref=*/true);
-  ASSERT_TRUE(rc->AddObjectPrimaryCopyDeleteCallback(id, callback));
+  ASSERT_TRUE(rc->AddObjectOutOfScopeOrFreedCallback(id, callback));
   rc->UpdateObjectPinnedAtRaylet(id, node_id);
   rc->ResetObjectsOnRemovedNode(node_id);
   auto objects = rc->FlushObjectsToRecover();
@@ -2683,7 +2683,7 @@ TEST_F(ReferenceCountLineageEnabledTest, TestPlasmaLocation) {
   ASSERT_TRUE(rc->IsPlasmaObjectPinnedOrSpilled(id, &owned_by_us, &pinned_at, &spilled));
   ASSERT_TRUE(owned_by_us);
   ASSERT_TRUE(pinned_at.IsNil());
-  ASSERT_TRUE(deleted->count(id) > 0);
+  ASSERT_TRUE(deleted->empty());
   deleted->clear();
 }
 
@@ -2699,7 +2699,7 @@ TEST_F(ReferenceCountTest, TestFree) {
   ASSERT_FALSE(rc->IsPlasmaObjectFreed(id));
   rc->FreePlasmaObjects({id});
   ASSERT_TRUE(rc->IsPlasmaObjectFreed(id));
-  ASSERT_FALSE(rc->AddObjectPrimaryCopyDeleteCallback(id, callback));
+  ASSERT_FALSE(rc->AddObjectOutOfScopeOrFreedCallback(id, callback));
   ASSERT_EQ(deleted->count(id), 0);
   rc->UpdateObjectPinnedAtRaylet(id, node_id);
   bool owned_by_us;
@@ -2714,7 +2714,7 @@ TEST_F(ReferenceCountTest, TestFree) {
 
   // Test free after receiving information about where the object is pinned.
   rc->AddOwnedObject(id, {}, rpc::Address(), "", 0, true, /*add_local_ref=*/true);
-  ASSERT_TRUE(rc->AddObjectPrimaryCopyDeleteCallback(id, callback));
+  ASSERT_TRUE(rc->AddObjectOutOfScopeOrFreedCallback(id, callback));
   rc->UpdateObjectPinnedAtRaylet(id, node_id);
   ASSERT_FALSE(rc->IsPlasmaObjectFreed(id));
   rc->FreePlasmaObjects({id});
diff --git a/src/ray/core_worker/transport/actor_task_submitter.cc b/src/ray/core_worker/transport/actor_task_submitter.cc
index c54b9deb16ec..af2600e0a6b8 100644
--- a/src/ray/core_worker/transport/actor_task_submitter.cc
+++ b/src/ray/core_worker/transport/actor_task_submitter.cc
@@ -44,7 +44,7 @@ void ActorTaskSubmitter::NotifyGCSWhenActorOutOfScope(
         }));
   };
 
-  if (!reference_counter_->AddObjectPrimaryCopyDeleteCallback(
+  if (!reference_counter_->AddObjectOutOfScopeOrFreedCallback(
           actor_creation_return_id,
           [actor_out_of_scope_callback](const ObjectID &object_id) {
             actor_out_of_scope_callback(object_id);
diff --git a/src/ray/gcs/gcs_client/test/gcs_client_reconnection_test.cc b/src/ray/gcs/gcs_client/test/gcs_client_reconnection_test.cc
index 99a92034aed2..2e003ec39977 100644
--- a/src/ray/gcs/gcs_client/test/gcs_client_reconnection_test.cc
+++ b/src/ray/gcs/gcs_client/test/gcs_client_reconnection_test.cc
@@ -14,6 +14,7 @@
 
 #include <boost/asio/ip/tcp.hpp>
 #include <chrono>
+#include <future>
 
 #include "absl/strings/substitute.h"
 #include "gtest/gtest.h"
@@ -198,7 +199,7 @@ TEST_F(GcsClientReconnectionTest, ReconnectionBasic) {
         ASSERT_TRUE(status.ok()) << status.ToString();
         p1.set_value(*p);
       }));
-  ASSERT_EQ(f1.wait_for(1s), std::future_status::timeout);
+  ASSERT_EQ(std::future_status::timeout, f1.wait_for(1s));
 
   // Make sure io context is not blocked
   std::promise<void> p2;
@@ -210,7 +211,7 @@ TEST_F(GcsClientReconnectionTest, ReconnectionBasic) {
   StartGCS();
 
   // Make sure the request is executed
-  ASSERT_EQ(f1.get(), "B");
+  ASSERT_EQ("B", f1.get());
 }
 
 TEST_F(GcsClientReconnectionTest, ReconnectionBackoff) {
@@ -241,8 +242,14 @@ TEST_F(GcsClientReconnectionTest, ReconnectionBackoff) {
 
   ShutdownGCS();
 
+  std::promise<void> p2;
+  auto f2 = p2.get_future();
   RAY_UNUSED(client->InternalKV().AsyncInternalKVPut(
-      "", "A", "B", false, gcs::GetGcsTimeoutMs(), [](auto, auto) {}));
+      "", "A", "B", false, gcs::GetGcsTimeoutMs(), [&p2](auto status, auto) {
+        ASSERT_TRUE(status.ok()) << status.ToString();
+        p2.set_value();
+      }));
+  ASSERT_EQ(std::future_status::timeout, f2.wait_for(1s));
 
   ASSERT_TRUE(WaitUntil(
       [channel]() {
@@ -305,27 +312,41 @@ TEST_F(GcsClientReconnectionTest, QueueingAndBlocking) {
   ShutdownGCS();
 
   // Send one request which should fail
-  RAY_UNUSED(client->InternalKV().AsyncInternalKVPut(
-      "", "A", "B", false, gcs::GetGcsTimeoutMs(), [](auto status, auto) {}));
-
-  // Make sure it's not blocking
   std::promise<void> p2;
-  client_io_service_->post([&p2]() { p2.set_value(); }, "");
   auto f2 = p2.get_future();
-  ASSERT_EQ(std::future_status::ready, f2.wait_for(1s));
-
-  // Send the second one and it should block the thread
   RAY_UNUSED(client->InternalKV().AsyncInternalKVPut(
-      "", "A", "B", false, gcs::GetGcsTimeoutMs(), [](auto status, auto) {}));
-  std::this_thread::sleep_for(1s);
+      "", "A", "B", false, gcs::GetGcsTimeoutMs(), [&p2](auto status, auto) {
+        ASSERT_TRUE(status.ok()) << status.ToString();
+        p2.set_value();
+      }));
+  ASSERT_EQ(std::future_status::timeout, f2.wait_for(1s));
+
+  // Make sure it's not blocking
   std::promise<void> p3;
   client_io_service_->post([&p3]() { p3.set_value(); }, "");
   auto f3 = p3.get_future();
-  ASSERT_EQ(std::future_status::timeout, f3.wait_for(1s));
+  ASSERT_EQ(std::future_status::ready, f3.wait_for(1s));
+
+  // Send the second one and it should block the thread
+  std::promise<void> p4;
+  auto f4 = p4.get_future();
+  RAY_UNUSED(client->InternalKV().AsyncInternalKVPut(
+      "", "A", "B", false, gcs::GetGcsTimeoutMs(), [&p4](auto status, auto) {
+        ASSERT_TRUE(status.ok()) << status.ToString();
+        p4.set_value();
+      }));
+  ASSERT_EQ(std::future_status::timeout, f4.wait_for(1s));
+
+  std::promise<void> p5;
+  client_io_service_->post([&p5]() { p5.set_value(); }, "");
+  auto f5 = p5.get_future();
+  ASSERT_EQ(std::future_status::timeout, f5.wait_for(1s));
 
   // Resume GCS server and it should unblock
   StartGCS();
-  ASSERT_EQ(std::future_status::ready, f3.wait_for(5s));
+  ASSERT_EQ(std::future_status::ready, f5.wait_for(5s));
+  ASSERT_EQ(std::future_status::ready, f2.wait_for(1s));
+  ASSERT_EQ(std::future_status::ready, f4.wait_for(1s));
 }
 
 TEST_F(GcsClientReconnectionTest, Timeout) {
@@ -335,7 +356,7 @@ TEST_F(GcsClientReconnectionTest, Timeout) {
   "gcs_rpc_server_reconnect_timeout_s": 60,
   "gcs_storage": "redis",
   "gcs_grpc_max_request_queued_max_bytes": 10,
-  "gcs_server_request_timeout_seconds": 3
+  "gcs_server_request_timeout_seconds": 10
 }
   )");
   StartGCS();
@@ -346,11 +367,11 @@ TEST_F(GcsClientReconnectionTest, Timeout) {
   ASSERT_TRUE(added);
 
   ShutdownGCS();
-
   std::vector<std::string> values;
   ASSERT_TRUE(
       client->InternalKV().Keys("", "A", gcs::GetGcsTimeoutMs(), values).IsTimedOut());
   ASSERT_TRUE(values.empty());
+
   StartGCS();
   ASSERT_TRUE(client->InternalKV().Keys("", "A", gcs::GetGcsTimeoutMs(), values).ok());
   ASSERT_EQ(std::vector<std::string>{"A"}, values);
diff --git a/src/ray/gcs/gcs_server/gcs_actor_manager.cc b/src/ray/gcs/gcs_server/gcs_actor_manager.cc
index aa609bacb445..8bf3f3d484f1 100644
--- a/src/ray/gcs/gcs_server/gcs_actor_manager.cc
+++ b/src/ray/gcs/gcs_server/gcs_actor_manager.cc
@@ -749,11 +749,8 @@ Status GcsActorManager::RegisterActor(const ray::rpc::RegisterActorRequest &requ
                "explicitly connect to this namespace with ray.init(namespace=\""
             << actor->GetRayNamespace() << "\", ...)";
 
-        auto error_data_ptr =
-            gcs::CreateErrorTableData("detached_actor_anonymous_namespace",
-                                      stream.str(),
-                                      absl::GetCurrentTimeNanos(),
-                                      job_id);
+        auto error_data_ptr = gcs::CreateErrorTableData(
+            "detached_actor_anonymous_namespace", stream.str(), absl::Now(), job_id);
 
         RAY_LOG(WARNING) << error_data_ptr->SerializeAsString();
         RAY_CHECK_OK(
diff --git a/src/ray/gcs/gcs_server/gcs_actor_manager.h b/src/ray/gcs/gcs_server/gcs_actor_manager.h
index dadb90498379..afdf55be80d6 100644
--- a/src/ray/gcs/gcs_server/gcs_actor_manager.h
+++ b/src/ray/gcs/gcs_server/gcs_actor_manager.h
@@ -321,7 +321,7 @@ class GcsActorManager : public rpc::ActorInfoHandler {
       std::function<void(const ActorID &)> destroy_owned_placement_group_if_needed,
       const rpc::CoreWorkerClientFactoryFn &worker_client_factory = nullptr);
 
-  ~GcsActorManager() = default;
+  ~GcsActorManager() override = default;
 
   void HandleRegisterActor(rpc::RegisterActorRequest request,
                            rpc::RegisterActorReply *reply,
diff --git a/src/ray/gcs/gcs_server/gcs_actor_scheduler.cc b/src/ray/gcs/gcs_server/gcs_actor_scheduler.cc
index f562c0f9034e..5daa1a992257 100644
--- a/src/ray/gcs/gcs_server/gcs_actor_scheduler.cc
+++ b/src/ray/gcs/gcs_server/gcs_actor_scheduler.cc
@@ -27,17 +27,17 @@ GcsActorScheduler::GcsActorScheduler(
     instrumented_io_context &io_context,
     GcsActorTable &gcs_actor_table,
     const GcsNodeManager &gcs_node_manager,
-    std::shared_ptr<ClusterTaskManager> cluster_task_manager,
+    ClusterTaskManager &cluster_task_manager,
     GcsActorSchedulerFailureCallback schedule_failure_handler,
     GcsActorSchedulerSuccessCallback schedule_success_handler,
-    std::shared_ptr<rpc::NodeManagerClientPool> raylet_client_pool,
+    rpc::NodeManagerClientPool &raylet_client_pool,
     rpc::CoreWorkerClientFactoryFn client_factory,
     std::function<void(const NodeID &, const rpc::ResourcesData &)>
         normal_task_resources_changed_callback)
     : io_context_(io_context),
       gcs_actor_table_(gcs_actor_table),
       gcs_node_manager_(gcs_node_manager),
-      cluster_task_manager_(std::move(cluster_task_manager)),
+      cluster_task_manager_(cluster_task_manager),
       schedule_failure_handler_(std::move(schedule_failure_handler)),
       schedule_success_handler_(std::move(schedule_success_handler)),
       raylet_client_pool_(raylet_client_pool),
@@ -97,11 +97,11 @@ void GcsActorScheduler::ScheduleByGcs(std::shared_ptr<GcsActor> actor) {
   const auto &owner_node = gcs_node_manager_.GetAliveNode(actor->GetOwnerNodeID());
   RayTask task(actor->GetCreationTaskSpecification(),
                owner_node.has_value() ? actor->GetOwnerNodeID().Binary() : std::string());
-  cluster_task_manager_->QueueAndScheduleTask(task,
-                                              /*grant_or_reject*/ false,
-                                              /*is_selected_based_on_locality*/ false,
-                                              /*reply*/ reply.get(),
-                                              send_reply_callback);
+  cluster_task_manager_.QueueAndScheduleTask(task,
+                                             /*grant_or_reject*/ false,
+                                             /*is_selected_based_on_locality*/ false,
+                                             /*reply*/ reply.get(),
+                                             send_reply_callback);
 }
 
 void GcsActorScheduler::ScheduleByRaylet(std::shared_ptr<GcsActor> actor) {
@@ -218,7 +218,7 @@ std::vector<ActorID> GcsActorScheduler::CancelOnNode(const NodeID &node_id) {
     }
   }
 
-  raylet_client_pool_->Disconnect(node_id);
+  raylet_client_pool_.Disconnect(node_id);
 
   return actor_ids;
 }
@@ -531,7 +531,7 @@ void GcsActorScheduler::DoRetryCreatingActorOnWorker(
 
 std::shared_ptr<WorkerLeaseInterface> GcsActorScheduler::GetOrConnectLeaseClient(
     const rpc::Address &raylet_address) {
-  return raylet_client_pool_->GetOrConnectByAddress(raylet_address);
+  return raylet_client_pool_.GetOrConnectByAddress(raylet_address);
 }
 
 bool GcsActorScheduler::KillActorOnWorker(const rpc::Address &worker_address,
@@ -664,13 +664,13 @@ void GcsActorScheduler::HandleWorkerLeaseRejectedReply(
 void GcsActorScheduler::OnActorDestruction(std::shared_ptr<GcsActor> actor) {
   if (!actor->GetAcquiredResources().IsEmpty()) {
     ReturnActorAcquiredResources(actor);
-    cluster_task_manager_->ScheduleAndDispatchTasks();
+    cluster_task_manager_.ScheduleAndDispatchTasks();
   }
 }
 
 void GcsActorScheduler::ReturnActorAcquiredResources(std::shared_ptr<GcsActor> actor) {
   auto &cluster_resource_manager =
-      cluster_task_manager_->GetClusterResourceScheduler()->GetClusterResourceManager();
+      cluster_task_manager_.GetClusterResourceScheduler()->GetClusterResourceManager();
   cluster_resource_manager.AddNodeAvailableResources(
       scheduling::NodeID(actor->GetNodeID().Binary()),
       actor->GetAcquiredResources().GetResourceSet());
@@ -678,14 +678,13 @@ void GcsActorScheduler::ReturnActorAcquiredResources(std::shared_ptr<GcsActor> a
 }
 
 size_t GcsActorScheduler::GetPendingActorsCount() const {
-  return cluster_task_manager_->GetInfeasibleQueueSize() +
-         cluster_task_manager_->GetPendingQueueSize();
+  return cluster_task_manager_.GetInfeasibleQueueSize() +
+         cluster_task_manager_.GetPendingQueueSize();
 }
 
 bool GcsActorScheduler::CancelInFlightActorScheduling(
     const std::shared_ptr<GcsActor> &actor) {
-  return cluster_task_manager_->CancelTask(
-      actor->GetCreationTaskSpecification().TaskId());
+  return cluster_task_manager_.CancelTask(actor->GetCreationTaskSpecification().TaskId());
 }
 
 }  // namespace gcs
diff --git a/src/ray/gcs/gcs_server/gcs_actor_scheduler.h b/src/ray/gcs/gcs_server/gcs_actor_scheduler.h
index 1ea66d0ddbe0..048d1da8939c 100644
--- a/src/ray/gcs/gcs_server/gcs_actor_scheduler.h
+++ b/src/ray/gcs/gcs_server/gcs_actor_scheduler.h
@@ -129,14 +129,14 @@ class GcsActorScheduler : public GcsActorSchedulerInterface {
       instrumented_io_context &io_context,
       GcsActorTable &gcs_actor_table,
       const GcsNodeManager &gcs_node_manager,
-      std::shared_ptr<ClusterTaskManager> cluster_task_manager_,
+      ClusterTaskManager &cluster_task_manager_,
       GcsActorSchedulerFailureCallback schedule_failure_handler,
       GcsActorSchedulerSuccessCallback schedule_success_handler,
-      std::shared_ptr<rpc::NodeManagerClientPool> raylet_client_pool,
+      rpc::NodeManagerClientPool &raylet_client_pool,
       rpc::CoreWorkerClientFactoryFn client_factory = nullptr,
       std::function<void(const NodeID &, const rpc::ResourcesData &)>
           normal_task_resources_changed_callback = nullptr);
-  virtual ~GcsActorScheduler() = default;
+  ~GcsActorScheduler() override = default;
 
   /// Schedule the specified actor.
   /// If there is no available nodes then the actor would be queued in the
@@ -377,7 +377,7 @@ class GcsActorScheduler : public GcsActorSchedulerInterface {
   /// Reference of GcsNodeManager.
   const GcsNodeManager &gcs_node_manager_;
   /// The cluster task manager.
-  std::shared_ptr<ClusterTaskManager> cluster_task_manager_;
+  ClusterTaskManager &cluster_task_manager_;
   /// The handler to handle the scheduling failures.
   GcsActorSchedulerFailureCallback schedule_failure_handler_;
   /// The handler to handle the successful scheduling.
@@ -385,7 +385,7 @@ class GcsActorScheduler : public GcsActorSchedulerInterface {
   /// The nodes which are releasing unused workers.
   absl::flat_hash_set<NodeID> nodes_of_releasing_unused_workers_;
   /// The cached raylet clients used to communicate with raylet.
-  std::shared_ptr<rpc::NodeManagerClientPool> raylet_client_pool_;
+  rpc::NodeManagerClientPool &raylet_client_pool_;
   /// The cached core worker clients which are used to communicate with leased worker.
   rpc::CoreWorkerClientPool core_worker_clients_;
 
diff --git a/src/ray/gcs/gcs_server/gcs_autoscaler_state_manager.cc b/src/ray/gcs/gcs_server/gcs_autoscaler_state_manager.cc
index e4ef13371b53..c166dbbf6398 100644
--- a/src/ray/gcs/gcs_server/gcs_autoscaler_state_manager.cc
+++ b/src/ray/gcs/gcs_server/gcs_autoscaler_state_manager.cc
@@ -28,12 +28,12 @@ GcsAutoscalerStateManager::GcsAutoscalerStateManager(
     GcsNodeManager &gcs_node_manager,
     GcsActorManager &gcs_actor_manager,
     const GcsPlacementGroupManager &gcs_placement_group_manager,
-    std::shared_ptr<rpc::NodeManagerClientPool> raylet_client_pool)
+    rpc::NodeManagerClientPool &raylet_client_pool)
     : session_name_(session_name),
       gcs_node_manager_(gcs_node_manager),
       gcs_actor_manager_(gcs_actor_manager),
       gcs_placement_group_manager_(gcs_placement_group_manager),
-      raylet_client_pool_(std::move(raylet_client_pool)),
+      raylet_client_pool_(raylet_client_pool),
       last_cluster_resource_state_version_(0),
       last_seen_autoscaler_state_version_(0) {}
 
@@ -396,7 +396,7 @@ void GcsAutoscalerStateManager::HandleDrainNode(
   raylet_address.set_ip_address(node->node_manager_address());
   raylet_address.set_port(node->node_manager_port());
 
-  const auto raylet_client = raylet_client_pool_->GetOrConnectByAddress(raylet_address);
+  const auto raylet_client = raylet_client_pool_.GetOrConnectByAddress(raylet_address);
   raylet_client->DrainRaylet(
       request.reason(),
       request.reason_message(),
diff --git a/src/ray/gcs/gcs_server/gcs_autoscaler_state_manager.h b/src/ray/gcs/gcs_server/gcs_autoscaler_state_manager.h
index c00d8d465202..c592a7a484d6 100644
--- a/src/ray/gcs/gcs_server/gcs_autoscaler_state_manager.h
+++ b/src/ray/gcs/gcs_server/gcs_autoscaler_state_manager.h
@@ -29,12 +29,11 @@ class GcsResourceManager;
 
 class GcsAutoscalerStateManager : public rpc::autoscaler::AutoscalerStateHandler {
  public:
-  GcsAutoscalerStateManager(
-      const std::string &session_name,
-      GcsNodeManager &gcs_node_manager,
-      GcsActorManager &gcs_actor_manager,
-      const GcsPlacementGroupManager &gcs_placement_group_manager,
-      std::shared_ptr<rpc::NodeManagerClientPool> raylet_client_pool);
+  GcsAutoscalerStateManager(const std::string &session_name,
+                            GcsNodeManager &gcs_node_manager,
+                            GcsActorManager &gcs_actor_manager,
+                            const GcsPlacementGroupManager &gcs_placement_group_manager,
+                            rpc::NodeManagerClientPool &raylet_client_pool);
 
   void HandleGetClusterResourceState(
       rpc::autoscaler::GetClusterResourceStateRequest request,
@@ -152,7 +151,7 @@ class GcsAutoscalerStateManager : public rpc::autoscaler::AutoscalerStateHandler
   const GcsPlacementGroupManager &gcs_placement_group_manager_;
 
   /// Raylet client pool.
-  std::shared_ptr<rpc::NodeManagerClientPool> raylet_client_pool_;
+  rpc::NodeManagerClientPool &raylet_client_pool_;
 
   // The default value of the last seen version for the request is 0, which indicates
   // no version has been reported. So the first reported version should be 1.
diff --git a/src/ray/gcs/gcs_server/gcs_health_check_manager.cc b/src/ray/gcs/gcs_server/gcs_health_check_manager.cc
index 2cefb37f6b7c..d6e858482185 100644
--- a/src/ray/gcs/gcs_server/gcs_health_check_manager.cc
+++ b/src/ray/gcs/gcs_server/gcs_health_check_manager.cc
@@ -14,15 +14,17 @@
 
 #include "ray/gcs/gcs_server/gcs_health_check_manager.h"
 
+#include <string_view>
+
 #include "ray/stats/metric.h"
+
 DEFINE_stats(health_check_rpc_latency_ms,
              "Latency of rpc request for health check.",
              (),
              ({1, 10, 100, 1000, 10000}, ),
              ray::stats::HISTOGRAM);
 
-namespace ray {
-namespace gcs {
+namespace ray::gcs {
 
 GcsHealthCheckManager::GcsHealthCheckManager(
     instrumented_io_context &io_service,
@@ -38,17 +40,18 @@ GcsHealthCheckManager::GcsHealthCheckManager(
       period_ms_(period_ms),
       failure_threshold_(failure_threshold) {
   RAY_CHECK(on_node_death_callback != nullptr);
-  RAY_CHECK(initial_delay_ms >= 0);
-  RAY_CHECK(timeout_ms >= 0);
-  RAY_CHECK(period_ms >= 0);
-  RAY_CHECK(failure_threshold >= 0);
+  RAY_CHECK_GE(initial_delay_ms, 0);
+  RAY_CHECK_GE(timeout_ms, 0);
+  RAY_CHECK_GE(period_ms, 0);
+  RAY_CHECK_GE(failure_threshold, 0);
 }
 
-GcsHealthCheckManager::~GcsHealthCheckManager() {}
+GcsHealthCheckManager::~GcsHealthCheckManager() = default;
 
 void GcsHealthCheckManager::RemoveNode(const NodeID &node_id) {
   io_service_.dispatch(
       [this, node_id]() {
+        thread_checker_.IsOnSameThread();
         auto iter = health_check_contexts_.find(node_id);
         if (iter == health_check_contexts_.end()) {
           return;
@@ -61,6 +64,7 @@ void GcsHealthCheckManager::RemoveNode(const NodeID &node_id) {
 
 void GcsHealthCheckManager::FailNode(const NodeID &node_id) {
   RAY_LOG(WARNING).WithField(node_id) << "Node is dead because the health check failed.";
+  thread_checker_.IsOnSameThread();
   auto iter = health_check_contexts_.find(node_id);
   if (iter != health_check_contexts_.end()) {
     on_node_death_callback_(node_id);
@@ -69,7 +73,9 @@ void GcsHealthCheckManager::FailNode(const NodeID &node_id) {
 }
 
 std::vector<NodeID> GcsHealthCheckManager::GetAllNodes() const {
+  thread_checker_.IsOnSameThread();
   std::vector<NodeID> nodes;
+  nodes.reserve(health_check_contexts_.size());
   for (const auto &[node_id, _] : health_check_contexts_) {
     nodes.emplace_back(node_id);
   }
@@ -84,24 +90,26 @@ void GcsHealthCheckManager::HealthCheckContext::StartHealthCheck() {
   new (&context_) grpc::ClientContext();
   response_.Clear();
 
-  auto deadline =
-      std::chrono::system_clock::now() + std::chrono::milliseconds(manager_->timeout_ms_);
-  context_.set_deadline(deadline);
+  const auto now = absl::Now();
+  const auto deadline = now + absl::Milliseconds(manager_->timeout_ms_);
+  context_.set_deadline(absl::ToChronoTime(deadline));
   stub_->async()->Check(
-      &context_, &request_, &response_, [this, now = absl::Now()](::grpc::Status status) {
+      &context_, &request_, &response_, [this, start = now](::grpc::Status status) {
         // This callback is done in gRPC's thread pool.
         STATS_health_check_rpc_latency_ms.Record(
-            absl::ToInt64Milliseconds(absl::Now() - now));
+            absl::ToInt64Milliseconds(absl::Now() - start));
         manager_->io_service_.post(
             [this, status]() {
               if (stopped_) {
                 delete this;
                 return;
               }
-              RAY_LOG(DEBUG) << "Health check status: " << int(response_.status());
+              RAY_LOG(DEBUG) << "Health check status: "
+                             << HealthCheckResponse_ServingStatus_Name(
+                                    response_.status());
 
               if (status.ok() && response_.status() == HealthCheckResponse::SERVING) {
-                // Health check passed
+                // Health check passed.
                 health_check_remaining_ = manager_->failure_threshold_;
               } else {
                 --health_check_remaining_;
@@ -118,6 +126,9 @@ void GcsHealthCheckManager::HealthCheckContext::StartHealthCheck() {
                 delete this;
               } else {
                 // Do another health check.
+                //
+                // TODO(hjiang): Able to reduce a few health check based on know resource
+                // usage communication between GCS and raylet.
                 timer_.expires_from_now(
                     boost::posix_time::milliseconds(manager_->period_ms_));
                 timer_.async_wait([this](auto) { StartHealthCheck(); });
@@ -132,13 +143,13 @@ void GcsHealthCheckManager::HealthCheckContext::Stop() { stopped_ = true; }
 void GcsHealthCheckManager::AddNode(const NodeID &node_id,
                                     std::shared_ptr<grpc::Channel> channel) {
   io_service_.dispatch(
-      [this, channel, node_id]() {
-        RAY_CHECK(health_check_contexts_.count(node_id) == 0);
+      [this, channel = std::move(channel), node_id]() {
+        thread_checker_.IsOnSameThread();
         auto context = new HealthCheckContext(this, channel, node_id);
-        health_check_contexts_.emplace(std::make_pair(node_id, context));
+        auto [_, is_new] = health_check_contexts_.emplace(node_id, context);
+        RAY_CHECK(is_new);
       },
       "GcsHealthCheckManager::AddNode");
 }
 
-}  // namespace gcs
-}  // namespace ray
+}  // namespace ray::gcs
diff --git a/src/ray/gcs/gcs_server/gcs_health_check_manager.h b/src/ray/gcs/gcs_server/gcs_health_check_manager.h
index d877a217d803..a6e36d82972a 100644
--- a/src/ray/gcs/gcs_server/gcs_health_check_manager.h
+++ b/src/ray/gcs/gcs_server/gcs_health_check_manager.h
@@ -16,16 +16,19 @@
 
 #include <grpcpp/grpcpp.h>
 
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <vector>
+
 #include "absl/container/flat_hash_map.h"
 #include "ray/common/asio/instrumented_io_context.h"
 #include "ray/common/id.h"
 #include "ray/common/ray_config.h"
+#include "ray/util/thread_checker.h"
 #include "src/proto/grpc/health/v1/health.grpc.pb.h"
 
-class GcsHealthCheckManagerTest;
-
-namespace ray {
-namespace gcs {
+namespace ray::gcs {
 
 /// GcsHealthCheckManager is used to track the healthiness of the nodes in the ray
 /// cluster. The health check is done in pull based way, which means this module will send
@@ -35,6 +38,9 @@ namespace gcs {
 /// node will be removed from GcsHealthCheckManager. The node can be added into this class
 /// later. Although the same node id is not supposed to be reused in ray cluster, this is
 /// not enforced in this class.
+///
+/// All IO operations happens on the same thread, which is managed by the pass-ed in
+/// [io_service].
 /// TODO (iycheng): Move the GcsHealthCheckManager to ray/common.
 class GcsHealthCheckManager {
  public:
@@ -58,24 +64,27 @@ class GcsHealthCheckManager {
   ~GcsHealthCheckManager();
 
   /// Start to track the healthiness of a node.
+  /// Safe to call from non-io-context threads.
   ///
   /// \param node_id The id of the node.
   /// \param channel The gRPC channel to the node.
   void AddNode(const NodeID &node_id, std::shared_ptr<grpc::Channel> channel);
 
   /// Stop tracking the healthiness of a node.
+  /// Safe to call from non-io-context threads.
   ///
   /// \param node_id The id of the node to stop tracking.
   void RemoveNode(const NodeID &node_id);
 
-  /// Return all the nodes monitored.
+  /// Return all the nodes monitored and alive.
+  /// Notice: have to invoke from io-context thread.
   ///
   /// \return A list of node id which are being monitored by this class.
   std::vector<NodeID> GetAllNodes() const;
 
  private:
   /// Fail a node when health check failed. It'll stop the health checking and
-  /// call on_node_death_callback.
+  /// call `on_node_death_callback_`.
   ///
   /// \param node_id The id of the node.
   void FailNode(const NodeID &node_id);
@@ -133,8 +142,12 @@ class GcsHealthCheckManager {
   std::function<void(const NodeID &)> on_node_death_callback_;
 
   /// The context of the health check for each nodes.
+  /// Only living nodes are bookkept, while failed one will be removed.
   absl::flat_hash_map<NodeID, HealthCheckContext *> health_check_contexts_;
 
+  /// Checker to make sure there's no concurrent access for node addition and removal.
+  const ThreadChecker thread_checker_;
+
   /// The delay for the first health check request.
   const int64_t initial_delay_ms_;
   /// Timeout for each health check request.
@@ -145,5 +158,4 @@ class GcsHealthCheckManager {
   const int64_t failure_threshold_;
 };
 
-}  // namespace gcs
-}  // namespace ray
+}  // namespace ray::gcs
diff --git a/src/ray/gcs/gcs_server/gcs_node_manager.cc b/src/ray/gcs/gcs_server/gcs_node_manager.cc
index a232ecf10903..15aa488cb65f 100644
--- a/src/ray/gcs/gcs_server/gcs_node_manager.cc
+++ b/src/ray/gcs/gcs_server/gcs_node_manager.cc
@@ -29,14 +29,13 @@ namespace ray {
 namespace gcs {
 
 //////////////////////////////////////////////////////////////////////////////////////////
-GcsNodeManager::GcsNodeManager(
-    std::shared_ptr<GcsPublisher> gcs_publisher,
-    std::shared_ptr<gcs::GcsTableStorage> gcs_table_storage,
-    std::shared_ptr<rpc::NodeManagerClientPool> raylet_client_pool,
-    const ClusterID &cluster_id)
+GcsNodeManager::GcsNodeManager(std::shared_ptr<GcsPublisher> gcs_publisher,
+                               std::shared_ptr<gcs::GcsTableStorage> gcs_table_storage,
+                               rpc::NodeManagerClientPool *raylet_client_pool,
+                               const ClusterID &cluster_id)
     : gcs_publisher_(std::move(gcs_publisher)),
       gcs_table_storage_(std::move(gcs_table_storage)),
-      raylet_client_pool_(std::move(raylet_client_pool)),
+      raylet_client_pool_(raylet_client_pool),
       cluster_id_(cluster_id) {}
 
 void GcsNodeManager::WriteNodeExportEvent(rpc::GcsNodeInfo node_info) const {
@@ -393,8 +392,8 @@ std::shared_ptr<rpc::GcsNodeInfo> GcsNodeManager::RemoveNode(
               .WithField("ip", removed_node->node_manager_address())
           << error_message.str();
       RAY_LOG(WARNING) << error_message.str();
-      auto error_data_ptr =
-          gcs::CreateErrorTableData(type, error_message.str(), current_time_ms());
+      auto error_data_ptr = gcs::CreateErrorTableData(
+          type, error_message.str(), absl::FromUnixMillis(current_time_ms()));
       RAY_CHECK_OK(gcs_publisher_->PublishError(node_id.Hex(), *error_data_ptr, nullptr));
     }
 
diff --git a/src/ray/gcs/gcs_server/gcs_node_manager.h b/src/ray/gcs/gcs_server/gcs_node_manager.h
index db258d4cb00c..b924fec264c9 100644
--- a/src/ray/gcs/gcs_server/gcs_node_manager.h
+++ b/src/ray/gcs/gcs_server/gcs_node_manager.h
@@ -50,7 +50,7 @@ class GcsNodeManager : public rpc::NodeInfoHandler {
   /// \param gcs_table_storage GCS table external storage accessor.
   explicit GcsNodeManager(std::shared_ptr<GcsPublisher> gcs_publisher,
                           std::shared_ptr<gcs::GcsTableStorage> gcs_table_storage,
-                          std::shared_ptr<rpc::NodeManagerClientPool> raylet_client_pool,
+                          rpc::NodeManagerClientPool *raylet_client_pool,
                           const ClusterID &cluster_id);
 
   /// Handle register rpc request come from raylet.
@@ -248,7 +248,7 @@ class GcsNodeManager : public rpc::NodeInfoHandler {
   /// Storage for GCS tables.
   std::shared_ptr<gcs::GcsTableStorage> gcs_table_storage_;
   /// Raylet client pool.
-  std::shared_ptr<rpc::NodeManagerClientPool> raylet_client_pool_;
+  rpc::NodeManagerClientPool *raylet_client_pool_ = nullptr;
   /// Cluster ID to be shared with clients when connecting.
   const ClusterID cluster_id_;
 
diff --git a/src/ray/gcs/gcs_server/gcs_placement_group_manager.cc b/src/ray/gcs/gcs_server/gcs_placement_group_manager.cc
index c60bcd43cc45..1aec60e9603c 100644
--- a/src/ray/gcs/gcs_server/gcs_placement_group_manager.cc
+++ b/src/ray/gcs/gcs_server/gcs_placement_group_manager.cc
@@ -14,6 +14,8 @@
 
 #include "ray/gcs/gcs_server/gcs_placement_group_manager.h"
 
+#include <utility>
+
 #include "ray/common/asio/asio_util.h"
 #include "ray/common/asio/instrumented_io_context.h"
 #include "ray/common/ray_config.h"
@@ -181,15 +183,15 @@ rpc::PlacementGroupStats *GcsPlacementGroup::GetMutableStats() {
 
 GcsPlacementGroupManager::GcsPlacementGroupManager(
     instrumented_io_context &io_context,
-    std::shared_ptr<GcsPlacementGroupSchedulerInterface> scheduler,
+    GcsPlacementGroupSchedulerInterface *scheduler,
     std::shared_ptr<gcs::GcsTableStorage> gcs_table_storage,
     GcsResourceManager &gcs_resource_manager,
     std::function<std::string(const JobID &)> get_ray_namespace)
     : io_context_(io_context),
-      gcs_placement_group_scheduler_(std::move(scheduler)),
+      gcs_placement_group_scheduler_(scheduler),
       gcs_table_storage_(std::move(gcs_table_storage)),
       gcs_resource_manager_(gcs_resource_manager),
-      get_ray_namespace_(get_ray_namespace) {
+      get_ray_namespace_(std::move(get_ray_namespace)) {
   placement_group_state_counter_.reset(
       new CounterMap<rpc::PlacementGroupTableData::PlacementGroupState>());
   placement_group_state_counter_->SetOnChangeCallback(
diff --git a/src/ray/gcs/gcs_server/gcs_placement_group_manager.h b/src/ray/gcs/gcs_server/gcs_placement_group_manager.h
index d90fdccf3a8a..a7d91388e264 100644
--- a/src/ray/gcs/gcs_server/gcs_placement_group_manager.h
+++ b/src/ray/gcs/gcs_server/gcs_placement_group_manager.h
@@ -237,7 +237,7 @@ class GcsPlacementGroupManager : public rpc::PlacementGroupInfoHandler {
   /// \param gcs_resource_manager Reference of GcsResourceManager.
   /// \param get_ray_namespace A callback to get the ray namespace.
   GcsPlacementGroupManager(instrumented_io_context &io_context,
-                           std::shared_ptr<GcsPlacementGroupSchedulerInterface> scheduler,
+                           GcsPlacementGroupSchedulerInterface *scheduler,
                            std::shared_ptr<gcs::GcsTableStorage> gcs_table_storage,
                            GcsResourceManager &gcs_resource_manager,
                            std::function<std::string(const JobID &)> get_ray_namespace);
@@ -480,8 +480,8 @@ class GcsPlacementGroupManager : public rpc::PlacementGroupInfoHandler {
   std::deque<std::shared_ptr<GcsPlacementGroup>> infeasible_placement_groups_;
 
   /// The scheduler to schedule all registered placement_groups.
-  std::shared_ptr<gcs::GcsPlacementGroupSchedulerInterface>
-      gcs_placement_group_scheduler_;
+  /// Scheduler's lifecycle lies in [GcsServer].
+  gcs::GcsPlacementGroupSchedulerInterface *gcs_placement_group_scheduler_ = nullptr;
 
   /// Used to update placement group information upon creation, deletion, etc.
   std::shared_ptr<gcs::GcsTableStorage> gcs_table_storage_;
diff --git a/src/ray/gcs/gcs_server/gcs_placement_group_scheduler.cc b/src/ray/gcs/gcs_server/gcs_placement_group_scheduler.cc
index 6bc2737c14a6..85a94f863598 100644
--- a/src/ray/gcs/gcs_server/gcs_placement_group_scheduler.cc
+++ b/src/ray/gcs/gcs_server/gcs_placement_group_scheduler.cc
@@ -26,13 +26,13 @@ GcsPlacementGroupScheduler::GcsPlacementGroupScheduler(
     std::shared_ptr<gcs::GcsTableStorage> gcs_table_storage,
     const gcs::GcsNodeManager &gcs_node_manager,
     ClusterResourceScheduler &cluster_resource_scheduler,
-    std::shared_ptr<rpc::NodeManagerClientPool> raylet_client_pool)
+    rpc::NodeManagerClientPool &raylet_client_pool)
     : io_context_(io_context),
       return_timer_(io_context),
       gcs_table_storage_(std::move(gcs_table_storage)),
       gcs_node_manager_(gcs_node_manager),
       cluster_resource_scheduler_(cluster_resource_scheduler),
-      raylet_client_pool_(std::move(raylet_client_pool)) {}
+      raylet_client_pool_(raylet_client_pool) {}
 
 void GcsPlacementGroupScheduler::ScheduleUnplacedBundles(
     const SchedulePgRequest &request) {
@@ -279,7 +279,7 @@ void GcsPlacementGroupScheduler::CancelResourceReserve(
 
 std::shared_ptr<ResourceReserveInterface>
 GcsPlacementGroupScheduler::GetOrConnectLeaseClient(const rpc::Address &raylet_address) {
-  return raylet_client_pool_->GetOrConnectByAddress(raylet_address);
+  return raylet_client_pool_.GetOrConnectByAddress(raylet_address);
 }
 
 std::shared_ptr<ResourceReserveInterface>
diff --git a/src/ray/gcs/gcs_server/gcs_placement_group_scheduler.h b/src/ray/gcs/gcs_server/gcs_placement_group_scheduler.h
index ec7ac53941bd..df16f025d082 100644
--- a/src/ray/gcs/gcs_server/gcs_placement_group_scheduler.h
+++ b/src/ray/gcs/gcs_server/gcs_placement_group_scheduler.h
@@ -290,12 +290,11 @@ class GcsPlacementGroupScheduler : public GcsPlacementGroupSchedulerInterface {
   /// \param cluster_resource_scheduler The resource scheduler which is used when
   /// scheduling.
   /// \param lease_client_factory Factory to create remote lease client.
-  GcsPlacementGroupScheduler(
-      instrumented_io_context &io_context,
-      std::shared_ptr<gcs::GcsTableStorage> gcs_table_storage,
-      const GcsNodeManager &gcs_node_manager,
-      ClusterResourceScheduler &cluster_resource_scheduler,
-      std::shared_ptr<rpc::NodeManagerClientPool> raylet_client_pool);
+  GcsPlacementGroupScheduler(instrumented_io_context &io_context,
+                             std::shared_ptr<gcs::GcsTableStorage> gcs_table_storage,
+                             const GcsNodeManager &gcs_node_manager,
+                             ClusterResourceScheduler &cluster_resource_scheduler,
+                             rpc::NodeManagerClientPool &raylet_client_pool);
 
   virtual ~GcsPlacementGroupScheduler() = default;
 
@@ -502,7 +501,7 @@ class GcsPlacementGroupScheduler : public GcsPlacementGroupSchedulerInterface {
       placement_group_leasing_in_progress_;
 
   /// The cached raylet clients used to communicate with raylets.
-  std::shared_ptr<rpc::NodeManagerClientPool> raylet_client_pool_;
+  rpc::NodeManagerClientPool &raylet_client_pool_;
 
   /// The nodes which are releasing unused bundles.
   absl::flat_hash_set<NodeID> nodes_of_releasing_unused_bundles_;
diff --git a/src/ray/gcs/gcs_server/gcs_resource_manager.h b/src/ray/gcs/gcs_server/gcs_resource_manager.h
index dff95380cd21..47ecf9fff3a3 100644
--- a/src/ray/gcs/gcs_server/gcs_resource_manager.h
+++ b/src/ray/gcs/gcs_server/gcs_resource_manager.h
@@ -67,7 +67,7 @@ class GcsResourceManager : public rpc::NodeResourceInfoHandler,
       NodeID local_node_id,
       std::shared_ptr<ClusterTaskManager> cluster_task_manager = nullptr);
 
-  virtual ~GcsResourceManager() {}
+  virtual ~GcsResourceManager() = default;
 
   /// Handle the resource update.
   void ConsumeSyncMessage(std::shared_ptr<const syncer::RaySyncMessage> message) override;
diff --git a/src/ray/gcs/gcs_server/gcs_server.cc b/src/ray/gcs/gcs_server/gcs_server.cc
index c51c14bbb0dd..a8ad05ff3c9d 100644
--- a/src/ray/gcs/gcs_server/gcs_server.cc
+++ b/src/ray/gcs/gcs_server/gcs_server.cc
@@ -64,7 +64,7 @@ GcsServer::GcsServer(const ray::gcs::GcsServerConfig &config,
                            ClusterID::Nil(),
                            RayConfig::instance().gcs_server_rpc_client_thread_num()),
       raylet_client_pool_(
-          std::make_shared<rpc::NodeManagerClientPool>(client_call_manager_)),
+          std::make_unique<rpc::NodeManagerClientPool>(client_call_manager_)),
       pubsub_periodical_runner_(io_context_provider_.GetIOContext<GcsPublisher>()),
       periodical_runner_(io_context_provider_.GetDefaultIOContext()),
       is_started_(false),
@@ -289,7 +289,7 @@ void GcsServer::InitGcsNodeManager(const GcsInitData &gcs_init_data) {
   RAY_CHECK(gcs_table_storage_ && gcs_publisher_);
   gcs_node_manager_ = std::make_unique<GcsNodeManager>(gcs_publisher_,
                                                        gcs_table_storage_,
-                                                       raylet_client_pool_,
+                                                       raylet_client_pool_.get(),
                                                        rpc_server_.GetClusterId());
   // Initialize by gcs tables data.
   gcs_node_manager_->Initialize(gcs_init_data);
@@ -323,7 +323,7 @@ void GcsServer::InitGcsHealthCheckManager(const GcsInitData &gcs_init_data) {
 
 void GcsServer::InitGcsResourceManager(const GcsInitData &gcs_init_data) {
   RAY_CHECK(cluster_resource_scheduler_ && cluster_task_manager_);
-  gcs_resource_manager_ = std::make_shared<GcsResourceManager>(
+  gcs_resource_manager_ = std::make_unique<GcsResourceManager>(
       io_context_provider_.GetDefaultIOContext(),
       cluster_resource_scheduler_->GetClusterResourceManager(),
       *gcs_node_manager_,
@@ -446,25 +446,25 @@ void GcsServer::InitGcsActorManager(const GcsInitData &gcs_init_data) {
                                          const rpc::PushTaskReply &reply) {
     gcs_actor_manager_->OnActorCreationSuccess(std::move(actor), reply);
   };
-  auto client_factory = [this](const rpc::Address &address) {
-    return std::make_shared<rpc::CoreWorkerClient>(address, client_call_manager_);
-  };
 
   RAY_CHECK(gcs_resource_manager_ && cluster_task_manager_);
   scheduler = std::make_unique<GcsActorScheduler>(
       io_context_provider_.GetDefaultIOContext(),
       gcs_table_storage_->ActorTable(),
       *gcs_node_manager_,
-      cluster_task_manager_,
+      *cluster_task_manager_,
       schedule_failure_handler,
       schedule_success_handler,
-      raylet_client_pool_,
-      client_factory,
+      *raylet_client_pool_,
+      /*factory=*/
+      [this](const rpc::Address &address) {
+        return std::make_shared<rpc::CoreWorkerClient>(address, client_call_manager_);
+      },
       /*normal_task_resources_changed_callback=*/
       [this](const NodeID &node_id, const rpc::ResourcesData &resources) {
         gcs_resource_manager_->UpdateNodeNormalTaskResources(node_id, resources);
       });
-  gcs_actor_manager_ = std::make_shared<GcsActorManager>(
+  gcs_actor_manager_ = std::make_unique<GcsActorManager>(
       std::move(scheduler),
       gcs_table_storage_,
       gcs_publisher_,
@@ -480,23 +480,23 @@ void GcsServer::InitGcsActorManager(const GcsInitData &gcs_init_data) {
   // Initialize by gcs tables data.
   gcs_actor_manager_->Initialize(gcs_init_data);
   // Register service.
-  actor_info_service_.reset(new rpc::ActorInfoGrpcService(
-      io_context_provider_.GetDefaultIOContext(), *gcs_actor_manager_));
+  actor_info_service_ = std::make_unique<rpc::ActorInfoGrpcService>(
+      io_context_provider_.GetDefaultIOContext(), *gcs_actor_manager_);
   rpc_server_.RegisterService(*actor_info_service_);
 }
 
 void GcsServer::InitGcsPlacementGroupManager(const GcsInitData &gcs_init_data) {
   RAY_CHECK(gcs_table_storage_ && gcs_node_manager_);
-  gcs_placement_group_scheduler_ = std::make_shared<GcsPlacementGroupScheduler>(
+  gcs_placement_group_scheduler_ = std::make_unique<GcsPlacementGroupScheduler>(
       io_context_provider_.GetDefaultIOContext(),
       gcs_table_storage_,
       *gcs_node_manager_,
       *cluster_resource_scheduler_,
-      raylet_client_pool_);
+      *raylet_client_pool_);
 
-  gcs_placement_group_manager_ = std::make_shared<GcsPlacementGroupManager>(
+  gcs_placement_group_manager_ = std::make_unique<GcsPlacementGroupManager>(
       io_context_provider_.GetDefaultIOContext(),
-      gcs_placement_group_scheduler_,
+      gcs_placement_group_scheduler_.get(),
       gcs_table_storage_,
       *gcs_resource_manager_,
       [this](const JobID &job_id) {
@@ -671,7 +671,7 @@ void GcsServer::InitGcsAutoscalerStateManager(const GcsInitData &gcs_init_data)
                                                   *gcs_node_manager_,
                                                   *gcs_actor_manager_,
                                                   *gcs_placement_group_manager_,
-                                                  raylet_client_pool_);
+                                                  *raylet_client_pool_);
   gcs_autoscaler_state_manager_->Initialize(gcs_init_data);
 
   autoscaler_state_service_.reset(new rpc::autoscaler::AutoscalerStateGrpcService(
@@ -826,7 +826,7 @@ std::shared_ptr<RedisClient> GcsServer::GetOrConnectRedis() {
     RAY_CHECK(status.ok()) << "Failed to init redis gcs client as " << status;
 
     // Init redis failure detector.
-    gcs_redis_failure_detector_ = std::make_shared<GcsRedisFailureDetector>(
+    gcs_redis_failure_detector_ = std::make_unique<GcsRedisFailureDetector>(
         io_context_provider_.GetDefaultIOContext(), redis_client_, []() {
           RAY_LOG(FATAL) << "Redis connection failed. Shutdown GCS.";
         });
diff --git a/src/ray/gcs/gcs_server/gcs_server.h b/src/ray/gcs/gcs_server/gcs_server.h
index 22ece4dda229..6c37e9d8210c 100644
--- a/src/ray/gcs/gcs_server/gcs_server.h
+++ b/src/ray/gcs/gcs_server/gcs_server.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <memory>
+
 #include "ray/common/asio/asio_util.h"
 #include "ray/common/asio/instrumented_io_context.h"
 #include "ray/common/ray_syncer/ray_syncer.h"
@@ -218,9 +220,9 @@ class GcsServer {
   /// The `ClientCallManager` object that is shared by all `NodeManagerWorkerClient`s.
   rpc::ClientCallManager client_call_manager_;
   /// Node manager client pool.
-  std::shared_ptr<rpc::NodeManagerClientPool> raylet_client_pool_;
+  std::unique_ptr<rpc::NodeManagerClientPool> raylet_client_pool_;
   /// The gcs resource manager.
-  std::shared_ptr<GcsResourceManager> gcs_resource_manager_;
+  std::unique_ptr<GcsResourceManager> gcs_resource_manager_;
   /// The cluster resource scheduler.
   std::shared_ptr<ClusterResourceScheduler> cluster_resource_scheduler_;
   /// The cluster task manager.
@@ -230,15 +232,17 @@ class GcsServer {
   /// The gcs node manager.
   std::unique_ptr<GcsNodeManager> gcs_node_manager_;
   /// The health check manager.
-  std::shared_ptr<GcsHealthCheckManager> gcs_healthcheck_manager_;
+  std::unique_ptr<GcsHealthCheckManager> gcs_healthcheck_manager_;
   /// The gcs redis failure detector.
-  std::shared_ptr<GcsRedisFailureDetector> gcs_redis_failure_detector_;
+  std::unique_ptr<GcsRedisFailureDetector> gcs_redis_failure_detector_;
   /// The gcs actor manager.
-  std::shared_ptr<GcsActorManager> gcs_actor_manager_;
+  std::unique_ptr<GcsActorManager> gcs_actor_manager_;
   /// The gcs placement group scheduler.
-  std::shared_ptr<GcsPlacementGroupScheduler> gcs_placement_group_scheduler_;
+  /// [gcs_placement_group_scheduler_] depends on [raylet_client_pool_].
+  std::unique_ptr<GcsPlacementGroupScheduler> gcs_placement_group_scheduler_;
   /// The gcs placement group manager.
-  std::shared_ptr<GcsPlacementGroupManager> gcs_placement_group_manager_;
+  /// [gcs_placement_group_manager_] depends on [gcs_placement_group_scheduler_].
+  std::unique_ptr<GcsPlacementGroupManager> gcs_placement_group_manager_;
   /// Job info handler and service.
   std::unique_ptr<GcsJobManager> gcs_job_manager_;
   std::unique_ptr<rpc::JobInfoGrpcService> job_info_service_;
diff --git a/src/ray/gcs/gcs_server/test/export_api/gcs_node_manager_export_event_test.cc b/src/ray/gcs/gcs_server/test/export_api/gcs_node_manager_export_event_test.cc
index 742716da4b45..61d2d0e8b932 100644
--- a/src/ray/gcs/gcs_server/test/export_api/gcs_node_manager_export_event_test.cc
+++ b/src/ray/gcs/gcs_server/test/export_api/gcs_node_manager_export_event_test.cc
@@ -41,7 +41,7 @@ class GcsNodeManagerExportAPITest : public ::testing::Test {
  public:
   GcsNodeManagerExportAPITest() {
     raylet_client_ = std::make_shared<GcsServerMocker::MockRayletClient>();
-    client_pool_ = std::make_shared<rpc::NodeManagerClientPool>(
+    client_pool_ = std::make_unique<rpc::NodeManagerClientPool>(
         [this](const rpc::Address &) { return raylet_client_; });
     gcs_publisher_ = std::make_shared<gcs::GcsPublisher>(
         std::make_unique<ray::pubsub::MockPublisher>());
@@ -72,7 +72,7 @@ class GcsNodeManagerExportAPITest : public ::testing::Test {
  protected:
   std::shared_ptr<gcs::GcsTableStorage> gcs_table_storage_;
   std::shared_ptr<GcsServerMocker::MockRayletClient> raylet_client_;
-  std::shared_ptr<rpc::NodeManagerClientPool> client_pool_;
+  std::unique_ptr<rpc::NodeManagerClientPool> client_pool_;
   std::shared_ptr<gcs::GcsPublisher> gcs_publisher_;
   instrumented_io_context io_service_;
   std::string log_dir_;
@@ -81,7 +81,7 @@ class GcsNodeManagerExportAPITest : public ::testing::Test {
 TEST_F(GcsNodeManagerExportAPITest, TestExportEventRegisterNode) {
   // Test export event is written when a node is added with HandleRegisterNode
   gcs::GcsNodeManager node_manager(
-      gcs_publisher_, gcs_table_storage_, client_pool_, ClusterID::Nil());
+      gcs_publisher_, gcs_table_storage_, client_pool_.get(), ClusterID::Nil());
   auto node = Mocker::GenNodeInfo();
 
   rpc::RegisterNodeRequest register_request;
@@ -103,7 +103,7 @@ TEST_F(GcsNodeManagerExportAPITest, TestExportEventRegisterNode) {
 TEST_F(GcsNodeManagerExportAPITest, TestExportEventUnregisterNode) {
   // Test export event is written when a node is removed with HandleUnregisterNode
   gcs::GcsNodeManager node_manager(
-      gcs_publisher_, gcs_table_storage_, client_pool_, ClusterID::Nil());
+      gcs_publisher_, gcs_table_storage_, client_pool_.get(), ClusterID::Nil());
   auto node = Mocker::GenNodeInfo();
   auto node_id = NodeID::FromBinary(node->node_id());
   node_manager.AddNode(node);
diff --git a/src/ray/gcs/gcs_server/test/gcs_actor_manager_test.cc b/src/ray/gcs/gcs_server/test/gcs_actor_manager_test.cc
index c3b8c9f2a421..9bb274af97bd 100644
--- a/src/ray/gcs/gcs_server/test/gcs_actor_manager_test.cc
+++ b/src/ray/gcs/gcs_server/test/gcs_actor_manager_test.cc
@@ -1424,8 +1424,3 @@ TEST_F(GcsActorManagerTest, TestDestroyActorWhenActorIsCreating) {
 
 }  // namespace gcs
 }  // namespace ray
-
-int main(int argc, char **argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_mock_test.cc b/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_mock_test.cc
index 6beeb8b7504c..aca66ca39c09 100644
--- a/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_mock_test.cc
+++ b/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_mock_test.cc
@@ -42,7 +42,7 @@ class GcsActorSchedulerMockTest : public Test {
         std::make_unique<GcsNodeManager>(nullptr, nullptr, nullptr, ClusterID::Nil());
     raylet_client = std::make_shared<MockRayletClientInterface>();
     core_worker_client = std::make_shared<rpc::MockCoreWorkerClientInterface>();
-    client_pool = std::make_shared<rpc::NodeManagerClientPool>(
+    client_pool = std::make_unique<rpc::NodeManagerClientPool>(
         [this](const rpc::Address &) { return raylet_client; });
     local_node_id = NodeID::FromRandom();
     auto cluster_resource_scheduler = std::make_shared<ClusterResourceScheduler>(
@@ -52,7 +52,7 @@ class GcsActorSchedulerMockTest : public Test {
         /*is_node_available_fn=*/
         [](auto) { return true; },
         /*is_local_node_with_raylet=*/false);
-    auto cluster_task_manager = std::make_shared<ClusterTaskManager>(
+    cluster_task_manager = std::make_unique<ClusterTaskManager>(
         local_node_id,
         cluster_resource_scheduler,
         /*get_node_info=*/
@@ -70,10 +70,10 @@ class GcsActorSchedulerMockTest : public Test {
         io_context,
         *actor_table,
         *gcs_node_manager,
-        cluster_task_manager,
+        *cluster_task_manager,
         [this](auto a, auto b, auto c) { schedule_failure_handler(a); },
         [this](auto a, const rpc::PushTaskReply) { schedule_success_handler(a); },
-        client_pool,
+        *client_pool,
         [this](const rpc::Address &) { return core_worker_client; });
     auto node_info = std::make_shared<rpc::GcsNodeInfo>();
     node_info->set_state(rpc::GcsNodeInfo::ALIVE);
@@ -82,14 +82,16 @@ class GcsActorSchedulerMockTest : public Test {
     worker_id = WorkerID::FromRandom();
     gcs_node_manager->AddNode(node_info);
   }
+
   std::shared_ptr<MockRayletClientInterface> raylet_client;
   instrumented_io_context io_context;
   std::shared_ptr<MockStoreClient> store_client;
   std::unique_ptr<GcsActorTable> actor_table;
-  std::unique_ptr<GcsActorScheduler> actor_scheduler;
   std::unique_ptr<GcsNodeManager> gcs_node_manager;
+  std::unique_ptr<ClusterTaskManager> cluster_task_manager;
+  std::unique_ptr<GcsActorScheduler> actor_scheduler;
   std::shared_ptr<rpc::MockCoreWorkerClientInterface> core_worker_client;
-  std::shared_ptr<rpc::NodeManagerClientPool> client_pool;
+  std::unique_ptr<rpc::NodeManagerClientPool> client_pool;
   std::shared_ptr<CounterMap<std::pair<rpc::ActorTableData::ActorState, std::string>>>
       counter;
   MockCallback schedule_failure_handler;
diff --git a/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc b/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc
index c14497db7eaa..6302ee02ed63 100644
--- a/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc
+++ b/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <gtest/gtest.h>
+
 #include <memory>
 
 // clang-format off
-#include "gtest/gtest.h"
 #include "ray/common/asio/instrumented_io_context.h"
 #include "ray/gcs/gcs_server/gcs_actor_scheduler.h"
 #include "ray/gcs/gcs_server/test/gcs_server_test_util.h"
@@ -39,7 +40,7 @@ class GcsActorSchedulerTest : public ::testing::Test {
     store_client_ = std::make_shared<gcs::InMemoryStoreClient>(io_service_);
     gcs_table_storage_ = std::make_shared<gcs::InMemoryGcsTableStorage>(io_service_);
     gcs_node_manager_ = std::make_shared<gcs::GcsNodeManager>(
-        gcs_publisher_, gcs_table_storage_, raylet_client_pool_, ClusterID::Nil());
+        gcs_publisher_, gcs_table_storage_, raylet_client_pool_.get(), ClusterID::Nil());
     gcs_actor_table_ =
         std::make_shared<GcsServerMocker::MockedGcsActorTable>(store_client_);
     local_node_id_ = NodeID::FromRandom();
@@ -73,7 +74,7 @@ class GcsActorSchedulerTest : public ::testing::Test {
         io_service_,
         *gcs_actor_table_,
         *gcs_node_manager_,
-        cluster_task_manager_,
+        *cluster_task_manager_,
         /*schedule_failure_handler=*/
         [this](std::shared_ptr<gcs::GcsActor> actor,
                const rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type,
@@ -84,7 +85,7 @@ class GcsActorSchedulerTest : public ::testing::Test {
         [this](std::shared_ptr<gcs::GcsActor> actor, const rpc::PushTaskReply &reply) {
           success_actors_.emplace_back(std::move(actor));
         },
-        raylet_client_pool_,
+        *raylet_client_pool_,
         /*client_factory=*/
         [this](const rpc::Address &address) { return worker_client_; },
         /*normal_task_resources_changed_callback=*/
@@ -1191,8 +1192,3 @@ TEST_F(GcsActorSchedulerTest, TestReleaseUnusedActorWorkersByGcs) {
 
 }  // namespace gcs
 }  // namespace ray
-
-int main(int argc, char **argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/src/ray/gcs/gcs_server/test/gcs_autoscaler_state_manager_test.cc b/src/ray/gcs/gcs_server/test/gcs_autoscaler_state_manager_test.cc
index b0b31b182d77..2f281fa31844 100644
--- a/src/ray/gcs/gcs_server/test/gcs_autoscaler_state_manager_test.cc
+++ b/src/ray/gcs/gcs_server/test/gcs_autoscaler_state_manager_test.cc
@@ -61,7 +61,7 @@ class GcsAutoscalerStateManagerTest : public ::testing::Test {
 
   void SetUp() override {
     raylet_client_ = std::make_shared<GcsServerMocker::MockRayletClient>();
-    client_pool_ = std::make_shared<rpc::NodeManagerClientPool>(
+    client_pool_ = std::make_unique<rpc::NodeManagerClientPool>(
         [this](const rpc::Address &) { return raylet_client_; });
     cluster_resource_manager_ = std::make_unique<ClusterResourceManager>(io_service_);
     gcs_node_manager_ = std::make_shared<MockGcsNodeManager>();
@@ -86,7 +86,7 @@ class GcsAutoscalerStateManagerTest : public ::testing::Test {
                                       *gcs_node_manager_,
                                       *gcs_actor_manager_,
                                       *gcs_placement_group_manager_,
-                                      client_pool_));
+                                      *client_pool_));
   }
 
  public:
@@ -847,8 +847,3 @@ TEST_F(GcsAutoscalerStateManagerTest, TestGcsKvManagerInternalConfig) {
 
 }  // namespace gcs
 }  // namespace ray
-
-int main(int argc, char **argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/src/ray/gcs/gcs_server/test/gcs_health_check_manager_test.cc b/src/ray/gcs/gcs_server/test/gcs_health_check_manager_test.cc
index 1f0722a6d6d7..35fc308f28a9 100644
--- a/src/ray/gcs/gcs_server/test/gcs_health_check_manager_test.cc
+++ b/src/ray/gcs/gcs_server/test/gcs_health_check_manager_test.cc
@@ -271,17 +271,3 @@ TEST_F(GcsHealthCheckManagerTest, StressTest) {
   io_service.stop();
   t->join();
 }
-
-int main(int argc, char **argv) {
-  InitShutdownRAII ray_log_shutdown_raii(ray::RayLog::StartRayLog,
-                                         ray::RayLog::ShutDownRayLog,
-                                         argv[0],
-                                         ray::RayLogLevel::INFO,
-                                         /*log_dir=*/"");
-
-  ray::RayLog::InstallFailureSignalHandler(argv[0]);
-  ray::RayLog::InstallTerminateHandler();
-
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/src/ray/gcs/gcs_server/test/gcs_job_manager_test.cc b/src/ray/gcs/gcs_server/test/gcs_job_manager_test.cc
index 107af6752a5d..b18658dffc95 100644
--- a/src/ray/gcs/gcs_server/test/gcs_job_manager_test.cc
+++ b/src/ray/gcs/gcs_server/test/gcs_job_manager_test.cc
@@ -720,8 +720,4 @@ TEST_F(GcsJobManagerTest, TestNodeFailure) {
   EXPECT_TRUE(WaitForCondition(condition, 2000));
 }
 
-int main(int argc, char **argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
 }  // namespace ray
diff --git a/src/ray/gcs/gcs_server/test/gcs_node_manager_test.cc b/src/ray/gcs/gcs_server/test/gcs_node_manager_test.cc
index a8a0157e0d54..eb12d59dbdb3 100644
--- a/src/ray/gcs/gcs_server/test/gcs_node_manager_test.cc
+++ b/src/ray/gcs/gcs_server/test/gcs_node_manager_test.cc
@@ -28,7 +28,7 @@ class GcsNodeManagerTest : public ::testing::Test {
  public:
   GcsNodeManagerTest() {
     raylet_client_ = std::make_shared<GcsServerMocker::MockRayletClient>();
-    client_pool_ = std::make_shared<rpc::NodeManagerClientPool>(
+    client_pool_ = std::make_unique<rpc::NodeManagerClientPool>(
         [this](const rpc::Address &) { return raylet_client_; });
     gcs_publisher_ = std::make_shared<gcs::GcsPublisher>(
         std::make_unique<ray::pubsub::MockPublisher>());
@@ -37,13 +37,13 @@ class GcsNodeManagerTest : public ::testing::Test {
  protected:
   std::shared_ptr<gcs::GcsTableStorage> gcs_table_storage_;
   std::shared_ptr<GcsServerMocker::MockRayletClient> raylet_client_;
-  std::shared_ptr<rpc::NodeManagerClientPool> client_pool_;
+  std::unique_ptr<rpc::NodeManagerClientPool> client_pool_;
   std::shared_ptr<gcs::GcsPublisher> gcs_publisher_;
 };
 
 TEST_F(GcsNodeManagerTest, TestManagement) {
   gcs::GcsNodeManager node_manager(
-      gcs_publisher_, gcs_table_storage_, client_pool_, ClusterID::Nil());
+      gcs_publisher_, gcs_table_storage_, client_pool_.get(), ClusterID::Nil());
   // Test Add/Get/Remove functionality.
   auto node = Mocker::GenNodeInfo();
   auto node_id = NodeID::FromBinary(node->node_id());
@@ -58,7 +58,7 @@ TEST_F(GcsNodeManagerTest, TestManagement) {
 
 TEST_F(GcsNodeManagerTest, TestListener) {
   gcs::GcsNodeManager node_manager(
-      gcs_publisher_, gcs_table_storage_, client_pool_, ClusterID::Nil());
+      gcs_publisher_, gcs_table_storage_, client_pool_.get(), ClusterID::Nil());
   // Test AddNodeAddedListener.
   int node_count = 1000;
   std::vector<std::shared_ptr<rpc::GcsNodeInfo>> added_nodes;
@@ -97,8 +97,3 @@ TEST_F(GcsNodeManagerTest, TestListener) {
 }
 
 }  // namespace ray
-
-int main(int argc, char **argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_mock_test.cc b/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_mock_test.cc
index 6cfd689ac168..1e3ef61060c8 100644
--- a/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_mock_test.cc
+++ b/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_mock_test.cc
@@ -47,7 +47,7 @@ class GcsPlacementGroupManagerMockTest : public Test {
 
     gcs_placement_group_manager_ =
         std::make_unique<GcsPlacementGroupManager>(io_context_,
-                                                   gcs_placement_group_scheduler_,
+                                                   gcs_placement_group_scheduler_.get(),
                                                    gcs_table_storage_,
                                                    *resource_manager_,
                                                    [](auto &) { return ""; });
diff --git a/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_test.cc b/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_test.cc
index 268096815cbe..ad808b644b67 100644
--- a/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_test.cc
+++ b/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_test.cc
@@ -89,7 +89,7 @@ class GcsPlacementGroupManagerTest : public ::testing::Test {
         io_service_, cluster_resource_manager_, *gcs_node_manager_, NodeID::FromRandom());
     gcs_placement_group_manager_.reset(new gcs::GcsPlacementGroupManager(
         io_service_,
-        mock_placement_group_scheduler_,
+        mock_placement_group_scheduler_.get(),
         gcs_table_storage_,
         *gcs_resource_manager_,
         [this](const JobID &job_id) { return job_namespace_table_[job_id]; }));
@@ -1011,8 +1011,3 @@ TEST_F(GcsPlacementGroupManagerTest, TestCheckCreatorJobIsDeadWhenGcsRestart) {
 
 }  // namespace gcs
 }  // namespace ray
-
-int main(int argc, char **argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/src/ray/gcs/gcs_server/test/gcs_placement_group_scheduler_test.cc b/src/ray/gcs/gcs_server/test/gcs_placement_group_scheduler_test.cc
index 5d3f11ed39b0..093bdaf13fcc 100644
--- a/src/ray/gcs/gcs_server/test/gcs_placement_group_scheduler_test.cc
+++ b/src/ray/gcs/gcs_server/test/gcs_placement_group_scheduler_test.cc
@@ -55,21 +55,21 @@ class GcsPlacementGroupSchedulerTest : public ::testing::Test {
         [](auto) { return true; },
         /*is_local_node_with_raylet=*/false);
     gcs_node_manager_ = std::make_shared<gcs::GcsNodeManager>(
-        gcs_publisher_, gcs_table_storage_, raylet_client_pool_, ClusterID::Nil());
+        gcs_publisher_, gcs_table_storage_, raylet_client_pool_.get(), ClusterID::Nil());
     gcs_resource_manager_ = std::make_shared<gcs::GcsResourceManager>(
         io_service_,
         cluster_resource_scheduler_->GetClusterResourceManager(),
         *gcs_node_manager_,
         local_node_id);
     store_client_ = std::make_shared<gcs::InMemoryStoreClient>(io_service_);
-    raylet_client_pool_ = std::make_shared<rpc::NodeManagerClientPool>(
+    raylet_client_pool_ = std::make_unique<rpc::NodeManagerClientPool>(
         [this](const rpc::Address &addr) { return raylet_clients_[addr.port()]; });
     scheduler_ = std::make_shared<GcsServerMocker::MockedGcsPlacementGroupScheduler>(
         io_service_,
         gcs_table_storage_,
         *gcs_node_manager_,
         *cluster_resource_scheduler_,
-        raylet_client_pool_);
+        *raylet_client_pool_);
     counter_.reset(new CounterMap<rpc::PlacementGroupTableData::PlacementGroupState>());
   }
 
@@ -296,7 +296,7 @@ class GcsPlacementGroupSchedulerTest : public ::testing::Test {
       ABSL_GUARDED_BY(placement_group_requests_mutex_);
   std::shared_ptr<gcs::GcsPublisher> gcs_publisher_;
   std::shared_ptr<gcs::GcsTableStorage> gcs_table_storage_;
-  std::shared_ptr<rpc::NodeManagerClientPool> raylet_client_pool_;
+  std::unique_ptr<rpc::NodeManagerClientPool> raylet_client_pool_;
   std::shared_ptr<CounterMap<rpc::PlacementGroupTableData::PlacementGroupState>> counter_;
 };
 
@@ -1476,8 +1476,3 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestBundlesRemovedWhenNodeDead) {
 }
 
 }  // namespace ray
-
-int main(int argc, char **argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/src/ray/gcs/gcs_server/test/gcs_resource_manager_test.cc b/src/ray/gcs/gcs_server/test/gcs_resource_manager_test.cc
index 31fdc58530d8..cc2d3dec33a8 100644
--- a/src/ray/gcs/gcs_server/test/gcs_resource_manager_test.cc
+++ b/src/ray/gcs/gcs_server/test/gcs_resource_manager_test.cc
@@ -255,8 +255,3 @@ TEST_F(GcsResourceManagerTest, TestGetDrainingNodes) {
 }
 
 }  // namespace ray
-
-int main(int argc, char **argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/src/ray/gcs/gcs_server/test/gcs_worker_manager_test.cc b/src/ray/gcs/gcs_server/test/gcs_worker_manager_test.cc
index 3dd5c1e720e2..37d6a67b7b0d 100644
--- a/src/ray/gcs/gcs_server/test/gcs_worker_manager_test.cc
+++ b/src/ray/gcs/gcs_server/test/gcs_worker_manager_test.cc
@@ -297,8 +297,3 @@ TEST_F(GcsWorkerManagerTest, TestUpdateWorkerNumPausedThreads) {
     ASSERT_EQ(reply.worker_table_data(0).num_paused_threads(), num_paused_threads_delta);
   }
 }
-
-int main(int argc, char **argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/src/ray/gcs/gcs_server/test/in_memory_gcs_table_storage_test.cc b/src/ray/gcs/gcs_server/test/in_memory_gcs_table_storage_test.cc
index 0a63277536c8..dba6ddce5922 100644
--- a/src/ray/gcs/gcs_server/test/in_memory_gcs_table_storage_test.cc
+++ b/src/ray/gcs/gcs_server/test/in_memory_gcs_table_storage_test.cc
@@ -35,8 +35,3 @@ TEST_F(InMemoryGcsTableStorageTest, TestGcsTableWithJobIdApi) {
 }
 
 }  // namespace ray
-
-int main(int argc, char **argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/src/ray/gcs/gcs_server/test/usage_stats_client_test.cc b/src/ray/gcs/gcs_server/test/usage_stats_client_test.cc
index 21e660b1f1cd..9448f0000b9f 100644
--- a/src/ray/gcs/gcs_server/test/usage_stats_client_test.cc
+++ b/src/ray/gcs/gcs_server/test/usage_stats_client_test.cc
@@ -45,8 +45,3 @@ TEST_F(UsageStatsClientTest, TestRecordExtraUsageTag) {
         ASSERT_EQ(value.value(), "value2");
       });
 }
-
-int main(int argc, char **argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/src/ray/gcs/pb_util.h b/src/ray/gcs/pb_util.h
index cb3c518072b2..0bdd056b0b8d 100644
--- a/src/ray/gcs/pb_util.h
+++ b/src/ray/gcs/pb_util.h
@@ -16,6 +16,7 @@
 
 #include <memory>
 
+#include "absl/time/time.h"
 #include "ray/common/constants.h"
 #include "ray/common/id.h"
 #include "ray/common/ray_config.h"
@@ -61,27 +62,11 @@ inline std::shared_ptr<ray::rpc::JobTableData> CreateJobTableData(
 }
 
 /// Helper function to produce error table data.
-inline std::shared_ptr<ray::rpc::ErrorTableData> CreateErrorTableData(
+std::shared_ptr<ray::rpc::ErrorTableData> CreateErrorTableData(
     const std::string &error_type,
     const std::string &error_msg,
-    double timestamp,
-    const JobID &job_id = JobID::Nil()) {
-  uint32_t max_error_msg_size_bytes = RayConfig::instance().max_error_msg_size_bytes();
-  auto error_info_ptr = std::make_shared<ray::rpc::ErrorTableData>();
-  error_info_ptr->set_type(error_type);
-  if (error_msg.length() > max_error_msg_size_bytes) {
-    std::ostringstream stream;
-    stream << "The message size exceeds " << std::to_string(max_error_msg_size_bytes)
-           << " bytes. Find the full log from the log files. Here is abstract: "
-           << error_msg.substr(0, max_error_msg_size_bytes);
-    error_info_ptr->set_error_message(stream.str());
-  } else {
-    error_info_ptr->set_error_message(error_msg);
-  }
-  error_info_ptr->set_timestamp(timestamp);
-  error_info_ptr->set_job_id(job_id.Binary());
-  return error_info_ptr;
-}
+    absl::Time timestamp,
+    const JobID &job_id = JobID::Nil());
 
 /// Helper function to produce worker failure data.
 inline std::shared_ptr<ray::rpc::WorkerTableData> CreateWorkerFailureData(
diff --git a/src/ray/gcs/pb_utils.cc b/src/ray/gcs/pb_utils.cc
new file mode 100644
index 000000000000..9330f80c27ce
--- /dev/null
+++ b/src/ray/gcs/pb_utils.cc
@@ -0,0 +1,47 @@
+// Copyright 2024 The Ray Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// TODO(hjiang): Move all functions from `pb_utils.h` to this implementation file.
+
+#include <string_view>
+
+#include "absl/strings/str_format.h"
+#include "ray/gcs/pb_util.h"
+
+namespace ray::gcs {
+
+std::shared_ptr<ray::rpc::ErrorTableData> CreateErrorTableData(
+    const std::string &error_type,
+    const std::string &error_msg,
+    absl::Time timestamp,
+    const JobID &job_id) {
+  uint32_t max_error_msg_size_bytes = RayConfig::instance().max_error_msg_size_bytes();
+  auto error_info_ptr = std::make_shared<ray::rpc::ErrorTableData>();
+  error_info_ptr->set_type(error_type);
+  if (error_msg.length() > max_error_msg_size_bytes) {
+    std::string formatted_error_message = absl::StrFormat(
+        "The message size exceeds %d bytes. Find the full log from the log files. Here "
+        "is abstract: %s",
+        max_error_msg_size_bytes,
+        std::string_view{error_msg}.substr(0, max_error_msg_size_bytes));
+    error_info_ptr->set_error_message(std::move(formatted_error_message));
+  } else {
+    error_info_ptr->set_error_message(error_msg);
+  }
+  error_info_ptr->set_timestamp(absl::ToUnixMillis(timestamp));
+  error_info_ptr->set_job_id(job_id.Binary());
+  return error_info_ptr;
+}
+
+}  // namespace ray::gcs
diff --git a/src/ray/gcs/redis_context.cc b/src/ray/gcs/redis_context.cc
index 6de20bfe34af..f26333299b11 100644
--- a/src/ray/gcs/redis_context.cc
+++ b/src/ray/gcs/redis_context.cc
@@ -431,6 +431,114 @@ void ValidateRedisDB(RedisContext &context) {
   }
 }
 
+bool isRedisSentinel(RedisContext &context) {
+  auto reply = context.RunArgvSync(std::vector<std::string>{"INFO", "SENTINEL"});
+  if (reply->IsNil() || reply->IsError() || reply->ReadAsString().length() == 0) {
+    return false;
+  } else {
+    return true;
+  }
+}
+
+Status ConnectRedisCluster(RedisContext &context,
+                           const std::string &password,
+                           bool enable_ssl,
+                           const std::string &redis_address) {
+  RAY_LOG(INFO) << "Connect to Redis Cluster";
+  // Ray has some restrictions for RedisDB. Validate it here.
+  ValidateRedisDB(context);
+
+  // Find the true leader
+  std::vector<const char *> argv;
+  std::vector<size_t> argc;
+  std::vector<std::string> cmds = {"DEL", "DUMMY"};
+  for (const auto &arg : cmds) {
+    argv.push_back(arg.data());
+    argc.push_back(arg.size());
+  }
+
+  auto redis_reply = reinterpret_cast<redisReply *>(
+      ::redisCommandArgv(context.sync_context(), cmds.size(), argv.data(), argc.data()));
+
+  if (redis_reply->type == REDIS_REPLY_ERROR) {
+    // This should be a MOVED error
+    // MOVED 14946 10.xx.xx.xx:7001
+    std::string error_msg(redis_reply->str, redis_reply->len);
+    freeReplyObject(redis_reply);
+    auto maybe_ip_port = ParseIffMovedError(error_msg);
+    RAY_CHECK(maybe_ip_port.has_value())
+        << "Setup Redis cluster failed in the dummy deletion: " << error_msg;
+    context.Disconnect();
+    const auto &[ip, port] = maybe_ip_port.value();
+    // Connect to the true leader.
+    RAY_LOG(INFO) << "Redis cluster leader is " << ip << ":" << port
+                  << ". Reconnect to it.";
+    return context.Connect(ip, port, password, enable_ssl);
+  } else {
+    RAY_LOG(INFO) << "Redis cluster leader is " << redis_address;
+    freeReplyObject(redis_reply);
+  }
+
+  return Status::OK();
+}
+
+Status ConnectRedisSentinel(RedisContext &context,
+                            const std::string &password,
+                            bool enable_ssl) {
+  RAY_LOG(INFO) << "Connect to Redis sentinel";
+
+  std::vector<const char *> argv;
+  std::vector<size_t> argc;
+  std::vector<std::string> cmds = {"SENTINEL", "MASTERS"};
+  for (const auto &arg : cmds) {
+    argv.push_back(arg.data());
+    argc.push_back(arg.size());
+  }
+
+  // use raw redis context since we need to parse a complex reply.
+  // sample reply (array of arrays):
+  // 1)  1) "name"
+  //     2) "redis-ha"
+  //     3) "ip"
+  //     4) "10.112.202.115"
+  //     5) "port"
+  //     6) "6379"
+  //     7) "runid"
+  //     8) "18a76cedbf445bd25bbd412c92e237137b5c7d4d"
+  auto redis_reply = reinterpret_cast<redisReply *>(
+      ::redisCommandArgv(context.sync_context(), cmds.size(), argv.data(), argc.data()));
+
+  RAY_CHECK(redis_reply) << "Failed to get redis sentinel masters info";
+  RAY_CHECK_EQ(redis_reply->type, REDIS_REPLY_ARRAY)
+      << "Redis sentinel master info should be REDIS_REPLY_ARRAY but got "
+      << redis_reply->type;
+  RAY_CHECK_EQ(redis_reply->elements, 1UL)
+      << "There should be only one primary behind the Redis sentinel";
+  auto primary = redis_reply->element[0];
+  std::string actual_ip, actual_port;
+  for (size_t i = 0; i < primary->elements; i += 2) {
+    std::string key = primary->element[i]->str;        // Key (e.g., "name", "ip")
+    std::string value = primary->element[i + 1]->str;  // Value corresponding to the key
+    if ("ip" == key) {
+      actual_ip = value;
+    } else if ("port" == key) {
+      actual_port = value;
+    }
+  }
+  freeReplyObject(redis_reply);
+  if (actual_ip.empty() || actual_port.empty()) {
+    RAY_LOG(ERROR)
+        << "Failed to get the ip and port of the primary node from Redis sentinel";
+    return Status::RedisError(
+        "Failed to get the ip and port of the primary node from Redis sentinel");
+  } else {
+    RAY_LOG(INFO) << "Connecting to the Redis primary node behind sentinel: " << actual_ip
+                  << ":" << actual_port;
+    context.Disconnect();
+    return context.Connect(actual_ip, std::stoi(actual_port), password, enable_ssl);
+  }
+}
+
 std::vector<std::string> ResolveDNS(const std::string &address, int port) {
   using namespace boost::asio;
   io_context ctx;
@@ -503,41 +611,13 @@ Status RedisContext::Connect(const std::string &address,
   redis_async_context_.reset(new RedisAsyncContext(std::move(async_context)));
   SetDisconnectCallback(redis_async_context_.get());
 
-  // Ray has some restrictions for RedisDB. Validate it here.
-  ValidateRedisDB(*this);
-
-  // Find the true leader
-  std::vector<const char *> argv;
-  std::vector<size_t> argc;
-  std::vector<std::string> cmds = {"DEL", "DUMMY"};
-  for (const auto &arg : cmds) {
-    argv.push_back(arg.data());
-    argc.push_back(arg.size());
-  }
-
-  auto redis_reply = reinterpret_cast<redisReply *>(
-      ::redisCommandArgv(context_.get(), cmds.size(), argv.data(), argc.data()));
-
-  if (redis_reply->type == REDIS_REPLY_ERROR) {
-    // This should be a MOVED error
-    // MOVED 14946 10.xx.xx.xx:7001
-    std::string error_msg(redis_reply->str, redis_reply->len);
-    freeReplyObject(redis_reply);
-    auto maybe_ip_port = ParseIffMovedError(error_msg);
-    RAY_CHECK(maybe_ip_port.has_value())
-        << "Setup Redis cluster failed in the dummy deletion: " << error_msg;
-    Disconnect();
-    const auto &[ip, port] = maybe_ip_port.value();
-    // Connect to the true leader.
-    RAY_LOG(INFO) << "Redis cluster leader is " << ip << ":" << port
-                  << ". Reconnect to it.";
-    return Connect(ip, port, password, enable_ssl);
+  // handle validation and primary connection for different types of redis
+  if (isRedisSentinel(*this)) {
+    return ConnectRedisSentinel(*this, password, enable_ssl);
   } else {
-    RAY_LOG(INFO) << "Redis cluster leader is " << ip_addresses[0] << ":" << port;
-    freeReplyObject(redis_reply);
+    return ConnectRedisCluster(
+        *this, password, enable_ssl, ip_addresses[0] + ":" + std::to_string(port));
   }
-
-  return Status::OK();
 }
 
 std::unique_ptr<CallbackReply> RedisContext::RunArgvSync(
diff --git a/src/ray/gcs/store_client/in_memory_store_client.cc b/src/ray/gcs/store_client/in_memory_store_client.cc
index 39306b1254c9..1d1cb3451350 100644
--- a/src/ray/gcs/store_client/in_memory_store_client.cc
+++ b/src/ray/gcs/store_client/in_memory_store_client.cc
@@ -14,9 +14,7 @@
 
 #include "ray/gcs/store_client/in_memory_store_client.h"
 
-namespace ray {
-
-namespace gcs {
+namespace ray::gcs {
 
 Status InMemoryStoreClient::AsyncPut(const std::string &table_name,
                                      const std::string &key,
@@ -35,9 +33,10 @@ Status InMemoryStoreClient::AsyncPut(const std::string &table_name,
     table->records_[key] = data;
     inserted = true;
   }
-  if (callback != nullptr) {
-    main_io_service_.post([callback, inserted]() { callback(inserted); },
-                          "GcsInMemoryStore.Put");
+  if (callback) {
+    main_io_service_.post(
+        [callback = std::move(callback), inserted]() { callback(inserted); },
+        "GcsInMemoryStore.Put");
   }
   return Status::OK();
 }
@@ -45,7 +44,7 @@ Status InMemoryStoreClient::AsyncPut(const std::string &table_name,
 Status InMemoryStoreClient::AsyncGet(const std::string &table_name,
                                      const std::string &key,
                                      const OptionalItemCallback<std::string> &callback) {
-  RAY_CHECK(callback != nullptr);
+  RAY_CHECK(callback);
   auto table = GetOrCreateTable(table_name);
   absl::MutexLock lock(&(table->mutex_));
   auto iter = table->records_.find(key);
@@ -66,9 +65,10 @@ Status InMemoryStoreClient::AsyncGetAll(
     const std::string &table_name,
     const MapCallback<std::string, std::string> &callback) {
   RAY_CHECK(callback);
+  auto result = absl::flat_hash_map<std::string, std::string>();
   auto table = GetOrCreateTable(table_name);
   absl::MutexLock lock(&(table->mutex_));
-  auto result = absl::flat_hash_map<std::string, std::string>();
+  result.reserve(table->records_.size());
   result.insert(table->records_.begin(), table->records_.end());
   main_io_service_.post(
       [result = std::move(result), callback]() mutable { callback(std::move(result)); },
@@ -81,10 +81,10 @@ Status InMemoryStoreClient::AsyncMultiGet(
     const std::vector<std::string> &keys,
     const MapCallback<std::string, std::string> &callback) {
   RAY_CHECK(callback);
+  auto result = absl::flat_hash_map<std::string, std::string>();
   auto table = GetOrCreateTable(table_name);
   absl::MutexLock lock(&(table->mutex_));
-  auto result = absl::flat_hash_map<std::string, std::string>();
-  for (auto &key : keys) {
+  for (const auto &key : keys) {
     auto it = table->records_.find(key);
     if (it == table->records_.end()) {
       continue;
@@ -104,7 +104,7 @@ Status InMemoryStoreClient::AsyncDelete(const std::string &table_name,
   absl::MutexLock lock(&(table->mutex_));
   auto num = table->records_.erase(key);
   if (callback != nullptr) {
-    main_io_service_.post([callback, num]() { callback(num > 0); },
+    main_io_service_.post([callback = std::move(callback), num]() { callback(num > 0); },
                           "GcsInMemoryStore.Delete");
   }
   return Status::OK();
@@ -120,7 +120,7 @@ Status InMemoryStoreClient::AsyncBatchDelete(const std::string &table_name,
     num += table->records_.erase(key);
   }
   if (callback != nullptr) {
-    main_io_service_.post([callback, num]() { callback(num); },
+    main_io_service_.post([callback = std::move(callback), num]() { callback(num); },
                           "GcsInMemoryStore.BatchDelete");
   }
   return Status::OK();
@@ -138,11 +138,10 @@ std::shared_ptr<InMemoryStoreClient::InMemoryTable> InMemoryStoreClient::GetOrCr
   auto iter = tables_.find(table_name);
   if (iter != tables_.end()) {
     return iter->second;
-  } else {
-    auto table = std::make_shared<InMemoryTable>();
-    tables_[table_name] = table;
-    return table;
   }
+  auto table = std::make_shared<InMemoryTable>();
+  tables_[table_name] = table;
+  return table;
 }
 
 Status InMemoryStoreClient::AsyncGetKeys(
@@ -150,16 +149,18 @@ Status InMemoryStoreClient::AsyncGetKeys(
     const std::string &prefix,
     std::function<void(std::vector<std::string>)> callback) {
   RAY_CHECK(callback);
-  auto table = GetOrCreateTable(table_name);
   std::vector<std::string> result;
+  auto table = GetOrCreateTable(table_name);
   absl::MutexLock lock(&(table->mutex_));
-  for (auto &pair : table->records_) {
-    if (pair.first.find(prefix) == 0) {
-      result.push_back(pair.first);
+  for (const auto &[key, _] : table->records_) {
+    if (key.find(prefix) == 0) {
+      result.emplace_back(key);
     }
   }
   main_io_service_.post(
-      [result = std::move(result), callback]() mutable { callback(std::move(result)); },
+      [result = std::move(result), callback = std::move(callback)]() mutable {
+        callback(std::move(result));
+      },
       "GcsInMemoryStore.Keys");
   return Status::OK();
 }
@@ -171,11 +172,9 @@ Status InMemoryStoreClient::AsyncExists(const std::string &table_name,
   auto table = GetOrCreateTable(table_name);
   absl::MutexLock lock(&(table->mutex_));
   bool result = table->records_.contains(key);
-  main_io_service_.post([result, callback]() mutable { callback(result); },
+  main_io_service_.post([result, callback = std::move(callback)]() { callback(result); },
                         "GcsInMemoryStore.Exists");
   return Status::OK();
 }
 
-}  // namespace gcs
-
-}  // namespace ray
+}  // namespace ray::gcs
diff --git a/src/ray/gcs/store_client/in_memory_store_client.h b/src/ray/gcs/store_client/in_memory_store_client.h
index a4ea7bc47ac6..6590d608f833 100644
--- a/src/ray/gcs/store_client/in_memory_store_client.h
+++ b/src/ray/gcs/store_client/in_memory_store_client.h
@@ -14,15 +14,17 @@
 
 #pragma once
 
+#include <functional>
+#include <string>
+#include <vector>
+
 #include "absl/container/flat_hash_map.h"
 #include "absl/synchronization/mutex.h"
 #include "ray/common/asio/instrumented_io_context.h"
 #include "ray/gcs/store_client/store_client.h"
 #include "src/ray/protobuf/gcs.pb.h"
 
-namespace ray {
-
-namespace gcs {
+namespace ray::gcs {
 
 /// \class InMemoryStoreClient
 /// Please refer to StoreClient for API semantics.
@@ -77,7 +79,7 @@ class InMemoryStoreClient : public StoreClient {
   };
 
   std::shared_ptr<InMemoryStoreClient::InMemoryTable> GetOrCreateTable(
-      const std::string &table_name);
+      const std::string &table_name) ABSL_LOCKS_EXCLUDED(mutex_);
 
   /// Mutex to protect the tables_ field.
   absl::Mutex mutex_;
@@ -88,9 +90,8 @@ class InMemoryStoreClient : public StoreClient {
   /// of the callback.
   instrumented_io_context &main_io_service_;
 
-  int job_id_ = 0;
+  /// Current job id, auto-increment when request next-id.
+  int job_id_ ABSL_GUARDED_BY(mutex_) = 0;
 };
 
-}  // namespace gcs
-
-}  // namespace ray
+}  // namespace ray::gcs
diff --git a/src/ray/protobuf/common.proto b/src/ray/protobuf/common.proto
index 604c92d0d4db..f18175a12f2c 100644
--- a/src/ray/protobuf/common.proto
+++ b/src/ray/protobuf/common.proto
@@ -556,6 +556,8 @@ message TaskSpec {
   // this field contains the detached actor id.
   // Otherwise it's empty and is originated from a driver.
   bytes root_detached_actor_id = 40;
+  // The key-value labels for task and actor.
+  map<string, string> labels = 41;
 }
 
 message TaskInfoEntry {
@@ -986,6 +988,10 @@ message NamedActorInfo {
 
 message LineageReconstructionTask {
   string name = 1;
-  map<string, double> resources = 2;
-  TaskStatus status = 3;
+  TaskStatus status = 2;
+  // If the task is a normal task,
+  // this has the labels of the normal task.
+  // If the task is an actor task,
+  // this has the labels of the corresponding actor.
+  map<string, string> labels = 3;
 }
diff --git a/src/ray/protobuf/core_worker.proto b/src/ray/protobuf/core_worker.proto
index caa869ce18e9..9b6dad5191dd 100644
--- a/src/ray/protobuf/core_worker.proto
+++ b/src/ray/protobuf/core_worker.proto
@@ -70,6 +70,9 @@ message ActorHandle {
 
   // Whether task events will be reported from this actor.
   bool enable_task_events = 14;
+
+  // The key-value labels for actor.
+  map<string, string> labels = 15;
 }
 
 message PushTaskRequest {
diff --git a/src/ray/protobuf/gcs.proto b/src/ray/protobuf/gcs.proto
index b34f5bd9f39b..2d0ff6dfac4a 100644
--- a/src/ray/protobuf/gcs.proto
+++ b/src/ray/protobuf/gcs.proto
@@ -165,6 +165,8 @@ message ErrorTableData {
   // The error message.
   string error_message = 3;
   // The timestamp of the error message.
+  // Unit: millisecond.
+  // TODO(hjiang): Update field naming from `timestamp` to `timestamp_millisec`.
   double timestamp = 4;
 }
 
diff --git a/src/ray/raylet/local_task_manager.cc b/src/ray/raylet/local_task_manager.cc
index f53fca5a365a..aae37371e29f 100644
--- a/src/ray/raylet/local_task_manager.cc
+++ b/src/ray/raylet/local_task_manager.cc
@@ -252,7 +252,7 @@ void LocalTaskManager::DispatchScheduledTasksToWorkers() {
     for (auto work_it = dispatch_queue.begin(); work_it != dispatch_queue.end();) {
       auto &work = *work_it;
       const auto &task = work->task;
-      const auto spec = task.GetTaskSpecification();
+      const auto &spec = task.GetTaskSpecification();
       TaskID task_id = spec.TaskId();
       if (work->GetState() == internal::WorkStatus::WAITING_FOR_WORKER) {
         work_it++;
@@ -436,7 +436,8 @@ void LocalTaskManager::SpillWaitingTasks() {
   while (it != waiting_task_queue_.begin()) {
     it--;
     const auto &task = (*it)->task;
-    const auto &task_id = task.GetTaskSpecification().TaskId();
+    const auto &spec = task.GetTaskSpecification();
+    const auto &task_id = spec.TaskId();
 
     // Check whether this task's dependencies are blocked (not being actively
     // pulled).  If this is true, then we should force the task onto a remote
@@ -452,9 +453,9 @@ void LocalTaskManager::SpillWaitingTasks() {
     // object store memory availability. Ideally, we should pick the node with
     // the most memory availability.
     scheduling::NodeID scheduling_node_id;
-    if (!task.GetTaskSpecification().IsSpreadSchedulingStrategy()) {
+    if (!spec.IsSpreadSchedulingStrategy()) {
       scheduling_node_id = cluster_resource_scheduler_->GetBestSchedulableNode(
-          task.GetTaskSpecification(),
+          spec,
           /*preferred_node_id*/ self_node_id_.Binary(),
           /*exclude_local_node*/ task_dependencies_blocked,
           /*requires_object_store_memory*/ true,
@@ -470,9 +471,8 @@ void LocalTaskManager::SpillWaitingTasks() {
         scheduling_node_id.Binary() != self_node_id_.Binary()) {
       NodeID node_id = NodeID::FromBinary(scheduling_node_id.Binary());
       Spillback(node_id, *it);
-      if (!task.GetTaskSpecification().GetDependencies().empty()) {
-        task_dependency_manager_.RemoveTaskDependencies(
-            task.GetTaskSpecification().TaskId());
+      if (!spec.GetDependencies().empty()) {
+        task_dependency_manager_.RemoveTaskDependencies(spec.TaskId());
       }
       num_waiting_task_spilled_++;
       waiting_tasks_index_.erase(task_id);
@@ -495,14 +495,15 @@ void LocalTaskManager::SpillWaitingTasks() {
 
 bool LocalTaskManager::TrySpillback(const std::shared_ptr<internal::Work> &work,
                                     bool &is_infeasible) {
+  const auto &spec = work->task.GetTaskSpecification();
   auto scheduling_node_id = cluster_resource_scheduler_->GetBestSchedulableNode(
-      work->task.GetTaskSpecification(),
+      spec,
       // We should prefer to stay local if possible
       // to avoid unnecessary spillback
       // since this node is already selected by the cluster scheduler.
-      /*preferred_node_id*/ self_node_id_.Binary(),
-      /*exclude_local_node*/ false,
-      /*requires_object_store_memory*/ false,
+      /*preferred_node_id=*/self_node_id_.Binary(),
+      /*exclude_local_node=*/false,
+      /*requires_object_store_memory=*/false,
       &is_infeasible);
 
   if (is_infeasible || scheduling_node_id.IsNil() ||
@@ -513,9 +514,8 @@ bool LocalTaskManager::TrySpillback(const std::shared_ptr<internal::Work> &work,
   NodeID node_id = NodeID::FromBinary(scheduling_node_id.Binary());
   Spillback(node_id, work);
   num_unschedulable_task_spilled_++;
-  if (!work->task.GetTaskSpecification().GetDependencies().empty()) {
-    task_dependency_manager_.RemoveTaskDependencies(
-        work->task.GetTaskSpecification().TaskId());
+  if (!spec.GetDependencies().empty()) {
+    task_dependency_manager_.RemoveTaskDependencies(spec.TaskId());
   }
   return true;
 }
diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc
index 879edff0bb6c..18384711de58 100644
--- a/src/ray/raylet/node_manager.cc
+++ b/src/ray/raylet/node_manager.cc
@@ -143,7 +143,7 @@ NodeManager::NodeManager(
           /*starting_worker_timeout_callback=*/
           [this] { cluster_task_manager_->ScheduleAndDispatchTasks(); },
           config.ray_debugger_external,
-          /*get_time=*/[]() { return absl::GetCurrentTimeNanos() / 1e6; }),
+          /*get_time=*/[]() { return absl::Now(); }),
       client_call_manager_(io_service),
       worker_rpc_pool_(client_call_manager_),
       core_worker_subscriber_(std::make_unique<pubsub::Subscriber>(
@@ -1562,8 +1562,8 @@ void NodeManager::DisconnectClient(const std::shared_ptr<ClientConnection> &clie
                 .WithField("node_id", self_node_id_.Hex())
                 .WithField("job_id", worker->GetAssignedJobId().Hex())
             << error_message_str;
-        auto error_data_ptr =
-            gcs::CreateErrorTableData(type, error_message_str, current_time_ms(), job_id);
+        auto error_data_ptr = gcs::CreateErrorTableData(
+            type, error_message_str, absl::FromUnixMillis(current_time_ms()), job_id);
         RAY_CHECK_OK(gcs_client_->Errors().AsyncReportJobError(error_data_ptr, nullptr));
       }
     }
@@ -1762,9 +1762,11 @@ void NodeManager::ProcessPushErrorRequestMessage(const uint8_t *message_data) {
 
   auto const &type = string_from_flatbuf(*message->type());
   auto const &error_message = string_from_flatbuf(*message->error_message());
+  // TODO(hjiang): Figure out what's the unit for `PushErrorRequest`.
   double timestamp = message->timestamp();
   JobID job_id = from_flatbuf<JobID>(*message->job_id());
-  auto error_data_ptr = gcs::CreateErrorTableData(type, error_message, timestamp, job_id);
+  auto error_data_ptr = gcs::CreateErrorTableData(
+      type, error_message, absl::FromUnixMillis(timestamp), job_id);
   RAY_CHECK_OK(gcs_client_->Errors().AsyncReportJobError(error_data_ptr, nullptr));
 }
 
@@ -2126,8 +2128,8 @@ void NodeManager::MarkObjectsAsFailed(
              << " object may hang forever.";
       std::string error_message = stream.str();
       RAY_LOG(ERROR) << error_message;
-      auto error_data_ptr =
-          gcs::CreateErrorTableData("task", error_message, current_time_ms(), job_id);
+      auto error_data_ptr = gcs::CreateErrorTableData(
+          "task", error_message, absl::FromUnixMillis(current_time_ms()), job_id);
       RAY_CHECK_OK(gcs_client_->Errors().AsyncReportJobError(error_data_ptr, nullptr));
     }
   }
diff --git a/src/ray/raylet/raylet.h b/src/ray/raylet/raylet.h
index c220e2bc2fab..619fcd7b69ec 100644
--- a/src/ray/raylet/raylet.h
+++ b/src/ray/raylet/raylet.h
@@ -23,9 +23,7 @@
 #include "ray/common/asio/instrumented_io_context.h"
 // clang-format on
 
-namespace ray {
-
-namespace raylet {
+namespace ray::raylet {
 
 using rpc::GcsNodeInfo;
 using rpc::NodeSnapshot;
@@ -109,6 +107,4 @@ class Raylet {
   local_stream_socket socket_;
 };
 
-}  // namespace raylet
-
-}  // namespace ray
+}  // namespace ray::raylet
diff --git a/src/ray/raylet/scheduling/cluster_task_manager.h b/src/ray/raylet/scheduling/cluster_task_manager.h
index 058c40f97fcf..752cbd6b3e98 100644
--- a/src/ray/raylet/scheduling/cluster_task_manager.h
+++ b/src/ray/raylet/scheduling/cluster_task_manager.h
@@ -161,6 +161,7 @@ class ClusterTaskManager : public ClusterTaskManagerInterface {
 
   const NodeID &self_node_id_;
   /// Responsible for resource tracking/view of the cluster.
+  /// TODO(hjiang): Use reference instead of shared pointer.
   std::shared_ptr<ClusterResourceScheduler> cluster_resource_scheduler_;
 
   /// Function to get the node information of a given node id.
diff --git a/src/ray/raylet/worker_pool.cc b/src/ray/raylet/worker_pool.cc
index 943ea89b24b5..5964063a3205 100644
--- a/src/ray/raylet/worker_pool.cc
+++ b/src/ray/raylet/worker_pool.cc
@@ -89,10 +89,10 @@ WorkerPool::WorkerPool(instrumented_io_context &io_service,
                        const std::vector<int> &worker_ports,
                        std::shared_ptr<gcs::GcsClient> gcs_client,
                        const WorkerCommandMap &worker_commands,
-                       const std::string &native_library_path,
+                       std::string native_library_path,
                        std::function<void()> starting_worker_timeout_callback,
                        int ray_debugger_external,
-                       const std::function<double()> get_time)
+                       std::function<absl::Time()> get_time)
     : worker_startup_token_counter_(0),
       io_service_(&io_service),
       node_id_(node_id),
@@ -105,15 +105,15 @@ WorkerPool::WorkerPool(instrumented_io_context &io_service,
               RayConfig::instance().worker_maximum_startup_concurrency()
               : maximum_startup_concurrency),
       gcs_client_(std::move(gcs_client)),
-      native_library_path_(native_library_path),
-      starting_worker_timeout_callback_(starting_worker_timeout_callback),
+      native_library_path_(std::move(native_library_path)),
+      starting_worker_timeout_callback_(std::move(starting_worker_timeout_callback)),
       ray_debugger_external(ray_debugger_external),
       first_job_registered_python_worker_count_(0),
       first_job_driver_wait_num_python_workers_(
           std::min(num_prestarted_python_workers, maximum_startup_concurrency_)),
       num_prestart_python_workers(num_prestarted_python_workers),
       periodical_runner_(io_service),
-      get_time_(get_time) {
+      get_time_(std::move(get_time)) {
   RAY_CHECK_GT(maximum_startup_concurrency_, 0);
   // We need to record so that the metric exists. This way, we report that 0
   // processes have started before a task runs on the node (as opposed to the
@@ -593,7 +593,8 @@ void WorkerPool::MonitorPopWorkerRequestForRegistration(
     auto &requests = state.pending_registration_requests;
     auto it = std::find(requests.begin(), requests.end(), pop_worker_request);
     if (it != requests.end()) {
-      // Fail the task...
+      // Pop and fail the task...
+      requests.erase(it);
       PopWorkerStatus status = PopWorkerStatus::WorkerPendingRegistration;
       PopWorkerCallbackAsync(pop_worker_request->callback, nullptr, status);
     }
@@ -1042,7 +1043,7 @@ void WorkerPool::PushWorker(const std::shared_ptr<WorkerInterface> &worker) {
 }
 
 void WorkerPool::TryKillingIdleWorkers() {
-  int64_t now = get_time_();
+  const auto now = get_time_();
 
   // Filter out all idle workers that are already dead and/or associated with
   // jobs that have already finished.
@@ -1055,14 +1056,14 @@ void WorkerPool::TryKillingIdleWorkers() {
     }
 
     const auto &job_id = idle_worker->GetAssignedJobId();
-    if (finished_jobs_.count(job_id) > 0) {
+    if (finished_jobs_.contains(job_id)) {
       // The job has finished, so we should kill the worker immediately.
       KillIdleWorker(idle_worker, it->second);
       it = idle_of_all_languages_.erase(it);
     } else {
-      if (it->second == -1 ||
-          now - it->second >
-              RayConfig::instance().idle_worker_killing_time_threshold_ms()) {
+      if (now - it->second >
+          absl::Milliseconds(
+              RayConfig::instance().idle_worker_killing_time_threshold_ms())) {
         // The job has not yet finished and the worker has been idle for longer
         // than the timeout.
         num_killable_idle_workers++;
@@ -1084,9 +1085,9 @@ void WorkerPool::TryKillingIdleWorkers() {
   auto it = idle_of_all_languages_.begin();
   while (num_killable_idle_workers > num_desired_idle_workers &&
          it != idle_of_all_languages_.end()) {
-    if (it->second == -1 ||
-        now - it->second >
-            RayConfig::instance().idle_worker_killing_time_threshold_ms()) {
+    if (now - it->second >
+        absl::Milliseconds(
+            RayConfig::instance().idle_worker_killing_time_threshold_ms())) {
       RAY_LOG(DEBUG) << "Number of idle workers " << num_killable_idle_workers
                      << " is larger than the number of desired workers "
                      << num_desired_idle_workers << " killing idle worker with PID "
@@ -1101,7 +1102,7 @@ void WorkerPool::TryKillingIdleWorkers() {
 }
 
 void WorkerPool::KillIdleWorker(std::shared_ptr<WorkerInterface> idle_worker,
-                                int64_t last_time_used_ms) {
+                                absl::Time last_time_used) {
   // To avoid object lost issue caused by forcibly killing, send an RPC request to the
   // worker to allow it to do cleanup before exiting. We kill it anyway if the driver
   // is already exited.
@@ -1121,8 +1122,8 @@ void WorkerPool::KillIdleWorker(std::shared_ptr<WorkerInterface> idle_worker,
   }
   rpc_client->Exit(
       request,
-      [this, idle_worker, last_time_used_ms](const ray::Status &status,
-                                             const rpc::ExitReply &r) {
+      [this, idle_worker, last_time_used](const ray::Status &status,
+                                          const rpc::ExitReply &r) {
         RAY_CHECK(pending_exit_idle_workers_.erase(idle_worker->WorkerId()));
         if (!status.ok()) {
           RAY_LOG(ERROR) << "Failed to send exit request: " << status.ToString();
@@ -1148,8 +1149,7 @@ void WorkerPool::KillIdleWorker(std::shared_ptr<WorkerInterface> idle_worker,
           // kill the worker (e.g., when the worker owns the object). Without this,
           // if the first N workers own objects, it can't kill idle workers that are
           // >= N+1.
-          idle_of_all_languages_.push_back(
-              std::make_pair(idle_worker, last_time_used_ms));
+          idle_of_all_languages_.emplace_back(idle_worker, last_time_used);
         }
       });
 }
@@ -1310,7 +1310,7 @@ void WorkerPool::PopWorker(const TaskSpecification &task_spec,
 
   auto worker_fits_for_task_fn =
       [this, &pop_worker_request, &skip_reason_count](
-          const std::pair<std::shared_ptr<WorkerInterface>, int64_t> &pair) -> bool {
+          const std::pair<std::shared_ptr<WorkerInterface>, absl::Time> &pair) -> bool {
     const auto &worker = pair.first;
     WorkerUnfitForTaskReason reason = WorkerFitsForTask(*worker, *pop_worker_request);
     if (reason == WorkerUnfitForTaskReason::NONE) {
@@ -1543,6 +1543,7 @@ void WorkerPool::WarnAboutSize() {
           << "some discussion of workarounds).";
       std::string warning_message_str = warning_message.str();
       RAY_LOG(WARNING) << warning_message_str;
+
       auto error_data_ptr = gcs::CreateErrorTableData(
           "worker_pool_large", warning_message_str, get_time_());
       RAY_CHECK_OK(gcs_client_->Errors().AsyncReportJobError(error_data_ptr, nullptr));
diff --git a/src/ray/raylet/worker_pool.h b/src/ray/raylet/worker_pool.h
index ef2e1e048635..3d7f456f82cb 100644
--- a/src/ray/raylet/worker_pool.h
+++ b/src/ray/raylet/worker_pool.h
@@ -25,6 +25,7 @@
 #include <utility>
 #include <vector>
 
+#include "absl/time/time.h"
 #include "ray/common/asio/instrumented_io_context.h"
 #include "ray/common/asio/periodical_runner.h"
 #include "ray/common/client_connection.h"
@@ -200,7 +201,7 @@ class WorkerPool : public WorkerPoolInterface, public IOWorkerPoolInterface {
   /// it times out to start a worker.
   /// \param ray_debugger_external Ray debugger in workers will be started in a way
   /// that they are accessible from outside the node.
-  /// \param get_time A callback to get the current time.
+  /// \param get_time A callback to get the current time in milliseconds.
   WorkerPool(instrumented_io_context &io_service,
              const NodeID node_id,
              const std::string node_address,
@@ -212,10 +213,10 @@ class WorkerPool : public WorkerPoolInterface, public IOWorkerPoolInterface {
              const std::vector<int> &worker_ports,
              std::shared_ptr<gcs::GcsClient> gcs_client,
              const WorkerCommandMap &worker_commands,
-             const std::string &native_library_path,
+             std::string native_library_path,
              std::function<void()> starting_worker_timeout_callback,
              int ray_debugger_external,
-             const std::function<double()> get_time);
+             std::function<absl::Time()> get_time);
 
   /// Destructor responsible for freeing a set of workers owned by this class.
   virtual ~WorkerPool() override;
@@ -472,7 +473,7 @@ class WorkerPool : public WorkerPoolInterface, public IOWorkerPoolInterface {
   /// TODO(scv119): replace dynamic options by runtime_env.
   const std::vector<std::string> &LookupWorkerDynamicOptions(StartupToken token) const;
 
-  void KillIdleWorker(std::shared_ptr<WorkerInterface> worker, int64_t last_time_used_ms);
+  void KillIdleWorker(std::shared_ptr<WorkerInterface> worker, absl::Time last_time_used);
 
   /// Gloabl startup token variable. Incremented once assigned
   /// to a worker process and is added to
@@ -587,7 +588,8 @@ class WorkerPool : public WorkerPoolInterface, public IOWorkerPoolInterface {
   /// The pool of idle non-actor workers of all languages. This is used to kill idle
   /// workers in FIFO order. The second element of std::pair is the time a worker becomes
   /// idle.
-  std::list<std::pair<std::shared_ptr<WorkerInterface>, int64_t>> idle_of_all_languages_;
+  std::list<std::pair<std::shared_ptr<WorkerInterface>, absl::Time>>
+      idle_of_all_languages_;
 
  private:
   /// A helper function that returns the reference of the pool state
@@ -792,7 +794,7 @@ class WorkerPool : public WorkerPoolInterface, public IOWorkerPoolInterface {
   PeriodicalRunner periodical_runner_;
 
   /// A callback to get the current time.
-  const std::function<double()> get_time_;
+  const std::function<absl::Time()> get_time_;
   /// Runtime env manager client.
   std::shared_ptr<RuntimeEnvAgentClient> runtime_env_agent_client_;
   /// Stats
diff --git a/src/ray/raylet/worker_pool_test.cc b/src/ray/raylet/worker_pool_test.cc
index d945384b7277..022c5055522a 100644
--- a/src/ray/raylet/worker_pool_test.cc
+++ b/src/ray/raylet/worker_pool_test.cc
@@ -14,8 +14,10 @@
 
 #include "ray/raylet/worker_pool.h"
 
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "absl/time/time.h"
 #include "nlohmann/json.hpp"
 #include "ray/common/asio/asio_util.h"
 #include "ray/common/asio/instrumented_io_context.h"
@@ -26,9 +28,8 @@
 #include "src/ray/protobuf/runtime_env_agent.pb.h"
 
 using json = nlohmann::json;
-namespace ray {
 
-namespace raylet {
+namespace ray::raylet {
 
 int MAXIMUM_STARTUP_CONCURRENCY = 15;
 int PYTHON_PRESTART_WORKERS = 15;
@@ -140,7 +141,7 @@ class WorkerPoolMock : public WorkerPool {
             "",
             []() {},
             0,
-            [this]() { return current_time_ms_; }),
+            [this]() { return absl::FromUnixMillis(current_time_ms_); }),
         last_worker_process_(),
         instrumented_io_service_(io_service),
         error_message_type_(1),
@@ -241,7 +242,7 @@ class WorkerPoolMock : public WorkerPool {
 
   size_t GetIdleWorkerSize() { return idle_of_all_languages_.size(); }
 
-  std::list<std::pair<std::shared_ptr<WorkerInterface>, int64_t>> &GetIdleWorkers() {
+  std::list<std::pair<std::shared_ptr<WorkerInterface>, absl::Time>> &GetIdleWorkers() {
     return idle_of_all_languages_;
   }
 
@@ -1928,6 +1929,19 @@ TEST_F(WorkerPoolDriverRegisteredTest, PopWorkerStatus) {
   worker_pool_->ClearProcesses();
 }
 
+TEST_F(WorkerPoolDriverRegisteredTest, WorkerPendingRegistrationErasesRequest) {
+  std::shared_ptr<WorkerInterface> popped_worker;
+  PopWorkerStatus status;
+  auto task_spec = ExampleTaskSpec();
+  // Create a task without push worker. It should time out (WorkerPendingRegistration).
+  popped_worker = worker_pool_->PopWorkerSync(task_spec, false, &status);
+  ASSERT_EQ(popped_worker, nullptr);
+  ASSERT_EQ(status, PopWorkerStatus::WorkerPendingRegistration);
+  // The request should be erased.
+  ASSERT_EQ(worker_pool_->NumPendingRegistrationRequests(), 0);
+  worker_pool_->ClearProcesses();
+}
+
 TEST_F(WorkerPoolDriverRegisteredTest, TestIOWorkerFailureAndSpawn) {
   std::unordered_set<std::shared_ptr<WorkerInterface>> spill_worker_set;
   auto spill_worker_callback =
@@ -2138,9 +2152,7 @@ TEST_F(WorkerPoolTest, RegisterFirstJavaDriverCallbackImmediately) {
   ASSERT_TRUE(callback_called);
 }
 
-}  // namespace raylet
-
-}  // namespace ray
+}  // namespace ray::raylet
 
 int main(int argc, char **argv) {
   InitShutdownRAII ray_log_shutdown_raii(
diff --git a/src/ray/raylet_client/raylet_client.cc b/src/ray/raylet_client/raylet_client.cc
index 7911a3ce0a86..5eff4538f837 100644
--- a/src/ray/raylet_client/raylet_client.cc
+++ b/src/ray/raylet_client/raylet_client.cc
@@ -370,10 +370,10 @@ void raylet::RayletClient::ReportWorkerBacklog(
   request.set_worker_id(worker_id.Binary());
   request.mutable_backlog_reports()->Add(backlog_reports.begin(), backlog_reports.end());
   grpc_client_->ReportWorkerBacklog(
-      request, [](const Status &status, rpc::ReportWorkerBacklogReply &&reply) {
-        if (!status.ok()) {
-          RAY_LOG(INFO) << "Error reporting task backlog information: " << status;
-        }
+      request,
+      [](const Status &status, rpc::ReportWorkerBacklogReply &&reply /*unused*/) {
+        RAY_LOG_IF_ERROR(INFO, status)
+            << "Error reporting task backlog information: " << status;
       });
 }
 
@@ -389,12 +389,10 @@ Status raylet::RayletClient::ReturnWorker(
   request.set_disconnect_worker(disconnect_worker);
   request.set_disconnect_worker_error_detail(disconnect_worker_error_detail);
   request.set_worker_exiting(worker_exiting);
-  grpc_client_->ReturnWorker(request,
-                             [](const Status &status, rpc::ReturnWorkerReply &&reply) {
-                               if (!status.ok()) {
-                                 RAY_LOG(INFO) << "Error returning worker: " << status;
-                               }
-                             });
+  grpc_client_->ReturnWorker(
+      request, [](const Status &status, rpc::ReturnWorkerReply &&reply /*unused*/) {
+        RAY_LOG_IF_ERROR(INFO, status) << "Error returning worker: " << status;
+      });
   return Status::OK();
 }
 
@@ -405,9 +403,7 @@ void raylet::RayletClient::GetTaskFailureCause(
   request.set_task_id(task_id.Binary());
   grpc_client_->GetTaskFailureCause(
       request, [callback](const Status &status, rpc::GetTaskFailureCauseReply &&reply) {
-        if (!status.ok()) {
-          RAY_LOG(INFO) << "Error getting task result: " << status;
-        }
+        RAY_LOG_IF_ERROR(INFO, status) << "Error getting task result: " << status;
         callback(status, std::move(reply));
       });
 }
@@ -459,9 +455,7 @@ void raylet::RayletClient::PushMutableObject(
     // TODO: Add failure recovery, retries, and timeout.
     grpc_client_->PushMutableObject(
         request, [callback](const Status &status, rpc::PushMutableObjectReply &&reply) {
-          if (!status.ok()) {
-            RAY_LOG(ERROR) << "Error pushing mutable object: " << status;
-          }
+          RAY_LOG_IF_ERROR(ERROR, status) << "Error pushing mutable object: " << status;
           if (reply.done()) {
             // The callback is only executed once the receiver node receives all chunks
             // for the mutable object write.
diff --git a/src/ray/util/BUILD b/src/ray/util/BUILD
index 23d9f1e90150..87f8a57e8dea 100644
--- a/src/ray/util/BUILD
+++ b/src/ray/util/BUILD
@@ -55,3 +55,13 @@ cc_library(
     srcs = ["thread_checker.cc"],
     visibility = ["//visibility:public"],
 )
+
+cc_library(
+    name = "shared_lru",
+    hdrs = ["shared_lru.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":util",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
diff --git a/src/ray/util/logging.cc b/src/ray/util/logging.cc
index 12737eabed7e..7562b3e2ecce 100644
--- a/src/ray/util/logging.cc
+++ b/src/ray/util/logging.cc
@@ -27,14 +27,17 @@
 #endif
 
 #include <algorithm>
+#include <array>
 #include <cstdlib>
 #include <fstream>
 #include <iostream>
 #include <sstream>
+#include <string_view>
 
 #include "absl/debugging/failure_signal_handler.h"
 #include "absl/debugging/stacktrace.h"
 #include "absl/debugging/symbolize.h"
+#include "absl/strings/numbers.h"
 #include "absl/strings/str_format.h"
 #include "nlohmann/json.hpp"
 #include "ray/util/event_label.h"
@@ -301,10 +304,8 @@ void RayLog::InitLogFormat() {
   log_format_json_ = false;
   log_format_pattern_ = kLogFormatTextPattern;
 
-  const char *var_value = std::getenv("RAY_BACKEND_LOG_JSON");
-  if (var_value != nullptr) {
-    std::string data = var_value;
-    if (data == "1") {
+  if (const char *var_value = std::getenv("RAY_BACKEND_LOG_JSON"); var_value != nullptr) {
+    if (std::string_view{var_value} == std::string_view{"1"}) {
       log_format_json_ = true;
       log_format_pattern_ = kLogFormatJsonPattern;
     }
@@ -321,7 +322,9 @@ void RayLog::StartRayLog(const std::string &app_name,
   log_dir_ = log_dir;
 
   // All the logging sinks to add.
-  std::vector<spdlog::sink_ptr> sinks;
+  // One for file/stdout, another for stderr.
+  std::array<spdlog::sink_ptr, 2> sinks;  // Intentionally no initialization.
+
   auto level = GetMappedSeverity(severity_threshold_);
   std::string app_name_without_path = app_name;
   if (app_name.empty()) {
@@ -343,17 +346,20 @@ void RayLog::StartRayLog(const std::string &app_name,
 #endif
     // Reset log pattern and level and we assume a log file can be rotated with
     // 10 files in max size 512M by default.
-    if (std::getenv("RAY_ROTATION_MAX_BYTES")) {
-      long max_size = std::atol(std::getenv("RAY_ROTATION_MAX_BYTES"));
-      // 0 means no log rotation in python, but not in spdlog. We just use the default
-      // value here.
-      if (max_size != 0) {
+    if (const char *ray_rotation_max_bytes = std::getenv("RAY_ROTATION_MAX_BYTES");
+        ray_rotation_max_bytes != nullptr) {
+      long max_size = 0;
+      if (absl::SimpleAtoi(ray_rotation_max_bytes, &max_size) && max_size > 0) {
+        // 0 means no log rotation in python, but not in spdlog. We just use the default
+        // value here.
         log_rotation_max_size_ = max_size;
       }
     }
-    if (std::getenv("RAY_ROTATION_BACKUP_COUNT")) {
-      long file_num = std::atol(std::getenv("RAY_ROTATION_BACKUP_COUNT"));
-      if (file_num != 0) {
+
+    if (const char *ray_rotation_backup_count = std::getenv("RAY_ROTATION_BACKUP_COUNT");
+        ray_rotation_backup_count != nullptr) {
+      long file_num = 0;
+      if (absl::SimpleAtoi(ray_rotation_backup_count, &file_num) && file_num > 0) {
         log_rotation_file_num_ = file_num;
       }
     }
@@ -370,23 +376,24 @@ void RayLog::StartRayLog(const std::string &app_name,
         log_rotation_max_size_,
         log_rotation_file_num_);
     file_sink->set_level(level);
-    sinks.push_back(file_sink);
+    sinks[0] = std::move(file_sink);
   } else {
     component_name_ = app_name_without_path;
     auto console_sink = std::make_shared<spdlog::sinks::stdout_color_sink_mt>();
     console_sink->set_level(level);
-    sinks.push_back(console_sink);
+    sinks[0] = std::move(console_sink);
   }
 
   // In all cases, log errors to the console log so they are in driver logs.
   // https://github.com/ray-project/ray/issues/12893
   auto err_sink = std::make_shared<spdlog::sinks::stderr_color_sink_mt>();
   err_sink->set_level(spdlog::level::err);
-  sinks.push_back(err_sink);
+  sinks[1] = std::move(err_sink);
 
   // Set the combined logger.
-  auto logger = std::make_shared<spdlog::logger>(
-      RayLog::GetLoggerName(), sinks.begin(), sinks.end());
+  auto logger = std::make_shared<spdlog::logger>(RayLog::GetLoggerName(),
+                                                 std::make_move_iterator(sinks.begin()),
+                                                 std::make_move_iterator(sinks.end()));
   logger->set_level(level);
   // Set the pattern of all sinks.
   logger->set_pattern(log_format_pattern_);
diff --git a/src/ray/util/logging.h b/src/ray/util/logging.h
index bea9c0b5de44..b5e06f4b083a 100644
--- a/src/ray/util/logging.h
+++ b/src/ray/util/logging.h
@@ -129,6 +129,11 @@ enum class RayLogLevel {
   if (ray::RayLog::IsLevelEnabled(ray::RayLogLevel::level)) \
   RAY_LOG_INTERNAL(ray::RayLogLevel::level)
 
+// `cond` is a `Status` class, could be `ray::Status`, or from third-party like
+// `grpc::Status`.
+#define RAY_LOG_IF_ERROR(level, cond) \
+  if (RAY_PREDICT_FALSE(!(cond).ok())) RAY_LOG(level)
+
 #define RAY_IGNORE_EXPR(expr) ((void)(expr))
 
 #define RAY_CHECK(condition)                                                      \
diff --git a/src/ray/util/shared_lru.h b/src/ray/util/shared_lru.h
new file mode 100644
index 000000000000..8132e38b6f12
--- /dev/null
+++ b/src/ray/util/shared_lru.h
@@ -0,0 +1,207 @@
+// Copyright 2024 The Ray Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SharedLruCache is a LRU cache, with all entries shared, which means a single entry
+// could be accessed by multiple getters. All values are wrapped with shared pointer to
+// avoid copy at get operation, meanwhile also useful to maintain memory validity at any
+// time.
+//
+// Example usage:
+// SharedLruCache<std::string, std::string> cache{cap};
+// // Put a key-value pair into cache.
+// cache.Put("key", std::make_shared<std::string>("val"));
+//
+// // Get a key-value pair from cache.
+// auto val = cache.Get("key");
+// // Check and consume `val`.
+//
+// TODO(hjiang):
+// 1. Add template arguments for key hash and key equal, to pass into absl::flat_hash_map.
+// 2. Provide a key hash wrapper to save a copy.
+// 3. flat hash map supports heterogeneous lookup, expose `KeyLike` templated interface.
+// 4. Add a `GetOrCreate` interface, which takes factory function to creation value.
+// 5. For thread-safe cache, add a sharded container wrapper to reduce lock contention.
+
+#pragma once
+
+#include <cstdint>
+#include <list>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "src/ray/util/logging.h"
+
+namespace ray::utils::container {
+
+template <typename Key, typename Val>
+class SharedLruCache final {
+ public:
+  using key_type = Key;
+  using mapped_type = Val;
+
+  // A `max_entries` of 0 means that there is no limit on the number of entries
+  // in the cache.
+  explicit SharedLruCache(size_t max_entries) : max_entries_(max_entries) {}
+
+  SharedLruCache(const SharedLruCache &) = delete;
+  SharedLruCache &operator=(const SharedLruCache &) = delete;
+
+  ~SharedLruCache() = default;
+
+  // Insert `value` with key `key`. This will replace any previous entry with
+  // the same key.
+  void Put(Key key, std::shared_ptr<Val> value) {
+    RAY_CHECK(value != nullptr);
+    auto iter = cache_.find(key);
+    if (iter != cache_.end()) {
+      lru_list_.splice(lru_list_.begin(), lru_list_, iter->second.lru_iterator);
+      iter->second.value = std::move(value);
+      return;
+    }
+
+    lru_list_.emplace_front(key);
+    Entry new_entry{std::move(value), lru_list_.begin()};
+    cache_[std::move(key)] = std::move(new_entry);
+
+    if (max_entries_ > 0 && lru_list_.size() > max_entries_) {
+      const auto &stale_key = lru_list_.back();
+      cache_.erase(stale_key);
+      lru_list_.pop_back();
+    }
+
+    RAY_CHECK_EQ(lru_list_.size(), cache_.size());
+  }
+
+  // Delete the entry with key `key`. Return true if the entry was found for
+  // `key`, false if the entry was not found. In both cases, there is no entry
+  // with key `key` existed after the call.
+  bool Delete(const Key &key) {
+    auto it = cache_.find(key);
+    if (it == cache_.end()) {
+      return false;
+    }
+    lru_list_.erase(it->second.lru_iterator);
+    cache_.erase(it);
+    return true;
+  }
+
+  // Look up the entry with key `key`. Return nullptr if key doesn't exist.
+  std::shared_ptr<Val> Get(const Key &key) {
+    const auto cache_iter = cache_.find(key);
+    if (cache_iter == cache_.end()) {
+      return nullptr;
+    }
+    lru_list_.splice(lru_list_.begin(), lru_list_, cache_iter->second.lru_iterator);
+    return cache_iter->second.value;
+  }
+
+  // Clear the cache.
+  void Clear() {
+    cache_.clear();
+    lru_list_.clear();
+  }
+
+  // Accessors for cache parameters.
+  size_t max_entries() const { return max_entries_; }
+
+ private:
+  struct Entry {
+    // The entry's value.
+    std::shared_ptr<Val> value;
+
+    // A list iterator pointing to the entry's position in the LRU list.
+    typename std::list<Key>::iterator lru_iterator;
+  };
+
+  using EntryMap = absl::flat_hash_map<Key, Entry>;
+
+  // The maximum number of entries in the cache. A value of 0 means there is no
+  // limit on entry count.
+  const size_t max_entries_;
+
+  // Stores key-value pairs.
+  EntryMap cache_;
+
+  // The LRU list of entries. The front of the list identifies the most
+  // recently accessed entry.
+  std::list<Key> lru_list_;
+};
+
+// Same interfaces as `SharedLruCache`, but all cached values are
+// `const`-specified to avoid concurrent updates.
+template <typename K, typename V>
+using SharedLruConstCache = SharedLruCache<K, const V>;
+
+// Same interface and functionality as `SharedLruCache`, but thread-safe version.
+template <typename Key, typename Val>
+class ThreadSafeSharedLruCache final {
+ public:
+  using key_type = Key;
+  using mapped_type = Val;
+
+  // A `max_entries` of 0 means that there is no limit on the number of entries
+  // in the cache.
+  explicit ThreadSafeSharedLruCache(size_t max_entries) : cache_(max_entries) {}
+
+  ThreadSafeSharedLruCache(const ThreadSafeSharedLruCache &) = delete;
+  ThreadSafeSharedLruCache &operator=(const ThreadSafeSharedLruCache &) = delete;
+
+  ~ThreadSafeSharedLruCache() = default;
+
+  // Insert `value` with key `key`. This will replace any previous entry with
+  // the same key.
+  void Put(Key key, std::shared_ptr<Val> value) {
+    std::lock_guard lck(mu_);
+    cache_.Put(std::move(key), std::move(value));
+  }
+
+  // Delete the entry with key `key`. Return true if the entry was found for
+  // `key`, false if the entry was not found. In both cases, there is no entry
+  // with key `key` existed after the call.
+  bool Delete(const Key &key) {
+    std::lock_guard lck(mu_);
+    return cache_.Delete(key);
+  }
+
+  // Look up the entry with key `key`. Return std::nullopt if key doesn't exist.
+  // If found, return a copy for the value.
+  std::shared_ptr<Val> Get(const Key &key) {
+    std::lock_guard lck(mu_);
+    return cache_.Get(key);
+  }
+
+  // Clear the cache.
+  void Clear() {
+    std::lock_guard lck(mu_);
+    cache_.Clear();
+  }
+
+  // Accessors for cache parameters.
+  size_t max_entries() const { return cache_.max_entries(); }
+
+ private:
+  std::mutex mu_;
+  SharedLruCache<Key, Val> cache_;
+};
+
+// Same interfaces as `SharedLruCache`, but all cached values are
+// `const`-specified to avoid concurrent updates.
+template <typename K, typename V>
+using ThreadSafeSharedLruConstCache = ThreadSafeSharedLruCache<K, const V>;
+
+}  // namespace ray::utils::container
diff --git a/src/ray/util/tests/BUILD b/src/ray/util/tests/BUILD
index 2941d105cf91..b85c01f28ebf 100644
--- a/src/ray/util/tests/BUILD
+++ b/src/ray/util/tests/BUILD
@@ -194,3 +194,15 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_test(
+    name = "shared_lru_test",
+    srcs = ["shared_lru_test.cc"],
+    deps = [
+        "//src/ray/util:shared_lru",
+        "@com_google_googletest//:gtest_main",
+    ],
+    size = "small",
+    copts = COPTS,
+    tags = ["team:core"],
+)
diff --git a/src/ray/util/tests/shared_lru_test.cc b/src/ray/util/tests/shared_lru_test.cc
new file mode 100644
index 000000000000..7c47f4d1daf0
--- /dev/null
+++ b/src/ray/util/tests/shared_lru_test.cc
@@ -0,0 +1,76 @@
+// Copyright 2024 The Ray Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/ray/util/shared_lru.h"
+
+#include <gtest/gtest.h>
+
+#include <string>
+#include <type_traits>
+
+namespace ray::utils::container {
+
+namespace {
+constexpr size_t kTestCacheSz = 1;
+}  // namespace
+
+TEST(SharedLruCache, PutAndGet) {
+  ThreadSafeSharedLruCache<std::string, std::string> cache{kTestCacheSz};
+
+  // No value initially.
+  auto val = cache.Get("1");
+  EXPECT_EQ(val, nullptr);
+
+  // Check put and get.
+  cache.Put("1", std::make_shared<std::string>("1"));
+  val = cache.Get("1");
+  EXPECT_NE(val, nullptr);
+  EXPECT_EQ(*val, "1");
+
+  // Check key eviction.
+  cache.Put("2", std::make_shared<std::string>("2"));
+  val = cache.Get("1");
+  EXPECT_EQ(val, nullptr);
+  val = cache.Get("2");
+  EXPECT_NE(val, nullptr);
+  EXPECT_EQ(*val, "2");
+
+  // Check deletion.
+  EXPECT_FALSE(cache.Delete("1"));
+  EXPECT_TRUE(cache.Delete("2"));
+  val = cache.Get("2");
+  EXPECT_EQ(val, nullptr);
+}
+
+// Testing senario: push multiple same keys into the cache.
+TEST(SharedLruCache, SameKeyTest) {
+  ThreadSafeSharedLruCache<int, int> cache{2};
+
+  cache.Put(1, std::make_shared<int>(1));
+  auto val = cache.Get(1);
+  EXPECT_NE(val, nullptr);
+  EXPECT_EQ(1, *val);
+
+  cache.Put(1, std::make_shared<int>(2));
+  val = cache.Get(1);
+  EXPECT_NE(val, nullptr);
+  EXPECT_EQ(2, *val);
+}
+
+TEST(SharedLruConstCache, TypeAliasAssertion) {
+  static_assert(
+      std::is_same_v<SharedLruConstCache<int, int>, SharedLruCache<int, const int>>);
+}
+
+}  // namespace ray::utils::container
diff --git a/src/ray/util/thread_checker.cc b/src/ray/util/thread_checker.cc
index 73a0072c7575..0f33dfd4c712 100644
--- a/src/ray/util/thread_checker.cc
+++ b/src/ray/util/thread_checker.cc
@@ -16,7 +16,7 @@
 
 namespace ray {
 
-bool ThreadChecker::IsOnSameThread() {
+bool ThreadChecker::IsOnSameThread() const {
   const auto cur_id = std::this_thread::get_id();
   std::thread::id uninitialized_id;
   return thread_id_.compare_exchange_strong(uninitialized_id, cur_id) ||
diff --git a/src/ray/util/thread_checker.h b/src/ray/util/thread_checker.h
index 622624859b75..2e3dcf1ed3df 100644
--- a/src/ray/util/thread_checker.h
+++ b/src/ray/util/thread_checker.h
@@ -34,10 +34,10 @@ class ThreadChecker {
  public:
   // Return true at initialization, or current invocation happens on the same thread as
   // initialization.
-  bool IsOnSameThread();
+  bool IsOnSameThread() const;
 
  private:
-  std::atomic<std::thread::id> thread_id_{};
+  mutable std::atomic<std::thread::id> thread_id_{};
 };
 
 }  // namespace ray
diff --git a/src/ray/util/util.cc b/src/ray/util/util.cc
index 478fec48d157..35b2d7c51144 100644
--- a/src/ray/util/util.cc
+++ b/src/ray/util/util.cc
@@ -35,11 +35,12 @@
 #include "ray/util/filesystem.h"
 #include "ray/util/logging.h"
 
+namespace {
 /// Uses sscanf() to read a token matching from the string, advancing the iterator.
 /// \param c_str A string iterator that is dereferenceable. (i.e.: c_str < string::end())
 /// \param format The pattern. It must not produce any output. (e.g., use %*d, not %d.)
 /// \return The scanned prefix of the string, if any.
-static std::string ScanToken(std::string::const_iterator &c_str, std::string format) {
+std::string ScanToken(std::string::const_iterator &c_str, std::string format) {
   int i = 0;
   std::string result;
   format += "%n";
@@ -49,6 +50,7 @@ static std::string ScanToken(std::string::const_iterator &c_str, std::string for
   }
   return result;
 }
+}  // namespace
 
 std::string EndpointToUrl(
     const boost::asio::generic::basic_endpoint<boost::asio::generic::stream_protocol> &ep,
@@ -58,7 +60,7 @@ std::string EndpointToUrl(
   case AF_INET: {
     scheme = "tcp://";
     boost::asio::ip::tcp::endpoint e(boost::asio::ip::tcp::v4(), 0);
-    RAY_CHECK(e.size() == ep.size());
+    RAY_CHECK_EQ(e.size(), ep.size());
     const sockaddr *src = ep.data();
     sockaddr *dst = e.data();
     *reinterpret_cast<sockaddr_in *>(dst) = *reinterpret_cast<const sockaddr_in *>(src);
@@ -70,7 +72,7 @@ std::string EndpointToUrl(
   case AF_INET6: {
     scheme = "tcp://";
     boost::asio::ip::tcp::endpoint e(boost::asio::ip::tcp::v6(), 0);
-    RAY_CHECK(e.size() == ep.size());
+    RAY_CHECK_EQ(e.size(), ep.size());
     const sockaddr *src = ep.data();
     sockaddr *dst = e.data();
     *reinterpret_cast<sockaddr_in6 *>(dst) = *reinterpret_cast<const sockaddr_in6 *>(src);
diff --git a/src/ray/util/util.h b/src/ray/util/util.h
index e7f7a1d96781..05ce88c7ee47 100644
--- a/src/ray/util/util.h
+++ b/src/ray/util/util.h
@@ -62,22 +62,13 @@
 #endif
 
 // Boost forward-declarations (to avoid forcing slow header inclusions)
-namespace boost {
-
-namespace asio {
-
-namespace generic {
+namespace boost::asio::generic {
 
 template <class Protocol>
 class basic_endpoint;
-
 class stream_protocol;
 
-}  // namespace generic
-
-}  // namespace asio
-
-}  // namespace boost
+}  // namespace boost::asio::generic
 
 enum class CommandLineSyntax { System, POSIX, Windows };
 
@@ -302,12 +293,19 @@ inline void unsetEnv(const std::string &name) {
   RAY_CHECK_EQ(ret, 0) << "Failed to unset env var " << name;
 }
 
+// Set [thread_name] to current thread; if it fails, error will be logged.
+// NOTICE: It only works for macos and linux.
 inline void SetThreadName(const std::string &thread_name) {
+  int ret = 0;
 #if defined(__APPLE__)
-  pthread_setname_np(thread_name.c_str());
+  ret = pthread_setname_np(thread_name.c_str());
 #elif defined(__linux__)
-  pthread_setname_np(pthread_self(), thread_name.substr(0, 15).c_str());
+  ret = pthread_setname_np(pthread_self(), thread_name.substr(0, 15).c_str());
 #endif
+  if (ret < 0) {
+    RAY_LOG(ERROR) << "Fails to set thread name to " << thread_name << " since "
+                   << strerror(errno);
+  }
 }
 
 inline std::string GetThreadName() {